# NLP with Disaster Tweets

In [1]:

import re, string, os

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten, concatenate, Dropout 
from tensorflow.keras import Model
import tensorflow_hub as hub


from symspellpy import SymSpell, Verbosity

import spacy 

from sentence_transformers import SentenceTransformer 

from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.utils import shuffle

import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


## Importing data

In [2]:
train = pd.read_csv('nlp-getting-started/train.csv')
test = pd.read_csv('nlp-getting-started/test.csv')


## Preprocessing

### Lowercasing

In [7]:
train[ "keyword"] = train[ "keyword" ].apply(lambda x: str. lower(x) if pd.isna(x) != True else x)
train[ "location"] = train["location"].apply(lambda x: str.lower(x) if pd.isna(x) != True else x)
train[ "text"] = train["text"].apply(lambda x: str. lower (x) )

test[ "keyword" ] = test["keyword"].apply(lambda x: str. lower(x) if pd.isna(x) != True else x)
test[ "location"] = test["location"].apply(lambda x: str. lower(x) if pd.isna(x) != True else x)
test["text"] = test["text"].apply(lambda x: str. lower (x))

### Entities, URL, Links and Punctuation Removal

In [8]:
def remove_entities (text) :
    entity_prefixes = ['@', '#']
    for separator in string.punctuation:
        if separator not in entity_prefixes :
            text = text. replace (separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words. append (word)
    return ''. join (words)


this_lambda = lambda x: remove_entities(x) if pd.isna(x) != True else x

train["keyword"] = train[ "keyword" ].apply(this_lambda) 
train["location"] = train[ "location" ].apply(this_lambda)

test["keyword"] = test ["keyword"].apply(this_lambda)
test["location"] = test["location"].apply(this_lambda)


this_lambda = lambda x: remove_entities(x)

train["text"] = train["text"].apply(this_lambda)

test["text"] = test["text"].apply(this_lambda)

In [9]:
# remove URL links

this_lambda = lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+",' ', x) if pd.isna (x) != True else x

train["keyword"] = train["keyword"].apply(this_lambda)
train["location"] = train["location"].apply(this_lambda)

test["keyword"] = test ["keyword"].apply (this_lambda)
test["location"] = test["location"].apply(this_lambda)


this_lambda = lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+",' ', x)

train[ "text"] = train[ "text"].apply(this_lambda)

test[ "text"] = test[ "text"].apply(this_lambda)



In [10]:
# remove punctuation

this_lambda = lambda x: re.sub(r'[^\w\s]',' ', x) if pd.isna(x) != True else x

train["keyword"] = train["keyword"].apply(this_lambda)
train["location"] = train["location"].apply(this_lambda)

test["keyword"] = test["keyword"].apply(this_lambda)
test["location"] = test["location"].apply(this_lambda)


this_lambda = lambda x: re.sub(r'[^\w\s]',' ', x)

train[ "text"] = train[ "text"].apply(this_lambda)

test[ "text"] = test[ "text"].apply(this_lambda)


### Spelling Correction

In [11]:
sym_spell = SymSpell ()

dictionary_path = "./frequency_dictionary_en_82_765.txt"

sym_spell.load_dictionary(dictionary_path, 0, 1)

def spelling_correction(sent):
    doc_w_correct_spelling=[]
    
    for tok in sent.split(" "):
        x = sym_spell.lookup(tok, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)[0].__str__()
        y = x.split(',')[0]
        doc_w_correct_spelling.append(y)
    
    return " ".join(doc_w_correct_spelling)


this_lambda = lambda x: spelling_correction(x) if pd.isna(x) != True else x

train ["keyword"] = train["keyword"].apply(this_lambda)
train["location"] = train["location"].apply(this_lambda)

test ["keyword"] = test["keyword"].apply(this_lambda)
test["location"] = test["location"] .apply(this_lambda)


this_lambda = lambda x: spelling_correction (x)

train[ "text"] = train["text"].apply(this_lambda)

test[ "text"] = test["text"].apply(this_lambda)

2024-03-05 17:58:49,730: E symspellpy.symspellpy] Dictionary file not found at frequency_dictionary_en_82_765.txt.


### Filling Missing Data
#### Keyword Extraction

In [12]:
os. system('python -m spacy download en')
nlp = spacy.load("en_core_web_sm")


[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

def extract_keywords (nlp=nlp, doc="", no_of_keywords=5, model=model) :
    
    doc = doc.lower ( )
    
    doc = re.sub(r'(?:\@|http?\://|https?\://|www)\S+',' ', doc)
    doc = re.sub(r'[^\w\s]',' ', doc)
    doc = re.sub(' \d+', ' ', doc)
    
    doc_ = nlp (doc)
    
    # costum list of parts-of-speech (pos) tags we are interested in
    pos_tag = [ 'VERB', 'NOUN', 'ADJ', 'PROPN' ]
    result = []
    
    # if the token pos tag matches one of the pos_tag, then add the text form of the token to result list
    for token in doc_: 
        if (token.pos_ in pos_tag) :
            result. append (token.text)
            
    doc_embedding = model.encode([doc])
    results_embeddings = model.encode(result)
    
    # calculate the similarity between document and results embeddings
    distances = cosine_similarity(doc_embedding, results_embeddings)
    
    # get the top similar keywords
    keywords = [result[index] for index in distances.argsort()[0][-no_of_keywords:]]
    
    return keywords

In [14]:
# fill the empty entries in the train keyword column
for i in range(len(train["keyword"])):
    if pd.isnull(train['keyword'].iloc[i]) :
        try:
            train['keyword'].iloc[i] = extract_keywords(nlp=nlp, doc=train.text.iloc[i], no_of_keywords=1, model=model)[0]
        except:
            train['keyword'].iloc[i] = "NaN" # incase of an unexpected error add "NaN"

# fill the empty entries in the train keyword column
for i in range(len(test ["keyword" ])) :
    if pd.isnull(test[ 'keyword' ].iloc[i]):
        try:
            test['keyword'].iloc[i] = extract_keywords(nlp=nlp, doc=test.text.iloc[i], no_of__keywords=1, model=model)[0]
        except:
            test['keyword'].iloc[i] = "NaN" # incase of an unexpected error add "NaN"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  train['keyword'].iloc[i] = extract_keywords(nlp=nlp, doc=train.text.iloc[i], no_of_keywords=1, model=model)[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

#### Entity Recognition
Check the entities labels
If they are Geopolitical Entities (GPE) - i.e. country, city, state or Organizations (ORG)
Append them to the location string

In [15]:
def get_location (nlp=nlp, doc=""):
    doc_ = nlp(doc)
    location = ""

    for ent in doc_.ents:
        if ent.label_ in [ "GPE", "ORG"]:
            location = location + ent.text + " "

    return location


In [16]:

# fill the empty entries in the train location column
for i in range(len(train["location"])) :
    if pd. isnull(train['location'].iloc[i]):
        try:
            train['location'].iloc[i] = get_location(nlp=nlp, doc=train. text. iloc[i])
        except:
            test['location'].iloc[i] = "NaN" # incase of an unexpected error add "NaN"

# fill the empty entries in the test location column
for i in range(len(test["location"])) :
    if pd. isnull(test['location'].iloc[i]):
        try:
            test['location'].iloc[i] = get_location(nlp=nlp, doc=test.text.iloc[i])
        except:
            test['location'].iloc[i] = "NaN" # incase of an unexpected error add "NaN"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  train['location'].iloc[i] = get_location(nlp=nlp, doc=train. text. iloc[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

### Lemmatization

In [17]:
def lemmatize (sentence) :
    doc = nlp(sentence) # tokenize the text and produce a Doc Object
    lemmas = [token.lemma_ for token in doc]
    return " ".join(lemmas)


this_lambda = lambda x: lemmatize(x) if pd.isna(x) != True else x

train["keyword"] = train["keyword"].apply(this_lambda)
train["location"] = train["location"].apply(this_lambda)

test["keyword"] = test["keyword"].apply(this_lambda)
test["location"] = test["location"].apply(this_lambda)


this_lambda = lambda x: lemmatize(x)

train["text"] = train["text"].apply(this_lambda)

test["text"] = test["text"].apply(this_lambda)


### Stop Words Removal

In [18]:
def remove_stopwords (sentence) :
    doc = nlp(sentence) # tokenize the text and produce a Doc Object
    all_stopwords = nlp.Defaults.stop_words
    doc_tokens = [token.text for token in doc]
    tokens_without_sw = [word for word in doc_tokens if not word in all_stopwords]
    return " ".join(tokens_without_sw)

this_lambda = lambda x: remove_stopwords(x) if pd.isna(x) != True else x

train["keyword"] = train["keyword"].apply(this_lambda)
train["location"] = train["location"].apply(this_lambda)

test["keyword"] = test["keyword"].apply (this_lambda)
test["location"] = test["location"].apply(this_lambda)


this_lambda = lambda x: remove_stopwords (x) 

train["text"] = train["text"].apply(this_lambda)

test["text"] = test["text"].apply(this_lambda)

## Model

In [19]:
model = "https://tfhub.dev/google/nnlm-en-dim50/2"
embed = hub.load(model)

2024-03-05 18:01:41.715107: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-03-05 18:01:41.715151: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-03-05 18:01:41.715154: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-03-05 18:01:41.715450: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-05 18:01:41.716086: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-03-05 18:01:42.116658: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [20]:

train = shuffle(train, random_state=42).reset_index(drop=True) # shuffle the train data
y = np.array(train["target"].tolist()) # convert the target column into a numpy array

key_embed = embed(train.keyword.to_list()) # keyword embeddings
loc_embed = embed(train. location.to_list()) # location embeddings
text_embed = embed(train.text.to_list()) # text embeddings

In [21]:

keyword_input = Input(shape=(key_embed.shape[1],) )
location_input = Input(shape= (loc_embed.shape [1], ))
text_input = Input(shape=(text_embed.shape[1], ))


# create keyword model
key_model = Flatten()(keyword_input)
key_model = Dense(1024, activation='relu')(key_model)
key_model = Dropout(0.5)(key_model)


# create location model
loc_model = Flatten()(location_input)
loc_model = Dense(1024, activation='relu')(loc_model)
loc_model = Dropout(0.5)(loc_model)

# create text model
text_model = Flatten()(text_input)
text_model = Dense(1024, activation='relu')(text_model)
text_model = Dropout(0.5)(text_model)

# concatenate the three models to create the final model
merged = concatenate([  key_model,
                        loc_model,
                        text_model], axis=1)

merged = Dense(1024, activation='relu')(merged)
merged = Dropout(0.5)(merged)

final = Dense(1, activation = 'sigmoid') (merged)
final = Model(inputs = [keyword_input, location_input, text_input], outputs=final) 

In [22]:
lr = 0.1 # learning rate
epochs = 100 # number of epochs

decay_rate = 0.1  # Decay rate per step
decay_steps = 100  # Number of steps for one decay

#opt = tf.keras.optimizers.SGD(lr = lr, momentum = 0.8, decay = lr/epochs) # optimizer  # deprecated
opt = tf.keras.optimizers.SGD(learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
                                                            initial_learning_rate=lr,
                                                            decay_steps=decay_steps,
                                                            decay_rate=decay_rate,
                                                            staircase=True  # Optionally set to False for continuous decay
                                                        ),  
                              momentum = 0.8)

final.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

earlystop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    mode = 'min', 
    patience=100, 
    verbose = 1
)

history = final.fit(
    x = [key_embed, loc_embed, text_embed],
    y = y,
    batch_size = 32,
    epochs = epochs,
    validation_split = 0.1,
    callbacks=[earlystop]
)



Epoch 1/100


2024-03-05 18:01:43.465210: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node SGD/AssignVariableOp.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [23]:

test = shuffle(test, random_state=42).reset_index(drop=True) # shuffle the train data
y = np.array(train["target"].tolist()) # convert the target column into a numpy array

new_key_embed = embed(test.keyword.to_list()) # keyword embeddings
new_loc_embed = embed(test.location.to_list()) # location embeddings
new_text_embed = embed(test.text.to_list()) # text embeddings

new_data = [new_key_embed, new_loc_embed, new_text_embed]

# Make predictions
predictions = final.predict(new_data)

# Interpret predictions (assuming binary classification with sigmoid activation)
if predictions[0][0] > 0.5:  # Threshold of 0.5 is a common choice
    print("Predicted class: Positive")
else:
    print("Predicted class: Negative")

Predicted class: Positive


In [24]:
predictions

array([[0.6035562 ],
       [0.22574411],
       [0.5452347 ],
       ...,
       [0.20711838],
       [0.54979354],
       [0.48183706]], dtype=float32)

In [25]:
np.shape(predictions)

(3263, 1)

In [26]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,3796,destruction,,soyouhaveanewweaponthatcancauseunimaginabledes...,1
1,3185,deluge,,thefampthingsidoforjustgotsoakedinadelugegoing...,0
2,7769,police,uk,dtrt ûïthecolpolicecancatchapickpocketinliverp...,1
3,191,aftershock,,aftershockbacktoschoolkickoffwasgreatiwanttoth...,0
4,9810,trauma,montgomerycountymd,inresponsetotraumachildrenofaddictsdevelopadef...,0


In [27]:
test.head()

Unnamed: 0,id,keyword,location,text
0,8051,refugee,,refugeesascitizensthehinduhttptcogjsaaf3u6k
1,425,apocalypse,currentlysomewhereonearth,honestlyhecouldsayanapocalypseiscomingandiwoul...
2,1330,blown20up,scoutteam,ifyouboredasshitdontnobodyfuckwityoubutwhenyou...
3,663,attack,,yesterdayihadaheatattackandwhatsfunnyourrelati...
4,2930,danger,leed,thedevilwearspradaisstilloneofmyfavouritefilm


In [28]:
test.shape

(3263, 4)

In [29]:
test['target'] = predictions

In [30]:
test.target = test['target'].apply(lambda x: 1 if x >= 0.5 else 0)

In [32]:
subm = test.drop(['keyword', 'location', 'text'], axis = 1)

In [33]:
subm.to_csv('submission.csv', index = False)