# Natural Language Processing with Disaster Tweets

## 1 Imports

In [1]:
!/anaconda/envs/azureml_py38_PT_TF/bin/python -m pip install -r requirements.txt



In [2]:
%matplotlib inline
import numpy as np
from numpy import argmax
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from bs4 import BeautifulSoup
import re

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.callbacks import EarlyStopping

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import joblib


2024-06-12 11:23:52.588865: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-12 11:23:52.750801: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-12 11:23:52.755595: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-12 11:23:52.755625: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 2 Data exploration / plots

Reading in train.csv & test.csv.

In [4]:
df_train = pd.read_csv('CSV/train.csv')
df_test = pd.read_csv('CSV/test.csv')

In [5]:
X_train = df_train['text']
y_train = df_train['target']

X_train, y_train = shuffle(X_train, y_train, random_state=42)
id = df_test['id']

In [6]:
X_test = df_test['text']

## 3 Text-preprocessing

In [7]:
def text_preprocessing(text, language, minWordSize):

    text_no_html = BeautifulSoup(text, "html.parser").get_text()

    text_no_url = re.sub(r'http\S+', ' ', text_no_html)

    text_no_at = ''
    for word in text_no_url.split():
        if not word.startswith('@'):  
            text_no_at = text_no_at + word + ' '
            
    text_no_RT = ''
    for word in text_no_at.split():
        if not word.startswith('RT'):  
            text_no_RT = text_no_RT + word + ' '         

    text_alpha_chars = re.sub("[^a-zA-Z']", " ", str(text_no_RT))

    text_lower = text_alpha_chars.lower()

    stops = set(stopwords.words(language)) 
    text_no_stop_words = ' '
    
    whitelist = ["n't", "not", "no"]
    for word in text_lower.split():
        if word not in stops or word in whitelist:  
            text_no_stop_words = text_no_stop_words + word + ' '

    text_stemmer = ' '
    stemmer = SnowballStemmer(language)
    for w in text_no_stop_words.split():
        text_stemmer = text_stemmer + stemmer.stem(w) + ' '

    text_no_short_words = ' '
    for w in text_stemmer.split(): 
        if len(w) >=minWordSize:
            text_no_short_words = text_no_short_words + w + ' '
 

    return text_no_short_words

In [8]:
language = 'english'
minWordLength = 2
text_prep = np.empty

for i in range(X_train.shape[0]):
    X_train.iloc[i] = text_preprocessing(X_train.iloc[i], language, minWordLength)
    
    
for i in range(X_test.shape[0]):
    X_test.iloc[i] = text_preprocessing(X_test.iloc[i], language, minWordLength)

  text_no_html = BeautifulSoup(text, "html.parser").get_text()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.iloc[i] = text_preprocessing(X_test.iloc[i], language, minWordLength)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.iloc[i] = text_preprocessing(X_test.iloc[i], language, minWordLength)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.iloc[i] = text_preprocessing(X_test.iloc[i], language, minWordLength)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats

In [9]:
X_train

2644                  new weapon caus un imagin destruct 
2227     ing thing gishwh got soak delug go pad tampon...
5448       dt col polic catch pickpocket liverpool stree 
132      aftershock back school kick great want thank ...
6845     respons trauma children addict develop defens...
                              ...                        
5226                 mani obliter server alway like play 
5390     panic attack bc enough money drug alcohol wan...
860      omron hem automat blood pressur monitor stand...
7603     offici say quarantin place alabama home possi...
7270     move england five year ago today whirlwind time 
Name: text, Length: 7613, dtype: object

In [10]:
X_train_clean = X_train.copy()
X_test_clean = X_test.copy()

In [11]:
count_vect = CountVectorizer()
X_train_bag_of_words = count_vect.fit_transform(X_train)
X_test_bag_of_words = count_vect.transform(X_test)


tfidf_transformer = TfidfTransformer()
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_bag_of_words)
X_train_tf = tf_transformer.transform(X_train_bag_of_words)
X_test_tf = tf_transformer.transform(X_test_bag_of_words)

## 4 Building Models

### 4.1 LSTM Model


In [12]:
model = LogisticRegression()
paramaters = [{'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000,10000, 100000]}                                       
             ]
                            
random_search = RandomizedSearchCV(estimator = model,  n_iter = 9, 
                           param_distributions = paramaters,
                           scoring = 'accuracy',
                           cv = 4,
                           n_jobs = -1)

random_search = random_search.fit(X_train_tf, y_train)

best_accuracy = random_search.best_score_ 
best_parameters = random_search.best_params_  

print('Best accuracy : ', random_search.best_score_)
print('Best parameters :', random_search.best_params_  )

Best accuracy :  0.8017863214650022
Best parameters : {'C': 1}


In [13]:
model = LogisticRegression()
paramaters = [{'C' : [0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99,1,1.01,1.02,1.03,1.04,1.05,1.06,1.07,1.08,1.09,1.1]}                                       
             ]
                            
grid_search = GridSearchCV(estimator = model, 
                           param_grid = paramaters,
                           scoring = 'accuracy',
                           cv = 4,
                           n_jobs = -1)

grid_search = grid_search.fit(X_train_tf, y_train)

best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  

print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )

Best accuracy :  0.8021802980256738
Best parameters : {'C': 1.02}


In [14]:
max_fatures = 2000

tokenizer = Tokenizer(oov_token= True, num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X_train_clean)
X_train_tok = tokenizer.texts_to_sequences(X_train_clean)
X_test_tok = tokenizer.texts_to_sequences(X_test_clean)

X_train_tok = pad_sequences(X_train_tok)
X_test_tok = pad_sequences(X_test_tok,maxlen=X_train_tok.shape[1])

In [15]:
X_train_tok.shape

(7613, 23)

In [16]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_train_class = to_categorical(y_train)

In [17]:
early_stopping =  EarlyStopping(patience=8,  restore_best_weights=True)

model = Sequential()
model.add(Embedding(input_dim=2000,output_dim =128, input_length = X_train_tok.shape[1]))
model.add(LSTM(100, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

history = model.fit(X_train_tok, y_train_class, epochs = 100, batch_size=16, verbose = 2, validation_split= 0.2,
                   callbacks=[early_stopping])

2024-06-12 11:24:17.261031: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-06-12 11:24:17.261098: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (mlopsprojectcompute): /proc/driver/nvidia/version does not exist
2024-06-12 11:24:17.262079: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
381/381 - 28s - loss: 0.5054 - accuracy: 0.7542 - val_loss: 0.4750 - val_accuracy: 0.7971 - 28s/epoch - 74ms/step
Epoch 2/100
381/381 - 24s - loss: 0.3790 - accuracy: 0.8386 - val_loss: 0.4988 - val_accuracy: 0.7938 - 24s/epoch - 63ms/step
Epoch 3/100
381/381 - 25s - loss: 0.3408 - accuracy: 0.8565 - val_loss: 0.4710 - val_accuracy: 0.7958 - 25s/epoch - 66ms/step
Epoch 4/100
381/381 - 24s - loss: 0.2946 - accuracy: 0.8800 - val_loss: 0.5288 - val_accuracy: 0.7971 - 24s/epoch - 63ms/step
Epoch 5/100
381/381 - 24s - loss: 0.2406 - accuracy: 0.9007 - val_loss: 0.6277 - val_accuracy: 0.7728 - 24s/epoch - 64ms/step
Epoch 6/100
381/381 - 24s - loss: 0.1988 - accuracy: 0.9204 - val_loss: 0.7717 - val_accuracy: 0.7859 - 24s/epoch - 63ms/step
Epoch 7/100
381/381 - 23s - loss: 0.1688 - accuracy: 0.9266 - val_loss: 0.8502 - val_accuracy: 0.7623 - 23s/epoch - 61ms/step
Epoch 8/100
381/381 - 23s - loss: 0.1382 - accuracy: 0.9396 - val_loss: 1.1249 - val_accuracy: 0.7669 - 23s/epoch - 62

In [None]:
y_pred = model.predict(X_train_tok[int(len(X_train_tok)*0.8):])
y_pred = argmax(y_pred, axis=1)

y_true = argmax(y_train_class[int(len(y_train_class)*0.8):], axis=1)

print(f'Accuracy: {accuracy_score(y_true, y_pred)}')
print(f'Precision: {precision_score(y_true, y_pred)}')
print(f'Recall: {recall_score(y_true, y_pred)}')
print(f'F1 score: {f1_score(y_true, y_pred)}')

In [None]:
predictions_test = grid_search.predict(X_test_tf)

In [None]:
le = preprocessing.LabelEncoder()
le.fit(predictions_test)
predictions_test = le.transform(predictions_test)
predictions_test

In [None]:
df = pd.DataFrame({
    'id': id,
    'target': predictions_test
})

df['target'] = encoder.inverse_transform(df['target'])

print(df)

In [None]:
df.to_csv('CSV/predictionsLSTM.csv', index=False)

In [None]:
# Save tokenizer
joblib.dump(tokenizer, 'tokenizer.pkl')

# Save model
model.save('disaster_tweet_lstm_model.h5')
