# Disaster Tweet Prediction NLP

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import re

%matplotlib inline

## Load the Disaster Tweets Dataset

We use the Disaster Tweets dataset from kaggle: <a href="https://www.kaggle.com/competitions/nlp-getting-started/data">https://www.kaggle.com/competitions/nlp-getting-started/data</a>

In [2]:
df = pd.read_csv("./nlp-getting-started/train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Prepare the Dataset

In [3]:
df = df.drop(['id','keyword','location'], axis=1)
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


In [5]:
# Check if there are missing values
df.isnull().sum()

text      0
target    0
dtype: int64

In [6]:
# Check the frequency of each class in the dataset
print("Number of positive labels in the dataset (target=1): ", len(df[df.target==1]))
print("Number of negative labels in the dataset (target=0): ", len(df[df.target==0]))

Number of positive labels in the dataset (target=1):  3271
Number of negative labels in the dataset (target=0):  4342


In [7]:
X = df['text']
y = df['target']

In [8]:
X

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [9]:
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

## Preprocess

In [10]:
def expand_contractions(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [11]:
def clean_text(df):
    
    len_df = df.shape[0]
    
    for i in range(len_df):
        
        # lowercase letters
        df[i] = df[i].lower()
        
        # expand contractions
        df[i] = expand_contractions(df[i])
        
        # eliminate links
        df[i] = re.sub(r'http\S+', ' ', df[i])
        
        # eliminate special characters
        df[i] = re.sub(r'[^a-z]+',' ', df[i])
        
    return df

In [12]:
X = clean_text(X)
X

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i] = df[i].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i] = expand_contractions(df[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i] = re.sub(r'http\S+', ' ', df[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i] = re.sub(r'[^a-z]+',' ', df[i])


0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to ishelter in place are b...
3        people receive wildfires evacuation orders in...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609     aria ahrary thetawniest the out of control wi...
7610                        m utc km s of volcano hawaii 
7611    police investigating after an e bike collided ...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

## Pipeline Building

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## 1. Multinomial Naive Bayes Classifier

In [14]:
# Multinomial Naive Bayes
text_clf_mnb = Pipeline([('cv', TfidfVectorizer()),
                     ('mnb', MultinomialNB())])

text_clf_mnb.fit(X_train, y_train)

y_pred_mnb = text_clf_mnb.predict(X_test)
f1_mnb = f1_score(y_pred_mnb, y_test)
print("Accuracy Multinomial Naive Bayes: ", np.mean(y_pred_mnb == y_test))
print("F1 Score Multinomial Naive Bayes: ", f1_mnb)

Accuracy Multinomial Naive Bayes:  0.7859487852921865
F1 Score Multinomial Naive Bayes:  0.72787979966611


In [15]:
print(classification_report(y_pred_mnb, y_test))

              precision    recall  f1-score   support

           0       0.93      0.74      0.82      1026
           1       0.62      0.88      0.73       497

    accuracy                           0.79      1523
   macro avg       0.77      0.81      0.78      1523
weighted avg       0.83      0.79      0.79      1523



In [16]:
# Cross Validation
scores = model_selection.cross_val_score(text_clf_mnb, X, y, cv=3, scoring='f1')
print("Scores Multinomial Naive Bayes: ", scores)
print("Mean F1 Scores Multinomial Naive Bayes: ", np.mean(scores))

Scores Multinomial Naive Bayes:  [0.64745011 0.61183866 0.72472472]
Mean F1 Scores Multinomial Naive Bayes:  0.6613378315244103


## 2. Logistic Regression  Classifier

In [17]:
# Logistic Regression
text_clf_lr = Pipeline([('cv', TfidfVectorizer()),
                     ('lr', LogisticRegression())])

text_clf_lr.fit(X_train, y_train)

y_pred_lr = text_clf_lr.predict(X_test)
f1_lr = f1_score(y_pred_lr, y_test)

print("Accuracy Logistic Regression: ", np.mean(y_pred_lr == y_test))

print("F1 Score Logistic Regression: ", f1_lr)

Accuracy Logistic Regression:  0.7925147734734077
F1 Score Logistic Regression:  0.7463884430176565


In [18]:
print(classification_report(y_pred_lr, y_test))

              precision    recall  f1-score   support

           0       0.90      0.76      0.82       978
           1       0.66      0.85      0.75       545

    accuracy                           0.79      1523
   macro avg       0.78      0.81      0.79      1523
weighted avg       0.82      0.79      0.80      1523



In [19]:
scores_lr = model_selection.cross_val_score(text_clf_lr, X, y, cv=3, scoring='f1')
print("Scores Logistic Regression: ", scores_lr)
print("Mean F1 Scores Logistic Regression: ", np.mean(scores_lr))

Scores Logistic Regression:  [0.66155531 0.62033195 0.70128586]
Mean F1 Scores Logistic Regression:  0.6610577059845723


## 3. Support Vector Classifier

In [20]:
svc = SVC()

text_clf_svc = Pipeline([("tfidf", TfidfVectorizer()), 
                         ("clf", SVC())])
text_clf_svc.fit(X_train, y_train)

y_pred_svc = text_clf_svc.predict(X_test)
f1_svc = f1_score(y_pred_svc, y_test)

print("Accuracy SVC: ", np.mean(y_pred_svc == y_test))

print("F1 Score SVC: ", f1_svc)

Accuracy SVC:  0.7944845699277742
F1 Score SVC:  0.746147607461476


In [22]:
print(classification_report(y_pred_svc, y_test))

              precision    recall  f1-score   support

           0       0.92      0.78      0.85      1029
           1       0.65      0.86      0.74       494

    accuracy                           0.81      1523
   macro avg       0.79      0.82      0.79      1523
weighted avg       0.83      0.81      0.81      1523



In [23]:
scores_svc = model_selection.cross_val_score(text_clf_svc, X, y, cv=3, scoring='f1')
print("Scores SVC: ", scores_svc)
print("Mean F1 Scores SVC: ", np.mean(scores_svc))

Scores SVC:  [0.63352601 0.59581498 0.68500259]
Mean F1 Scores SVC:  0.6381145280803534


## Select Best Model

In [21]:
f1_scores = {"mnb": f1_mnb, "lr": f1_lr, "svc": f1_svc}
print(sorted(f1_scores.items(), key = lambda x: x[1], reverse=True))

[('lr', 0.7463884430176565), ('svc', 0.746147607461476), ('mnb', 0.72787979966611)]


In [22]:
# test prediction of one data point
test_0 = "just got sent this photo from ruby alaska"
test_1 = "forest fire near la ronge sask canada"

print(text_clf_lr.predict([test_0]))
print(text_clf_lr.predict([test_1]))

[0]
[1]


## Pickle The Model 

In [23]:
import pickle

In [34]:
pickle.dump(text_clf_lr, open('text_clf_model.pkl', 'wb'))

In [36]:
pickled_model = pickle.load(open('text_clf_model.pkl', 'rb'))

In [49]:
print(pickled_model.predict([test_0]))
print(pickled_model.predict([test_1]))

[0]
[1]
