# Disaster Tweets

Initial plan is to use this notebook for modeling of preprocessed data.

In [1]:
# imports

# data
import pandas as pd
import numpy as np

# modeling
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn import svm

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization
from nltk.corpus import stopwords

import pickle

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')


In [2]:
# read training and test data
train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/cleaned_train.csv')

In [3]:
train_df.head()

Unnamed: 0,id,keyword,text,target
0,1,nokeyword,our deeds are the reason of this earthquake ma...,1
1,4,nokeyword,forest fire near la ronge sask canada,1
2,5,nokeyword,all residents asked to shelter in place are be...,1
3,6,nokeyword,people receive wildfires evacuation orders in ...,1
4,7,nokeyword,just got sent this photo from ruby alaska as s...,1


In [4]:
stemmed_train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/stemmed_train.csv')

In [5]:
stemmed_train_df.head()

Unnamed: 0,id,keyword,text,target
0,1,nokeyword,our deed are the reason of thi earthquak may a...,1
1,4,nokeyword,forest fire near la rong sask canada,1
2,5,nokeyword,all resid ask to shelter in place are be notif...,1
3,6,nokeyword,peopl receiv wildfir evacu order in california,1
4,7,nokeyword,just got sent thi photo from rubi alaska as sm...,1


In [6]:
lemmatized_train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/lemmatized_train.csv')

In [7]:
lemmatized_train_df.head()

Unnamed: 0,id,keyword,text,target
0,1,nokeyword,our deed are the reason of this earthquake may...,1
1,4,nokeyword,forest fire near la ronge sask canada,1
2,5,nokeyword,all resident asked to shelter in place are bei...,1
3,6,nokeyword,people receive wildfire evacuation order in ca...,1
4,7,nokeyword,just got sent this photo from ruby alaska a sm...,1


In [8]:
test_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/cleaned_test.csv')

In [9]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,nokeyword,nolocation,just happened a terrible car crash
1,2,nokeyword,nolocation,heard about earthquake is different city stay ...
2,3,nokeyword,nolocation,there is a forest fire at spot pond goose are ...
3,9,nokeyword,nolocation,apocalypse lighting spokane wildfire
4,11,nokeyword,nolocation,typhoon soudelor kill in china and taiwan


#### How good does my model have to be to outperform the naive approach (i.e., no tweet is about a disaster)?

In [10]:
p_classes = dict(train_df['target'].value_counts(normalize=True))
naive_approach = p_classes[0]
print('Class probabilities: ', p_classes,
      '\nChance tweet is not about a real disaster: ', np.round(naive_approach, decimals = 4))

Class probabilities:  {0: 0.5737136763529725, 1: 0.42628632364702745} 
Chance tweet is not about a real disaster:  0.5737


#### Set up a DataFrame to hold scoring information, for final model selection.

In [11]:
scoring_df = pd.DataFrame(columns = ['Model', 'Vectorizer', 'Text_Treatment', 'Mean_F1_Score', 'F1_Std_Dev'])

### Bagging using sklearn CountVectorizer

First set of experiments will include stop words.

In [12]:
count_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                   ngram_range = (1, 2),
                                   binary = True)

In [13]:
train_vector_df = count_vectorizer.fit_transform(train_df['text'])

In [14]:
train_vector_df.shape

(7502, 70574)

In [15]:
train_vector_stemmed_df = count_vectorizer.fit_transform(stemmed_train_df['text'])

In [16]:
train_vector_stemmed_df.shape

(7502, 65602)

In [17]:
train_vector_lemma_df = count_vectorizer.fit_transform(lemmatized_train_df['text'])

In [18]:
train_vector_lemma_df.shape

(7502, 68135)

Second set of experiments will remove stop words, to see if that improves performance.

In [19]:
count_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                   stop_words = stopwords.words('english'),
                                   ngram_range = (1, 2),
                                   binary = True)

In [20]:
train_vector_no_stops_df = count_vectorizer.fit_transform(train_df['text'])

In [21]:
train_vector_no_stops_df.shape

(7502, 57913)

In [22]:
train_vector_stemmed_no_stops_df = count_vectorizer.fit_transform(stemmed_train_df['text'])

In [23]:
train_vector_stemmed_no_stops_df.shape

(7502, 55077)

In [24]:
train_vector_lemma_no_stops_df = count_vectorizer.fit_transform(lemmatized_train_df['text'])

In [25]:
train_vector_lemma_no_stops_df.shape

(7502, 56481)

#### Logistic Regression on CountVectorizer treated training data

In [26]:
# basic LogReg
clf_logreg = LogisticRegression(class_weight = 'balanced')

scores = model_selection.cross_val_score(clf_logreg,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")

mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6290542858178572 +/- 0.05930569871502275


In [27]:
# LogRegCV
clf_logreg_cv = LogisticRegressionCV(class_weight = 'balanced')

scores = model_selection.cross_val_score(clf_logreg_cv,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")

mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegressionCV'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6382107580107526 +/- 0.05755892959272567


In [28]:
# LogReg, stemmed
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6282724542006161 +/- 0.05454954304699235


In [29]:
# LogReg, lemmatized
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6377161392519037 +/- 0.054316053736676334


In [30]:
# LogReg, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.5804990476888585 +/- 0.07410121362572519


In [31]:
# LogReg, stemmed, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.5907999024775348 +/- 0.06233658166058654


In [32]:
# LogReg, lemmatized, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.5888661506439731 +/- 0.06521271412619133


#### Multinomial Bayes on CountVectorizer treated training data

In [33]:
# basic Multinomial Bayes
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6749567532885051 +/- 0.0388777822155517


In [34]:
# Multinomial Bayes, stemmed
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.679516177528353 +/- 0.04514210200517351


In [35]:
# Multinomial Bayes, lemmatized
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6838672747769288 +/- 0.04188014881745761


In [36]:
# Multinomial Bayes, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6600585905094238 +/- 0.04066299988918634


In [37]:
# Multinomial Bayes, stemmed, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6593484695899581 +/- 0.04559206635857508


In [38]:
# Multinomial Bayes, lemmatized, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6607353953919963 +/- 0.04315073975283043


### Bagging and term frequency weighting using TD-IDF vectorization

First set will retain stop words.

In [39]:
tf_idf = TfidfVectorizer(ngram_range=(1, 1),
                         max_df=0.5,
                         min_df=2)

In [40]:
train_tfidf_df = tf_idf.fit_transform(train_df['text'])

In [41]:
train_tfidf_df.shape

(7502, 6063)

In [42]:
train_tfidf_stemmed_df = tf_idf.fit_transform(stemmed_train_df['text'])

In [43]:
train_tfidf_stemmed_df.shape

(7502, 5000)

In [44]:
train_tfidf_lemmatized_df = tf_idf.fit_transform(lemmatized_train_df['text'])

In [45]:
tf_idf_pickled = pickle.dumps(tf_idf)

In [46]:
train_tfidf_lemmatized_df.shape

(7502, 5578)

Second set will remove stop words.

In [47]:
tf_idf = TfidfVectorizer(stop_words = stopwords.words('english'),
                         ngram_range=(1, 1),
                         max_df=0.5,
                         min_df=2)

In [48]:
train_tfidf_no_stops_df = tf_idf.fit_transform(train_df['text'])

In [49]:
train_tfidf_no_stops_df.shape

(7502, 5938)

In [50]:
train_tfidf_stemmed_no_stops_df = tf_idf.fit_transform(stemmed_train_df['text'])

In [51]:
train_tfidf_stemmed_no_stops_df.shape

(7502, 4898)

In [52]:
train_tfidf_lemmatized_no_stops_df = tf_idf.fit_transform(lemmatized_train_df['text'])

In [53]:
train_tfidf_lemmatized_no_stops_df.shape

(7502, 5457)

#### Logistic Regression on TF-IDF treated training data

In [54]:
# basic LogReg
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.661170084735614 +/- 0.04248994235989204


In [55]:
# LogReg, stemmed
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6649059406545451 +/- 0.045688643692764035


In [56]:
# LogReg, lemmatized
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6675572000390637 +/- 0.04109215054227096


In [57]:
# LogRegCV, lemmatized
clf = LogisticRegressionCV(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegressionCV'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6741968290118585 +/- 0.04037677659541278


In [58]:
# LogReg, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6239710913054827 +/- 0.04929118579662498


In [59]:
# LogReg, stemmed, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6297679507030749 +/- 0.048122300969477526


In [60]:
# LogReg, lemmatized, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6309529869872104 +/- 0.04725685211221847


#### Multinomial Bayes on TF-IDF treated training data

In [61]:
# basic Multinomial Bayes
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6554771472834363 +/- 0.05697076934045421


In [62]:
# Multinomial Bayes, stemmed
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6601653543759294 +/- 0.06010001540846695


In [63]:
# Multinomial Bayes, lemmatized
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6617217901213649 +/- 0.055086839634722246


In [64]:
# Multinomial Bayes, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6478170987485605 +/- 0.054678210710489006


In [65]:
# Multinomial Bayes, stemmed, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6466233450531924 +/- 0.05253238685817721


In [66]:
# Multinomial Bayes, lemmatized, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6514290195889771 +/- 0.05372415943963887


In [67]:
# my_params = {'C': [1e3, 1e2, 0.1, 0.5, 1, 10, 100, 1000],
#              'kernel': ['rbf', 'poly', 'sigmoid'],
#              'degree': [2, 3, 4],
#              'gamma' : ['auto', 'scale', 1e3, 1e2, 0.1, 1, 10, 100, 1000],
#              'class_weight' : ['balanced'],
#              'random_state' : [42],
#              'probability' : [False, True],
#              'shrinking' : [False, True],
#              'coef0' : [1e3, 1e2, 0.1, 1, 10, 100, 1000]}

In [68]:
# scorer = make_scorer(f1_score)
# gs_clf = GridSearchCV(svm.SVC(), my_params, scoring = scorer)
# gs_clf.fit(train_tfidf_lemmatized_df, train_df["target"])
# print(gs_clf.best_params_, gs_clf.best_score_)

In [69]:
#try SVM: params determined by GridSearchCV
clf = svm.SVC(C = 0.5,
              kernel = 'sigmoid',
              degree = 2,
              gamma = 'scale',
              class_weight = 'balanced',
              random_state = 42)

scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['SVM'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6587775553284982 +/- 0.051150000199746234


In [70]:
scoring_df.sort_values(by = 'Mean_F1_Score', ascending = False)

Unnamed: 0,Model,Vectorizer,Text_Treatment,Mean_F1_Score,F1_Std_Dev
9,MultinomialNB,CountVectorizer,Lemmatized,0.683867,0.04188
8,MultinomialNB,CountVectorizer,Stemmed,0.679516,0.045142
7,MultinomialNB,CountVectorizer,,0.674957,0.038878
16,LogisticRegressionCV,TfidfVectorizer,Lemmatized,0.674197,0.040377
15,LogisticRegression,TfidfVectorizer,Lemmatized,0.667557,0.041092
14,LogisticRegression,TfidfVectorizer,Stemmed,0.664906,0.045689
22,MultinomialNB,TfidfVectorizer,Lemmatized,0.661722,0.055087
13,LogisticRegression,TfidfVectorizer,,0.66117,0.04249
12,MultinomialNB,CountVectorizer,"Removed stopwords, lemmatized",0.660735,0.043151
21,MultinomialNB,TfidfVectorizer,Stemmed,0.660165,0.0601


In [71]:
# save scoring for future reference
# scoring_df.to_csv('../data/first_round_scores.csv', index = False)

I chose the LogisticRegressionCV model, using TF-IDF term weighting, trained on the lemmatized training data, because its mean F1 score was only slightly lower than Multinomial Bayes, while its stability was slightly better (as measured by the F1 score standard deviation).

In [72]:
# Final model - LogRegCV, lemmatized
clf = LogisticRegressionCV(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

0.6741968290118585 +/- 0.04037677659541278


In [73]:
vectorizer = pickle.loads(tf_idf_pickled)
vectorizer.fit(lemmatized_train_df['text'])
test_tfidf_df = vectorizer.transform(test_df['text'])

In [74]:
train_tfidf_lemmatized_df.shape

(7502, 5578)

In [75]:
test_tfidf_df.shape

(3263, 5578)

In [76]:
clf.fit(train_tfidf_lemmatized_df, train_df["target"])
preds = clf.predict(test_tfidf_df)
preds

array([1, 1, 1, ..., 1, 1, 0])

### Create submission file and validate.

In [77]:
# model_sub = pd.read_csv('../data/sample_submission.csv')
# model_sub['target'] = preds
# model_sub.to_csv('../data/prediction_submission.csv', index = False)

In [78]:
# check_sub = pd.read_csv('../data/prediction_submission.csv')
# check_sub.head().append(check_sub.tail())

In [79]:
# check_sub['target'].value_counts()

Attempting to use TensorFlow & BERT

commented out, takes too long on my laptop; will try to put in a Kaggle notebook with GPU

In [80]:
# def bert_encode(texts, tokenizer, max_len=512):
#     all_tokens = []
#     all_masks = []
#     all_segments = []
    
#     for text in texts:
#         text = tokenizer.tokenize(text)
            
#         text = text[:max_len-2]
#         input_sequence = ["[CLS]"] + text + ["[SEP]"]
#         pad_len = max_len - len(input_sequence)
        
#         tokens = tokenizer.convert_tokens_to_ids(input_sequence)
#         tokens += [0] * pad_len
#         pad_masks = [1] * len(input_sequence) + [0] * pad_len
#         segment_ids = [0] * max_len
        
#         all_tokens.append(tokens)
#         all_masks.append(pad_masks)
#         all_segments.append(segment_ids)
    
#     return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [81]:
# def build_model(bert_layer, max_len=512):
#     input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
#     segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

#     _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
#     clf_output = sequence_output[:, 0, :]
#     out = Dense(1, activation='sigmoid')(clf_output)
    
#     model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
#     model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
#     return model

In [82]:
# %%time
# module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
# bert_layer = hub.KerasLayer(module_url, trainable=True)

In [83]:
# vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
# tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [84]:
# train_input = bert_encode(train_df.text.values, tokenizer, max_len=160)
# test_input = bert_encode(test_df.text.values, tokenizer, max_len=160)
# train_labels = train_df.target.values

In [85]:
# model = build_model(bert_layer, max_len=160)
# model.summary()

In [86]:
# train_history = model.fit(train_input, train_labels,
#                           validation_split=0.2,
#                           epochs=5)

In [87]:
# test_pred = model.predict(test_input)

In [88]:
# submission['target'] = test_pred.round().astype(int)
# submission.to_csv('disaster_tweet_submission.csv', index=False)