# Disaster Tweets

Initial plan is to use this notebook for modeling of preprocessed data.

In [1]:
# imports

# data
import pandas as pd
import numpy as np

# modeling
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization
from nltk.corpus import stopwords

import pickle

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')


In [2]:
# read training and test data
train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/cleaned_train.csv')

In [3]:
train_df.head()

Unnamed: 0,id,keyword,text,target
0,1,nokeyword,our deeds are the reason of this earthquake ma...,1
1,4,nokeyword,forest fire near la ronge sask canada,1
2,5,nokeyword,all residents asked to shelter in place are be...,1
3,6,nokeyword,people receive wildfires evacuation orders in ...,1
4,7,nokeyword,just got sent this photo from ruby alaska as s...,1


In [4]:
stemmed_train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/stemmed_train.csv')

In [5]:
stemmed_train_df.head()

Unnamed: 0,id,keyword,text,target
0,1,nokeyword,our deed are the reason of thi earthquak may a...,1
1,4,nokeyword,forest fire near la rong sask canada,1
2,5,nokeyword,all resid ask to shelter in place are be notif...,1
3,6,nokeyword,peopl receiv wildfir evacu order in california,1
4,7,nokeyword,just got sent thi photo from rubi alaska as sm...,1


In [6]:
lemmatized_train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/lemmatized_train.csv')

In [7]:
lemmatized_train_df.head()

Unnamed: 0,id,keyword,text,target
0,1,nokeyword,our deed are the reason of this earthquake may...,1
1,4,nokeyword,forest fire near la ronge sask canada,1
2,5,nokeyword,all resident asked to shelter in place are bei...,1
3,6,nokeyword,people receive wildfire evacuation order in ca...,1
4,7,nokeyword,just got sent this photo from ruby alaska a sm...,1


In [8]:
test_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/cleaned_test.csv')

In [9]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,nokeyword,nolocation,just happened a terrible car crash
1,2,nokeyword,nolocation,heard about earthquake is different city stay ...
2,3,nokeyword,nolocation,there is a forest fire at spot pond goose are ...
3,9,nokeyword,nolocation,apocalypse lighting spokane wildfire
4,11,nokeyword,nolocation,typhoon soudelor kill in china and taiwan


#### How good does my model have to be to outperform the naive approach (i.e., no tweet is about a disaster)?

In [10]:
p_classes = dict(train_df['target'].value_counts(normalize=True))
naive_approach = p_classes[0]
print('Class probabilities: ', p_classes,
      '\nChance tweet is not about a real disaster: ', np.round(naive_approach, decimals = 4))

Class probabilities:  {0: 0.5737136763529725, 1: 0.42628632364702745} 
Chance tweet is not about a real disaster:  0.5737


#### Set up a DataFrame to hold scoring information, for final model selection.

In [11]:
scoring_df = pd.DataFrame(columns = ['Model', 'Vectorizer', 'Text_Treatment', 'Mean_F1_Score', 'F1_Std_Dev'])

### Bagging using sklearn CountVectorizer

First set of experiments will include stop words.

In [12]:
count_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                   ngram_range = (1, 2),
                                   binary = True)

In [13]:
train_vector_df = count_vectorizer.fit_transform(train_df['text'])

In [14]:
train_vector_df.shape

(7502, 70574)

In [15]:
train_vector_stemmed_df = count_vectorizer.fit_transform(stemmed_train_df['text'])

In [16]:
train_vector_stemmed_df.shape

(7502, 65602)

In [17]:
train_vector_lemma_df = count_vectorizer.fit_transform(lemmatized_train_df['text'])

In [18]:
vector_pickled = pickle.dumps(count_vectorizer)

In [19]:
train_vector_lemma_df.shape

(7502, 68135)

Second set of experiments will remove stop words, to see if that improves performance.

In [20]:
count_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                   stop_words = stopwords.words('english'),
                                   ngram_range = (1, 2),
                                   binary = True)

In [21]:
train_vector_no_stops_df = count_vectorizer.fit_transform(train_df['text'])

In [22]:
train_vector_no_stops_df.shape

(7502, 57913)

In [23]:
train_vector_stemmed_no_stops_df = count_vectorizer.fit_transform(stemmed_train_df['text'])

In [24]:
train_vector_stemmed_no_stops_df.shape

(7502, 55077)

In [25]:
train_vector_lemma_no_stops_df = count_vectorizer.fit_transform(lemmatized_train_df['text'])

In [26]:
train_vector_lemma_no_stops_df.shape

(7502, 56481)

#### Logistic Regression on CountVectorizer treated training data

In [27]:
# basic LogReg
clf_logreg = LogisticRegression(class_weight = 'balanced')

scores = model_selection.cross_val_score(clf_logreg,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")

mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6290542858178572 +/- 0.05930569871502275


In [28]:
# LogRegCV
clf_logreg_cv = LogisticRegressionCV(class_weight = 'balanced')

scores = model_selection.cross_val_score(clf_logreg_cv,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")

mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegressionCV'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6382107580107526 +/- 0.05755892959272567


In [29]:
# LogReg, stemmed
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6282724542006161 +/- 0.05454954304699235


In [30]:
# LogReg, lemmatized
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6377161392519037 +/- 0.054316053736676334


In [31]:
# LogReg, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.5804990476888585 +/- 0.07410121362572519


In [32]:
# LogReg, stemmed, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.5907999024775348 +/- 0.06233658166058654


In [33]:
# LogReg, lemmatized, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.5888661506439731 +/- 0.06521271412619133


#### Multinomial Bayes on CountVectorizer treated training data

In [34]:
# basic Multinomial Bayes
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6749567532885051 +/- 0.0388777822155517


In [35]:
# Multinomial Bayes, stemmed
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.679516177528353 +/- 0.04514210200517351


In [36]:
# Multinomial Bayes, lemmatized
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6838672747769288 +/- 0.04188014881745761


In [37]:
# Multinomial Bayes, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6600585905094238 +/- 0.04066299988918634


In [38]:
# Multinomial Bayes, stemmed, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6593484695899581 +/- 0.04559206635857508


In [39]:
# Multinomial Bayes, lemmatized, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['CountVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6607353953919963 +/- 0.04315073975283043


### Bagging and term frequency weighting using TD-IDF vectorization

For the first set of experiments, I did not remove stopwords from the tweets to get a baseline for comparison.

In [40]:
tf_idf = TfidfVectorizer(ngram_range=(1, 1),
                         max_df=0.5,
                         min_df=2)

In [41]:
train_tfidf_df = tf_idf.fit_transform(train_df['text'])

In [42]:
train_tfidf_df.shape

(7502, 6063)

In [43]:
train_tfidf_stemmed_df = tf_idf.fit_transform(stemmed_train_df['text'])

In [44]:
train_tfidf_stemmed_df.shape

(7502, 5000)

In [45]:
train_tfidf_lemmatized_df = tf_idf.fit_transform(lemmatized_train_df['text'])

In [46]:
tf_idf_pickled = pickle.dumps(tf_idf)

In [47]:
train_tfidf_lemmatized_df.shape

(7502, 5578)

For the second set of experiments using TF-IDF term weighting, I removed the stopwords.

In [48]:
tf_idf = TfidfVectorizer(stop_words = stopwords.words('english'),
                         ngram_range=(1, 1),
                         max_df=0.5,
                         min_df=2)

In [49]:
train_tfidf_no_stops_df = tf_idf.fit_transform(train_df['text'])

In [50]:
train_tfidf_no_stops_df.shape

(7502, 5938)

In [51]:
train_tfidf_stemmed_no_stops_df = tf_idf.fit_transform(stemmed_train_df['text'])

In [52]:
train_tfidf_stemmed_no_stops_df.shape

(7502, 4898)

In [53]:
train_tfidf_lemmatized_no_stops_df = tf_idf.fit_transform(lemmatized_train_df['text'])

In [54]:
train_tfidf_lemmatized_no_stops_df.shape

(7502, 5457)

#### Logistic Regression on TF-IDF treated training data

In [55]:
# basic LogReg
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.661170084735614 +/- 0.04248994235989204


In [56]:
# LogReg, stemmed
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6649059406545451 +/- 0.045688643692764035


In [57]:
# LogReg, lemmatized
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6675572000390637 +/- 0.04109215054227096


In [58]:
# LogRegCV, lemmatized
clf = LogisticRegressionCV(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegressionCV'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6741968290118585 +/- 0.04037677659541278


In [59]:
# LogReg, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6239710913054827 +/- 0.04929118579662498


In [60]:
# LogReg, stemmed, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6297679507030749 +/- 0.048122300969477526


In [61]:
# LogReg, lemmatized, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['LogisticRegression'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6309529869872104 +/- 0.04725685211221847


#### Multinomial Bayes on TF-IDF treated training data

In [62]:
# basic Multinomial Bayes
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6554771472834363 +/- 0.05697076934045421


In [63]:
# Multinomial Bayes, stemmed
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6601653543759294 +/- 0.06010001540846695


In [64]:
# Multinomial Bayes, lemmatized
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6617217901213649 +/- 0.055086839634722246


In [65]:
# Multinomial Bayes, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6478170987485605 +/- 0.054678210710489006


In [66]:
# Multinomial Bayes, stemmed, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6466233450531924 +/- 0.05253238685817721


In [67]:
# Multinomial Bayes, lemmatized, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['MultinomialNB'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6514290195889771 +/- 0.05372415943963887


#### Support Vector Machine Models
I used cross-validation to determine the parameters for all SVM models.

In [68]:
# Set up parameter grid for GridSearchCV testing

# my_params = {'C': [0.1, 0.3, 0.5, 0.7],
#              'kernel': ['rbf', 'poly', 'sigmoid'],
#              'degree': [2, 3],
#              'gamma' : ['auto', 'scale'],
#              'class_weight' : ['balanced'],
#              'random_state' : [42],
#              'probability' : [False, True],
# #              'shrinking' : [False, True],
#              'coef0' : [1e2, 0.1, 1, 10]}

In [69]:
# GridSearchCV testing to find best parameters for SVM model

# scorer = make_scorer(f1_score)
# gs_clf = GridSearchCV(svm.SVC(),
#                       param_grid = my_params,
#                       scoring = scorer,
#                       verbose = 1,
#                       n_jobs = -1)
# gs_clf.fit(train_tfidf_lemmatized_df, train_df["target"])
# print(gs_clf.best_params_, gs_clf.best_score_)

# results:
# {'C': 0.7,
#  'class_weight': 'balanced',
#  'coef0': 1,
#  'degree': 2,
#  'gamma': 'scale',
#  'kernel': 'sigmoid',
#  'probability': False,
#  'random_state': 42}
# 0.6660730647063914

In [115]:
# LSA -> SVM: CountVectorizer, raw
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer()

clf_svc = svm.SVC(C = 0.5,
              kernel = 'sigmoid',
              degree = 2,
              gamma = 'scale',
              class_weight = 'balanced',
              random_state = 42)

pipe = pipeline.make_pipeline(svd, normalizer, clf_svc)

scores = model_selection.cross_val_score(pipe,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['SVM'],
                                    'Vectorizer' : ['LSA'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6124490891205137 +/- 0.03292917904554958


In [70]:
# LSA -> SVM: TF-IDF, raw
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer()

clf_svc = svm.SVC(C = 0.5,
              kernel = 'sigmoid',
              degree = 2,
              gamma = 'scale',
              class_weight = 'balanced',
              random_state = 42)

pipe = pipeline.make_pipeline(svd, normalizer, clf_svc)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['SVM'],
                                    'Vectorizer' : ['TfidfVectorizer'],
                                    'Text_Treatment' : ['None'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6422426503451424 +/- 0.027939842727464533


In [71]:
# LSA -> SVM: TF-IDF, stemmed
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer()

clf_svc = svm.SVC(C = 0.5,
              kernel = 'sigmoid',
              degree = 2,
              gamma = 'scale',
              class_weight = 'balanced',
              random_state = 42)

pipe = pipeline.make_pipeline(svd, normalizer, clf_svc)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['SVM'],
                                    'Vectorizer' : ['LSA'],
                                    'Text_Treatment' : ['Stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6516378542457344 +/- 0.030410924927612156


In [72]:
# LSA -> SVM: TF-IDF, lemmatized
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer()

clf_svc = svm.SVC(C = 0.5,
              kernel = 'sigmoid',
              degree = 2,
              gamma = 'scale',
              class_weight = 'balanced',
              random_state = 42)

pipe = pipeline.make_pipeline(svd, normalizer, clf_svc)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['SVM'],
                                    'Vectorizer' : ['LSA'],
                                    'Text_Treatment' : ['Lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.653453506250657 +/- 0.029915676849887145


In [73]:
# LSA -> SVM: TF-IDF, raw, no stopwords
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer()

clf_svc = svm.SVC(C = 0.5,
              kernel = 'sigmoid',
              degree = 2,
              gamma = 'scale',
              class_weight = 'balanced',
              random_state = 42)

pipe = pipeline.make_pipeline(svd, normalizer, clf_svc)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['SVM'],
                                    'Vectorizer' : ['LSA'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6462697248132283 +/- 0.03027529085221108


In [74]:
# LSA -> SVM: TF-IDF, stemmed, no stopwords
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer()

clf_svc = svm.SVC(C = 0.5,
              kernel = 'sigmoid',
              degree = 2,
              gamma = 'scale',
              class_weight = 'balanced',
              random_state = 42)

pipe = pipeline.make_pipeline(svd, normalizer, clf_svc)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['SVM'],
                                    'Vectorizer' : ['LSA'],
                                    'Text_Treatment' : ['Removed stopwords, stemmed'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6382746296233381 +/- 0.02709286556980339


In [75]:
# LSA -> SVM: TF-IDF, lemmatized, no stopwords
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer()

clf_svc = svm.SVC(C = 0.5,
              kernel = 'sigmoid',
              degree = 2,
              gamma = 'scale',
              class_weight = 'balanced',
              random_state = 42)

pipe = pipeline.make_pipeline(svd, normalizer, clf_svc)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_lemmatized_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['SVM'],
                                    'Vectorizer' : ['LSA'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.6357309894803729 +/- 0.03266720780230194


#### KNN Models

I also tried a couple of KNN models, but the results weren't promising.

In [76]:
# LSA -> KNN: TF-IDF, no stopwords
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer(copy = False)

clf_knn = KNeighborsClassifier(n_neighbors=5,
                               algorithm='brute',
                               metric='cosine')

pipe = pipeline.make_pipeline(svd, normalizer, clf_knn)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_no_stops_df, train_df['target'],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['KNN'],
                                    'Vectorizer' : ['LSA'],
                                    'Text_Treatment' : ['Removed stopwords'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.5589305021316724 +/- 0.041042963510106854


In [77]:
# LSA -> KNN: TF-IDF, lemmatized, no stopwords
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer(copy = False)

clf_knn = KNeighborsClassifier(n_neighbors=5,
                               algorithm='brute',
                               metric='cosine')

pipe = pipeline.make_pipeline(svd, normalizer, clf_knn)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_lemmatized_no_stops_df, train_df['target'],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

score_row = pd.DataFrame.from_dict({'Model' : ['KNN'],
                                    'Vectorizer' : ['LSA'],
                                    'Text_Treatment' : ['Removed stopwords, lemmatized'],
                                    'Mean_F1_Score' : [mean_score],
                                    'F1_Std_Dev' : [stability]})
scoring_df = scoring_df.append(score_row, ignore_index = True)

0.5416760287898673 +/- 0.04708368449040268


---

### Table of Model Scores
This table is sorted by the mean F1 score of each model, to help me select the final model(s) for submission to the contest.

In [78]:
scoring_df.sort_values(by = ['Mean_F1_Score'], ascending = False)

Unnamed: 0,Model,Vectorizer,Text_Treatment,Mean_F1_Score,F1_Std_Dev
9,MultinomialNB,CountVectorizer,Lemmatized,0.683867,0.04188
8,MultinomialNB,CountVectorizer,Stemmed,0.679516,0.045142
7,MultinomialNB,CountVectorizer,,0.674957,0.038878
16,LogisticRegressionCV,TfidfVectorizer,Lemmatized,0.674197,0.040377
15,LogisticRegression,TfidfVectorizer,Lemmatized,0.667557,0.041092
14,LogisticRegression,TfidfVectorizer,Stemmed,0.664906,0.045689
22,MultinomialNB,TfidfVectorizer,Lemmatized,0.661722,0.055087
13,LogisticRegression,TfidfVectorizer,,0.66117,0.04249
12,MultinomialNB,CountVectorizer,"Removed stopwords, lemmatized",0.660735,0.043151
21,MultinomialNB,TfidfVectorizer,Stemmed,0.660165,0.0601


In [79]:
# save scoring for future reference
# scoring_df.to_csv('../data/first_round_scores.csv', index = False)

---

### Final models
I used the three highest-scoring models to create submissions for the Kaggle contest.

In [80]:
# Final models - LogRegCV, lemmatized
clf_lr = LogisticRegressionCV(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf_lr,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

0.6741968290118585 +/- 0.04037677659541278


In [81]:
vectorizer = pickle.loads(tf_idf_pickled)
vectorizer.fit(lemmatized_train_df['text'])
test_tfidf_df = vectorizer.transform(test_df['text'])

In [82]:
train_tfidf_lemmatized_df.shape

(7502, 5578)

In [83]:
test_tfidf_df.shape

(3263, 5578)

In [84]:
clf_lr.fit(train_tfidf_lemmatized_df, train_df["target"])
lr_preds = clf_lr.predict(test_tfidf_df)
lr_preds

array([1, 1, 1, ..., 1, 1, 0])

In [85]:
# Final models - Multinomial Bayes, lemmatized
# this was the best model (Kaggle score: 0.80777)
clf_mnb = MultinomialNB()
scores = model_selection.cross_val_score(clf_mnb,
                                         train_vector_lemma_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)

0.6838672747769288 +/- 0.04188014881745761


In [86]:
train_vector_lemma_df.shape

(7502, 68135)

In [87]:
vectorizer = pickle.loads(vector_pickled)
vectorizer.fit(lemmatized_train_df['text'])
test_vector_df = vectorizer.transform(test_df['text'])

In [88]:
train_vector_lemma_df.shape

(7502, 68135)

In [89]:
test_vector_df.shape

(3263, 68135)

In [90]:
clf_mnb.fit(train_vector_lemma_df, train_df["target"])
mnb_preds = clf_mnb.predict(test_vector_df)
mnb_preds

array([1, 1, 1, ..., 1, 1, 1])

In [113]:
# Final models -  LSA -> SVM, lemmatized
svd = decomposition.TruncatedSVD(n_components = 100, random_state = 42)
normalizer = preprocessing.Normalizer()

clf_svc = svm.SVC(C = 0.7,
                  kernel = 'sigmoid',
                  degree = 2,
                  gamma = 'auto',
                  class_weight = 'balanced',
                  random_state = 42)

pipe = pipeline.make_pipeline(svd, normalizer, clf_svc)

scores = model_selection.cross_val_score(pipe,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
mean_score = scores.mean()
stability = scores.std()
print(mean_score, '+/-', stability)
# C = 0.7 : 0.6500167132804899 +/- 0.02834759351127173
# C = 0.5 : 0.6546081437077995 +/- 0.03106318759242463
# C = 0.5, gamma = 'auto' : 0.6515706143093366 +/- 0.01981111029829437

0.6407234003477653 +/- 0.02511603509552794


In [92]:
vectorizer = pickle.loads(tf_idf_pickled)
vectorizer.fit(lemmatized_train_df['text'])
test_tfidf_df = vectorizer.transform(test_df['text'])

In [93]:
train_tfidf_lemmatized_df.shape

(7502, 5578)

In [94]:
test_tfidf_df.shape

(3263, 5578)

In [95]:
clf_svc.fit(train_tfidf_lemmatized_df, train_df["target"])
svc_preds = clf_svc.predict(test_tfidf_df)
svc_preds

array([1, 1, 1, ..., 1, 1, 0])

---

### Create submission file and validate.

In [96]:
# submission for Logistic Regression predictions
# model_sub = pd.read_csv('../data/sample_submission.csv')
# model_sub['target'] = lr_preds
# model_sub.to_csv('../data/lr_prediction_submission.csv', index = False)

In [97]:
# submission for Multinomial Naive Bayes predictions
# this one got the best Kaggle score: 0.80777
# model_sub = pd.read_csv('../data/sample_submission.csv')
# model_sub['target'] = mnb_preds
# model_sub.to_csv('../data/mnb_prediction_submission.csv', index = False)

In [98]:
# submission for SVM predictions
# model_sub = pd.read_csv('../data/sample_submission.csv')
# model_sub['target'] = svc_preds
# model_sub.to_csv('../data/svc_prediction_submission.csv', index = False)

In [99]:
# check_sub = pd.read_csv('../data/prediction_submission.csv')
# check_sub.head().append(check_sub.tail())

In [100]:
# check_sub['target'].value_counts()

### Next Steps
There are still some things I can try with these models in order to improve them:
* Vary the number of components in the LSA models
* More/better tuning of hyperparameters in all models
As well, I would like to test TensorFlow & BERT in a Kaggle notebook w/GPU turned on to see how it performs on this problem.

---

Attempting to use TensorFlow & BERT

commented out, takes too long on my laptop; will try to put in a Kaggle notebook with GPU

In [101]:
# def bert_encode(texts, tokenizer, max_len=512):
#     all_tokens = []
#     all_masks = []
#     all_segments = []
    
#     for text in texts:
#         text = tokenizer.tokenize(text)
            
#         text = text[:max_len-2]
#         input_sequence = ["[CLS]"] + text + ["[SEP]"]
#         pad_len = max_len - len(input_sequence)
        
#         tokens = tokenizer.convert_tokens_to_ids(input_sequence)
#         tokens += [0] * pad_len
#         pad_masks = [1] * len(input_sequence) + [0] * pad_len
#         segment_ids = [0] * max_len
        
#         all_tokens.append(tokens)
#         all_masks.append(pad_masks)
#         all_segments.append(segment_ids)
    
#     return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [102]:
# def build_model(bert_layer, max_len=512):
#     input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
#     segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

#     _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
#     clf_output = sequence_output[:, 0, :]
#     out = Dense(1, activation='sigmoid')(clf_output)
    
#     model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
#     model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
#     return model

In [103]:
# %%time
# module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
# bert_layer = hub.KerasLayer(module_url, trainable=True)

In [104]:
# vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
# tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [105]:
# train_input = bert_encode(train_df.text.values, tokenizer, max_len=160)
# test_input = bert_encode(test_df.text.values, tokenizer, max_len=160)
# train_labels = train_df.target.values

In [106]:
# model = build_model(bert_layer, max_len=160)
# model.summary()

In [107]:
# train_history = model.fit(train_input, train_labels,
#                           validation_split=0.2,
#                           epochs=5)

In [108]:
# test_pred = model.predict(test_input)

In [109]:
# submission['target'] = test_pred.round().astype(int)
# submission.to_csv('disaster_tweet_submission.csv', index=False)