# 4.modeling

Label tweets as 0 and 1 based on key words to train classification models.

## 4.1 Label tweets based on key words

In [1]:
# read data
%store -r disaster_tweets
%store -r self_defined_stop_words
%store -r final_disaster_words

import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction import stop_words 
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from collections import Counter


In [2]:
disaster_tweets.reset_index(inplace=True, drop=True)

In [3]:
# we tried self defined words to label tweets as well
# but our final_disaster_words list works better

# words=[
#  'need shelter',
#  'help me',
#  'needhelp',
#  'help us',
#  'pleasehelp',
#  'casualties',
#  'assistance',
#  'need help',
#  'send help',
#  'please help',
#  'shelter',
#  'emergency']

words = final_disaster_words

disaster_tweet_index = []
words = final_disaster_words

for i in words:
    for x in range(len(disaster_tweets['text'])):
        if i in disaster_tweets['text'][x]:
            disaster_tweet_index.append(x)
print(f'Number of tweets related to disaster is {len(disaster_tweet_index)}')
print(f'Number of overall tweets is {len(disaster_tweets)}')


Number of tweets related to disaster is 942
Number of overall tweets is 1397


In [4]:
# label tweets as 0 or 1
disaster_tweets['label'] = 0
disaster_tweets['label'][disaster_tweet_index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## 4.2 Build models

In [5]:
X = disaster_tweets['text']
y = disaster_tweets['label']

# resampling to create a balanced class
ros = RandomOverSampler(random_state=0) # Instantiate a random oversampler in order to oversample our training set
X_resampled, y_resampled = ros.fit_resample(X.values.reshape(-1, 1), y) # Fit that oversampler to our X_sc (scaled) and y data
print(sorted(Counter(y_resampled).items())) # Show the balance btw classes

[(0, 834), (1, 834)]


In [6]:
# train test data split in training set
# test data here is used to evaluate
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)

In [7]:
# count vectorizer words
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokenizer = RegexpTokenizer('(?u)\\b\\w\\w+\\b')
        return [self.wnl.lemmatize(t, 'v') for t in tokenizer.tokenize(doc)]

In [8]:
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(self_defined_stop_words)
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                            preprocessor = None,
                            stop_words = stop_words,
                            max_features = 1500,
                            ngram_range= (1,2),
                            analyzer = 'word', 
                            min_df=3) 
vectorizer.fit(X_train.ravel())
X_train = vectorizer.transform(X_train.ravel())
X_test = vectorizer.transform(X_test.ravel())
X_train_df = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names())
y_train_df = pd.DataFrame(y_train)


  'stop_words.' % sorted(inconsistent))


In [9]:
# use best params to build classification model
lr_class = LogisticRegression(penalty='l1', C=40, solver='liblinear')
# knn_class = KNeighborsClassifier(n_neighbors=3, p=4, leaf_size=10, metric='minkowski')
tree_class = DecisionTreeClassifier(max_features='auto', min_samples_leaf=3, min_samples_split=4, random_state=100)
bag_class = BaggingClassifier(bootstrap=False, max_features=8, max_samples=100, n_estimators=100, random_state=100)
forest_class = RandomForestClassifier(bootstrap=True, max_leaf_nodes=None, min_samples_leaf=3, min_samples_split=8, n_estimators=9, random_state=100)
ada_class = AdaBoostClassifier(learning_rate=0.78, n_estimators=100, random_state=100)
svc = SVC(degree=8, C=2.5, gamma=0.1, kernel='poly', random_state=100)
grad_class = GradientBoostingClassifier(learning_rate=0.06464371632490062, n_estimators=100, min_samples_leaf=2, min_samples_split=5, max_depth=7)

In [10]:
# use below code to find the best params for each model
# params = {
#     'max_depth' : [5, 6, 7],
#     'min_samples_leaf' : [2, 3],
#     'min_samples_split' : [4, 5, 6],
#     'n_estimators' : [50, 100, 125],
#     "learning_rate" : (np.logspace(-1.6, -1, 20))
# }
# gs = GridSearchCV(
#     GradientBoostingClassifier(),
#     params,
#     cv=3,
#     verbose=1,
#     return_train_score=False,
#     n_jobs=2)
# gs.fit(X_train, y_train)
# print(gs.best_score_)
# print()
# print(gs.best_params_)
# print()
# print(gs.score(X_test, y_test))
# pred = model.predict(X_test)
# f1_score(y_test, pred)

In [11]:
class_models = {
            'lr_class': lr_class, 
            'forest_class': forest_class, 
            'tree_class': tree_class,
            'ada_class': ada_class,            
            'bag_class': bag_class, 
            'svc': svc,
            "grad": grad_class
                } 

In [12]:
y_pred_testc = []
y_pred_trainc = []

for model in class_models.values():
    model.fit(X_train, y_train)
    y_pred_testc.append(model.predict(X_test))
    y_pred_trainc.append(model.predict(X_train))

y_pred_testc_df = pd.DataFrame(y_pred_testc, index=class_models.keys()).T
y_pred_trainc_df = pd.DataFrame(y_pred_trainc, index=class_models.keys()).T
print(y_pred_testc_df.shape)
print(y_pred_trainc_df.shape)

(417, 7)
(1251, 7)


In [13]:
# create summary chart
accuracy = {'train': [], 'test': [], 'F1-train': [], 'F1-test': [], 'true_neg': [], 'fal_pos': [], 'fal_neg': [], 'true_po': []}
for model in class_models.values():
    accuracy['train'].append(model.score(X_train, y_train))
    accuracy['test'].append(model.score(X_test, y_test))
for col in y_pred_testc_df:
    accuracy['F1-train'].append(f1_score(y_train, y_pred_trainc_df[col]))
    accuracy['F1-test'].append(f1_score(y_test, y_pred_testc_df[col]))
for col in y_pred_testc_df:
    accuracy['true_neg'].append(confusion_matrix(y_test, y_pred_testc_df[col])[0][0])
    accuracy['fal_pos'].append(confusion_matrix(y_test, y_pred_testc_df[col])[0][1])
    accuracy['fal_neg'].append(confusion_matrix(y_test, y_pred_testc_df[col])[1][0])
    accuracy['true_po'].append(confusion_matrix(y_test, y_pred_testc_df[col])[1][1])
    
accuracy_df = pd.DataFrame(accuracy, index=class_models.keys())
accuracy_df

Unnamed: 0,train,test,F1-train,F1-test,true_neg,fal_pos,fal_neg,true_po
lr_class,0.998401,0.868106,0.998379,0.874715,170,31,24,192
forest_class,0.901679,0.846523,0.900566,0.848341,174,27,37,179
tree_class,0.884093,0.832134,0.886807,0.84375,158,43,27,189
ada_class,0.926459,0.846523,0.924466,0.843137,181,20,44,172
bag_class,0.59952,0.522782,0.332889,0.167364,198,3,196,20
svc,0.808153,0.633094,0.759036,0.451613,201,0,153,63
grad,0.948841,0.880096,0.947282,0.881517,181,20,30,186


## 4.3 Apply models on new tweets

In [14]:
# read new tweets
campfire_new_tweets = pd.read_csv('./data/campfire_new_tweets.csv')
campfire_new_tweets_2 = pd.read_csv('./data/campfire_new_tweets_2.csv')
carrfire_new_tweets = pd.read_csv('./data/carrfire_new_tweets.csv')

In [15]:
# combine results
new_tweets = pd.concat([campfire_new_tweets,campfire_new_tweets_2,carrfire_new_tweets])

In [16]:
# remove duplicate tweets
new_tweets.drop_duplicates(subset ="text", keep = False, inplace = True)
new_tweets.reset_index(inplace=True, drop=True)

In [17]:
# we have 832 tweets to predict
len(new_tweets)

832

In [18]:
# apply countvectorizer on X_pred
X_pred = new_tweets['text']
X_pred = vectorizer.transform(X_pred.ravel())
X_pred_df = pd.DataFrame(X_pred.toarray(), columns=vectorizer.get_feature_names())


In [19]:
# apply models on X_pred_df
prediction = pd.DataFrame(index=X_pred_df.index)

for (model_name, model) in class_models.items():
    prediction[model_name] = model.predict(X_pred_df)
prediction['total'] = prediction['lr_class'] + prediction['forest_class'] + prediction['tree_class'] + \
        prediction['ada_class'] + prediction['bag_class'] + prediction['svc'] + prediction['grad'] 


In [20]:
# combine all predict together
prediction.sort_values(by='total', ascending=False).head(10)

Unnamed: 0,lr_class,forest_class,tree_class,ada_class,bag_class,svc,grad,total
496,1,1,1,1,1,0,1,6
0,1,1,1,1,0,0,1,5
130,1,1,1,1,0,0,1,5
598,1,1,1,1,0,0,1,5
769,1,1,1,1,0,0,1,5
1,1,1,1,1,0,0,1,5
124,1,1,1,1,0,0,1,5
126,1,1,1,1,0,0,1,5
127,1,1,1,1,0,0,1,5
131,1,1,1,1,0,0,1,5


In [21]:
# number in the array means how many model predicted it as 1
prediction['total'].unique()

array([5, 0, 1, 2, 4, 3, 6])

In [22]:
# number of tweets marked as 1 by 3 or more models
len(prediction[~((prediction['total']==0)|(prediction['total']==1)|(prediction['total']==2))])

148

In [23]:
# index of rows that are predicted as "related to disaster and need help"
index_true= prediction[~((prediction['total']==0)|(prediction['total']==1)|(prediction['total']==1))].index

In [24]:
# quick check of related tweets
new_tweets.iloc[index_true]['text'].head(20)

0     Found on Neal please help... Call AC 949680884...
1     #HappyHolidays from #ParadiseCalifornia help m...
2     We are here to help. #LTMA #LessTalkMoreAction...
5     More pics from @viralwebstudio to go with my l...
6     Yesterday we returned to Butte County to help ...
9     "Hey. Are all the arrangements made? I think J...
11    Huge THANK YOU  to all who #donate #volunteer ...
12    Shaila worked nonstop to help organize. Jamie ...
14    As many of you have seen and donated to our ef...
16    Heartbreak for miles in Chico. Grateful to so ...
17    Been working on this non stop so happy it came...
22    Sunday evening by the fire. @ Oroville East, C...
23    We’re headed to Chico, California tomorrow to ...
25    Paradise, California: January 25, 2019 \nYeste...
26    Mod MOVERS is 30 minutes away from what’s refe...
28    Such a joy to see a group of @challengecsuc st...
37    #Chico!! TONITE 9pm at sierranevadachico big r...
39    #CHICO #CA!! TOMORROW 1/13 we rock the sie

In [25]:
# pull longitute and latitute
index_true= index_true.to_list()
coord_true = new_tweets.iloc[index_true][['latitute','longitute']]

In [26]:
coord_true.head()

Unnamed: 0,latitute,longitute
0,39.7617,-121.609
1,39.7617,-121.609
2,39.7617,-121.609
5,39.7263,-121.8358
6,39.51878,-121.567266


In [27]:
# export coordinates as csv file to plot
coord_true.to_csv('./data/coord_to_plot.csv', index=False)