In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [3]:
train = pd.read_csv('./data/train_clean.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,text_nourl,keywords_stemmed
0,1,earthquake,,our deeds are the reason of this #earthquake m...,1,our deeds are the reason of this #earthquake m...,earthquak
1,4,forest fire,,forest fire near la ronge sask. canada,1,forest fire near la ronge sask. canada,forest fir
2,5,evacuation,,all residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...,evacu
3,6,wildfire,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",wildfir
4,7,wildfire,,just got sent this photo from ruby #alaska as ...,1,just got sent this photo from ruby #alaska as ...,wildfir


In [9]:
y = train['target']
X = train[['text_nourl', 'keywords_stemmed']]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=42, 
                                                    test_size=.25, 
                                                    stratify=y)

In [10]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()

X_train_cv = cvec.fit_transform(X_train['keywords_stemmed'])

mnb_cv = MultinomialNB()
mnb_cv.fit(X_train_cv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
X_train_tfidf = tfidf.fit_transform(X_train['keywords_stemmed'])
mnb_tf = MultinomialNB()
mnb_tf.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# inspiration from https://stackoverflow.com/questions/38555650/try-multiple-estimator-in-one-grid-search

vects = {
    'cvec' : CountVectorizer(),
    'tfidf' : TfidfVectorizer()
}
models = {
    'mnb' : MultinomialNB(),
    'knn' : KNeighborsClassifier(),
    'logreg' : LogisticRegression(),
    'dectree' : DecisionTreeClassifier(),
    'svc' : SVC(),
    'randomforest' : RandomForestClassifier(),
    'adaboost' : AdaBoostClassifier()
    
}

modeling = []
for vec in vects:
    for mode in models:
        mod = Pipeline([(vec,vects[vec])
                                   ,(mode, models[mode])])
        curmod = mod.fit(X_train['keywords_stemmed'], y_train)
        out = {
            'name' : f'{vec} + {mode}',
            'model' : curmod,
            'train_score' : curmod.score(X_train['keywords_stemmed'],
                                        y_train),
            'test_score' : curmod.score(X_test['keywords_stemmed'],
                                       y_test)
        }
        modeling.append(out)
all_mods_keywords = pd.DataFrame(modeling)

In [29]:
def modeling(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=42, 
                                                    test_size=.25, 
                                                    stratify=y)
    vects = {
    'cvec' : CountVectorizer(),
    'tfidf' : TfidfVectorizer()
}
    models = {
    'mnb' : MultinomialNB(),
    'knn' : KNeighborsClassifier(),
    'logreg' : LogisticRegression(),
    'dectree' : DecisionTreeClassifier(),
    'svc' : SVC(),
    'randomforest' : RandomForestClassifier(),
    'adaboost' : AdaBoostClassifier(),
    'mlpclass' : MLPClassifier()
    
}

    outp = []
    for vec in vects:
        for mode in models:
            mod = Pipeline([(vec,vects[vec])
                                       ,(mode, models[mode])])
            curmod = mod.fit(X_train, y_train)
            out = {
                'name' : f'{vec} + {mode}',
                'model' : curmod,
                'train_score' : curmod.score(X_train,
                                            y_train),
                'test_score' : curmod.score(X_test,
                                           y_test)
            }
            outp.append(out)
    return pd.DataFrame(outp)

In [30]:
models_text = modeling(X['text_nourl'], y)
models_text

Unnamed: 0,name,model,train_score,test_score
0,cvec + mnb,"(CountVectorizer(analyzer='word', binary=False...",0.904186,0.8125
1,cvec + knn,"(CountVectorizer(analyzer='word', binary=False...",0.740235,0.679097
2,cvec + logreg,"(CountVectorizer(analyzer='word', binary=False...",0.965668,0.808298
3,cvec + dectree,"(CountVectorizer(analyzer='word', binary=False...",0.990016,0.747899
4,cvec + svc,"(CountVectorizer(analyzer='word', binary=False...",0.945349,0.804622
5,cvec + randomforest,"(CountVectorizer(analyzer='word', binary=False...",0.989841,0.797794
6,cvec + adaboost,"(CountVectorizer(analyzer='word', binary=False...",0.759503,0.753151
7,cvec + mlpclass,"(CountVectorizer(analyzer='word', binary=False...",0.990016,0.76208
8,tfidf + mnb,"(TfidfVectorizer(analyzer='word', binary=False...",0.88632,0.815126
9,tfidf + knn,"(TfidfVectorizer(analyzer='word', binary=False...",0.830443,0.778361


In [31]:
models_text['delta'] = models_text['train_score'] - models_text['test_score']
models_text.sort_values('delta')

Unnamed: 0,name,model,train_score,test_score,delta
6,cvec + adaboost,"(CountVectorizer(analyzer='word', binary=False...",0.759503,0.753151,0.006351
14,tfidf + adaboost,"(TfidfVectorizer(analyzer='word', binary=False...",0.761955,0.741071,0.020883
9,tfidf + knn,"(TfidfVectorizer(analyzer='word', binary=False...",0.830443,0.778361,0.052082
1,cvec + knn,"(CountVectorizer(analyzer='word', binary=False...",0.740235,0.679097,0.061138
10,tfidf + logreg,"(TfidfVectorizer(analyzer='word', binary=False...",0.885444,0.819328,0.066116
8,tfidf + mnb,"(TfidfVectorizer(analyzer='word', binary=False...",0.88632,0.815126,0.071194
0,cvec + mnb,"(CountVectorizer(analyzer='word', binary=False...",0.904186,0.8125,0.091686
4,cvec + svc,"(CountVectorizer(analyzer='word', binary=False...",0.945349,0.804622,0.140728
12,tfidf + svc,"(TfidfVectorizer(analyzer='word', binary=False...",0.966369,0.814076,0.152293
2,cvec + logreg,"(CountVectorizer(analyzer='word', binary=False...",0.965668,0.808298,0.15737


In [57]:
vec_params = [
    {'vec__ngram_range' : [(1,1), (1,2)]},
    {'vec__stop_words' : [None, 'english']}
]
lr_params = [
    {'logreg__penalty' : ['l1', 'l2', 'none']},
    {'logreg__C' : np.linspace(0.01, 1, 5)}    
]
knn_params = [
    {'knn__n_neighbors' : [5, 9, 15]},
    {'knn__p' : [1, 2]}
]
ada_params = [
    {'adaboost__n_estimators' : [10, 50, 100]}
]
rf_params = [
    {'rf__n_estimators' : [10, 50, 100, 200]},
    {'rf__max_depth' : [None, 5, 25, 50]},
    {'rf__max_features' : ['auto', 20, 50, 80]}
]
LogisticRegression

sklearn.linear_model._logistic.LogisticRegression

In [60]:
%%time
def modl(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=42, 
                                                    test_size=.25, 
                                                    stratify=y)
    vects = {
    'cvec' : ['vec', CountVectorizer()],
    'tfidf' : ['vec', TfidfVectorizer()]
    }
    models = {
    'knn' : KNeighborsClassifier(),
    'logreg' : LogisticRegression(),
    'rf' : RandomForestClassifier(),
    'adaboost' : AdaBoostClassifier(),
       
    }
    param_dict = {
        'knn' : knn_params,
        'logreg' : lr_params,
        'rf' : rf_params,
        'adaboost' : ada_params
    }
    outp = []
    for vec in vects:
        for mode in models:
            mod = Pipeline([(vects[vec][0],vects[vec][1])
                                       ,(mode, models[mode])])
            gs = GridSearchCV(mod,
                             vec_params+param_dict[mode],
                             cv=5,
                             n_jobs = -1)
            
            curmod = gs.fit(X_train, y_train)
            gsbest = curmod.best_estimator_
            out = {
                'name' : f'{vec} + {mode}',
                'model' : gsbest,
                'train_score' : gsbest.score(X_train,
                                            y_train),
                'test_score' : gsbest.score(X_test,
                                           y_test)
            }
            outp.append(out)
    return pd.DataFrame(outp)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


In [61]:
models_keys_gs = modl(X['keywords_stemmed'],y)
models_keys_gs

Unnamed: 0,name,model,train_score,test_score
0,cvec + knn,"(CountVectorizer(analyzer='word', binary=False...",0.722719,0.727941
1,cvec + logreg,"(CountVectorizer(analyzer='word', binary=False...",0.72955,0.733718
2,cvec + rf,"(CountVectorizer(analyzer='word', binary=False...",0.729725,0.734769
3,cvec + adaboost,"(CountVectorizer(analyzer='word', binary=False...",0.722719,0.735819
4,tfidf + knn,"(TfidfVectorizer(analyzer='word', binary=False...",0.725171,0.732143
5,tfidf + logreg,"(TfidfVectorizer(analyzer='word', binary=False...",0.72955,0.735819
6,tfidf + rf,"(TfidfVectorizer(analyzer='word', binary=False...",0.729725,0.73687
7,tfidf + adaboost,"(TfidfVectorizer(analyzer='word', binary=False...",0.725871,0.733718


In [62]:
models_keys_gs['delta'] = models_keys_gs['train_score'] - models_keys_gs['test_score']
models_keys_gs

Unnamed: 0,name,model,train_score,test_score,delta
0,cvec + knn,"(CountVectorizer(analyzer='word', binary=False...",0.722719,0.727941,-0.005223
1,cvec + logreg,"(CountVectorizer(analyzer='word', binary=False...",0.72955,0.733718,-0.004169
2,cvec + rf,"(CountVectorizer(analyzer='word', binary=False...",0.729725,0.734769,-0.005044
3,cvec + adaboost,"(CountVectorizer(analyzer='word', binary=False...",0.722719,0.735819,-0.013101
4,tfidf + knn,"(TfidfVectorizer(analyzer='word', binary=False...",0.725171,0.732143,-0.006972
5,tfidf + logreg,"(TfidfVectorizer(analyzer='word', binary=False...",0.72955,0.735819,-0.006269
6,tfidf + rf,"(TfidfVectorizer(analyzer='word', binary=False...",0.729725,0.73687,-0.007145
7,tfidf + adaboost,"(TfidfVectorizer(analyzer='word', binary=False...",0.725871,0.733718,-0.007847


In [63]:
best_model = models_keys_gs.loc[6,'model']

In [67]:
models_text_gs = modl(X['text_nourl'], y)
models_text_gs['delta'] = models_text_gs['train_score'] - models_text_gs['test_score']
models_text_gs

Unnamed: 0,name,model,train_score,test_score,delta
0,cvec + knn,"(CountVectorizer(analyzer='word', binary=False...",0.748642,0.68645,0.062193
1,cvec + logreg,"(CountVectorizer(analyzer='word', binary=False...",0.913295,0.813025,0.10027
2,cvec + rf,"(CountVectorizer(analyzer='word', binary=False...",0.990016,0.794118,0.195898
3,cvec + adaboost,"(CountVectorizer(analyzer='word', binary=False...",0.798739,0.771008,0.02773
4,tfidf + knn,"(TfidfVectorizer(analyzer='word', binary=False...",0.807497,0.785189,0.022308
5,tfidf + logreg,"(TfidfVectorizer(analyzer='word', binary=False...",0.885444,0.819328,0.066116
6,tfidf + rf,"(TfidfVectorizer(analyzer='word', binary=False...",0.990016,0.799895,0.190121
7,tfidf + adaboost,"(TfidfVectorizer(analyzer='word', binary=False...",0.801191,0.769958,0.031233


In [71]:
scores = models_keys_gs[['name', 'train_score', 'test_score']].copy()
scores.rename(columns={'train_score' : 'keywords_train', 'test_score' : 'keywords_test'}, inplace=True)
scores

Unnamed: 0,name,keywords_train,keywords_test
0,cvec + knn,0.722719,0.727941
1,cvec + logreg,0.72955,0.733718
2,cvec + rf,0.729725,0.734769
3,cvec + adaboost,0.722719,0.735819
4,tfidf + knn,0.725171,0.732143
5,tfidf + logreg,0.72955,0.735819
6,tfidf + rf,0.729725,0.73687
7,tfidf + adaboost,0.725871,0.733718


In [77]:
best_model_tx = models_text_gs.loc[6,'model']
preds = best_model_tx.predict(X_test['text_nourl'])

In [73]:
scores['text_train'] = models_text_gs['train_score']
scores['text_test'] = models_text_gs['test_score']
scores

Unnamed: 0,name,keywords_train,keywords_test,text_train,text_test
0,cvec + knn,0.722719,0.727941,0.748642,0.68645
1,cvec + logreg,0.72955,0.733718,0.913295,0.813025
2,cvec + rf,0.729725,0.734769,0.990016,0.794118
3,cvec + adaboost,0.722719,0.735819,0.798739,0.771008
4,tfidf + knn,0.725171,0.732143,0.807497,0.785189
5,tfidf + logreg,0.72955,0.735819,0.885444,0.819328
6,tfidf + rf,0.729725,0.73687,0.990016,0.799895
7,tfidf + adaboost,0.725871,0.733718,0.801191,0.769958


In [None]:
pipe_tf = Pipeline([
    {'tfvec' : TfidfVectorizer()},
    {'rf' : RandomForestClassifier()}
])
pipe_params = [
    {'vec__ngram_range' : [(1,1), (1,2)]},
    {'vec__stop_words' : [None, 'english']},
    {'rf__n_estimators' : [10, 50, 100, 200]},
    {'rf__max_depth' : [None, 5, 25, 50]},
    {'rf__max_features' : ['auto', 20, 50, 80]}
    ]
gs_rf = GridSearchCV(pipe_tf,
                    pipe_params,
                    cv=7,
                    n_jobs=-1)
gs_rf.fit(X_train['text_nourl'], y_train)
print(gs_rf.best_score_)
gs_rf_best = gs_rf.best_estimator_

In [78]:
test = pd.read_csv('./data/test_clean.csv')
test.head()

Unnamed: 0,id,keyword,location,text,target,text_nourl,keywords_stemmed
0,1,earthquake,,our deeds are the reason of this #earthquake m...,1,our deeds are the reason of this #earthquake m...,earthquak
1,4,forest fire,,forest fire near la ronge sask. canada,1,forest fire near la ronge sask. canada,forest fir
2,5,evacuation,,all residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...,evacu
3,6,wildfire,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",wildfir
4,7,wildfire,,just got sent this photo from ruby #alaska as ...,1,just got sent this photo from ruby #alaska as ...,wildfir


In [81]:
preds = best_model_tx.predict(test['text_nourl'])
pred_1 = pd.DataFrame()
pred_1['id'] = test['id']
pred_1['target'] = preds
pred_1.to_csv('./data/submissions/rftext.csv', index=False)