In [11]:
# Basics
import pandas as pd
import numpy as np
import time

# Modeling toolbox
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

# NLP specific
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Model imports
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC

In [12]:
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')

In [13]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_nourl,keywords_stemmed
0,1,earthquake,,our deeds are the reason of this #earthquake m...,1,our deeds are the reason of this #earthquake m...,earthquak
1,4,forest fire,,forest fire near la ronge sask. canada,1,forest fire near la ronge sask. canada,forest fir
2,5,evacuation,,all residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...,evacu
3,6,wildfire,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",wildfir
4,7,wildfire,,just got sent this photo from ruby #alaska as ...,1,just got sent this photo from ruby #alaska as ...,wildfir


In [25]:
train.isna().sum()

id                     0
keyword                0
location            2533
text                   0
target                 0
text_nourl             0
keywords_stemmed       0
dtype: int64

In [26]:
train.shape

(7613, 7)

In [27]:
test.shape

(7613, 7)

In [28]:
train.dtypes

id                   int64
keyword             object
location            object
text                object
target               int64
text_nourl          object
keywords_stemmed    object
dtype: object

In [29]:
test.dtypes

id                   int64
keyword             object
location            object
text                object
target               int64
text_nourl          object
keywords_stemmed    object
dtype: object

In [14]:
# scoring function
def score(model, X, y):
    preds = model.predict(X)

    print(f'{model} test score:', np.round(model.score(X, y),2))

    # Get confusion matrix and unravel
    tn, fp, fn, tp = confusion_matrix(y, preds).ravel() # 1-dimension array
    print(f'\nAccuracy: {round((tp+tn)/len(y),3)}')
    print(f'Sensitivity: {round(tp/(tp+fn),3)}')
    print(f'Specificity: {round(tn/(tn+fp),3)}')
    print(f'Precision: {round(tp/(tp+fp),3)}')
    print('------------')

In [15]:
features1 = [
    'keywords_stemmed'
]
features2 = [
    'text_nourl',
    'keywords_stemmed'
]

#### train test split

In [35]:
X = train[features2]
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.25,
                                                   random_state=42,
                                                   stratify = y)


#### Vectorize

In [45]:
cvec = CountVectorizer()
xtrain_cv = cvec.fit_transform(X_train['keywords_stemmed'])
xtest_cv = cvec.transform(X_test['keywords_stemmed'])

tfidf = TfidfVectorizer()
xtrain_tf = tfidf.fit_transform(X_train['keywords_stemmed'])
xtest_tf = tfidf.transform(X_test['keywords_stemmed'])

#### Naive-Bayes model

In [60]:
%%time

mnb_cv = MultinomialNB()
mnb_tf = MultinomialNB()


mnb_cv.fit(xtrain_cv, y_train)
mnb_tf.fit(xtrain_tf, y_train)

score(mnb_cv, xtrain_cv, y_train)
score(mnb_cv, xtest_cv, y_test)
score(mnb_tf, xtrain_tf, y_train)
score(mnb_tf, xtest_tf, y_test)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) test score: 0.73

Accuracy: 0.726
Sensitivity: 0.662
Specificity: 0.774
Precision: 0.688
------------
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) test score: 0.73

Accuracy: 0.729
Sensitivity: 0.659
Specificity: 0.782
Precision: 0.695
------------
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) test score: 0.73

Accuracy: 0.726
Sensitivity: 0.667
Specificity: 0.771
Precision: 0.687
------------
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) test score: 0.73

Accuracy: 0.729
Sensitivity: 0.664
Specificity: 0.778
Precision: 0.693
------------
CPU times: user 23.1 ms, sys: 11 ms, total: 34.1 ms
Wall time: 32.8 ms


#### Support Vector Model

In [50]:
%%time
from sklearn.metrics import accuracy_score
svm_cv = SVC()
svm_tf = SVC()
svm_cv.fit(xtrain_cv, y_train)
svm_tf.fit(xtrain_tf, y_train)
y_pred_svmcv = svm_cv.predict(xtest_cv)
y_pred_svmtf = svm_tf.predict(xtest_tf)
print(accuracy_score(y_test, y_pred_svmcv))
print(accuracy_score(y_test, y_pred_svmtf))

0.7337184873949579
0.7337184873949579
CPU times: user 950 ms, sys: 33.3 ms, total: 983 ms
Wall time: 993 ms


#### Logistic Regressor

In [55]:
logreg_cv = LogisticRegression()
logreg_tf = LogisticRegression()

logreg_cv.fit(xtrain_cv, y_train)
logreg_tf.fit(xtrain_tf, y_train)

score(logreg_cv(xtest_cv, y_test))
score(logreg_tf(xtest_tf, y_test))

TypeError: 'LogisticRegression' object is not callable

In [56]:
from sklearn.neural_network import MLPClassifier

mlp_cv = MLPClassifier()
mlp_tf = MLPClassifier()
mlp_cv.fit(xtrain_cv, y_train)
mlp_tf.fit(xtrain_tf, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [58]:
# score(mlp_cv(xtrain_cv, y_train))
# score(mlp_tf(xtest_cv, y_test))

In [59]:
mlp_cv.score(xtrain_cv, y_train)
mlp_tf.score(xtest_cv, y_test)

0.7373949579831933

Create CSV:

In [None]:
to_submit.to_csv('to_submit_outliers.csv', index=False)

In [61]:
scores = []
scores.append('MNB CVEC train accuracy:' + str(mnb_cv.score(xtrain_cv, y_train)))
scores.append('MNB CVEC test accuracy:' + str(mnb_cv.score(xtest_cv, y_train)))
scores.append('MNB TFIDF train accuracy:' + str(mnb_tf.score(xtrain_tf, y_train)))
scores.append('MNB TFIDF test accuracy:' + str(mnb_tf.score(xtest_tf, y_train)))
print('SVM CVEC test accuracy:' + str(accuracy_score(y_test, y_pred_svmcv)))
print('SVM TFIDF test accuracy:' + str(accuracy_score(y_test, y_pred_svmcv)))

AWS_scores = pd.DataFrame(scores)
# AWS_scores.to_csv('AWS_scores.csv', index=False)


ValueError: Found input variables with inconsistent numbers of samples: [5709, 1904]