# Modeling: Naive Bayes

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from eda_functions import split_data

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

## Attack

In [5]:
# Load data
df_attack = pd.read_csv('../data/attack_clean.csv', index_col='rev_id')
df_attack.info()
df_attack.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115563 entries, 37675 to 699897151
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   comment  115563 non-null  object
 1   target   115563 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.6+ MB


Unnamed: 0_level_0,comment,target
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1
37675,This is not creative Those are the dictionar...,0
44816,the term standard model is itself less NPOV...,0
49851,True or false the situation as of March 2002...,0


In [38]:
# Split into training/testing
# Use custom function to balance classes by downsampling (training data only)

X_train, X_test, y_train, y_test = split_data(
    data=df_attack,
    test_size=5_000,
    train_size=15_000,
    pct_positive=0.5)

In [39]:
# Check number of observations and class proportions 
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=15000),Test (n=5000)
0,0.5,0.862
1,0.5,0.138


In [40]:
# Start with a basic pipeline and see how it performs
attack_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

attack_pipe.fit(X_train, y_train)

print('Train: ', round(attack_pipe.score(X_train, y_train), 4))
print('Test: ', round(attack_pipe.score(X_test, y_test), 4))

Train:  0.8905
Test:  0.8646


In [21]:
# Grid search over some cvec parameters to see if we can do better
attack_params = {
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)]                         
}

attack_gs = GridSearchCV(attack_pipe, attack_params, cv=3)

In [22]:
%%time
attack_gs.fit(X_train, y_train)

Wall time: 43.6 s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [None, 'english']})

In [26]:
print(f'Best score of {attack_gs.best_score_}')
print(f'with params {attack_gs.best_params_}')

Best score of 0.836
with params {'cvec__ngram_range': (1, 1), 'cvec__stop_words': None}


In [41]:
# So basically the defaults.
# Let's try this with character ngrams split at word boundaries.
attack_params = {
    'cvec__analyzer': ['char_wb'],
    'cvec__ngram_range': [(2, 2), (3, 3), (4, 4), (5, 5)]
}

attack_gs = GridSearchCV(attack_pipe, attack_params, cv=3)

In [42]:
%%time
attack_gs.fit(X_train, y_train)

Wall time: 1min 3s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__analyzer': ['char_wb'],
                         'cvec__ngram_range': [(2, 2), (3, 3), (4, 4), (5, 5)]})

In [45]:
print(f'Best score of {attack_gs.best_score_}')
print(f'with params {attack_gs.best_params_}')

Best score of 0.8285333333333332
with params {'cvec__analyzer': 'char_wb', 'cvec__ngram_range': (5, 5)}


In [46]:
# So what was the best Naive Bayes model? The default.