# Modeling: Naive Bayes

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from eda_functions import split_data

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

## Attack

In [2]:
# Load data
df_attack = pd.read_csv('../data/attack_clean.csv', index_col='rev_id')
df_attack.info()
df_attack.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115563 entries, 37675 to 699897151
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   comment  115563 non-null  object
 1   target   115563 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.6+ MB


Unnamed: 0_level_0,comment,target
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1
37675,This is not creative Those are the dictionar...,0
44816,the term standard model is itself less NPOV...,0
49851,True or false the situation as of March 2002...,0


In [3]:
# Split into training/testing
# Use custom function to balance classes by downsampling (training data only)

X_train, X_test, y_train, y_test = split_data(
    data=df_attack,
    test_size=5_000,
    train_size=15_000,
    pct_positive=0.5)

In [4]:
# Check number of observations and class proportions 
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=15000),Test (n=5000)
0,0.5,0.8672
1,0.5,0.1328


In [5]:
# Start with a basic pipeline and see how it performs
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

pipe.fit(X_train, y_train)

print('Train: ', round(pipe.score(X_train, y_train), 4))
print('Test: ', round(pipe.score(X_test, y_test), 4))

Train:  0.892
Test:  0.8906


In [6]:
# Grid search over some cvec parameters to see if we can do better
params = {
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)]                         
}

gs = GridSearchCV(pipe, params, cv=3)

In [7]:
%%time
gs.fit(X_train, y_train)

Wall time: 1min 6s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [None, 'english']})

In [8]:
print(f'Best score of {gs.best_score_}')
print(f'with params {gs.best_params_}')

Best score of 0.8328000000000001
with params {'cvec__ngram_range': (1, 1), 'cvec__stop_words': None}


In [9]:
# So basically the defaults.
# Let's try this with character ngrams split at word boundaries.
params = {
    'cvec__analyzer': ['char_wb'],
    'cvec__ngram_range': [(2, 2), (3, 3), (4, 4), (5, 5)]
}

gs = GridSearchCV(pipe, params, cv=3)

In [10]:
%%time
gs.fit(X_train, y_train)

Wall time: 1min 23s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__analyzer': ['char_wb'],
                         'cvec__ngram_range': [(2, 2), (3, 3), (4, 4), (5, 5)]})

In [11]:
print(f'Best score of {gs.best_score_}')
print(f'with params {gs.best_params_}')

Best score of 0.8191333333333333
with params {'cvec__analyzer': 'char_wb', 'cvec__ngram_range': (5, 5)}


In [12]:
# So what was the best Naive Bayes model? The default.

## Aggression

In [13]:
# Load data
df_aggression = pd.read_csv('../data/aggression_clean_data.csv', index_col='rev_id')
df_aggression.info()
df_aggression.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115493 entries, 37675 to 699897151
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   comment           115493 non-null  object 
 1   year              115493 non-null  int64  
 2   logged_in         115493 non-null  bool   
 3   ns                115493 non-null  object 
 4   sample            115493 non-null  object 
 5   split             115493 non-null  object 
 6   aggression        115493 non-null  float64
 7   aggression_score  115493 non-null  float64
 8   label             115493 non-null  int64  
dtypes: bool(1), float64(2), int64(2), object(4)
memory usage: 8.0+ MB


Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,aggression,aggression_score,label
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
37675,This is not creative Those are the dictionary...,2002,True,article,random,train,0.1,0.0,0
44816,the term standard model is itself less NPOV t...,2002,True,article,random,train,0.0,0.111111,0
49851,True or false the situation as of March 2002 w...,2002,True,article,random,train,0.0,0.1,0


In [14]:
# Rename label --> target
df_aggression.rename({'label': 'target'}, axis=1, inplace=True)

In [15]:
# Split into training/testing
# Use custom function to balance classes by downsampling (training data only)

X_train, X_test, y_train, y_test = split_data(
    data=df_aggression,
    test_size=5_000,
    train_size=15_000,
    pct_positive=0.5)

In [16]:
# Check number of observations and class proportions 
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=15000),Test (n=5000)
0,0.5,0.8512
1,0.5,0.1488


In [17]:
# Start with a basic pipeline and see how it performs
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

pipe.fit(X_train, y_train)

print('Train: ', round(pipe.score(X_train, y_train), 4))
print('Test: ', round(pipe.score(X_test, y_test), 4))

Train:  0.8913
Test:  0.869


In [18]:
# Grid search over some cvec parameters to see if we can do better
params = {
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)]                         
}

gs = GridSearchCV(pipe, params, cv=3)

In [19]:
%%time
gs.fit(X_train, y_train)

Wall time: 55.9 s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [None, 'english']})

In [20]:
print(f'Best score of {gs.best_score_}')
print(f'with params {gs.best_params_}')

Best score of 0.8280666666666666
with params {'cvec__ngram_range': (1, 1), 'cvec__stop_words': None}


In [21]:
# So basically the defaults.
# Let's try this with character ngrams split at word boundaries.
params = {
    'cvec__analyzer': ['char_wb'],
    'cvec__ngram_range': [(2, 2), (3, 3), (4, 4), (5, 5)]
}

gs = GridSearchCV(pipe, params, cv=3)

In [24]:
%%time
gs.fit(X_train, y_train)

Wall time: 1min 28s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__analyzer': ['char_wb'],
                         'cvec__ngram_range': [(2, 2), (3, 3), (4, 4), (5, 5)]})

In [25]:
print(f'Best score of {gs.best_score_}')
print(f'with params {gs.best_params_}')

Best score of 0.8186
with params {'cvec__analyzer': 'char_wb', 'cvec__ngram_range': (5, 5)}


## Toxicity

In [32]:
# Load data
df_toxicity = pd.read_csv('../data/toxicity_cleaned.csv', index_col='rev_id')
df_toxicity.info()
df_toxicity.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159676 entries, 2232 to 699897151
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   comment   159666 non-null  object
 1   toxicity  159676 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


Unnamed: 0_level_0,comment,toxicity
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2232,This::One can make an analogy in mathematical ...,0
4216,":Clarification for you (and Zundark's right, ...",0
8953,Elected or Electoral? JHK,0


In [33]:
# Rename column toxicity --> target
df_toxicity.rename({'toxicity': 'target'}, axis=1, inplace=True)

In [35]:
# Are there NaN's?
df_toxicity.isna().sum()

comment    10
target      0
dtype: int64

In [36]:
# Drop 'em.
df_toxicity.dropna(inplace=True)

In [37]:
# Split into training/testing
# Use custom function to balance classes by downsampling (training data only)

X_train, X_test, y_train, y_test = split_data(
    data=df_toxicity,
    test_size=5_000,
    train_size=15_000,
    pct_positive=0.5)

In [38]:
# Check number of observations and class proportions 
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=15000),Test (n=5000)
0,0.5,0.8886
1,0.5,0.1114


In [39]:
# Start with a basic pipeline and see how it performs
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

pipe.fit(X_train, y_train)

print('Train: ', round(pipe.score(X_train, y_train), 4))
print('Test: ', round(pipe.score(X_test, y_test), 4))

Train:  0.9058
Test:  0.8864


In [40]:
# Grid search over some cvec parameters to see if we can do better
params = {
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)]                         
}

gs = GridSearchCV(pipe, params, cv=3)

In [41]:
%%time
gs.fit(X_train, y_train)

Wall time: 1min 3s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [None, 'english']})

In [42]:
print(f'Best score of {gs.best_score_}')
print(f'with params {gs.best_params_}')

Best score of 0.8515999999999999
with params {'cvec__ngram_range': (1, 1), 'cvec__stop_words': None}


In [43]:
# So basically the defaults.
# Let's try this with character ngrams split at word boundaries.
params = {
    'cvec__analyzer': ['char_wb'],
    'cvec__ngram_range': [(2, 2), (3, 3), (4, 4), (5, 5)]
}

gs = GridSearchCV(pipe, params, cv=3)

In [46]:
%%time
gs.fit(X_train, y_train)

Wall time: 1min 30s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__analyzer': ['char_wb'],
                         'cvec__ngram_range': [(2, 2), (3, 3), (4, 4), (5, 5)]})

In [47]:
print(f'Best score of {gs.best_score_}')
print(f'with params {gs.best_params_}')

Best score of 0.8386
with params {'cvec__analyzer': 'char_wb', 'cvec__ngram_range': (5, 5)}


## Summary
|           | Attack | Aggression | Toxicity | 
| --------- | ------ | ---------- | -------- |
| **Train** | 0.89   | 0.89       | 0.90     |
| **Test**  | 0.89   | 0.87       | 0.89     |