# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [44]:
# import data
df = pd.read_csv('../data/posts.csv').dropna()

In [3]:
df

Unnamed: 0,post,class
0,billy butcherson back hocus pocus doug jones s...,0
1,go,1
2,lost city open domestically,1
3,thirsty go watch video,0
4,wanted see imax dolby digital theyre showing s...,0
...,...,...
19182,dc film next cross domestically opening weekend,1
19183,first image cillian murphy j robert oppenheime...,0
19184,stunt mean stunt pulled actor sense owni know ...,1
19185,maybe thats play,1


In [4]:
df.head()  # 0 movies, 1 == boxoffice

Unnamed: 0,post,class
0,billy butcherson back hocus pocus doug jones s...,0
1,go,1
2,lost city open domestically,1
3,thirsty go watch video,0
4,wanted see imax dolby digital theyre showing s...,0


In [24]:
# create a dataframe to store results
naive_res = pd.DataFrame(columns=['train_score_c', 'test_score_c', 'train_score_t', 'test_score_t'])


In [26]:
# function to add results

def results(train_c, test_c, train_t, test_t):
    naive_res.loc[len(naive_res)] = [round(train_c,2), round(test_c,2), round(train_t,2), round(test_t,2)]
    return naive_res
                  
                  

# Data Splitting

In [49]:
# create X and y
X = df['post']
y = df['class']

In [60]:
y.value_counts(normalize=True) # baseline score

1    0.505924
0    0.494076
Name: class, dtype: float64

In [50]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
# create pipes, for countvectorizer and tfidfvectorizer
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('model', MultinomialNB())
])

pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', MultinomialNB())
])

In [10]:
# params
params = {}

# Trial 1

In [11]:
# instantiate gridsearch with two different pipes
gs1c = GridSearchCV(pipe1, param_grid=params, cv=5, verbose=1, n_jobs=-1)

In [12]:
# fit gridsearch
gs1c.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1, param_grid={}, verbose=1)

In [14]:
# instantiate gridsearch with two different pipes
gs1t = GridSearchCV(pipe2, param_grid=params, cv=5, verbose=1, n_jobs=-1)

In [15]:
# fit gridsearch
gs1t.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1, param_grid={}, verbose=1)

In [27]:
results(gs1c.score(X_train, y_train), gs1c.score(X_test, y_test), gs1t.score(X_train, y_train), gs1t.score(X_test, y_test)
                  
                  )

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.85,0.76,0.87,0.76


* The model and the vectorizers had default params
* The score is definitely better than the baseline score
* The model is overfitting on the training data

# Trial 2

In [17]:
params1 = {
      'cvec__max_features': [1500, 2000, 2500],
    'cvec__max_df': [.8, .7],
    'cvec__binary': [True],
    'cvec__ngram_range': [(1,1), (1,2), (2,2)],
    'model__alpha': np.logspace(0,-9, num=100)
}

params2 = {
      'tfidf__max_features': [1500, 2000, 2500],
    'tfidf__max_df': [.8, .7],
    'tfidf__binary': [True],
    'tfidf__ngram_range': [(1,1), (1,2), (2,2)],
    'model__alpha': np.logspace(0,-9, num=100)
}



In [18]:
gs2c = GridSearchCV(pipe1, param_grid=params1, verbose=1, n_jobs=-1, cv=10)
gs2t = GridSearchCV(pipe2, param_grid=params2, verbose=1, n_jobs=-1, cv=10)

In [19]:
gs2c.fit(X_train, y_train)

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True], 'cvec__max_df': [0.8, 0.7],
                         'cvec__max_features': [1500, 2000, 2500],
                         'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'model__alpha': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.5111...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             verbose=1)

In [22]:
gs2t.fit(X_train, y_train)

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'model__alpha': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225...
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09]),
                         'tfidf__binary': [True], 'tfidf__max_df': [0.8, 0.7],
                         'tfidf__max_features': [1500, 2000, 2500],
                         'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)]},
             verbose

In [28]:
results(gs2c.score(X_train, y_train), gs2c.score(X_test, y_test), gs2t.score(X_train, y_train), gs2t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.85,0.76,0.87,0.76
1,0.77,0.75,0.78,0.75


In [29]:
gs2c.best_params_

{'cvec__binary': True,
 'cvec__max_df': 0.8,
 'cvec__max_features': 2500,
 'cvec__ngram_range': (1, 1),
 'model__alpha': 0.02848035868435802}

In [30]:
gs2t.best_params_

{'model__alpha': 1.0,
 'tfidf__binary': True,
 'tfidf__max_df': 0.8,
 'tfidf__max_features': 2500,
 'tfidf__ngram_range': (1, 1)}

* The score has slightly decreased from the previous trial
* Both the model including the vectorizers were tuned
* The model is less overfitting on the training data as compared to before

# Trial 3

In [40]:
params1 = {
      'cvec__max_features': [2000, 2500, 3000, 3500],
    'cvec__max_df': [.8, .7, 0.9, 1.0],
    'cvec__binary': [True, False],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'model__alpha': [0.001, 0.01, 0.02, 0.03],
    'model__fit_prior': [True, False]
}

params2 = {
      'tfidf__max_features': [2000, 2500, 3000, 3500],
    'tfidf__max_df': [.8, .7, 0.9, 1.0],
    'tfidf__binary': [True, False],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'model__alpha': [0.8, 0.9, 1.0],
    'model__fit_prior': [True, False]
}



In [41]:
gs3c = GridSearchCV(pipe1, param_grid=params1, verbose=1, n_jobs=-1, cv=5)
gs3t = GridSearchCV(pipe2, param_grid=params2, verbose=1, n_jobs=-1, cv=5)

In [42]:
gs3c.fit(X_train, y_train)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True, False],
                         'cvec__max_df': [0.8, 0.7, 0.9, 1.0],
                         'cvec__max_features': [2000, 2500, 3000, 3500],
                         'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
                         'model__alpha': [0.001, 0.01, 0.02, 0.03],
                         'model__fit_prior': [True, False]},
             verbose=1)

In [43]:
gs3t.fit(X_train, y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'model__alpha': [0.8, 0.9, 1.0],
                         'model__fit_prior': [True, False],
                         'tfidf__binary': [True, False],
                         'tfidf__max_df': [0.8, 0.7, 0.9, 1.0],
                         'tfidf__max_features': [2000, 2500, 3000, 3500],
                         'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]},
             verbose=1)

In [45]:
results(gs3c.score(X_train, y_train), gs3c.score(X_test, y_test), gs3t.score(X_train, y_train), gs3t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.85,0.76,0.87,0.76
1,0.77,0.75,0.78,0.75
2,0.8,0.76,0.81,0.76


In [46]:
gs3c.best_params_

{'cvec__binary': False,
 'cvec__max_df': 0.8,
 'cvec__max_features': 3500,
 'cvec__ngram_range': (1, 1),
 'model__alpha': 0.03,
 'model__fit_prior': False}

In [47]:
gs3t.best_params_

{'model__alpha': 1.0,
 'model__fit_prior': False,
 'tfidf__binary': False,
 'tfidf__max_df': 0.8,
 'tfidf__max_features': 3500,
 'tfidf__ngram_range': (1, 1)}

* The model's accuracy has slightly increased back to the initial score
* The model is overfitting the training data more

# Trial 4

In [48]:
params1 = {
      'cvec__max_features': [3000, 3500, 4000, 5000],
    'cvec__min_df': [.1, .2, 0.01],
    'cvec__binary': [True, False],
    'cvec__ngram_range': [(1,1)],
    'model__alpha': [0.02, 0.03, 0.04, 0.05],
    'model__fit_prior': [True, False]
}

params2 = {
      'tfidf__max_features': [3000, 3500, 4000, 5000],
    'tfidf__min_df': [.1, .2, .01],
    'tfidf__binary': [True, False],
    'tfidf__ngram_range': [(1,1)],
    'model__alpha': [0.8, 0.9, 0.6],
    'model__fit_prior': [True, False]
}



In [51]:
gs4c = GridSearchCV(pipe1, param_grid=params1, verbose=1, n_jobs=-1, cv=5)
gs4t = GridSearchCV(pipe2, param_grid=params2, verbose=1, n_jobs=-1, cv=5)

In [53]:
gs4c.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True, False],
                         'cvec__max_features': [3000, 3500, 4000, 5000],
                         'cvec__min_df': [0.1, 0.2, 0.01],
                         'cvec__ngram_range': [(1, 1)],
                         'model__alpha': [0.02, 0.03, 0.04, 0.05],
                         'model__fit_prior': [True, False]},
             verbose=1)

In [54]:
gs4t.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'model__alpha': [0.8, 0.9, 0.6],
                         'model__fit_prior': [True, False],
                         'tfidf__binary': [True, False],
                         'tfidf__max_features': [3000, 3500, 4000, 5000],
                         'tfidf__min_df': [0.1, 0.2, 0.01],
                         'tfidf__ngram_range': [(1, 1)]},
             verbose=1)

In [55]:
results(gs4c.score(X_train, y_train), gs4c.score(X_test, y_test), gs4t.score(X_train, y_train), gs4t.score(X_test, y_test))


Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.85,0.76,0.87,0.76
1,0.77,0.75,0.78,0.75
2,0.8,0.76,0.81,0.76
3,0.7,0.69,0.71,0.7


I will choose the 3rd trial as the best out of the 4 since the accuracy score for test is higher and the model doesn't overfit too much to the training data.