# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# import data
df = pd.read_csv('../data/posts.csv').dropna()

In [4]:
df.head() # 0 movies, 1 == boxoffice

Unnamed: 0,post,class
0,billy butcherson back hocus pocus doug jones s...,0
1,go,1
2,lost city open domestically,1
3,thirsty go watch video,0
4,wanted see imax dolby digital theyre showing s...,0


In [46]:
# create a dataframe to store results for countvectorizer and tfidf
res = pd.DataFrame(columns=['train_score_c', 'test_score_c', 'train_score_t', 'test_score_t'])


In [30]:
# function to add to results

def results(train_c, test_c, train_t, test_t):
    res.loc[len(res)] = [round(train_c,3), round(test_c, 3), round(train_t,3), round(test_t, 3)]
    return res
                  
                  

# Creating Train and Test Datasets

In [7]:
# create X and y
X = df['post']
y = df['class']

In [56]:
y.value_counts(normalize=True)

1    0.505869
0    0.494131
Name: class, dtype: float64

In [8]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
# create pipes, for countvectorizer and tfidfvectorizer
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('model', RandomForestClassifier())
])

pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', RandomForestClassifier())
])

In [10]:
params = {}

# Trial 1

In [11]:
gs1c = GridSearchCV(pipe1, param_grid=params, verbose=1, n_jobs=-1, cv=10)
gs1t = GridSearchCV(pipe2, param_grid=params, verbose=1, n_jobs=-1, cv=10)

In [12]:
gs1c.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', RandomForestClassifier())]),
             n_jobs=-1, param_grid={}, verbose=1)

In [13]:
gs1t.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', RandomForestClassifier())]),
             n_jobs=-1, param_grid={}, verbose=1)

In [47]:
results(gs1c.score(X_train, y_train), gs1c.score(X_test, y_test), gs1t.score(X_train, y_train), gs1t.score(X_test, y_test))


Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.995,0.738,0.995,0.743


* Default parameters were used
* The model is highly overfitting the training data
* The score is definitely better than the baseline

# Trial 2

In [35]:
params = {
     'model__max_depth': [None, 10, 15],
    'model__min_samples_split': [2, 5, 7],
    'model__min_samples_leaf': [1, 3, 5, 7],
    'model__max_features': [2,3,4,5],
}


In [36]:
gs2c = GridSearchCV(pipe1, param_grid=params, verbose=1, n_jobs=-1, cv=10)
gs2t = GridSearchCV(pipe2, param_grid=params, verbose=1, n_jobs=-1, cv=10)

In [37]:
gs2c.fit(X_train, y_train)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None, 10, 15],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 3, 5, 7],
                         'model__min_samples_split': [2, 5, 7]},
             verbose=1)

In [38]:
gs2t.fit(X_train, y_train)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None, 10, 15],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 3, 5, 7],
                         'model__min_samples_split': [2, 5, 7]},
             verbose=1)

In [48]:
results(gs2c.score(X_train, y_train), gs2c.score(X_test, y_test), gs2t.score(X_train, y_train), gs2t.score(X_test, y_test))


Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.995,0.738,0.995,0.743
1,0.99,0.768,0.994,0.766


* Only the model was tuned while default params were used for the vectorizers
* The score has improved from the previous trial
* The model does not overfit as much as previously

# Trial 3

In [40]:
params1 = {
     'cvec__max_features': [1500, 2000, 2500],
    'cvec__max_df': [.8, .7],
    'cvec__binary': [True],
    'cvec__lowercase': [True, False],
     'model__max_depth': [None, 5, 10, 15],
    'model__min_samples_split': [2, 5, 7],
    'model__min_samples_leaf': [1, 3, 5, 7],
    'model__max_features': [2,3,4,5],
    'cvec__ngram_range': [(1,1), (1,2), (2,2)],
}

params2 = {
     'tfidf__max_features': [1500, 2000, 2500],
    'tfidf__max_df': [.8, .7],
    'tfidf__binary': [True],
    'tfidf__lowercase': [True, False],
     'model__max_depth': [None, 5, 10, 15],
    'model__min_samples_split': [2, 5, 7],
    'model__min_samples_leaf': [1, 3, 5, 7],
    'model__max_features': [2,3,4,5],
    'tfidf__ngram_range': [(1,1), (1,2), (2,2)],
}

In [51]:
gs3c = GridSearchCV(pipe1, param_grid=params1, verbose=1, n_jobs=-1, cv=5)
gs3t = GridSearchCV(pipe2, param_grid=params2, verbose=1, n_jobs=-1, cv=5)

In [52]:
gs3c.fit(X_train, y_train)

Fitting 5 folds for each of 6912 candidates, totalling 34560 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True],
                         'cvec__lowercase': [True, False],
                         'cvec__max_df': [0.8, 0.7],
                         'cvec__max_features': [1500, 2000, 2500],
                         'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'model__max_depth': [None, 5, 10, 15],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 3, 5, 7],
                         'model__min_samples_split': [2, 5, 7]},
             verbose=1)

In [53]:
gs3t.fit(X_train, y_train)

Fitting 5 folds for each of 6912 candidates, totalling 34560 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None, 5, 10, 15],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 3, 5, 7],
                         'model__min_samples_split': [2, 5, 7],
                         'tfidf__binary': [True],
                         'tfidf__lowercase': [True, False],
                         'tfidf__max_df': [0.8, 0.7],
                         'tfidf__max_features': [1500, 2000, 2500],
                         'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)]},
             verbose=1)

In [54]:
results(gs3c.score(X_train, y_train), gs3c.score(X_test, y_test), gs3t.score(X_train, y_train), gs3t.score(X_test, y_test))


Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.995,0.738,0.995,0.743
1,0.99,0.768,0.994,0.766
2,0.973,0.775,0.843,0.765


In [55]:
gs3c.best_params_

{'cvec__binary': True,
 'cvec__lowercase': False,
 'cvec__max_df': 0.7,
 'cvec__max_features': 2500,
 'cvec__ngram_range': (1, 1),
 'model__max_depth': None,
 'model__max_features': 3,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 7}

* The models as well as the vectorizers were tuned
* There isn't that much of a difference in the scores
* the model is less overfit on the train data as compared to the previous trial
* The best model would be the 3rd one which used tfidf vectorizer since the overfitting less and at the same time the accuracy score is high