# Imports

In [11]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# import data
df = pd.read_csv('../data/posts.csv').dropna()

In [3]:
df.head()  # 0 movies, 1 == boxoffice

Unnamed: 0,post,class
0,billy butcherson back hocus pocus doug jones s...,0
1,go,1
2,lost city open domestically,1
3,thirsty go watch video,0
4,wanted see imax dolby digital theyre showing s...,0


In [5]:
# create a dataframe to store results
res = pd.DataFrame(columns=['train_score_c', 'test_score_c', 'train_score_t', 'test_score_t'])


In [6]:
# function to add results

def results(train_c, test_c, train_t, test_t):
    res.loc[len(res)] = [round(train_c,2), round(test_c,2), round(train_t,2), round(test_t,2)]
    return res

# Data Splitting

In [7]:
# create X and y
X = df['post']
y = df['class']

In [41]:
y.value_counts(normalize=True)

1    0.505869
0    0.494131
Name: class, dtype: float64

In [8]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [12]:
# create pipes, for countvectorizer and tfidfvectorizer
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('model', DecisionTreeClassifier())
])

pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', DecisionTreeClassifier())
])

In [13]:
# params
params = {}

# Trial 1

In [15]:
# instantiate gridsearch with two different pipes
gs1c = GridSearchCV(pipe1, param_grid=params, cv=5, verbose=1, n_jobs=-1)
gs1t = GridSearchCV(pipe2, param_grid=params, cv=5, verbose=1, n_jobs=-1)

In [16]:
gs1c.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1, param_grid={}, verbose=1)

In [17]:
gs1t.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1, param_grid={}, verbose=1)

In [18]:
results(gs1c.score(X_train, y_train), gs1c.score(X_test, y_test), gs1t.score(X_train, y_train), gs1t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,1.0,0.68,1.0,0.69


* Default params were used
* The scores are not that good
* There is too much overfitting on the training data

# Trial 2

In [19]:
params = {
    'model__max_depth': [None, 10, 15],
    'model__min_samples_split': [2, 5, 7],
    'model__min_samples_leaf': [1, 3, 5, 7],
    'model__max_features': [2,3,4,5],
    
}

In [20]:
# instantiate gridsearch with two different pipes
gs2c = GridSearchCV(pipe1, param_grid=params, cv=5, verbose=1, n_jobs=-1)
gs2t = GridSearchCV(pipe2, param_grid=params, cv=5, verbose=1, n_jobs=-1)

In [21]:
gs2c.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None, 10, 15],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 3, 5, 7],
                         'model__min_samples_split': [2, 5, 7]},
             verbose=1)

In [22]:
gs2t.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None, 10, 15],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 3, 5, 7],
                         'model__min_samples_split': [2, 5, 7]},
             verbose=1)

In [23]:
results(gs2c.score(X_train, y_train), gs2c.score(X_test, y_test), gs2t.score(X_train, y_train), gs2t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,1.0,0.68,1.0,0.69
1,0.91,0.64,0.96,0.64


In [24]:
gs2c.best_params_

{'model__max_depth': None,
 'model__max_features': 2,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 7}

In [25]:
gs2t.best_params_

{'model__max_depth': None,
 'model__max_features': 5,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 7}

* Only the model was tuned
* The score is less than before, no improvement
* The model is still overfitting but slightly less than before especially for the count vectorizer

# Trial 3

In [26]:
params1 = {
    'model__max_depth': [None, 10, 15],
    'model__min_samples_split': [2, 5, 7],
    'model__min_samples_leaf': [1, 3, 5, 7],
    'model__max_features': [2,3,4,5],
    'cvec__max_features': [1500, 2000, 2500],
    'cvec__max_df': [.8, .7],
    'cvec__binary': [True],
    'cvec__ngram_range': [(1,1), (1,2), (2,2)],
    
}

params2 = {
    'model__max_depth': [None, 10, 15],
    'model__min_samples_split': [2, 5, 7],
    'model__min_samples_leaf': [1, 3, 5, 7],
    'model__max_features': [2,3,4,5],
       'tfidf__max_features': [1500, 2000, 2500],
    'tfidf__max_df': [.8, .7],
    'tfidf__binary': [True],
    'tfidf__ngram_range': [(1,1), (1,2), (2,2)],
    
}

In [27]:
# instantiate gridsearch with two different pipes
gs3c = GridSearchCV(pipe1, param_grid=params1, cv=5, verbose=1, n_jobs=-1)
gs3t = GridSearchCV(pipe2, param_grid=params2, cv=5, verbose=1, n_jobs=-1)

In [28]:
gs3c.fit(X_train, y_train)

Fitting 5 folds for each of 2592 candidates, totalling 12960 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True], 'cvec__max_df': [0.8, 0.7],
                         'cvec__max_features': [1500, 2000, 2500],
                         'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'model__max_depth': [None, 10, 15],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 3, 5, 7],
                         'model__min_samples_split': [2, 5, 7]},
             verbose=1)

In [29]:
gs3t.fit(X_train, y_train)

Fitting 5 folds for each of 2592 candidates, totalling 12960 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None, 10, 15],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 3, 5, 7],
                         'model__min_samples_split': [2, 5, 7],
                         'tfidf__binary': [True], 'tfidf__max_df': [0.8, 0.7],
                         'tfidf__max_features': [1500, 2000, 2500],
                         'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)]},
             verbose=1)

In [30]:
results(gs3c.score(X_train, y_train), gs3c.score(X_test, y_test), gs3t.score(X_train, y_train), gs3t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,1.0,0.68,1.0,0.69
1,0.91,0.64,0.96,0.64
2,0.73,0.69,0.76,0.69


In [31]:
gs3c.best_params_

{'cvec__binary': True,
 'cvec__max_df': 0.7,
 'cvec__max_features': 2000,
 'cvec__ngram_range': (1, 2),
 'model__max_depth': None,
 'model__max_features': 5,
 'model__min_samples_leaf': 3,
 'model__min_samples_split': 5}

In [32]:
gs3t.best_params_

{'model__max_depth': None,
 'model__max_features': 5,
 'model__min_samples_leaf': 3,
 'model__min_samples_split': 7,
 'tfidf__binary': True,
 'tfidf__max_df': 0.8,
 'tfidf__max_features': 2000,
 'tfidf__ngram_range': (1, 2)}

* Both the model as well as the vectorizers were tuned
* The score is better than the previous and similar to the first trial
* The overfitting has greatly reduced

# Trial 4

In [33]:
params1 = {
    'model__max_depth': [None],
    'model__min_samples_split': [4, 5, 7],
    'model__min_samples_leaf': [1,2, 3,4, 5],
    'model__max_features': [2,3,4,5],
    'cvec__max_features': [1500, 2000, 2500],
    'cvec__max_df': [0.6, 0.7],
    'cvec__binary': [True],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    
}

params2 = {
    'model__max_depth': [None],
    'model__min_samples_split': [5, 7, 9, 10],
    'model__min_samples_leaf': [1, 2, 3, 4, 5],
    'model__max_features': [2,3,4,5],
       'tfidf__max_features': [1500, 2000, 2500],
    'tfidf__max_df': [0.8, 0.9],
    'tfidf__binary': [True],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    
}

In [35]:
# instantiate gridsearch with two different pipes
gs4c = GridSearchCV(pipe1, param_grid=params1, cv=5, verbose=1, n_jobs=-1)
gs4t = GridSearchCV(pipe2, param_grid=params2, cv=5, verbose=1, n_jobs=-1)

In [36]:
gs4c.fit(X_train, y_train)

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True], 'cvec__max_df': [0.6, 0.7],
                         'cvec__max_features': [1500, 2000, 2500],
                         'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
                         'model__max_depth': [None],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 2, 3, 4, 5],
                         'model__min_samples_split': [4, 5, 7]},
             verbose=1)

In [37]:
gs4t.fit(X_train, y_train)

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None],
                         'model__max_features': [2, 3, 4, 5],
                         'model__min_samples_leaf': [1, 2, 3, 4, 5],
                         'model__min_samples_split': [5, 7, 9, 10],
                         'tfidf__binary': [True], 'tfidf__max_df': [0.8, 0.9],
                         'tfidf__max_features': [1500, 2000, 2500],
                         'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]},
             verbose=1)

In [38]:
results(gs4c.score(X_train, y_train), gs4c.score(X_test, y_test), gs4t.score(X_train, y_train), gs4t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,1.0,0.68,1.0,0.69
1,0.91,0.64,0.96,0.64
2,0.73,0.69,0.76,0.69
3,0.73,0.69,0.79,0.71


In [39]:
gs4c.best_params_

{'cvec__binary': True,
 'cvec__max_df': 0.7,
 'cvec__max_features': 2500,
 'cvec__ngram_range': (1, 2),
 'model__max_depth': None,
 'model__max_features': 5,
 'model__min_samples_leaf': 2,
 'model__min_samples_split': 5}

In [40]:
gs4t.best_params_

{'model__max_depth': None,
 'model__max_features': 5,
 'model__min_samples_leaf': 2,
 'model__min_samples_split': 5,
 'tfidf__binary': True,
 'tfidf__max_df': 0.9,
 'tfidf__max_features': 2000,
 'tfidf__ngram_range': (1, 1)}

* The score for the countvectorizer is still similar to the previous one whereas the score for the tfidf vectorizer has increased slightly
* The overfitting is still similar to the previous trial
* Out of the models done, this model has not performed as good as the others