# Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
# import data
df = pd.read_csv('../data/posts.csv').dropna()

In [6]:
df.head() # 0 movies, 1 == boxoffice

Unnamed: 0,post,class
0,billy butcherson back hocus pocus doug jones s...,0
1,go,1
2,lost city open domestically,1
3,thirsty go watch video,0
4,wanted see imax dolby digital theyre showing s...,0


In [42]:
# create a dataframe to store results
res = pd.DataFrame(columns=['train_score_c', 'test_score_c', 'train_score_t', 'test_score_t'])


In [18]:
# function to add results

def results(train_c, test_c, train_t, test_t):
    res.loc[len(naive_res)] = [round(train_c,3), round(test_c, 3), round(train_t,3), round(test_t, 3)]
    return res
                  
                  

# Splitting Data

In [9]:
# create X and y
X = df['post']
y = df['class']

In [61]:
y.value_counts(normalize=True) # base line score

1    0.505869
0    0.494131
Name: class, dtype: float64

In [10]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [11]:
# create pipes, for countvectorizer and tfidfvectorizer
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('model', LogisticRegression())
])

pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LogisticRegression())
])

In [12]:
params = {
    'model__solver': ['liblinear']
}

# Trial 1

In [13]:
gs1c = GridSearchCV(pipe1, param_grid=params, verbose=1, n_jobs=-1, cv=10)
gs1t = GridSearchCV(pipe2, param_grid=params, verbose=1, n_jobs=-1, cv=10)

In [14]:
gs1c.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1, param_grid={'model__solver': ['liblinear']}, verbose=1)

In [15]:
gs1t.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1, param_grid={'model__solver': ['liblinear']}, verbose=1)

In [43]:
results(gs1c.score(X_train, y_train), gs1c.score(X_test, y_test), gs1t.score(X_train, y_train), gs1t.score(X_test, y_test))


Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.915,0.768,0.86,0.769


* The score is definitely better than the baseline score
* Default params were used
* The model overfits on the training data more for the countvectorizer than the tfidfvectorizer

# Trial 2

In [20]:
params = {
    'model__C': [100, 10, 1.0, 0.1, 0.01],
    'model__penalty': ['l1', 'l2'],
    'model__solver': ['liblinear'], 
}

In [21]:
gs2c = GridSearchCV(pipe1, param_grid=params, verbose=1, n_jobs=-1, cv=10)
gs2t = GridSearchCV(pipe2, param_grid=params, verbose=1, n_jobs=-1, cv=10)

In [22]:
gs2c.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'model__C': [100, 10, 1.0, 0.1, 0.01],
                         'model__penalty': ['l1', 'l2'],
                         'model__solver': ['liblinear']},
             verbose=1)

In [23]:
gs2t.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'model__C': [100, 10, 1.0, 0.1, 0.01],
                         'model__penalty': ['l1', 'l2'],
                         'model__solver': ['liblinear']},
             verbose=1)

In [44]:
results(gs2c.score(X_train, y_train), gs2c.score(X_test, y_test), gs2t.score(X_train, y_train), gs2t.score(X_test, y_test))


Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.915,0.768,0.86,0.769
1,0.834,0.765,0.86,0.769


In [28]:
gs2c.best_params_

{'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'liblinear'}

In [29]:
gs2t.best_params_

{'model__C': 1.0, 'model__penalty': 'l2', 'model__solver': 'liblinear'}

* The scores are similar to the previous trial
* For the countvectorizer, the model fits less on the training data
* Only the model was tuned

# Trial 3

In [30]:
params1 = {
     'cvec__max_features': [1500, 2000, 2500],
    'cvec__max_df': [.8, .7],
    'cvec__ngram_range': [(1,1)],
    'cvec__binary': [True],
    'model__C': [0.1, 0.001, 5, 10],
    'model__solver': ['liblinear']
}

params2 = {
     'tfidf__max_features': [1500, 2000, 2500],
    'tfidf__max_df': [.8, .7],
    'tfidf__ngram_range': [(1,1)],
    'tfidf__binary': [True],
    'model__C': [1, 100, 0.1],
    'model__solver': ['liblinear']
}

In [31]:
gs3c = GridSearchCV(pipe1, param_grid=params1, verbose=1, n_jobs=-1, cv=10)
gs3t = GridSearchCV(pipe2, param_grid=params2, verbose=1, n_jobs=-1, cv=10)

In [32]:
gs3c.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True], 'cvec__max_df': [0.8, 0.7],
                         'cvec__max_features': [1500, 2000, 2500],
                         'cvec__ngram_range': [(1, 1)],
                         'model__C': [0.1, 0.001, 5, 10],
                         'model__solver': ['liblinear']},
             verbose=1)

In [33]:
gs3t.fit(X_train, y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'model__C': [1, 100, 0.1],
                         'model__solver': ['liblinear'],
                         'tfidf__binary': [True], 'tfidf__max_df': [0.8, 0.7],
                         'tfidf__max_features': [1500, 2000, 2500],
                         'tfidf__ngram_range': [(1, 1)]},
             verbose=1)

In [45]:
results(gs3c.score(X_train, y_train), gs3c.score(X_test, y_test), gs3t.score(X_train, y_train), gs3t.score(X_test, y_test))


Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.915,0.768,0.86,0.769
1,0.834,0.765,0.86,0.769
2,0.807,0.761,0.825,0.765


In [35]:
gs3c.best_params_

{'cvec__binary': True,
 'cvec__max_df': 0.8,
 'cvec__max_features': 2500,
 'cvec__ngram_range': (1, 1),
 'model__C': 0.1,
 'model__solver': 'liblinear'}

In [36]:
gs3t.best_params_

{'model__C': 1,
 'model__solver': 'liblinear',
 'tfidf__binary': True,
 'tfidf__max_df': 0.8,
 'tfidf__max_features': 2500,
 'tfidf__ngram_range': (1, 1)}

* The model and vectorizers were tuned
* The scores are still similar to previous scores
* The model is less overfitting

# Trial 4

In [37]:
params1 = {
     'cvec__max_features': [2000, 2500, 3000, 3500],
    'cvec__max_df': [.8, .7, 0.9],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__binary': [True],
    'model__C': [0.1, 0.001, 1],
    'model__solver': ['liblinear']
}

params2 = {
     'tfidf__max_features': [2000, 2500, 300, 3500],
    'tfidf__max_df': [.8, .7, 0.9],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__binary': [True],
    'model__C': [1,  0.1, 0.01],
    'model__solver': ['liblinear']
}

In [38]:
gs4c = GridSearchCV(pipe1, param_grid=params1, verbose=1, n_jobs=-1, cv=10)
gs4t = GridSearchCV(pipe2, param_grid=params2, verbose=1, n_jobs=-1, cv=10)

In [39]:
gs4c.fit(X_train, y_train)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True],
                         'cvec__max_df': [0.8, 0.7, 0.9],
                         'cvec__max_features': [2000, 2500, 3000, 3500],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'model__C': [0.1, 0.001, 1],
                         'model__solver': ['liblinear']},
             verbose=1)

In [40]:
gs4t.fit(X_train, y_train)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'model__C': [1, 0.1, 0.01],
                         'model__solver': ['liblinear'],
                         'tfidf__binary': [True],
                         'tfidf__max_df': [0.8, 0.7, 0.9],
                         'tfidf__max_features': [2000, 2500, 300, 3500],
                         'tfidf__ngram_range': [(1, 1), (1, 2)]},
             verbose=1)

In [46]:
results(gs4c.score(X_train, y_train), gs4c.score(X_test, y_test), gs4t.score(X_train, y_train), gs4t.score(X_test, y_test))


Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.915,0.768,0.86,0.769
1,0.834,0.765,0.86,0.769
2,0.807,0.761,0.825,0.765
3,0.816,0.76,0.834,0.762


In [47]:
gs4c.best_params_

{'cvec__binary': True,
 'cvec__max_df': 0.8,
 'cvec__max_features': 3500,
 'cvec__ngram_range': (1, 2),
 'model__C': 0.1,
 'model__solver': 'liblinear'}

In [48]:
gs4t.best_params_

{'model__C': 1,
 'model__solver': 'liblinear',
 'tfidf__binary': True,
 'tfidf__max_df': 0.8,
 'tfidf__max_features': 3500,
 'tfidf__ngram_range': (1, 1)}

* There isn't really a clearcut answer to which hyperparameter combinations could be better as all the scores are similar
* It could be the first 2 models using tfidf vectorizer as it has a higher accuracy score and at the same time has less overfitting