# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# import data
df = pd.read_csv('../data/posts.csv').dropna()

In [3]:
df.head()  # 0 movies, 1 == boxoffice

Unnamed: 0,post,class
0,billy butcherson back hocus pocus doug jones s...,0
1,go,1
2,lost city open domestically,1
3,thirsty go watch video,0
4,wanted see imax dolby digital theyre showing s...,0


In [23]:
# create a dataframe to store results, c - countvectorizer, t - tfidf vectorizer
res = pd.DataFrame(columns=['train_score_c', 'test_score_c', 'train_score_t', 'test_score_t'])


In [5]:
# function to add results
def results(train_c, test_c, train_t, test_t):
    res.loc[len(naive_res)] = [round(train_c,2), round(test_c,2), round(train_t,2), round(test_t,2)]
    return res
                  
                  

# Splitting Data

In [6]:
# create X and y
X = df['post']
y = df['class']

In [36]:
y.value_counts(normalize=True) # baseline model

1    0.505869
0    0.494131
Name: class, dtype: float64

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
# create pipes, for countvectorizer and tfidfvectorizer
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('model', SVC())
])

pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', SVC())
])

In [10]:
# params
params = {}

# Trial 1

In [11]:
# instantiate gridsearch with two different pipes
gs1c = GridSearchCV(pipe1, param_grid=params, cv=5, verbose=1, n_jobs=-1)
gs1t = GridSearchCV(pipe2, param_grid=params, cv=5, verbose=1, n_jobs=-1)

In [12]:
gs1c.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', SVC())]),
             n_jobs=-1, param_grid={}, verbose=1)

In [13]:
gs1t.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', SVC())]),
             n_jobs=-1, param_grid={}, verbose=1)

In [24]:
results(gs1c.score(X_train, y_train), gs1c.score(X_test, y_test), gs1t.score(X_train, y_train), gs1t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.9,0.77,0.97,0.77


* Default params were used
* The score is better than the baseline model
* The model overfits the training data more for the tfidf vectorizer than the count vectorizer

# Trial 2

In [16]:
params = {
    'model__C': [0.1, 1, 10, 100, 1000],
    'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'model__kernel': ['rbf']
}

In [19]:
# instantiate gridsearch with two different pipes
gs2c = GridSearchCV(pipe1, param_grid=params, cv=5, verbose=1, n_jobs=-1)
gs2t = GridSearchCV(pipe2, param_grid=params, cv=5, verbose=1, n_jobs=-1) 

In [20]:
gs2c.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', SVC())]),
             n_jobs=-1,
             param_grid={'model__C': [0.1, 1, 10, 100, 1000],
                         'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'model__kernel': ['rbf']},
             verbose=1)

In [21]:
gs2t.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', SVC())]),
             n_jobs=-1,
             param_grid={'model__C': [0.1, 1, 10, 100, 1000],
                         'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'model__kernel': ['rbf']},
             verbose=1)

In [25]:
results(gs2c.score(X_train, y_train), gs2c.score(X_test, y_test), gs2t.score(X_train, y_train), gs2t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.9,0.77,0.97,0.77
1,0.9,0.77,0.97,0.77


In [26]:
gs2c.best_params_

{'model__C': 10, 'model__gamma': 0.01, 'model__kernel': 'rbf'}

In [27]:
gs2t.best_params_

{'model__C': 1, 'model__gamma': 1, 'model__kernel': 'rbf'}

* Only the model was tuned
* There are no improvements/changes from the previous trial

# Trial 3

In [28]:
params1 = {
    'cvec__max_features': [1500, 2000, 2500],
    'cvec__max_df': [.8, .7],
    'cvec__ngram_range': [(1,1)],
    'cvec__binary': [True],
    'model__C': [0.9, 1, 5, 15, 10, 20],
    'model__gamma': [0.01,0.1, 0.02]
}

params2 = {
    'tfidf__max_features': [1500, 2000, 2500],
    'tfidf__max_df': [.8, .7],
    'tfidf__ngram_range': [(1,1)],
    'tfidf__binary': [True],
    'model__C': [0.9, 1, 5, 15, 10, 20],
    'model__gamma': [0.01,0.1, 0.02, 1, 2]
}

In [30]:
# instantiate gridsearch with two different pipes
gs3c = GridSearchCV(pipe1, param_grid=params1, cv=5, verbose=1, n_jobs=-1)
gs3t = GridSearchCV(pipe2, param_grid=params2, cv=5, verbose=1, n_jobs=-1) 

In [31]:
gs3c.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', SVC())]),
             n_jobs=-1,
             param_grid={'cvec__binary': [True], 'cvec__max_df': [0.8, 0.7],
                         'cvec__max_features': [1500, 2000, 2500],
                         'cvec__ngram_range': [(1, 1)],
                         'model__C': [0.9, 1, 5, 15, 10, 20],
                         'model__gamma': [0.01, 0.1, 0.02]},
             verbose=1)

In [32]:
gs3t.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model', SVC())]),
             n_jobs=-1,
             param_grid={'model__C': [0.9, 1, 5, 15, 10, 20],
                         'model__gamma': [0.01, 0.1, 0.02, 1, 2],
                         'tfidf__binary': [True], 'tfidf__max_df': [0.8, 0.7],
                         'tfidf__max_features': [1500, 2000, 2500],
                         'tfidf__ngram_range': [(1, 1)]},
             verbose=1)

In [33]:
 results(gs3c.score(X_train, y_train), gs3c.score(X_test, y_test), gs3t.score(X_train, y_train), gs3t.score(X_test, y_test))

Unnamed: 0,train_score_c,test_score_c,train_score_t,test_score_t
0,0.9,0.77,0.97,0.77
1,0.9,0.77,0.97,0.77
2,0.86,0.76,0.99,0.77


In [34]:
gs3c.best_params_

{'cvec__binary': True,
 'cvec__max_df': 0.8,
 'cvec__max_features': 2500,
 'cvec__ngram_range': (1, 1),
 'model__C': 5,
 'model__gamma': 0.02}

In [35]:
gs3t.best_params_

{'model__C': 5,
 'model__gamma': 2,
 'tfidf__binary': True,
 'tfidf__max_df': 0.8,
 'tfidf__max_features': 2500,
 'tfidf__ngram_range': (1, 1)}

* The model along with the vectorizers were tuned
* The scores were once again similar to the previous scores
* The model overfits less for the count vectorizer compared to the tfidf vectorizer
* The best model would probably be the 3rd one using the count vectorizer as it has less overfitting compared to the others and the accuracy is 96%