# Amazon Notebook

## Variables

In [57]:
import pandas as pd
import requests
import time
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier,\
                                RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier

In [26]:
# Reading in the data

df = pd.read_csv('./data/df_export.csv')

In [27]:
df.isna().sum()

subreddit     0
body         72
dtype: int64

In [28]:
# Dropping null values

df = df.dropna()

In [29]:
# Assigning variables

X = df['body']
y = df['subreddit']

In [30]:
# Splitting the data for training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y)

### Data Logging

In [None]:
model_scores = []

## Models

### Simple Logistic Regression

In [11]:
log_pipeline = Pipeline([
    ('tf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver = 'lbfgs'))
])
params = [{
    'tf__max_features' : [1000, 1250],
    'tf__stop_words'   : [None, 'english'],
    'tf__ngram_range'  : [(1, 1), (1, 2)],
    'logreg__C'         : [.5, 1, 2]
}]
gs_log = GridSearchCV(log_pipeline, param_grid = params, 
                  cv = 5, n_jobs = -1, verbose = 1)

In [12]:
gs_log.fit(X_train, y_train)
log_score = {'estimator' : 'Logistic Regression/Tfidf',
            'train score' : gs_log.score(X_train, y_train),
            'test score' : gs_log.score(X_test, y_test),
            'best params'    : gs_log.best_params_
}
model_scores.append(log_score)
print(log_score)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   15.1s finished


{'estimator': 'Logistic Regression/Tfidf', 'train score': 0.7597721886165058, 'test score': 0.7273692030139022, 'best params': {'logreg__C': 1, 'tf__max_features': 1250, 'tf__ngram_range': (1, 1), 'tf__stop_words': None}}


In [13]:
log_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('logreg', LogisticRegression(solver = 'lbfgs'))
])
params = [{
    'vec__max_features' : [1000, 1250],
    'vec__stop_words'   : [None, 'english'],
    'vec__ngram_range'  : [(1, 1), (1, 2)],
    'logreg__C'         : [.5, 1, 2]
}]
gs_log = GridSearchCV(log_pipeline, param_grid = params, 
                  cv = 5, n_jobs = -1, verbose = 1)

In [14]:
gs_log.fit(X_train, y_train)
log_score = {'estimator' : 'Logistic Regression',
            'train score' : gs_log.score(X_train, y_train),
            'test score' : gs_log.score(X_test, y_test),
            'best params'    : gs_log.best_params_
}
model_scores.append(log_score)
print(log_score)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   14.4s finished


{'estimator': 'Logistic Regression', 'train score': 0.7582510877639818, 'test score': 0.7223814071951608, 'best params': {'logreg__C': 0.5, 'vec__max_features': 1250, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}}


It looks like the test scores for Logistic Regression with both CountVectorizer and TFIDFVectorizer are almost identical. I'm going to continue using CountVectorizer because I am more familiar with its parameters. Also of note, both models liked the highest max_features parameter, so I will increase that hyperparameter to see if it yields better results

### Multinomial Naive Bayes

In [17]:
nb_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('nb', MultinomialNB())
])
params = [{
    'vec__max_features' : [1250, 1500],
    'vec__stop_words'   : [None, 'english'],
    'vec__ngram_range'  : [(1, 1), (1, 2)],
    'nb__alpha'         : [.5, 1, 2]
}]
gs_nb = GridSearchCV(nb_pipeline, param_grid = params, 
                  cv = 5, n_jobs = -1, verbose = 2)

In [18]:
gs_nb.fit(X_train, y_train)
nb_score = {'estimator' : 'Multinomial Naive Bayes',
            'train score' : gs_nb.score(X_train, y_train),
            'test score' : gs_nb.score(X_test, y_test),
            'params'    : gs_nb.best_params_
}
model_scores.append(nb_score)
print(nb_score)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  86 out of 120 | elapsed:    9.0s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   12.1s finished


{'estimator': 'Multinomial Naive Bayes', 'train score': 0.7301991580883653, 'test score': 0.7212140507269447, 'params': {'nb__alpha': 0.5, 'vec__max_features': 1500, 'vec__ngram_range': (1, 1), 'vec__stop_words': 'english'}}


Again, the model seemed to do best with the maximum hyperparameter for max_features. I am going to keep increasing it until the performance boost bottoms out. Also of note, all three models performed best with the base ngram range of (1,1). I will continue using that ngram range from here on out.

### Random Forest

In [19]:
rf_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('rf', RandomForestClassifier()),
])

params = [{
    'vec__max_features': [1500, 1750],
    'vec__stop_words'  : [None],
    'vec__ngram_range' : [(1, 1)],
    'rf__n_estimators' : [100, 250, 500],
    'rf__max_features': ['auto'],
    'rf__max_depth': [None, 3, 5],
    'rf__min_samples_split' : [2, 3, 5]
}]
gs_rf = GridSearchCV(rf_pipeline, param_grid = params, 
                  cv = 3, n_jobs = -1, verbose = 5)

In [20]:
gs_rf.fit(X_train, y_train)
rf_score = {'estimator' : 'Random Forest',
            'train score' : gs_rf.score(X_train, y_train),
            'test score' : gs_rf.score(X_test, y_test),
            'best params'    : gs_rf.best_params_
}
model_scores.append(rf_score)
print(rf_score)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 294 out of 324 | elapsed:  4.5min remaining:   27.4s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  5.4min finished


{'estimator': 'Random Forest', 'train score': 0.9762991262513707, 'test score': 0.8150270614453996, 'best params': {'rf__max_depth': None, 'rf__max_features': 'auto', 'rf__min_samples_split': 5, 'rf__n_estimators': 250, 'vec__max_features': 1750, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}}


### Extra Trees

In [21]:
et_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('et', ExtraTreesClassifier()),
])

params = [{
    'vec__max_features': [1750, 2000],
    'vec__stop_words'  : [None],
    'vec__ngram_range' : [(1, 1)],
    'et__n_estimators' : [100, 250, 500],
    'et__max_features': ['auto'],
    'et__max_depth': [None, 3, 5], 
    'et__min_samples_split' : [2, 3, 5]
}]
gs_et = GridSearchCV(et_pipeline, param_grid = params, 
                  cv = 3, n_jobs = -1, verbose = 3)

In [22]:
gs_et.fit(X_train, y_train)
et_score = {'estimator' : 'Extra Trees',
            'train score' : gs_et.score(X_train, y_train),
            'test score' : gs_et.score(X_test, y_test),
            'best params' : gs_et.best_params_
}
model_scores.append(et_score)
print(et_score)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done 122 out of 162 | elapsed:  1.5min remaining:   29.9s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:  3.1min finished


{'estimator': 'Extra Trees', 'train score': 0.9808624288089427, 'test score': 0.8256393929746365, 'best params': {'et__max_depth': None, 'et__max_features': 'auto', 'et__min_samples_split': 2, 'et__n_estimators': 500, 'vec__max_features': 2000, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}}


### K Nearest Neighbor

In [26]:
knn_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('ss', StandardScaler(with_mean = False)),
    ('knn', KNeighborsClassifier())
])

params = [{
    'vec__max_features': [1750, 2000],
    'vec__stop_words'  : [None],
    'vec__ngram_range' : [(1, 1)],
    'knn__n_neighbors' : [3, 4, 5],
    'knn__p'           : [1, 2]
}]
gs_knn = GridSearchCV(knn_pipeline, param_grid = params, 
                  cv = 3, n_jobs = -1, verbose = 5)

In [27]:
gs_knn.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  36 | elapsed:   32.3s remaining:  3.3min
[Parallel(n_jobs=-1)]: Done  13 out of  36 | elapsed:   39.5s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  21 out of  36 | elapsed:  2.8min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done  29 out of  36 | elapsed:  2.9min remaining:   42.4s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  3.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vec__max_features': [1750, 2000], 'vec__stop_words': [None], 'vec__ngram_range': [(1, 1)], 'knn__n_neighbors': [3, 4, 5], 'knn__p': [1, 2]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [28]:
knn_score = {'estimator' : 'K Nearest Neighbor',
            'train score' : gs_knn.score(X_train, y_train),
            'test score' : gs_knn.score(X_test, y_test),
            'best params' : gs_knn.best_params_
}
model_scores.append(knn_score)
print(knn_score)



{'estimator': 'K Nearest Neighbor', 'train score': 0.827903357034207, 'test score': 0.6747320386288868, 'best params': {'knn__n_neighbors': 3, 'knn__p': 2, 'vec__max_features': 1750, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}}


### Bagging Classifier

In [35]:
bc_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('bc', BaggingClassifier()),
])

params = [{
    'vec__max_features': [1750, 2000],
    'vec__stop_words'  : [None],
    'vec__ngram_range' : [(1, 1)],
    'bc__n_estimators' : [10, 50, 100]
}]
gs_bc = GridSearchCV(bc_pipeline, param_grid = params, 
                  cv = 3, n_jobs = -1, verbose = 5)

In [36]:
gs_bc.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  18 | elapsed:   32.8s remaining:  2.7min
[Parallel(n_jobs=-1)]: Done   7 out of  18 | elapsed:  2.5min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done  11 out of  18 | elapsed:  2.6min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:  5.0min remaining:   59.7s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  5.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...imators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vec__max_features': [1750, 2000], 'vec__stop_words': [None], 'vec__ngram_range': [(1, 1)], 'bc__n_estimators': [10, 50, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [38]:
bc_score = {'estimator' : 'Bagging Classifier',
            'train score' : gs_bc.score(X_train, y_train),
            'test score' : gs_bc.score(X_test, y_test),
            'best params' : gs_bc.best_params_
}
model_scores.append(bc_score)
print(bc_score)

{'estimator': 'Bagging Classifier', 'train score': 0.9808624288089427, 'test score': 0.8070678127984718, 'best params': {'bc__n_estimators': 100, 'vec__max_features': 2000, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}}


### Gradient Booster

In [39]:
gb_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('gb', GradientBoostingClassifier())
])

params = [{
    'vec__max_features': [1750, 2000],
    'vec__stop_words'  : [None],
    'vec__ngram_range' : [(1, 1)],
    'gb__n_estimators' : [100, 500, 750],
    'gb__min_samples_split' : [2, 3, 5]
}]
gs_gb = GridSearchCV(gb_pipeline, param_grid = params, 
                  cv = 3, n_jobs = -1, verbose = 2)

In [40]:
gs_gb.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  54 | elapsed:   11.6s remaining:   30.2s
[Parallel(n_jobs=-1)]: Done  43 out of  54 | elapsed:   39.3s remaining:   10.1s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   46.0s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vec__max_features': [1750, 2000], 'vec__stop_words': [None], 'vec__ngram_range': [(1, 1)], 'gb__n_estimators': [100, 500, 750], 'gb__min_samples_split': [2, 3, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [41]:
gb_score = {'estimator' : 'Gradient Booster',
            'train score' : gs_gb.score(X_train, y_train),
            'test score' : gs_gb.score(X_test, y_test),
            'best params' : gs_gb.best_params_
}
model_scores.append(gb_score)
print(gb_score)

{'estimator': 'Gradient Booster', 'train score': 0.7886377303760302, 'test score': 0.748275496126499, 'best params': {'gb__min_samples_split': 5, 'gb__n_estimators': 750, 'vec__max_features': 2000, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}}


### Support Vector Classifier

In [45]:
svc_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('ss', StandardScaler(with_mean = False)),
    ('svc', SVC()),
])

params = [{
    'vec__max_features': [2000],
    'vec__stop_words'  : [None],
    'vec__ngram_range' : [(1, 1)],
    'svc__kernel'      : ['rbf', 'poly'],
    'svc__degree'      : [2],
    'svc__C'           : [.5, 1]
}]
gs_svc = GridSearchCV(svc_pipeline, param_grid = params, 
                  cv = 3, n_jobs = -1, verbose = 5)

In [46]:
gs_svc.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:  2.9min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  3.3min remaining:  2.3min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  3.5min remaining:   42.3s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  3.9min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vec__max_features': [2000], 'vec__stop_words': [None], 'vec__ngram_range': [(1, 1)], 'svc__kernel': ['rbf', 'poly'], 'svc__degree': [2], 'svc__C': [0.5, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [48]:
svc_score = {'estimator' : 'Support Vector Classifier',
            'train score' : gs_svc.score(X_train, y_train),
            'test score' : gs_svc.score(X_test, y_test),
             'best params' : gs_svc.best_params_
}
model_scores.append(svc_score)
print(svc_score)

{'estimator': 'Support Vector Classifier', 'train score': 0.8452368318652942, 'test score': 0.7516714422158548, 'best params': {'svc__C': 1, 'svc__degree': 2, 'svc__kernel': 'rbf', 'vec__max_features': 2000, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}}


## Comprehensive Gridsearch

The random forest, while overfit, performed the best on the test dataset. I will instantiate a random forest model with a more comprehensive gridsearch through its hyperparameters

In [31]:
rf_large_pipeline = Pipeline([
    ('vec', CountVectorizer()),
    ('rf', RandomForestClassifier()),
])

params = [{
    'vec__max_features': [2250, 2500],
    'vec__stop_words'  : [None],
    'vec__ngram_range' : [(1, 1)],
    'rf__n_estimators' : [250, 300],
    'rf__max_features': ['auto', 'log2'],
    'rf__max_depth': [None],
    'rf__min_samples_split' : [4, 5, 6],
    'rf__min_samples_leaf': [1, 2, 3]
    
}]
gs_rf_large = GridSearchCV(rf_large_pipeline, param_grid = params, 
                  cv = 3, n_jobs = 4, verbose = 5)

In [32]:
gs_rf_large.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 11.9min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 22.5min
[Parallel(n_jobs=4)]: Done 216 out of 216 | elapsed: 24.5min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prepr

In [33]:
rf_large_score = {'estimator' : 'Large Random Forest',
            'train score' : gs_rf_large.score(X_train, y_train),
            'test score' : gs_rf_large.score(X_test, y_test),
            'best params'    : gs_rf_large.best_params_
}
model_scores.append(rf_large_score)
print(rf_large_score)

{'estimator': 'Large Random Forest', 'train score': 0.979624323463865, 'test score': 0.8313700520004245, 'best params': {'rf__max_depth': None, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 6, 'rf__n_estimators': 300, 'vec__max_features': 2500, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}}


## Exporting Results

In [35]:
df_scores = pd.DataFrame(model_scores)
df_scores = df_scores.drop(columns = ['best params', 'params'])

In [65]:
importances = gs_rf_large.best_estimator_['rf'].feature_importances_

In [69]:
important_words = gs_rf_large.best_estimator_.named_steps.vec.get_feature_names()

In [140]:
coefs = pd.DataFrame(importances, important_words)

In [141]:
df_scores.to_csv('./results/scores_dataframe.csv')
coefs.to_csv('./results/word_coefs.csv')