# Model Comparison
## Find the Best Model



In [88]:
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from glob import glob
from collections import Counter, namedtuple
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
!pwd

/Users/chris/github/reddit_nlp


In [None]:
# === TODO === #
# use postgres to store and retrieve the compare df rows

In [11]:
reports = glob('data/compare_df/*.csv')
reports

['data/compare_df/2020-05-09_0938.csv', 'data/compare_df/2020-05-09_1427.csv']

In [12]:
df = pd.DataFrame()

In [19]:
for report in reports:
    print(report)
    data = pd.read_csv(report)
    df = df.append(data)
    

data/compare_df/2020-05-09_0938.csv
data/compare_df/2020-05-09_1427.csv


In [21]:
df.drop(columns='Unnamed: 0', inplace=True)
df.head(10)

Unnamed: 0,preprocessor,estimator,best_train_score,best_test_score,time_weighted_score,roc_auc,train_test_variance,fit_time_seconds,predict_time_seconds,best_params,subreddits,date
0,TfidVectorizer,XGBoost Classifier,0.801,0.565,34.935,0.874,0.295,16.033,0.138,"{'xgbclassifier__n_estimators': 100, 'xgbclass...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
1,TfidVectorizer,Multi Layer Percetpron Classifier,0.826,0.608,29.133,0.902,0.264,20.83,0.025,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
2,TfidVectorizer,Logistic Regression,0.818,0.611,437.804,0.887,0.253,1.366,0.031,"{'tfidfvectorizer__use_idf': True, 'tfidfvecto...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
3,TfidVectorizer,Random Forest,0.737,0.561,165.529,0.884,0.239,3.002,0.389,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
4,TfidVectorizer,K Nearest Neighbors,0.541,0.416,881.227,0.755,0.232,0.083,0.389,"{'tfidfvectorizer__use_idf': True, 'tfidfvecto...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
5,TfidVectorizer,Multinomial Bayes Classifier,0.844,0.614,3128.69,0.89,0.272,0.163,0.033,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
6,TfidVectorizer,Support Vector Classifier,0.933,0.603,97.823,0.89,0.354,5.35,0.81,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
7,TfidVectorizer,Bagging Classifier Logistic Regression,0.837,0.622,3.535,0.9,0.257,175.725,0.217,"{'tfidfvectorizer__use_idf': True, 'tfidfvecto...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
8,TfidVectorizer,Bagging Classifier MultinomalNB,0.977,0.544,5.707,0.855,0.443,94.761,0.488,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278
9,TfidVectorizer,Extra Trees Classifier,0.977,0.594,30.411,0.884,0.392,18.568,0.952,"{'tfidfvectorizer__use_idf': True, 'tfidfvecto...","deeplearning, tensorflow, html, javascript, le...",2020-05-09 09:38:28.644278


In [24]:
columns_of_interest = ['preprocessor','estimator','best_test_score', 'roc_auc', 'fit_time_seconds','time_weighted_score']

In [25]:
df.sort_values(by='time_weighted_score', ascending=False)[columns_of_interest]

Unnamed: 0,preprocessor,estimator,best_test_score,roc_auc,fit_time_seconds,time_weighted_score
5,TfidVectorizer,Multinomial Bayes Classifier,0.614,0.89,0.163,3128.69
12,TfidVectorizer,Stochastic Gradient Descent Classifier,0.606,0.866,0.211,2510.941
11,TfidVectorizer,Passive Agressive Classifier,0.582,0.866,0.305,1749.729
14,TfidVectorizer,Linear SVC,0.592,0.866,0.364,1501.224
4,TfidVectorizer,K Nearest Neighbors,0.416,0.755,0.083,881.227
2,TfidVectorizer,Logistic Regression,0.611,0.887,1.366,437.804
3,TfidVectorizer,Random Forest,0.561,0.884,3.002,165.529
6,TfidVectorizer,Support Vector Classifier,0.603,0.89,5.35,97.823
13,TfidVectorizer,NuSVC,0.613,0.866,5.86,92.366
0,TfidVectorizer,XGBoost Classifier,0.565,0.874,16.033,34.935


In [26]:
df.sort_values(by='best_test_score', ascending=False)[columns_of_interest]

Unnamed: 0,preprocessor,estimator,best_test_score,roc_auc,fit_time_seconds,time_weighted_score
7,TfidVectorizer,Bagging Classifier Logistic Regression,0.622,0.9,175.725,3.535
5,TfidVectorizer,Multinomial Bayes Classifier,0.614,0.89,0.163,3128.69
13,TfidVectorizer,NuSVC,0.613,0.866,5.86,92.366
2,TfidVectorizer,Logistic Regression,0.611,0.887,1.366,437.804
1,TfidVectorizer,Multi Layer Percetpron Classifier,0.608,0.902,20.83,29.133
12,TfidVectorizer,Stochastic Gradient Descent Classifier,0.606,0.866,0.211,2510.941
6,TfidVectorizer,Support Vector Classifier,0.603,0.89,5.35,97.823
9,TfidVectorizer,Extra Trees Classifier,0.594,0.884,18.568,30.411
14,TfidVectorizer,Linear SVC,0.592,0.866,0.364,1501.224
11,TfidVectorizer,Passive Agressive Classifier,0.582,0.866,0.305,1749.729


In [35]:
df.groupby(by='estimator')[columns_of_interest].mean()

Unnamed: 0_level_0,best_test_score,roc_auc,fit_time_seconds,time_weighted_score
estimator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bagging Classifier Logistic Regression,0.622,0.9,175.725,3.535
Bagging Classifier MultinomalNB,0.544,0.855,94.761,5.707
Extra Trees Classifier,0.594,0.884,18.568,30.411
Gradient Boosting Classifier,0.576,0.866,19.174,29.892
K Nearest Neighbors,0.416,0.755,0.083,881.227
Linear SVC,0.592,0.866,0.364,1501.224
Logistic Regression,0.611,0.887,1.366,437.804
Multi Layer Percetpron Classifier,0.608,0.902,20.83,29.133
Multinomial Bayes Classifier,0.614,0.89,0.163,3128.69
NuSVC,0.613,0.866,5.86,92.366


# Best Params

In [51]:
eval(df.best_params

{'xgbclassifier__n_estimators': 100,
 'xgbclassifier__max_depth': 10,
 'xgbclassifier__hidden_layer_sizes': 10,
 'tfidfvectorizer__use_idf': False,
 'tfidfvectorizer__strip_accents': None,
 'tfidfvectorizer__norm': 'l1',
 'tfidfvectorizer__ngram_range': (1, 1),
 'tfidfvectorizer__min_df': 2,
 'tfidfvectorizer__max_features': 7000,
 'tfidfvectorizer__max_df': 0.8}

In [89]:
Param = namedtuple('Param', ['key', 'value', 'count'])

In [93]:
paramlist = []
for i in range(4):
    paramlist.append(Param('key','val', i))


In [94]:
paramlist

[Param(key='key', value='val', count=0),
 Param(key='key', value='val', count=1),
 Param(key='key', value='val', count=2),
 Param(key='key', value='val', count=3)]

In [95]:
stored_keys = dict()
key_counter = dict()

for param_grid in df.best_params:
    for k, v in eval(param_grid).items():
        if (k, v) in stored_keys.keys():
            print('yes')
            key_counter[(k, v)] += 1
        else:
            print('storing')
            stored_keys[(k, v)] = True
            key_counter[(k, v)] = 1

storing
storing
storing
storing
storing
storing
storing
storing
storing
storing
yes
storing
storing
yes
yes
storing
storing
storing
storing
storing
storing
yes
yes
storing
yes
storing
yes
storing
storing
storing
yes
yes
yes
yes
yes
yes
storing
storing
storing
storing
storing
yes
yes
yes
yes
storing
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
yes
storing
yes
yes
yes
yes
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing


In [97]:
pprint(key_counter)

{('baggingclassifier__n_estimators', 200): 2,
 ('extratreesclassifier__bootstrap', True): 1,
 ('extratreesclassifier__n_estimators', 300): 1,
 ('gradientboostingclassifier__max_depth', 3): 1,
 ('gradientboostingclassifier__n_estimators', 300): 1,
 ('kneighborsclassifier__metric', 'manhattan'): 1,
 ('kneighborsclassifier__n_neighbors', 7): 1,
 ('linearsvc__C', 5.0): 1,
 ('linearsvc__fit_intercept', False): 1,
 ('logisticregression__C', 3): 1,
 ('logisticregression__penalty', 'l2'): 1,
 ('logisticregression__solver', 'lbfgs'): 1,
 ('mlpclassifier__activation', 'relu'): 1,
 ('mlpclassifier__alpha', 0.50005): 1,
 ('mlpclassifier__hidden_layer_sizes', (100,)): 1,
 ('multinomialnb__alpha', 0.1): 1,
 ('multinomialnb__fit_prior', False): 1,
 ('nusvc__decision_function_shape', 'ovr'): 1,
 ('nusvc__degree', 5): 1,
 ('nusvc__nu', 0.45): 1,
 ('passiveaggressiveclassifier__C', 0.10526315789473684): 1,
 ('passiveaggressiveclassifier__fit_intercept', False): 1,
 ('randomforestclassifier__max_depth', 