# Model Comparison
## Find the Best Model



In [1]:
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from glob import glob
from collections import Counter, namedtuple
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
!pwd

/Users/chris/github/reddit_nlp


In [3]:
# === TODO === #
# use postgres to store and retrieve the compare df rows

In [4]:
reports = glob('data/compare_df/*.csv')

['data/compare_df/2020-05-09_1428.csv',
 'data/compare_df/2020-05-09_0938.csv',
 'data/compare_df/2020-05-09_1427.csv']

In [5]:
df = pd.DataFrame()

In [6]:
for report in reports:
    print(report)
    data = pd.read_csv(report)
    df = df.append(data)
    

data/compare_df/2020-05-09_1428.csv
data/compare_df/2020-05-09_0938.csv
data/compare_df/2020-05-09_1427.csv


In [7]:
df.drop(columns='Unnamed: 0', inplace=True)
df.head(10)

Unnamed: 0,preprocessor,estimator,best_train_score,best_test_score,time_weighted_score,roc_auc,train_test_variance,fit_time_seconds,predict_time_seconds,best_params,subreddits,date
0,TfidfVectorizer,XGBoost Classifier,0.898,0.719,34.703,0.906,0.199,20.474,0.255,"{'xgbclassifier__n_estimators': 100, 'xgbclass...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
1,TfidfVectorizer,MLPClassifier,0.822,0.73,8.41,0.911,0.112,86.753,0.042,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
2,TfidfVectorizer,Logistic Regression,0.864,0.74,2526.104,0.919,0.143,0.261,0.032,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
3,TfidfVectorizer,Random Forest,0.763,0.69,251.516,0.898,0.096,2.449,0.294,"{'tfidfvectorizer__use_idf': True, 'tfidfvecto...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
4,TfidfVectorizer,K Nearest Neighbors,0.659,0.555,961.899,0.79,0.157,0.157,0.42,"{'tfidfvectorizer__use_idf': True, 'tfidfvecto...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
5,TfidfVectorizer,Multinomial Bayes Classifier,0.855,0.757,8279.138,0.927,0.115,0.072,0.019,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
6,TfidfVectorizer,Support Vector Classifier,0.927,0.706,187.48,0.927,0.238,3.221,0.544,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
7,TfidfVectorizer,AdaBoost Classifier,0.474,0.448,389.966,0.74,0.055,1.021,0.127,"{'tfidfvectorizer__use_idf': False, 'tfidfvect...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
8,TfidfVectorizer,Bagging Classifier Logistic Regression,0.862,0.751,14.776,0.928,0.129,50.706,0.097,"{'tfidfvectorizer__use_idf': True, 'tfidfvecto...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364
9,TfidfVectorizer,Bagging Classifier,0.99,0.693,7.299,0.884,0.299,94.589,0.422,"{'tfidfvectorizer__use_idf': True, 'tfidfvecto...","scikit_learn, aws, awscertifications, java, sq...",2020-05-09 14:28:46.143364


In [8]:
columns_of_interest = ['preprocessor','estimator','best_test_score', 'roc_auc', 'fit_time_seconds','time_weighted_score']

In [9]:
df.sort_values(by='time_weighted_score', ascending=False)[columns_of_interest]

Unnamed: 0,preprocessor,estimator,best_test_score,roc_auc,fit_time_seconds,time_weighted_score
5,TfidfVectorizer,Multinomial Bayes Classifier,0.757,0.927,0.072,8279.138
13,TfidfVectorizer,Stochastic Gradient Descent Classifier,0.721,0.88,0.186,3296.15
5,TfidVectorizer,Multinomial Bayes Classifier,0.614,0.89,0.163,3128.69
12,TfidfVectorizer,Passive Agressive Classifier,0.753,0.88,0.257,2617.406
2,TfidfVectorizer,Logistic Regression,0.74,0.919,0.261,2526.104
12,TfidVectorizer,Stochastic Gradient Descent Classifier,0.606,0.866,0.211,2510.941
14,TfidfVectorizer,Linear SVC,0.759,0.88,0.317,2245.237
11,TfidVectorizer,Passive Agressive Classifier,0.582,0.866,0.305,1749.729
14,TfidVectorizer,Linear SVC,0.592,0.866,0.364,1501.224
4,TfidfVectorizer,K Nearest Neighbors,0.555,0.79,0.157,961.899


In [10]:
df.sort_values(by='best_test_score', ascending=False)[columns_of_interest]

Unnamed: 0,preprocessor,estimator,best_test_score,roc_auc,fit_time_seconds,time_weighted_score
14,TfidfVectorizer,Linear SVC,0.759,0.88,0.317,2245.237
5,TfidfVectorizer,Multinomial Bayes Classifier,0.757,0.927,0.072,8279.138
12,TfidfVectorizer,Passive Agressive Classifier,0.753,0.88,0.257,2617.406
8,TfidfVectorizer,Bagging Classifier Logistic Regression,0.751,0.928,50.706,14.776
2,TfidfVectorizer,Logistic Regression,0.74,0.919,0.261,2526.104
1,TfidfVectorizer,MLPClassifier,0.73,0.911,86.753,8.41
10,TfidfVectorizer,Extra Trees Classifier,0.728,0.893,20.907,33.665
11,TfidfVectorizer,Gradient Boosting Classifier,0.725,0.88,501.25,1.435
13,TfidfVectorizer,Stochastic Gradient Descent Classifier,0.721,0.88,0.186,3296.15
0,TfidfVectorizer,XGBoost Classifier,0.719,0.906,20.474,34.703


In [23]:
df.groupby(by='estimator')[columns_of_interest].agg(np.mean)

# [columns_of_interest].mean()

Unnamed: 0_level_0,best_test_score,roc_auc,fit_time_seconds,time_weighted_score
estimator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost Classifier,0.448,0.74,1.021,389.966
Bagging Classifier,0.693,0.884,94.589,7.299
Bagging Classifier Logistic Regression,0.6865,0.914,113.2155,9.1555
Bagging Classifier MultinomalNB,0.544,0.855,94.761,5.707
Extra Trees Classifier,0.661,0.8885,19.7375,32.038
Gradient Boosting Classifier,0.6505,0.873,260.212,15.6635
K Nearest Neighbors,0.4855,0.7725,0.12,921.563
Linear SVC,0.6755,0.873,0.3405,1873.2305
Logistic Regression,0.6755,0.903,0.8135,1481.954
MLPClassifier,0.73,0.911,86.753,8.41


In [33]:
df.groupby(by='estimator')[columns_of_interest].mean().sort_values(by='best_test_score', ascending=False)



Unnamed: 0_level_0,best_test_score,roc_auc,fit_time_seconds,time_weighted_score
estimator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MLPClassifier,0.73,0.911,86.753,8.41
Bagging Classifier,0.693,0.884,94.589,7.299
Bagging Classifier Logistic Regression,0.6865,0.914,113.2155,9.1555
Multinomial Bayes Classifier,0.6855,0.9085,0.1175,5703.914
Linear SVC,0.6755,0.873,0.3405,1873.2305
Logistic Regression,0.6755,0.903,0.8135,1481.954
Passive Agressive Classifier,0.6675,0.873,0.281,2183.5675
Stochastic Gradient Descent Classifier,0.6635,0.873,0.1985,2903.5455
Extra Trees Classifier,0.661,0.8885,19.7375,32.038
Support Vector Classifier,0.6545,0.9085,4.2855,142.6515


# Best Params

In [34]:
Param = namedtuple('Param', ['key', 'value', 'count'])

In [36]:
stored_keys = dict()
key_counter = dict()

for param_grid in df.best_params:
    for k, v in eval(param_grid).items():
        if (k, v) in stored_keys.keys():
            print('yes')
            key_counter[(k, v)] += 1
        else:
            print('storing')
            stored_keys[(k, v)] = True
            key_counter[(k, v)] = 1

storing
storing
storing
storing
storing
storing
storing
storing
storing
storing
yes
storing
storing
yes
storing
storing
yes
storing
storing
storing
yes
yes
yes
yes
yes
storing
storing
storing
storing
storing
storing
yes
yes
yes
yes
yes
storing
storing
storing
storing
storing
yes
yes
yes
yes
storing
yes
yes
storing
storing
yes
yes
yes
storing
yes
yes
storing
storing
storing
yes
storing
yes
yes
storing
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
storing
storing
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
storing
storing
yes
yes
yes
yes
yes
yes
yes
yes
storing
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
storing
storing
storing
yes
yes
yes
yes
stor

In [37]:
pprint(key_counter)

{('adaboostclassifier__learning_rate', 0.1): 1,
 ('adaboostclassifier__n_estimators', 200): 1,
 ('baggingclassifier__n_estimators', 50): 1,
 ('baggingclassifier__n_estimators', 200): 3,
 ('extratreesclassifier__bootstrap', False): 1,
 ('extratreesclassifier__bootstrap', True): 1,
 ('extratreesclassifier__n_estimators', 300): 2,
 ('gradientboostingclassifier__max_depth', None): 1,
 ('gradientboostingclassifier__max_depth', 3): 1,
 ('gradientboostingclassifier__n_estimators', 100): 1,
 ('gradientboostingclassifier__n_estimators', 300): 1,
 ('kneighborsclassifier__metric', 'manhattan'): 2,
 ('kneighborsclassifier__n_neighbors', 7): 2,
 ('linearsvc__C', 5.0): 1,
 ('linearsvc__C', 5.05): 1,
 ('linearsvc__fit_intercept', False): 1,
 ('linearsvc__fit_intercept', True): 1,
 ('logisticregression__C', 3): 2,
 ('logisticregression__penalty', 'l2'): 2,
 ('logisticregression__solver', 'lbfgs'): 1,
 ('logisticregression__solver', 'saga'): 1,
 ('mlpclassifier__activation', 'relu'): 2,
 ('mlpclassifie