# Model Comparison
## Find the Best Model

This notebook loads all of the comparison csvs generated by `compare_models.py` that is run five times per day on a random subset of eight subreddits.

Examples of different ways to group classifiers and sort based on desired metric.

Each model also has a `best_params` column that gives the best parameters for that model on each run.
There is a dictionary that includes the estimator, parameter, value, and count of times that value was chosen as the best.
Also included is a function to plot the best parameter values.

In [None]:
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from glob import glob
from collections import Counter, namedtuple
import CONFIG
from pathlib import Path
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
!pwd

In [None]:
reports = CONFIG.MODEL_COMPARE_DIR.glob('*.csv')

In [None]:
df = pd.DataFrame()

In [None]:
for report in reports:
    data = pd.read_csv(report)
    df = df.append(data)
len(df)    

In [None]:
df.drop(columns='Unnamed: 0', inplace=True)
df.head(10)

In [None]:
columns_of_interest = ['preprocessor','estimator','best_test_score', 'roc_auc', 'fit_time_seconds','time_weighted_score']

In [None]:
df.groupby(by='estimator')[columns_of_interest].agg(
    np.mean).sort_values(by='best_test_score', ascending=False)

In [None]:
df.groupby(by='estimator')[columns_of_interest].agg(
    np.mean).sort_values(by='roc_auc', ascending=False)

In [None]:
df.groupby(by='estimator')[columns_of_interest].agg(
    np.mean).sort_values(by='time_weighted_score', ascending=False)

# Best Params

In [None]:
params_tuple_list = []
for param_grid in df.best_params:
    for key, value in eval(param_grid).items():
        estimator, parameter = key.split('__')
        params_tuple_list.append((estimator, parameter, value))
        
counted = dict(Counter(params_tuple_list))
counted

In [None]:
def plot_best_params(estimator, parameter=None, cmap='Blues_r'):
    
    if parameter is None:
        string_keys = [(f'{key[1]}: {key[2]}') for key, _ in sorted(counted.items(), key=lambda item: item[1], reverse=True) if key[0] == estimator]
        string_values = [value for key, value in sorted(counted.items(), key=lambda item: item[1], reverse=True) if key[0] == estimator]
        
        plt.figure(figsize=(2 * len(string_keys), 2 * len(string_keys)))
        plt.title(f'{estimator.upper()}')
        
        sns.barplot(y=string_keys, x=string_values, orient='h', palette=cmap);
    
    else:
        string_keys = [(f'{key[1]}: {key[2]}') for key, _ in sorted(counted.items(), key=lambda item: item[1], reverse=True) if key[0] == estimator and key[1] == parameter]
        string_values = [value for key, value in sorted(counted.items(), key=lambda item: item[1], reverse=True) if key[0] == estimator and key[1] == parameter]
        
        plt.figure(figsize=(2 * len(string_keys), 2 * len(string_keys)))
        plt.title(f'{estimator.upper()} {parameter.upper()}')

        sns.barplot(y=string_keys, x=string_values, orient='h', palette=cmap);



In [None]:
plot_best_params('tfidfvectorizer', 'max_df')

In [None]:
plot_best_params('tfidfvectorizer', 'strip_accents')

In [None]:
plot_best_params('logisticregression')

In [None]:
estimators = set(estimator[0] for estimator in counted.keys())
for estimator in estimators:
    plot_best_params(estimator, cmap='Purples_r')