# Data Science Workflow
## Find the Best Model

This notebook shows how to use some of the functions located in `reddit_functions` to compare the performance of different models on the data.

A second workflow is included to take the parameters of the best model and create a new model and fit it on the entire dataset and see the improvement.

In [None]:
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from helpers import databases
from helpers import dataloader
from helpers import grid_models
from helpers.reddit_functions import Reddit

In [None]:
# subreddit_list = ['css', 'html', 'javascript', 'php', 'perl', 'java', 'datascience', 'machinelearning', 'etl', 'python', 'dataengineering']

In [None]:
subreddit_list = ['datascience','machinelearning','dataengineering','python','aws','sql']

In [None]:
df = dataloader.data_selector(subreddit_list, 'sqlite')

In [None]:
# get rid of list items with no data retrieved
subreddit_list = [sub for sub in subreddit_list if sub in df.subreddit.unique()]
subreddit_list

In [None]:
df = dataloader.subreddit_encoder(df)

In [None]:
df.sample(10)

In [None]:
X = df['title']
y = df['sub_code']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [None]:
useless_words = set(['using', 'help', 'new', 'data', 'science', 'machine', 'learning', 'use', 'need'])

custom_stop_words = ENGLISH_STOP_WORDS.union(subreddit_list, useless_words)

In [None]:
redfuncs = Reddit()

In [None]:
preprocessors = grid_models.preprocessors
estimators = grid_models.estimators

In [None]:
preprocessors['count_vec']['pipe_params']['count_vec__stop_words'].append(custom_stop_words)
# preprocessors['count_vec']['pipe_params']['count_vec__stop_words'].remove('english')

In [None]:
preprocessors['tfidf']['pipe_params']['tfidf__stop_words'].append(custom_stop_words)
# preprocessors['tfidf']['pipe_params']['tfidf__stop_words'].remove('english')

### Compare All Models

In [None]:
compare_df = redfuncs.compare_models(X_train, X_test, y_train, y_test, cv=3, verbose=1)

In [None]:
compare_df.sort_values(by='best_test_score', ascending=False)

In [None]:
date = str(datetime.datetime.now())
compare_df.to_csv(f'data/compare_df/{date}')

In [None]:
# [pprint(params) for params in compare_df.sort_values(by='best_test_score', ascending=False)['best_params']]

In [None]:
best_model = compare_df.sort_values(by='best_test_score', ascending=False).iloc[0, :].to_dict()
best_model

## Make a new model with the best params from the search

In [None]:
best_pipe = Pipeline([
    (best_model['prep_code'], preprocessors[best_model['prep_code']]['processor']),
    (best_model['est_code'], estimators[best_model['est_code']]['estimator'])
])
best_pipe.set_params(**best_model['best_params'])
# fit on entire dataset
best_pipe.fit(X, y)
best_pipe.score(X, y)

In [None]:
cross_score = cross_val_score(best_pipe, X, y)
print(cross_score, cross_score.mean())


### Model Improvement

In [None]:
# baseline
y.value_counts(normalize=True)

In [None]:
# how much improvement over baseline
best_pipe_score - y.value_counts(normalize=True)[0]

In [None]:
# how much difference from the best worst model to the best best model
best_pipe_score - min(compare_df['Best Test Score'])

In [None]:
# how much improvement from retraining on entire dataset
best_pipe_score - best_model['Best Test Score']