# Data Science Workflow
## Find the Best Model

This notebook shows how to use some of the functions located in `reddit_functions` to compare the performance of different models on the data.

A second workflow is included to take the parameters of the best model and create a new model and fit it on the entire dataset and see the improvement.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pprint import pprint

In [1]:
import databases
import dataloader
import grid_models
from reddit_functions import Reddit

ModuleNotFoundError: No module named 'databases'

In [None]:
# subreddit_list = ['css', 'html', 'javascript', 'php', 'perl', 'java', 'datascience', 'machinelearning', 'etl', 'python', 'dataengineering']

In [None]:
subreddit_list = ['datascience','machinelearning','dataengineering','python','aws','sql']

In [None]:
df = dataloader.data_selector(subreddit_list, 'sqlite')

In [None]:
# get rid of list items with no data retrieved
subreddit_list = [sub for sub in subreddit_list if sub in df.subreddit.unique()]
subreddit_list

In [None]:
df = dataloader.subreddit_encoder(df)

In [None]:
df.sample(10)

In [None]:
X = df['title']
y = df['sub_code']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [None]:
useless_words = set(['using', 'help', 'new', 'data', 'science', 'machine', 'learning', 'use', 'need'])

custom_stop_words = ENGLISH_STOP_WORDS.union(subreddit_list, useless_words)

In [None]:
redfun = Reddit()

In [None]:
preprocessors = grid_models.preprocessors
estimators = grid_models.estimators

In [None]:
pprint(preprocessors)

In [None]:
pprint(estimators)

### Compare Subset of Models

In [None]:
# esty = {'logreg': estimators['logreg']}

# compare_df = redfun.compare_models(X_train, X_test, y_train, y_test, estimators=esty, cv=3, verbose=0)

### Compare All Models

In [None]:
compare_df = compare_models(cv=3, verbose=0)

In [None]:
compare_df.sort_values(by='Best Test Score', ascending=False)

In [None]:
best_model = compare_df.sort_values(by='Best Test Score', ascending=False).iloc[0, :].to_dict()
best_model

## Make a new model with the best params from the search

In [None]:
best_pipe = Pipeline([
    (best_model['Preprocessor'], preprocessors[best_model['Preprocessor']]['processor']),
    (best_model['Estimator'], estimators[best_model['Estimator']]['estimator'])
])
best_pipe.set_params(**best_model['Best Params'])
# fit on entire dataset
best_pipe.fit(X, y)

In [None]:
best_pipe_score = best_pipe.score(X, y)
best_pipe_score

### Model Improvement

In [None]:
# baseline
y.value_counts(normalize=True)

In [None]:
# how much improvement over baseline
best_pipe_score - y.value_counts(normalize=True)[0]

In [None]:
# how much difference from the best worst model to the best best model
best_pipe_score - min(compare_df['Best Test Score'])

In [None]:
# how much improvement from retraining on entire dataset
best_pipe_score - best_model['Best Test Score']