# Special Model Comparison

### TODO

1. Add keras model

---

### This is gridsearch for each model

https://stackabuse.com/grid-search-optimization-algorithm-in-python/

In [None]:
## DO THIS BEFORE IMPORTING KERAS OR TENSOR TO USE PLAIDML
import plaidml.keras
plaidml.keras.install_backend()

# Help MacOS be able to use Keras
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

# Gets rid of the processor warning.
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from pprint import pprint
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from util import databases
from util import dataloader
from util import grid_models
from util.reddit_functions import Labeler
from util.reddit_functions import plot_confusion_matrix
from util.grid_models import custom_stop_words

In [29]:
def create_keras_model(learn_rate, dropout_rate, input_dim):

    model = Sequential()
    model.add(Dense(8, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    adam = Adam(lr=learn_rate)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

In [3]:
def build_and_train_model(preprocessor, classifier, cv=3, verbose=1):
    '''
    Takes a dictionary with params and outputs a fitted model
    '''
    pipe = Pipeline(
    [('prep', preprocessor.get('preprocessor')),
     ('clf', classifier.get('clf'))])
    
    pipe_params = dict()
    pipe_params.update(preprocessor.get('params'))
    pipe_params.update(classifier.get('params'))
    
    model = GridSearchCV(pipe, param_grid=pipe_params, cv=cv, verbose=verbose, n_jobs=-1)
    model.fit(X_train, y_train)
    return model

In [4]:
def score_model(model):
    
    print(f'Train Score: {model.score(X_train, y_train)}')
    print(f'Test Score: {model.score(X_test, y_test)}')
    print(f'AUC Score: {roc_auc_score(y_test, y_proba, multi_class="ovr")}')
    print(classification_report(y_test, y_pred, digits=3))

In [5]:
# subreddit_list = ['css', 'html', 'javascript', 'php', 'perl', 'java', 'datascience', 'machinelearning', 'etl', 'python', 'dataengineering']

In [6]:
subreddit_list = ['datascience','machinelearning','dataengineering','python','aws']

In [7]:
df = dataloader.data_selector(subreddit_list, 'sqlite')

Connection to SQLite DB successful


In [8]:
df.sample(10)

Unnamed: 0,title,subreddit,date
47674,How to design a SQL database,dataengineering,2020-04-25
34840,should I get a PhD in computer vision?,datascience,2020-04-23
36424,VCS vs PyPi on production,python,2020-04-23
35974,[P] A guide to fine-tuning XLNet for abstracti...,machinelearning,2020-04-23
33263,Facebook DE onsite interview in two weeks. Wha...,dataengineering,2020-04-22
22539,This weekend I released a Django/Vue.js/GraphQ...,python,2020-04-20
13521,Would this project come under Data Engineering?,dataengineering,2020-04-10
40153,[P] Stylegan2 training hardware,machinelearning,2020-04-24
41503,I made a Dash app that visually compares the F...,python,2020-04-24
54025,It’s never too early,datascience,2020-04-28


In [9]:
X = df['title']
y = df['subreddit']

In [10]:
labeler = Labeler()
labeler.fit(y)
y = labeler.transform(y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [12]:
tfidf = {
    'preprocessor': TfidfVectorizer(stop_words=custom_stop_words),
    'params': {
        "prep__ngram_range": [(1, 2)],
#         "prep__max_df": [.7, .8, .9],
        "prep__use_idf": [True],
        "prep__norm": ["l2"]
    }
}

In [13]:
lr = {
    'name': 'lr',
    'clf': LogisticRegression(max_iter=1000),
    'params': {
        "clf__C": [5]
    }
    
}

In [14]:
mlp = {
    'name': 'mlp',
    'clf': MLPClassifier(),
    'params': {
        "clf__hidden_layer_sizes": [50, 100, 200]
    }
}

In [15]:
xgb = {
    'name': 'xgb',
    'clf': XGBClassifier(),
    'params': {
        "clf__hidden_layer_sizes": [10, 25, 50],
        "clf__n_estimators": [50, 100, 200],
        "clf__max_depth": [5, 10, 20]
    }
}

In [16]:
onevrest = {
    'name': 'onevrest',
    'clf': OneVsRestClassifier(RandomForestClassifier()),
    'params': {
        "clf__estimator__n_estimators": [200, 300]
    }
}

In [40]:
keras = {
    'name': 'keras',
    'clf': KerasClassifier(build_fn=create_keras_model),
    'params': {
        'clf__input_dim': [X_train.shape[1]],
        'clf__learn_rate': [0.001, 0.02, 0.2],
        'clf__dropout_rate': [0.0, 0.2, 0.4],
        'clf__batch_size': [32, 64, 128],
        'clf__epochs': [10, 20, 50]
    }
}

In [17]:
fitted_models = dict()

In [28]:
# for estimator in [lr, mlp, xgb, onevrest, keras]:
for estimator in [keras]:
    
    model = build_and_train_model(preprocessor=tfidf, classifier=estimator)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    score_model(model)
    plot_confusion_matrix(model, y_true=y_test, y_pred=y_pred, classes=labeler.classes_)
    fitted_models[estimator.get('name')] = model

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:   40.0s finished


ValueError: Error when checking input: expected dense_input to have shape (8,) but got array with shape (43840,)

In [19]:
fitted_models

Pipeline(memory=None,
         steps=[('prep',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards...
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
