# Best Model Validation


---

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from pprint import pprint
from xgboost import XGBClassifier

%load_ext autoreload
%autoreload 2

In [2]:
from util import databases
from util import dataloader
from util import grid_models
from util.reddit_functions import Labeler
from util.reddit_functions import plot_confusion_matrix
from util.grid_models import custom_stop_words

In [3]:
def build_and_train_model(preprocessor, classifier, cv=3, verbose=1):
    '''
    Takes a dictionary with params and outputs a fitted model
    '''
    pipe = Pipeline(
    [('prep', preprocessor.get('preprocessor')),
     ('clf', classifier.get('clf'))])
    
    pipe_params = dict()
    pipe_params.update(preprocessor.get('params'))
    pipe_params.update(classifier.get('params'))
    
    model = GridSearchCV(pipe, param_grid=pipe_params, cv=cv, verbose=verbose, n_jobs=-1)
    model.fit(X_train, y_train)
    return model

In [4]:
def score_model(model):
    
    print(f'Train Score: {model.score(X_train, y_train)}')
    print(f'Test Score: {model.score(X_test, y_test)}')
    print(f'AUC Score: {roc_auc_score(y_test, y_proba, multi_class="ovr")}')
    print(classification_report(y_test, y_pred, digits=3))

In [5]:
# subreddit_list = ['css', 'html', 'javascript', 'php', 'perl', 'java', 'datascience', 'machinelearning', 'etl', 'python', 'dataengineering']

In [6]:
subreddit_list = ['datascience','machinelearning','dataengineering','python','aws']

In [7]:
df = dataloader.data_selector(subreddit_list, 'sqlite')

Connection to SQLite DB successful


In [8]:
df.sample(10)

Unnamed: 0,title,subreddit,date
7056,[R] Reservoir memory machines,machinelearning,2020-04-02
35028,[Discussion] [ML UTD 4] Machine Learning Up-To...,datascience,2020-04-23
19270,How to query dynamoDB table with Java? Specifi...,aws,2020-04-20
31345,"[P] Peaple.IO, the synthetic social network",machinelearning,2020-04-22
45779,[Project] dataget: a framework-agnostic datase...,machinelearning,2020-04-25
42903,"Big Data Project with Hadoop, Tajo, and Spark",dataengineering,2020-04-24
43734,Large scale NLU menu management,aws,2020-04-25
50674,I made a script to automatically play given no...,python,2020-04-27
51493,Is Visual Studio 2019 any good for python deve...,python,2020-04-27
45203,[D] Video Analysis - Imputer: Sequence Modelli...,machinelearning,2020-04-25


In [9]:
X = df['title']
y = df['subreddit']

In [10]:
labeler = Labeler()
labeler.fit(y)
y = labeler.transform(y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [12]:
tfidf = {
    'preprocessor': TfidfVectorizer(stop_words=custom_stop_words),
    'params': {
        "prep__ngram_range": [(1, 2)],
        "prep__max_df": [.7, .8, .9],
        "prep__use_idf": [True],
        "prep__norm": ["l2"]
    }
}

In [13]:
lr = {
    'clf': LogisticRegression(max_iter=1000),
    'params': {
        "clf__C": [.01, .1, 1, 5]
    }
    
}

In [14]:
mlp = {
    'clf': MLPClassifier(),
    'params': {
        "clf__hidden_layer_sizes": [50, 100, 200]
    }
}

In [15]:
xgb = {
    'clf': XGBClassifier(),
    'params': {
        "clf__hidden_layer_sizes": [10, 25, 50],
        "clf__n_estimators": [50, 100, 200],
        "clf__max_depth": [5, 10, 20]
    }
}

In [16]:
onevrest = {
    'clf': OneVsRestClassifier(RandomForestClassifier()),
    'params': {
        "clf__estimator__n_estimators": [200, 300]
    }
}

In [17]:
for estimator in [lr]:
    
    model = build_and_train_model(preprocessor=tfidf, classifier=estimator)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    score_model(model)
    plot_confusion_matrix(model, y_true=y_test, y_pred=y_pred, classes=labeler.classes_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  1.3min finished


Train Score: 0.9933051280859175
Test Score: 0.9836122733612274
AUC Score: 0.9979727821180688
              precision    recall  f1-score   support

           0      0.993     0.992     0.992      2962
           1      0.975     0.982     0.979      2613
           2      0.985     0.990     0.988      2998
           3      0.984     0.971     0.977      2784
           4      0.981     0.982     0.981      2983

    accuracy                          0.984     14340
   macro avg      0.983     0.983     0.983     14340
weighted avg      0.984     0.984     0.984     14340



In [None]:
from sklearn.learning_curve import validation_curve
degree = np.arange(0, 21)
train_score, val_score = validation_curve(PolynomialRegression(), X, y,
'polynomialfeatures__degree', degree, cv=7)
plt.plot(degree, np.median(train_score, 1), color='blue', label='training score') plt.plot(degree, np.median(val_score, 1), color='red', label='validation score') plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree') plt.ylabel('score');