# Thorough Gridsearch Model


This notebook contains code to easily add a model and preprocessor(s) and gridsearch through every possible combination to find the best

## Future:
Compare bagging, boosting, gradient, with base model.

---

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, validation_curve
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from pprint import pprint
from xgboost import XGBClassifier
import joblib
import CONFIG


%load_ext autoreload
%autoreload 2

In [2]:
from util import databases
from util import dataloader
from util import grid_models
from util.reddit_functions import Labeler
from util.reddit_functions import plot_confusion_matrix
from util.grid_models import custom_stop_words, get_random_class_labels

In [3]:
labels = get_random_class_labels(8)
print(labels)

df = dataloader.data_selector(labels, data_source='sqlite')

df.sample(10)

['softwarearchitecture' 'postgresql' 'css' 'apachespark' 'mongodb'
 'datascience' 'scala' 'deeplearning']
Connection to SQLite DB successful


Unnamed: 0,title,subreddit,date
2081,"Postgresql Integrations with Flask - By an ""En...",postgresql,2020-03-29
21439,How much does the title matter?,datascience,2020-04-10
81758,TF2.0 Auto encoder for beginner: 10min,deeplearning,2020-04-28
7353,Made this when I first got into data science. ...,datascience,2020-03-29
4426,Scala event if you're in the DC/NOVA Metro Area:,scala,2020-03-29
1855,Using Flexbox to arrange images,css,2020-03-29
6124,What do you guys think of the schedule for thi...,mongodb,2020-03-29
4290,Scale by the bay - Let's meet up ?,scala,2020-03-29
38138,CSS Grid Generator,css,2020-04-21
6316,How to make an INSTAGRAM clone - Profile Vue -...,mongodb,2020-03-29


In [4]:
X = df['title']
y = df['subreddit']

In [5]:
### TODO ### use scikit-learn labeler

In [6]:
labeler = Labeler()
labeler.fit(y)
y = labeler.transform(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [8]:
tfidf = {
    'preprocessor': TfidfVectorizer(stop_words=custom_stop_words),
    'params': {
        "prep__ngram_range": [(1, 2)],
        "prep__max_df": [.9],
        "prep__use_idf": [True],
        "prep__norm": ["l2"]
    }
}

In [9]:
lr = {
    'clf': LogisticRegression(max_iter=1000),
    'params': {
        "clf__C": [.01, .1, 1, 5]
    }
    
}

In [10]:
xgb = {
    'clf': XGBClassifier(n_jobs=-1),
    'params': {
        "clf__max_depth": [3, 5, 10, 20, 50],
        "clf__learning_rate": np.linspace(.001, .99, 5),
        "clf__n_estimators": [50, 100, 200],
        "clf__objective": ['binary:logistic', 'multi:softprob'],
        "clf__booster": ['gbtree', 'gblinear', 'dart'],
        "clf__gamma": np.linspace(0, 1, 5),
        "clf__subsample": np.linspace(.5, 1, 5),
        "clf__reg_alpha": np.linspace(0, 1, 5),
        "clf__reg_lambda": np.linspace(0, 1, 5),
        "clf__importance_type": ['gain', 'weight', 'cover', 'total_gain', 'total_cover'],
        "clf__hidden_layer_sizes": [10, 20, 30, 40, 50],
    }
}

In [11]:
xgb_orig = {
    'clf': XGBClassifier(n_jobs=-1, tree_method='gpu_hist', gpu_id=0),
    'params': {}
}

In [12]:
random = {
    'clf': RandomForestClassifier(),
    'params': {
        "clf__n_estimators": [200, 300]
    }
}

In [13]:
onevrest = {
    'clf': OneVsRestClassifier(RandomForestClassifier()),
    'params': {
        "clf__estimator__n_estimators": [200, 300]
    }
}

In [14]:
def build_and_train_model(preprocessor, classifier, cv=3, verbose=1):
    '''
    Takes a dictionary with params and outputs a fitted model
    '''
    pipe = Pipeline(
    [('prep', preprocessor.get('preprocessor')),
     ('clf', classifier.get('clf'))])
    
    pipe_params = dict()
    pipe_params.update(preprocessor.get('params'))
    pipe_params.update(classifier.get('params'))
    
    model = GridSearchCV(pipe, param_grid=pipe_params, cv=cv, verbose=verbose, n_jobs=-1)
    model.fit(X_train, y_train)
    return model

In [15]:
for estimator in [xgb_orig]:
    
    model = build_and_train_model(preprocessor=tfidf, classifier=estimator, verbose=5)
    
    date = str(datetime.datetime.now().strftime('%Y-%m-%d_%H%M'))
    estimator_name = type(model.estimator.named_steps.clf).__name__
    joblib_file = f'{estimator_name}_best_model_{date}.pkl'
    joblib.dump(model, CONFIG.DATA_DIR / joblib_file)
    
    print(f'Train Score: {model.score(X_train, y_train)}')
    print(f'Test Score: {model.score(X_test, y_test)}')
    
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test)
        print(f'AUC Score: {roc_auc_score(y_test, y_proba, multi_class="ovr")}')
        
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=3))

    plot_confusion_matrix(model, y_test, y_pred, classes=labeler.classes_)
    plt.savefig(CONFIG.DATA_DIR / f'{estimator_name}_confusion_matrix_{date}.png')
    plt.show()

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s finished


XGBoostError: [16:08:46] src/learner.cc:180: XGBoost version not compiled with GPU support.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000001a21a17aa9 dmlc::LogMessageFatal::~LogMessageFatal() + 57
  [bt] (1) 2   libxgboost.dylib                    0x0000001a21a1b384 xgboost::LearnerImpl::ConfigureUpdaters() + 2132
  [bt] (2) 3   libxgboost.dylib                    0x0000001a21a128cf xgboost::LearnerImpl::Configure(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > > const&) + 3519
  [bt] (3) 4   libxgboost.dylib                    0x0000001a21a32373 XGBoosterUpdateOneIter + 131
  [bt] (4) 5   libffi.6.dylib                      0x000000010f363884 ffi_call_unix64 + 76
  [bt] (5) 6   ???                                 0x00007ffee1431be0 0x0 + 140732677692384

