# Thorough Gridsearch Model


This notebook contains code to easily add a model and preprocessor(s) and gridsearch through every possible combination to find the best

## Future:
Compare bagging, boosting, gradient, with base model.

Use GPU XGBoost with all parameters

XGBoost Things to reference:

https://github.com/dmlc/xgboost/blob/master/demo/gpu_acceleration/cover_type.py

https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html

https://www.kaggle.com/c/expedia-hotel-recommendations/discussion/21439

---

In [2]:
# --- Absolute Import Fix --- #
# Use this if script needs to be run standalone
import os
import sys
from dotenv import find_dotenv
sys.path.append(os.path.dirname(find_dotenv()))

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, validation_curve
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from pprint import pprint
from xgboost import XGBClassifier
import joblib
from textron import CONFIG


%load_ext autoreload
%autoreload 2

In [8]:
from textron.util import databases
from textron.util import dataloader
from textron.autocompare import grid_models
from textron.util.helpers import Labeler
from textron.util.helpers import plot_confusion_matrix
from textron.autocompare.grid_models import custom_stop_words, get_random_class_labels

In [9]:
labels = get_random_class_labels(8)
print(labels)

df = dataloader.data_selector(labels, data_source='sqlite')

df.sample(10)

['shittyprogramming' 'java' 'deeplearning' 'tensorflow' 'css'
 'linux4noobs' 'datascience' 'sql']
Connection to SQLite DB successful


Unnamed: 0,title,subreddit,date
2870,This problem on my algebra test... this is wha...,shittyprogramming,2020-03-29
182550,OpenSUSE NVIDIA prime Driver issues,linux4noobs,2020-05-15
69278,What might be causing this funny unrequired pa...,css,2020-04-25
2061,4 Ways to Animate the Color of a Text Link on ...,css,2020-03-29
5699,How do I move files between Linux and Windows?,linux4noobs,2020-03-29
2611,CSS Selector - how can I only select p element...,css,2020-03-29
3593,"I lost my vcard, how can I get it back?",shittyprogramming,2020-03-29
49550,MS SQL Access 365,sql,2020-04-22
20723,help executing 'nitrogen --restore' automatica...,linux4noobs,2020-04-10
1375,omg I hate cython,tensorflow,2020-03-29


In [10]:
X = df['title']
y = df['subreddit']

In [11]:
### TODO ### use scikit-learn labeler

In [12]:
labeler = Labeler()
labeler.fit(y)
y = labeler.transform(y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [14]:
tfidf = {
    'preprocessor': TfidfVectorizer(stop_words=custom_stop_words),
    'params': {
        "prep__ngram_range": [(1, 2)],
        "prep__max_df": [.9],
        "prep__use_idf": [True],
        "prep__norm": ["l2"]
    }
}

In [15]:
lr = {
    'clf': LogisticRegression(max_iter=1000),
    'params': {
        "clf__C": [.01, .1, 1, 5]
    }
    
}

In [16]:
xgb = {
    'clf': XGBClassifier(n_jobs=-1),
    'params': {
        "clf__max_depth": [3, 5, 10, 20, 50],
        "clf__learning_rate": np.linspace(.001, .99, 5),
        "clf__n_estimators": [50, 100, 200],
        "clf__objective": ['binary:logistic', 'multi:softprob'],
        "clf__booster": ['gbtree', 'gblinear', 'dart'],
        "clf__gamma": np.linspace(0, 1, 5),
        "clf__subsample": np.linspace(.5, 1, 5),
        "clf__reg_alpha": np.linspace(0, 1, 5),
        "clf__reg_lambda": np.linspace(0, 1, 5),
        "clf__importance_type": ['gain', 'weight', 'cover', 'total_gain', 'total_cover'],
        "clf__hidden_layer_sizes": [10, 20, 30, 40, 50],
    }
}

In [17]:
xgb_orig = {
    'clf': XGBClassifier(n_jobs=-1),
    'params': {'objective': ['multi:softmax'], # Specify multiclass classification
             'num_class': [8], # Number of possible output classes
             'tree_method': ['gpu_hist'] # Use GPU accelerated algorithm
             }
}

In [18]:
random = {
    'clf': RandomForestClassifier(),
    'params': {
        "clf__n_estimators": [200, 300]
    }
}

In [19]:
onevrest = {
    'clf': OneVsRestClassifier(RandomForestClassifier()),
    'params': {
        "clf__estimator__n_estimators": [200, 300]
    }
}

In [20]:
def build_and_train_model(preprocessor, classifier, cv=3, verbose=1):
    '''
    Takes a dictionary with params and outputs a fitted model
    '''
    pipe = Pipeline(
    [('prep', preprocessor.get('preprocessor')),
     ('clf', classifier.get('clf'))])
    
    pipe_params = dict()
    pipe_params.update(preprocessor.get('params'))
    pipe_params.update(classifier.get('params'))
    
    model = GridSearchCV(pipe, param_grid=pipe_params, cv=cv, verbose=verbose, n_jobs=-1)
    model.fit(X_train, y_train)
    return model

In [21]:
for estimator in [xgb_orig]:
    
    model = build_and_train_model(preprocessor=tfidf, classifier=estimator, verbose=5)
    
    date = str(datetime.datetime.now().strftime('%Y-%m-%d_%H%M'))
    estimator_name = type(model.estimator.named_steps.clf).__name__
    joblib_file = f'{estimator_name}_best_model_{date}.pkl'
    joblib.dump(model, CONFIG.DATA_DIR / joblib_file)
    
    print(f'Train Score: {model.score(X_train, y_train)}')
    print(f'Test Score: {model.score(X_test, y_test)}')
    
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test)
        print(f'AUC Score: {roc_auc_score(y_test, y_proba, multi_class="ovr")}')
        
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=3))

    plot_confusion_matrix(model, y_test, y_pred, classes=labeler.classes_)
    plt.savefig(CONFIG.DATA_DIR / f'{estimator_name}_confusion_matrix_{date}.png')
    plt.show()

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.5s remaining:    0.0s


ValueError: Invalid parameter num_class for estimator Pipeline(memory=None,
         steps=[('prep',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards...
                               interaction_constraints=None, learning_rate=None,
                               max_delta_step=None, max_depth=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=-1, num_parallel_tree=None,
                               objective='binary:logistic', random_state=None,
                               reg_alpha=None, reg_lambda=None,
                               scale_pos_weight=None, subsample=None,
                               tree_method=None, validate_parameters=False,
                               verbosity=None))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.