# Libraries

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from __future__ import print_function

# Versions

**Disclaimer: This notebook uses Python 3.5. Sections may not work if you try this with Python 2.X.**

In [2]:
items = [("Numpy", np), ("Pandas", pd), ("Matplotlib", matplotlib), ("Seaborn", sns)]
for item in items:
    print(item[0] + " version " + str(item[1].__version__))

Numpy version 1.13.0
Pandas version 0.20.1
Matplotlib version 2.0.2
Seaborn version 0.7.1


---

# Get Data

In [3]:
X_train = pd.read_hdf('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/data/py35/simulated_cleaned_training_data_py35.h5', 'table')
y_train = X_train.pop('hired')

# Get X_test & y_test

In [4]:
import pickle

with open("/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py35/X_test_py35.pkl", 'rb') as picklefile: 
    X_test = pickle.load(picklefile)
    
with open("/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py35/y_test_py35.pkl", 'rb') as picklefile: 
    y_test = pickle.load(picklefile)

# Machine Learning

This section shows the machine learning pipeline used to generate scoring thresholds. A further discussion of how this information is used will be included in the next notebook.

### Check Class Weights

In [5]:
pos_class_prop = y_train.mean()
pos_class_prop

0.6514772158237356

### ML Libraries

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

### Instantiate Models

In [7]:
dt = DecisionTreeClassifier(criterion='gini',
                            splitter='best', 
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
                            max_features=None, 
                            random_state=13, 
                            max_leaf_nodes=None, 
                            min_impurity_split=1e-07, 
                            class_weight=None, 
                            presort=False)

dummy = DummyClassifier(strategy='most_frequent', 
                        random_state=99, 
                        constant=None)

gbc = GradientBoostingClassifier(loss='deviance', 
                                 learning_rate=0.1, 
                                 n_estimators=50, 
                                 subsample=1.0, 
                                 criterion='friedman_mse', 
                                 min_samples_split=2, 
                                 min_samples_leaf=1, 
                                 min_weight_fraction_leaf=0.0, 
                                 max_depth=3, 
                                 min_impurity_split=1e-07, 
                                 init=None, 
                                 random_state=123, 
                                 max_features=None, 
                                 verbose=0, 
                                 max_leaf_nodes=None, 
                                 warm_start=False, 
                                 presort='auto')

knn = KNeighborsClassifier(n_neighbors=5, 
                           weights='uniform', 
                           algorithm='auto', 
                           leaf_size=30, 
                           p=2, 
                           metric='euclidean', 
                           metric_params=None, 
                           n_jobs=-1)

lr = LogisticRegression(penalty='l2', 
                        dual=False, 
                        tol=0.0001, 
                        C=1.0, 
                        fit_intercept=True, 
                        intercept_scaling=1, 
                        class_weight=None, 
                        random_state=10, 
                        solver='liblinear', 
                        max_iter=100, 
                        multi_class='ovr', 
                        verbose=0, 
                        warm_start=False, 
                        n_jobs=-1)


nb = MultinomialNB(alpha=1.0, 
                   fit_prior=True, 
                   class_prior=None)

rf = RandomForestClassifier(n_estimators=50, 
                            criterion='gini', 
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
                            max_features='auto', 
                            max_leaf_nodes=None, 
                            min_impurity_split=1e-07, 
                            bootstrap=True, 
                            oob_score=False, 
                            n_jobs=-1, 
                            random_state=17, 
                            verbose=0, 
                            warm_start=False, 
                            class_weight=None)

### Setup Parameter Grid For RandomCV

In [8]:
dt_param_grid = dict(max_depth=[None, 6, 8], 
                     min_samples_leaf=range(1,5),
                     class_weight=[None, 'balanced'])

gbc_param_grid = dict(loss=['deviance','exponential'], 
                      max_depth=range(2,5), 
                      learning_rate=[0.001, 0.01, 0.1])

knn_param_grid = dict(n_neighbors=range(1, 15, 2), 
                      weights=('uniform', 'distance'))

lr_param_grid = dict(penalty=['l1', 'l2'], 
                     C=np.geomspace(0.001, 10, num=5),
                     class_weight=[None, 'balanced'])

rf_param_grid = dict(n_estimators=[50, 100], 
                     max_depth=[None, 8, 10],
                     class_weight=[None, 'balanced'])

### Setup Dictionary For Algo_Report()

In [9]:
from collections import OrderedDict

algo_dict = OrderedDict(
    (
        ("dt",(dt, dt_param_grid)),
        ("dummy",(dummy)),
        ("gbc",(gbc, gbc_param_grid)),
        ("knn",(knn, knn_param_grid)),
        ("lr",(lr, lr_param_grid)), 
        ("nb",(nb)), 
        ("rf",(rf, rf_param_grid))
    )
)

### Algo_Report()

In [10]:
def algo_report(algo_dict, cv=5, search_flag=1):
    '''
    Function that generates in-sample and out-of-sample metrics for numerous machine learning algorithms.
    
    Input:
        algo_dict = dictionary with algorithm name as key and model object & parameter grid as values
        cv = number of folds for cross validation
        search_flag = {0: use default model paramters; 1: use randomized search; 2: use grid search}
    Output:
        prints a report showing:
            1) in-sample negative log-loss value or accuracy (dependent on score function)
            2) out-of-sample negative log-loss value or accuracy (dependent on score function)
            3) out-of-sample log loss value
            4) confusion matrix
    '''
    for k, v in algo_dict.items():  
        if k == "nb" or k == "dummy":
            model = v.fit(X_train, y_train)
        else:
            if search_flag:
                model = RandomizedSearchCV(v[0], v[1], cv=cv, scoring='neg_log_loss')
                model.fit(X_train, y_train)
            elif search_flag==2:
                model = GridSearchCV(v[0], v[1], cv=cv, scoring='neg_log_loss')
                model.fit(X_train, y_train)
            else: 
                model = v[0].fit(X_train, y_train)

        print("[%s]" % k)
        print("In-Sample:     {}\nOut-of_Sample: {}\nLog_loss:      {}".format(
            round(model.score(X_train, y_train),3), 
            round(model.score(X_test, y_test),3), 
            round(log_loss(y_test, model.predict_proba(X_test)),3)))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, model.predict(X_test)))
        print("\n-----------------\n")

### Results
Results for all three scenarios - using only defaults, using random search, and using grid search - can be found below. 

#### Defaults

In [11]:
algo_report(algo_dict, cv=10, search_flag=0)

[dt]
In-Sample:     1.0
Out-of_Sample: 0.592
Log_loss:      14.092

Confusion Matrix:
[[167 189]
 [219 425]]

-----------------

[dummy]
In-Sample:     0.651
Out-of_Sample: 0.644
Log_loss:      12.296

Confusion Matrix:
[[  0 356]
 [  0 644]]

-----------------

[gbc]
In-Sample:     0.71
Out-of_Sample: 0.68
Log_loss:      0.606

Confusion Matrix:
[[108 248]
 [ 72 572]]

-----------------

[knn]
In-Sample:     0.742
Out-of_Sample: 0.613
Log_loss:      2.053

Confusion Matrix:
[[119 237]
 [150 494]]

-----------------

[lr]
In-Sample:     0.666
Out-of_Sample: 0.659
Log_loss:      0.62

Confusion Matrix:
[[ 61 295]
 [ 46 598]]

-----------------

[nb]
In-Sample:     0.614
Out-of_Sample: 0.614
Log_loss:      1.101

Confusion Matrix:
[[176 180]
 [206 438]]

-----------------

[rf]
In-Sample:     1.0
Out-of_Sample: 0.625
Log_loss:      0.657

Confusion Matrix:
[[115 241]
 [134 510]]

-----------------



#### Random Search

In [12]:
algo_report(algo_dict, cv=10, search_flag=1)

[dt]
In-Sample:     -0.566
Out-of_Sample: -0.881
Log_loss:      0.881

Confusion Matrix:
[[151 205]
 [143 501]]

-----------------

[dummy]
In-Sample:     0.651
Out-of_Sample: 0.644
Log_loss:      12.296

Confusion Matrix:
[[  0 356]
 [  0 644]]

-----------------

[gbc]
In-Sample:     -0.59
Out-of_Sample: -0.605
Log_loss:      0.605

Confusion Matrix:
[[ 92 264]
 [ 62 582]]

-----------------

[knn]
In-Sample:     -0.0
Out-of_Sample: -0.697
Log_loss:      0.697

Confusion Matrix:
[[ 85 271]
 [100 544]]

-----------------

[lr]
In-Sample:     -0.621
Out-of_Sample: -0.62
Log_loss:      0.62

Confusion Matrix:
[[ 62 294]
 [ 45 599]]

-----------------

[nb]
In-Sample:     0.614
Out-of_Sample: 0.614
Log_loss:      1.101

Confusion Matrix:
[[176 180]
 [206 438]]

-----------------

[rf]
In-Sample:     -0.491
Out-of_Sample: -0.615
Log_loss:      0.615

Confusion Matrix:
[[102 254]
 [ 73 571]]

-----------------



#### Grid Search

In [13]:
algo_report(algo_dict, cv=10, search_flag=2)

[dt]
In-Sample:     -0.566
Out-of_Sample: -0.881
Log_loss:      0.881

Confusion Matrix:
[[151 205]
 [143 501]]

-----------------

[dummy]
In-Sample:     0.651
Out-of_Sample: 0.644
Log_loss:      12.296

Confusion Matrix:
[[  0 356]
 [  0 644]]

-----------------

[gbc]
In-Sample:     -0.574
Out-of_Sample: -0.608
Log_loss:      0.608

Confusion Matrix:
[[105 251]
 [ 73 571]]

-----------------

[knn]
In-Sample:     -0.0
Out-of_Sample: -0.697
Log_loss:      0.697

Confusion Matrix:
[[ 85 271]
 [100 544]]

-----------------

[lr]
In-Sample:     -0.621
Out-of_Sample: -0.62
Log_loss:      0.62

Confusion Matrix:
[[ 62 294]
 [ 45 599]]

-----------------

[nb]
In-Sample:     0.614
Out-of_Sample: 0.614
Log_loss:      1.101

Confusion Matrix:
[[176 180]
 [206 438]]

-----------------

[rf]
In-Sample:     -0.491
Out-of_Sample: -0.615
Log_loss:      0.615

Confusion Matrix:
[[102 254]
 [ 73 571]]

-----------------



# Create Models For Pickling

In [14]:
# Needs Improvement Model
knn_randcv = RandomizedSearchCV(knn, knn_param_grid, cv=10, scoring='neg_log_loss', random_state=42)
knn_randcv.fit(X_train, y_train)
log_loss(y_test, knn_randcv.predict_proba(X_test))

0.69509316338861982

In [15]:
# Satisfactory Model
rf_randcv = RandomizedSearchCV(rf, rf_param_grid, cv=10, scoring='neg_log_loss', random_state=42)
rf_randcv.fit(X_train, y_train)
log_loss(y_test, rf_randcv.predict_proba(X_test))

0.61533035544578418

In [16]:
# Proficient Model
gbc_randcv = RandomizedSearchCV(gbc, gbc_param_grid, cv=10, scoring='neg_log_loss', random_state=42)
gbc_randcv.fit(X_train, y_train)
log_loss(y_test, gbc_randcv.predict_proba(X_test))

0.60513366403065194

# Pickle Models & Test Set Data For Auto-Scoring

In [17]:
path = '/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py35/'

# Save KNN model
with open('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py35/knn_needs_improvement_py35.pkl', 'wb') as picklefile:
    pickle.dump(knn_randcv, picklefile)
    
# Save Random Forest model
with open('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py35/rf_satisfactory_py35.pkl', 'wb') as picklefile:
    pickle.dump(rf_randcv, picklefile)
    
# Save Gradient Boosted Classifier model
with open('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py35/gbc_proficient_py35.pkl', 'wb') as picklefile:
    pickle.dump(gbc_randcv, picklefile)