# Libraries

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from copy import deepcopy
from __future__ import print_function

# Versions

In [2]:
!python --version

Python 2.7.13 :: Anaconda 4.4.0 (x86_64)


In [3]:
items = [("Numpy", np), ("Pandas", pd), ("Matplotlib", matplotlib), ("Seaborn", sns)]
for item in items:
    print(item[0] + " version " + str(item[1].__version__))

Numpy version 1.13.0
Pandas version 0.20.1
Matplotlib version 2.0.2
Seaborn version 0.7.1


---

# Get Data

In [4]:
# Read from disk
data = pd.read_hdf('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/data/py27/simulated_cleaned_data_py27.h5', 'table')

# Machine Learning

In [5]:
# Setup
X = deepcopy(data)
y = X.pop('hired')

from sklearn.metrics import log_loss, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Check Class Weights

In [6]:
pos_class_prop = y_train.mean()
pos_class_prop

0.690863579474343

### ML Libraries

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier

### Instantiate Models

In [8]:
dt = DecisionTreeClassifier(criterion='gini',
                            splitter='best', 
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
                            max_features=None, 
                            random_state=13, 
                            max_leaf_nodes=None, 
                            min_impurity_split=1e-07, 
                            class_weight={0:(1-pos_class_prop), 1:pos_class_prop}, 
                            presort=False)

dummy = DummyClassifier(strategy='most_frequent', 
                        random_state=99, 
                        constant=None)

gbc = GradientBoostingClassifier(loss='deviance', 
                                 learning_rate=0.1, 
                                 n_estimators=50, 
                                 subsample=1.0, 
                                 criterion='friedman_mse', 
                                 min_samples_split=2, 
                                 min_samples_leaf=1, 
                                 min_weight_fraction_leaf=0.0, 
                                 max_depth=3, 
                                 min_impurity_split=1e-07, 
                                 init=None, 
                                 random_state=123, 
                                 max_features=None, 
                                 verbose=0, 
                                 max_leaf_nodes=None, 
                                 warm_start=False, 
                                 presort='auto')

knn = KNeighborsClassifier(n_neighbors=5, 
                           weights='uniform', 
                           algorithm='auto', 
                           leaf_size=30, 
                           p=2, 
                           metric='euclidean', 
                           metric_params=None, 
                           n_jobs=-1)

lr = LogisticRegression(penalty='l2', 
                        dual=False, 
                        tol=0.0001, 
                        C=1.0, 
                        fit_intercept=True, 
                        intercept_scaling=1, 
                        class_weight={0:(1-pos_class_prop), 1:pos_class_prop}, 
                        random_state=10, 
                        solver='liblinear', 
                        max_iter=100, 
                        multi_class='ovr', 
                        verbose=0, 
                        warm_start=False, 
                        n_jobs=-1)


nb = MultinomialNB(alpha=1.0, 
                   fit_prior=True, 
                   class_prior=None)

rf = RandomForestClassifier(n_estimators=50, 
                            criterion='gini', 
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
                            max_features='auto', 
                            max_leaf_nodes=None, 
                            min_impurity_split=1e-07, 
                            bootstrap=True, 
                            oob_score=False, 
                            n_jobs=-1, 
                            random_state=17, 
                            verbose=0, 
                            warm_start=False, 
                            class_weight={0:(1-pos_class_prop), 1:pos_class_prop})

### Setup Parameter Grid For RandomCV

In [9]:
dt_param_grid = dict(criterion=['gini','entropy'], max_depth=range(2,7), min_samples_leaf=range(1,5))
gbc_param_grid = dict(loss=['deviance','exponential'], max_depth=range(2,5), learning_rate=[0.001, 0.01, 0.1])
knn_param_grid = dict(n_neighbors=range(1, 15, 2), weights=('uniform', 'distance'))
lr_param_grid = dict(penalty=['l1', 'l2'], C=np.geomspace(0.001, 10, num=5))
rf_param_grid = dict(n_estimators=[50,200,500,1000], max_depth=range(6,11,2))

### Setup Dictionary For Algo_Report()

In [10]:
from collections import OrderedDict

algo_dict = OrderedDict(
    (
        ("dt",(dt, dt_param_grid)),
        ("dummy",(dummy)),
        ("gbc",(gbc, gbc_param_grid)),
        ("knn",(knn, knn_param_grid)),
        ("lr",(lr, lr_param_grid)), 
        ("nb",(nb)), 
        ("rf",(rf, rf_param_grid))
    )
)

### Algo_Report()

In [11]:
def algo_report(algo_dict, cv=5, random_search_flag=1):
    '''
    Function that generates in-sample and out-of-sample metrics for numerous machine learning algorithms.
    
    Input:
        algo_dict = dictionary with algorithm name as key and model object & parameter grid as values
        cv = number of folds for cross validation
        random_search_flag = {0: use default model paramters; 1: use randomized search}
    Output:
        prints a report showing:
            1) in-sample negative log-loss value or accuracy (dependent on score function)
            2) out-of-sample negative log-loss value or accuracy (dependent on score function)
            3) out-of-sample log loss value
            4) confusion matrix
    '''
    for k, v in algo_dict.iteritems():  
        if k == "nb" or k == "dummy":
            model = v.fit(X_train, y_train)
        else:
            if random_search_flag:
                model = RandomizedSearchCV(v[0], v[1], cv=cv, scoring='neg_log_loss')
                model.fit(X_train, y_train)
            else: 
                model = v[0].fit(X_train, y_train)

        print("[%s]" % k)
        print("In-Sample:     {}\nOut-of_Sample: {}\nLog_loss:      {}".format(
            round(model.score(X_train, y_train),3), 
            round(model.score(X_test, y_test),3), 
            round(log_loss(y_test, model.predict_proba(X_test)),3)))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, model.predict(X_test)))
        print("\n-----------------\n")

### Results

In [12]:
algo_report(algo_dict, cv=10, random_search_flag=0)

[dt]
In-Sample:     1.0
Out-of_Sample: 0.616
Log_loss:      13.276

Confusion Matrix:
[[143 169]
 [215 472]]

-----------------

[dummy]
In-Sample:     0.691
Out-of_Sample: 0.688
Log_loss:      10.787

Confusion Matrix:
[[  0 312]
 [  0 687]]

-----------------

[gbc]
In-Sample:     0.746
Out-of_Sample: 0.734
Log_loss:      0.543

Confusion Matrix:
[[105 207]
 [ 59 628]]

-----------------

[knn]
In-Sample:     0.764
Out-of_Sample: 0.661
Log_loss:      2.348

Confusion Matrix:
[[ 82 230]
 [109 578]]

-----------------

[lr]
In-Sample:     0.693
Out-of_Sample: 0.687
Log_loss:      0.64

Confusion Matrix:
[[  1 311]
 [  2 685]]

-----------------

[nb]
In-Sample:     0.647
Out-of_Sample: 0.658
Log_loss:      1.032

Confusion Matrix:
[[145 167]
 [175 512]]

-----------------

[rf]
In-Sample:     1.0
Out-of_Sample: 0.712
Log_loss:      0.608

Confusion Matrix:
[[117 195]
 [ 93 594]]

-----------------



In [13]:
algo_report(algo_dict, cv=10, random_search_flag=1)

[dt]
In-Sample:     -0.605
Out-of_Sample: -0.613
Log_loss:      0.613

Confusion Matrix:
[[  5 307]
 [  4 683]]

-----------------

[dummy]
In-Sample:     0.691
Out-of_Sample: 0.688
Log_loss:      10.787

Confusion Matrix:
[[  0 312]
 [  0 687]]

-----------------

[gbc]
In-Sample:     -0.542
Out-of_Sample: -0.542
Log_loss:      0.542

Confusion Matrix:
[[ 84 228]
 [ 37 650]]

-----------------

[knn]
In-Sample:     -0.541
Out-of_Sample: -0.772
Log_loss:      0.772

Confusion Matrix:
[[ 68 244]
 [ 62 625]]

-----------------

[lr]
In-Sample:     -0.641
Out-of_Sample: -0.639
Log_loss:      0.639

Confusion Matrix:
[[  1 311]
 [  2 685]]

-----------------

[nb]
In-Sample:     0.647
Out-of_Sample: 0.658
Log_loss:      1.032

Confusion Matrix:
[[145 167]
 [175 512]]

-----------------

[rf]
In-Sample:     -0.361
Out-of_Sample: -0.575
Log_loss:      0.575

Confusion Matrix:
[[ 51 261]
 [ 28 659]]

-----------------



# Create Models For Pickling

In [14]:
# Needs Improvement Model
knn_randcv = RandomizedSearchCV(knn, knn_param_grid, cv=10, scoring='neg_log_loss', random_state=42)
knn_randcv.fit(X_train, y_train)
log_loss(y_test, knn_randcv.predict_proba(X_test))

0.77172596402024607

In [15]:
# Satisfactory Model
rf_randcv = RandomizedSearchCV(rf, rf_param_grid, cv=10, scoring='neg_log_loss', random_state=42)
rf_randcv.fit(X_train, y_train)
log_loss(y_test, rf_randcv.predict_proba(X_test))

0.57484754370751456

In [16]:
# Proficient Model
gbc_randcv = RandomizedSearchCV(gbc, gbc_param_grid, cv=10, scoring='neg_log_loss', random_state=42)
gbc_randcv.fit(X_train, y_train)
log_loss(y_test, gbc_randcv.predict_proba(X_test))

0.54224558841354176

# Pickle Models & Test Set Data For Auto-Scoring

In [17]:
import pickle

# Save X_test
with open('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py27/X_test_py27.pkl', 'wb') as picklefile:
    pickle.dump(X_test, picklefile)
    
# Save y_test
with open('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py27/y_test_py27.pkl', 'wb') as picklefile:
    pickle.dump(y_test, picklefile)

# Save KNN model
with open('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py27/knn_needs_improvement_py27.pkl', 'wb') as picklefile:
    pickle.dump(knn_randcv, picklefile)
    
# Save Random Forest model
with open('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py27/rf_satisfactory_py27.pkl', 'wb') as picklefile:
    pickle.dump(rf_randcv, picklefile)
    
# Save Gradient Boosted Classifier model
with open('/Users/davidziganto/Repositories/Synthetic_Dataset_Generation/pickle_files/py27/gbc_proficient_py27.pkl', 'wb') as picklefile:
    pickle.dump(gbc_randcv, picklefile)