# Model Selection and Experimentation
November 01, 2017 

> In this notebook we attempt to apply several classification algorithms to our data and experiment with model hyperparameters. We choose the best model in build_model.py.

Chakshu Tandon [ <chakshutandon@gmail.com> ]

In [1]:
# Add parent directory to path
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd

import sklearn

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

import configparser
import utils

In [3]:
config = configparser.ConfigParser()
config.read('../config.ini')
config = config['DEFAULT']

In [4]:
def printMetrics(scores, confMatrix):
    print("Accuracy (F1 Score): {:.2} (+/- {:.2})\n".format(scores.mean(), scores.std()))
    print("Confusion Matrix:")
    print(confMatrix)

In [5]:
training_data = pd.read_csv('../Data/train_potus_by_county.csv')

In [6]:
# Extract features and labels
features_, labels_ = utils.getFeatureLabelColumns(training_data, config['ClassLabelsColumn'])

In [7]:
# Scale features to zero mean and unit variance. Encode labels into binary values.
features, scaler_ = utils.scaleUnitMean(features_)
labels, encoder_ = utils.labelEncode(labels_)

In [8]:
# Compute and apply PCA transformation
pca_ = utils.computePCA(features, n_components=int(config['n_PCAComponents']))
pca_transformed_features = utils.applyPCA(features, pca_)

In [9]:
# Naive Bayes
nbc = GaussianNB().fit(features, labels)
nbc_scores = cross_val_score(nbc, features, labels, scoring="f1", cv=int(config['n_KFolds']))

nbc_predictedLabels = utils.getPredictedLabels(nbc, features)
nbc_confMatrix = confusion_matrix(labels, nbc_predictedLabels)

printMetrics(nbc_scores, nbc_confMatrix)

Accuracy (F1 Score): 0.88 (+/- 0.022)

Confusion Matrix:
[[110 154]
 [ 66 883]]


In [10]:
# Random Forrest Classifier
rfc = RandomForestClassifier(n_estimators=int(config['n_Estimators']), n_jobs=int(config['n_CPUCores'])).fit(features, labels)
rfc_scores = cross_val_score(rfc, features, labels, scoring="f1", cv=int(config['n_KFolds']))

rfc_predictedLabels = utils.getPredictedLabels(rfc, features)
rfc_confMatrix = confusion_matrix(labels, rfc_predictedLabels)

printMetrics(rfc_scores, rfc_confMatrix)

Accuracy (F1 Score): 0.89 (+/- 0.021)

Confusion Matrix:
[[264   0]
 [  0 949]]


In [13]:
# Gaussian Process Classifier with RBF Kernel
gpcRBFF = GaussianProcessClassifier().fit(features, labels)
gpcRBFF_scores = cross_val_score(gpcRBFF, features, labels, scoring="f1", cv=int(config['n_KFolds']))

gpcRBFF_predictedLabels = utils.getPredictedLabels(gpcRBFF, features)
gpcRBFF_confMatrix = confusion_matrix(labels, gpcRBFF_predictedLabels)

printMetrics(gpcRBFF_scores, gpcRBFF_confMatrix)

Accuracy (F1 Score): 0.89 (+/- 0.011)

Confusion Matrix:
[[175  89]
 [  2 947]]


## Optimization

In [14]:
# Optimize parameters of random forest classifier
# This will take a long time [10+ minutes](exhaustive search of parameter space)
parameters = {"max_depth": [None, 3],
              "max_features": ['auto', 0.1, 0.2],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
optimizer = GridSearchCV(rfc, parameters, return_train_score='false')
optimizer.fit(features, labels)
optimizer.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 0.2,
 'min_samples_leaf': 1,
 'min_samples_split': 10}

In [15]:
# Random Forrest Classifier (Optimized)
rfcOpt = optimizer.best_estimator_.fit(features, labels)
rfcOpt_scores = cross_val_score(rfcOpt, features, labels, scoring="f1", cv=int(config['n_KFolds']))

rfcOpt_predictedLabels = utils.getPredictedLabels(rfcOpt, features)
rfcOpt_confMatrix = confusion_matrix(labels, rfcOpt_predictedLabels)

printMetrics(rfcOpt_scores, rfcOpt_confMatrix)

Accuracy (F1 Score): 0.9 (+/- 0.017)

Confusion Matrix:
[[207  57]
 [  1 948]]


__NOTE__: While the optimized model has a slightly higher cross validation score, the number of false positives and false negatives increase substantially. This may be an indication of the model overfitting.

<BR><BR>Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

> http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.