# Model Building
This notebook takes the features extracted in feature_extraction.ipynb and uses them to train and test various models with various parameters to establish the 'best-fit' model that can then be saved and used for the prediction pipeline

In [10]:
#imports
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import cv2
import time
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import random
import pickle
%matplotlib inline

## model 101: build a 'bog-standard' linear svm

In [2]:
# let's load the pre-processed training data
filename = 'data/training_data'
temp = open(filename, 'r+b')
try:
    training_data = np.load(filename)
    training_data.files
finally:
    temp.close()

# some stats
X = training_data['X']
y = training_data['y']

print('we have {} feature vectors with {} features per image'.format(len(X), len(X[0])))

we have 17760 feature vectors with 8412 features per image


In [3]:
# build a simple model
def split_and_scale_data(X, y, test_size=0.2):
    rand_state = np.random.randint(0, 100)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=rand_state)

    # Fit a per-column scaler
    X_scaler = StandardScaler().fit(X_train)
    # Apply the scaler to X
    X_train = X_scaler.transform(X_train)
    X_test = X_scaler.transform(X_test)

    print('Feature vector length:', len(X_train[0]))

    return X_train, y_train, X_test, y_test, X_scaler

def build_model(clf, X_train, y_train):
    # Check the training time for the SVC
    t=time.time()
    clf.fit(X_train, y_train)
    t2 = time.time()
    print(round(t2-t, 2), 'Seconds to train...')
    
    return clf

def test_model(clf, X_test, y_test):
    # Check the score of the model
    t=time.time()
    print('Test Accuracy of model = ', round(clf.score(X_test, y_test), 4))
    # Check the prediction time for a single sample
    t2=time.time()
    print(round(t2-t, 2), 'Seconds to predict all test image vectors...')
    
X_train, y_train, X_test, y_test, X_scaler = split_and_scale_data(X, y)

# let's try bog standard svc, gaussian bayes, decision tree

# Use a linear SVC 
print('Linear SVC...')
lsvc = LinearSVC()
lsvc = build_model(lsvc, X_train, y_train)
test_model(lsvc, X_test, y_test)

# Use a non-linear SVC 
print('SVC...')
svc = svm.SVC(kernel='rbf')
svc = build_model(svc, X_train, y_train)
test_model(svc, X_test, y_test)

# Gaussian Bayes
print('Gaussian Bayes...')
clf = GaussianNB()
clf = build_model(clf, X_train, y_train)
test_model(clf, X_test, y_test)

# decision tree
print('Decision Tree...')
clf = tree.DecisionTreeClassifier()
clf = build_model(clf, X_train, y_train)
test_model(clf, X_test, y_test)

Feature vector length: 8412
Linear SVC...
44.58 Seconds to train...
Test Accuracy of model =  0.9856
0.16 Seconds to predict all test image vectors...
SVC...
327.12 Seconds to train...
Test Accuracy of model =  0.9907
67.51 Seconds to predict all test image vectors...
Gaussian Bayes...
2.96 Seconds to train...
Test Accuracy of model =  0.9155
1.13 Seconds to predict all test image vectors...
Decision Tree...
268.21 Seconds to train...
Test Accuracy of model =  0.9699
0.13 Seconds to predict all test image vectors...


The important points here are that the feature vectors were shuffled into a training and test set at random (although the time series data in the training set calls for a more robust method of splitting). The training data was scaled using the standard scalar (zero mean and unit variance). The test data was then separately scaled using the fitted scalar.

In addition, it's clear that out of the box, the linear svm has a really good initial testing accuracy and is relatively quick to train and very quick to predict. The rbf kernel for svc takes quite a lot longer to train but has even better initial accuracy but takes longer to predict all the test data. The Gaussian Bayes accuracy is lower, whilst the decision tree has good accuracy but takes a long time to train cf. svm.

It's time to tune the models and create a pickle output for the best model so it can be loaded back in at a later date with the feature extraction parameters and scalar for prediction

## model tuning and pickling

In [5]:
def pickle_model(clf, scaler, params):
    # create a dictionary to pickle
    pickled_objects = {
        'model': clf,
        'scaler': scaler,
        'params': params
    }
    
    # create output file
    filename = 'model.pkl'
    output = open(filename, 'wb')

    # Pickle dictionary using protocol 0.
    pickle.dump(pickled_objects, output)

    output.close()
    
    return filename

In [6]:
# let's try a pickle of the linear svc

filename = pickle_model(lsvc, X_scaler, None)

pkl_file = open(filename, 'rb')

data1 = pickle.load(pkl_file)
print(data1)
pkl_file.close()

{'scaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'model': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), 'params': None}


tuning models involve cycling through combinations of hyper-paramters for each classifier

In [13]:
# tuning linear svc
tune = True
parameters = {'C':[0.001, 0.01, 0.05, 0.1, 0.5, 1, 5]}
if tune:
    lsvc = LinearSVC()
    clf = GridSearchCV(lsvc, parameters)
    clf = build_model(clf, X_train, y_train)
    print(sorted(clf.cv_results_.keys()))
    test_model(clf, X_test, y_test)
    print(clf.best_params_)

440.44 Seconds to train...
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'mean_train_score', 'param_C', 'params', 'rank_test_score', 'split0_test_score', 'split0_train_score', 'split1_test_score', 'split1_train_score', 'split2_test_score', 'split2_train_score', 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score']
Test Accuracy of model =  0.987
0.19 Seconds to predict all test image vectors...
{'C': 0.001}


In [6]:
# tuning the SVC - we can tune kernel and 'c' - will split into 2
# done...
tune = False
parameters = {'kernel':['sigmoid'], 'C':[1, 10], 'gamma':[0.0001, 0.005]}
if tune:
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters)
    clf = build_model(clf, X_train, y_train)
    print(sorted(clf.cv_results_.keys()))
    test_model(clf, X_test, y_test)
    print(clf.best_params_)

In [None]:
# tuning the SVC - we can tune kernel and 'c' - will split into 2
tune = False
parameters = {'kernel':['rbf'], 'C':[1, 10], 'gamma':[0.0001, 0.005]}
if tune:
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters)
    clf = build_model(clf, X_train, y_train)
    print(sorted(clf.cv_results_.keys()))
    test_model(clf, X_test, y_test)
    print(clf.best_params_)

In [7]:
# tuning the decision tree - we can tune criterion, max-depth and min samples split - will split into 3
tune = True
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[4], 'min_samples_split':[2, 5, 10]}
if tune:
    tree_clf = tree.DecisionTreeClassifier()
    clf = GridSearchCV(tree_clf, parameters)
    clf = build_model(clf, X_train, y_train)
    print(sorted(clf.cv_results_.keys()))
    test_model(clf, X_test, y_test)
    print(clf.best_params_)

1086.57 Seconds to train...
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'mean_train_score', 'param_criterion', 'param_max_depth', 'param_min_samples_split', 'params', 'rank_test_score', 'split0_test_score', 'split0_train_score', 'split1_test_score', 'split1_train_score', 'split2_test_score', 'split2_train_score', 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score']
Test Accuracy of model =  0.9578
0.14 Seconds to predict all test image vectors...
{'max_depth': 4, 'criterion': 'gini', 'min_samples_split': 10}


In [11]:
# tuning the decision tree - we can tune criterion, max-depth and min samples split - will split into 3
tune = True
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[8], 'min_samples_split':[2, 5, 10]}
if tune:
    tree_clf = tree.DecisionTreeClassifier()
    clf = GridSearchCV(tree_clf, parameters)
    clf = build_model(clf, X_train, y_train)
    print(sorted(clf.cv_results_.keys()))
    test_model(clf, X_test, y_test)
    print(clf.best_params_)

1739.73 Seconds to train...
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'mean_train_score', 'param_criterion', 'param_max_depth', 'param_min_samples_split', 'params', 'rank_test_score', 'split0_test_score', 'split0_train_score', 'split1_test_score', 'split1_train_score', 'split2_test_score', 'split2_train_score', 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score']
Test Accuracy of model =  0.9679
0.13 Seconds to predict all test image vectors...
{'max_depth': 8, 'criterion': 'entropy', 'min_samples_split': 5}


In [12]:
# tuning the decision tree - we can tune criterion, max-depth and min samples split - will split into 3
tune = True
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[10], 'min_samples_split':[2, 5, 10]}
if tune:
    tree_clf = tree.DecisionTreeClassifier()
    clf = GridSearchCV(tree_clf, parameters)
    clf = build_model(clf, X_train, y_train)
    print(sorted(clf.cv_results_.keys()))
    test_model(clf, X_test, y_test)
    print(clf.best_params_)

1929.62 Seconds to train...
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'mean_train_score', 'param_criterion', 'param_max_depth', 'param_min_samples_split', 'params', 'rank_test_score', 'split0_test_score', 'split0_train_score', 'split1_test_score', 'split1_train_score', 'split2_test_score', 'split2_train_score', 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score']
Test Accuracy of model =  0.9648
0.12 Seconds to predict all test image vectors...
{'max_depth': 10, 'criterion': 'entropy', 'min_samples_split': 5}


This gives us...