# Model Validation

Wherein we take the model as trained and tuned on the full training set and check it against the hold-out test set split off at the beginning of development.

In [1]:
import sqlite3
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

train_conn = sqlite3.connect('./sqlite/training_incidents.sqlite')
test_conn  = sqlite3.connect('./sqlite/validation_incidents.sqlite')

r_seed = 38

train_inputs = []
train_labels = []
test_inputs = []
test_labels = []

train_results = test_conn.execute("SELECT * from incidents")
test_results = test_conn.execute("SELECT * from incidents")

for rec in train_results:
    train_labels.append(rec[1])
    train_inputs.append(rec[2:])

for rec in test_results:
    test_labels.append(rec[1])
    test_inputs.append(rec[2:])


inputs_train = np.array(train_inputs)
inputs_test = np.array(test_inputs)
labels_train = np.array(train_labels)
labels_test = np.array(test_labels)


At this point all the data is loaded, both training and test, so now we take the parameters
from the tuned algorithm and train it on the full set before scoring it against the test data

In [2]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_clf = GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=1.0, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=5, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

gbr_clf.fit(inputs_train, labels_train)

labels_predict = gbr_clf.predict(inputs_test)

print "EVS", explained_variance_score(labels_test, labels_predict)
print "MAE", mean_absolute_error(labels_test, labels_predict)
print "MSE", mean_squared_error(labels_test, labels_predict)
print "MedAE", median_absolute_error(labels_test, labels_predict)
print "r^2", r2_score(labels_test, labels_predict)
print gbr_clf.feature_importances_

EVS 0.676676560513
MAE 0.809925635314
MSE 1.43182316638
MedAE 0.534865737451
r^2 0.676676560513
[ 0.00026007  0.00373238  0.00668474  0.01518905  0.00782811  0.00524282
  0.01540893  0.01322664  0.01419353  0.00949979  0.00301603  0.00472313
  0.00881173  0.          0.00084628  0.00423763  0.          0.00421173
  0.05877099  0.19075471  0.07403105  0.06327598  0.23100903  0.00361041
  0.00122255  0.001185    0.          0.          0.00117129  0.01239872
  0.          0.00435039  0.00876331  0.00184721  0.          0.00821141
  0.01078589  0.00091745  0.00777092  0.00363756  0.01790475  0.01256453
  0.00304351  0.01009335  0.00289442  0.01075914  0.00200405  0.00126793
  0.00289501  0.00097596  0.00325585  0.0057671   0.00162331  0.00378119
  0.00321996  0.00856725  0.09188335  0.00419415  0.00391523  0.0060819
  0.0024816 ]


This is quite promising; here I'm showing an r^2 score of 0.68 against a quite large dataset (30k) that has never been seen by the model before either in training or tuning.  The next step is to persist it so it can be incorporated into other programs without paying a retraining penalty.  This is where we'll check the benchmark for correct record order greater than 75% of the time.

In [14]:
import random
import math


for trial in range(10):
    correct_order_count = 0
    incorrect_order_count = 0
    actual_difference_correct = []
    actual_differences_incorrect = []
    for i in range(100):
        index1 = random.randint(0,len(inputs_test)-1) 
        index2 = random.randint(0,len(inputs_test)-1) 
        inputs = [inputs_test[index1], inputs_test[index2]]
        labels = [labels_test[index1], labels_test[index2]]
        predictions = gbr_clf.predict(inputs)
        delta = math.fabs(math.exp(labels[0]) - math.exp(labels[1]))
        if labels[0] > labels[1]:
            if predictions[0] > predictions[1]:
                correct_order_count += 1
                actual_difference_correct.append(delta)
            else:
                incorrect_order_count += 1
                actual_differences_incorrect.append(delta)
        else:
            if predictions[0] > predictions[1]:
                incorrect_order_count += 1
                actual_differences_incorrect.append(delta)
            else:
                correct_order_count += 1
                actual_difference_correct.append(delta)

    print "TRIAL %s" % (trial + 1)
    print "CORRECT %s, AVG DELTA %s" % (correct_order_count, sum(actual_difference_correct)/len(actual_difference_correct))
    print "INCORRECT %s, AVG DELTA %s" % (incorrect_order_count, sum(actual_differences_incorrect)/len(actual_differences_incorrect))
    print "------------"

TRIAL 1
CORRECT 74, AVG DELTA 11179.5675676
INCORRECT 26, AVG DELTA 5272.69230769
------------
TRIAL 2
CORRECT 78, AVG DELTA 12798.0769231
INCORRECT 22, AVG DELTA 8534.04545455
------------
TRIAL 3
CORRECT 85, AVG DELTA 14021.6352941
INCORRECT 15, AVG DELTA 3511.06666667
------------
TRIAL 4
CORRECT 77, AVG DELTA 13019.4805195
INCORRECT 23, AVG DELTA 5962.60869565
------------
TRIAL 5
CORRECT 81, AVG DELTA 11966.8148148
INCORRECT 19, AVG DELTA 4231.52631579
------------
TRIAL 6
CORRECT 77, AVG DELTA 14181.9480519
INCORRECT 23, AVG DELTA 4515.2173913
------------
TRIAL 7
CORRECT 79, AVG DELTA 13047.4556962
INCORRECT 21, AVG DELTA 7590.47619048
------------
TRIAL 8
CORRECT 80, AVG DELTA 11675.6
INCORRECT 20, AVG DELTA 3385.0
------------
TRIAL 9
CORRECT 83, AVG DELTA 16899.9518072
INCORRECT 17, AVG DELTA 6104.70588235
------------
TRIAL 10
CORRECT 78, AVG DELTA 10835.1794872
INCORRECT 22, AVG DELTA 2274.09090909
------------


In [3]:
import pickle

pickle.dump(gbr_clf, open('final_model.pickle', 'wb'))