# Model Validation

Wherein we take the model as trained and tuned on the full training set and check it against the hold-out test set split off at the beginning of development.

In [1]:
import sqlite3
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

train_conn = sqlite3.connect('./sqlite/training_incidents.sqlite')
test_conn  = sqlite3.connect('./sqlite/validation_incidents.sqlite')

r_seed = 38

train_inputs = []
train_labels = []
test_inputs = []
test_labels = []

train_results = train_conn.execute("SELECT * from incidents")
test_results = test_conn.execute("SELECT * from incidents")

for rec in train_results:
    train_labels.append(rec[1])
    train_inputs.append(rec[2:])

for rec in test_results:
    test_labels.append(rec[1])
    test_inputs.append(rec[2:])


inputs_train = np.array(train_inputs)
inputs_test = np.array(test_inputs)
labels_train = np.array(train_labels)
labels_test = np.array(test_labels)


At this point all the data is loaded, both training and test, so now we take the parameters
from the tuned algorithm and train it on the full set before scoring it against the test data

In [2]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_clf = GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=1.0, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=5, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

gbr_clf.fit(inputs_train, labels_train)

labels_predict = gbr_clf.predict(inputs_test)

print "EVS", explained_variance_score(labels_test, labels_predict)
print "MAE", mean_absolute_error(labels_test, labels_predict)
print "MSE", mean_squared_error(labels_test, labels_predict)
print "MedAE", median_absolute_error(labels_test, labels_predict)
print "r^2", r2_score(labels_test, labels_predict)
print gbr_clf.feature_importances_

EVS 0.640268570775
MAE 0.839164971066
MSE 1.5930784843
MedAE 0.534785447124
r^2 0.640263108594
[ 0.00420535  0.00673082  0.00572975  0.0068716   0.01373218  0.00204348
  0.0198523   0.03353582  0.0133058   0.0095292   0.00923145  0.00923595
  0.00125699  0.0011511   0.00262994  0.00352596  0.          0.00515529
  0.04151542  0.20061566  0.04990149  0.05841704  0.29183182  0.00475986
  0.          0.          0.00112327  0.          0.00082123  0.          0.
  0.00080877  0.00429323  0.00106518  0.          0.00356953  0.00655799
  0.00240636  0.01228248  0.00599683  0.02701666  0.01698954  0.00385986
  0.00567016  0.00350229  0.01129049  0.00268853  0.00172596  0.00278653
  0.00333687  0.00410317  0.          0.00251664  0.00362095  0.00164701
  0.00124087  0.05934776  0.0052449   0.00787634  0.00184634  0.        ]


This is quite promising; here I'm showing an r^2 score of 0.64 against a quite large dataset (30k) that has never been seen by the model before either in training or tuning.  The next step is to persist it so it can be incorporated into other programs without paying a retraining penalty.  This is where we'll check the benchmark for correct record order greater than 75% of the time.

In [3]:
import random
import math


for trial in range(10):
    correct_order_count = 0
    incorrect_order_count = 0
    actual_difference_correct = []
    actual_differences_incorrect = []
    for i in range(100):
        index1 = random.randint(0,len(inputs_test)-1) 
        index2 = random.randint(0,len(inputs_test)-1) 
        inputs = [inputs_test[index1], inputs_test[index2]]
        labels = [labels_test[index1], labels_test[index2]]
        predictions = gbr_clf.predict(inputs)
        delta = math.fabs(math.exp(labels[0]) - math.exp(labels[1]))
        if labels[0] > labels[1]:
            if predictions[0] > predictions[1]:
                correct_order_count += 1
                actual_difference_correct.append(delta)
            else:
                incorrect_order_count += 1
                actual_differences_incorrect.append(delta)
        else:
            if predictions[0] > predictions[1]:
                incorrect_order_count += 1
                actual_differences_incorrect.append(delta)
            else:
                correct_order_count += 1
                actual_difference_correct.append(delta)

    print "TRIAL %s" % (trial + 1)
    print "CORRECT %s, AVG DELTA %s" % (correct_order_count, sum(actual_difference_correct)/len(actual_difference_correct))
    print "INCORRECT %s, AVG DELTA %s" % (incorrect_order_count, sum(actual_differences_incorrect)/len(actual_differences_incorrect))
    print "------------"

TRIAL 1
CORRECT 84, AVG DELTA 11276.3690476
INCORRECT 16, AVG DELTA 6631.25
------------
TRIAL 2
CORRECT 76, AVG DELTA 13588.9605263
INCORRECT 24, AVG DELTA 4934.04166667
------------
TRIAL 3
CORRECT 83, AVG DELTA 16471.9156627
INCORRECT 17, AVG DELTA 7952.88235294
------------
TRIAL 4
CORRECT 85, AVG DELTA 14023.7647059
INCORRECT 15, AVG DELTA 5026.66666667
------------
TRIAL 5
CORRECT 81, AVG DELTA 14575.6790123
INCORRECT 19, AVG DELTA 7829.68421053
------------
TRIAL 6
CORRECT 81, AVG DELTA 15550.7777778
INCORRECT 19, AVG DELTA 5907.89473684
------------
TRIAL 7
CORRECT 81, AVG DELTA 14842.0864198
INCORRECT 19, AVG DELTA 7233.21052632
------------
TRIAL 8
CORRECT 84, AVG DELTA 12543.3690476
INCORRECT 16, AVG DELTA 7277.8125
------------
TRIAL 9
CORRECT 84, AVG DELTA 10724.702381
INCORRECT 16, AVG DELTA 7703.0625
------------
TRIAL 10
CORRECT 85, AVG DELTA 13098.0117647
INCORRECT 15, AVG DELTA 7410.0
------------


In [3]:
import pickle

pickle.dump(gbr_clf, open('final_model.pickle', 'wb'))