# Blending Experiment

See if we can get better performance on regression by combining output of the other high performing regressions post tuning.

In [1]:
import sqlite3
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

conn = sqlite3.connect('./sqlite/training_incidents.sqlite')
r_seed = 38

inputs = []
labels = []
db_results = conn.execute("SELECT * from incidents")
for rec in db_results:
# split data into inputs and labels
    labels.append(rec[1])
    inputs.append(rec[2:])

all_inputs = np.array(inputs)
all_labels = np.array(labels)

inputs_train, inputs_test, labels_train, labels_test = train_test_split(all_inputs, all_labels,
                                                                        test_size=0.10, random_state=r_seed)


Build all the models based on tuned parameters which will then be blended together

In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor

gbr_clf = GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=1.0, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=5, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
rfr_clf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
             min_samples_split=9, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False)
bag_clf = BaggingRegressor(base_estimator=None, bootstrap=True,
             bootstrap_features=False, max_features=0.75, max_samples=0.5,
             n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False)

fit each classifier to the training data

In [3]:
gbr_clf.fit(inputs_train, labels_train)
rfr_clf.fit(inputs_train, labels_train)
bag_clf.fit(inputs_train, labels_train)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=0.75, max_samples=0.5,
         n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

Generate predictions on the training set to use them as inputs to the blended regressor

In [11]:
gbr_labels = gbr_clf.predict(inputs_train)
rfr_labels = rfr_clf.predict(inputs_train)
bag_labels = bag_clf.predict(inputs_train)

test_gbr_labels = gbr_clf.predict(inputs_test)
test_rfr_labels = rfr_clf.predict(inputs_test)
test_bag_labels = bag_clf.predict(inputs_test)

zip together outputs to make a new dataset for blended regressor


In [12]:
data = zip(gbr_labels, rfr_labels, bag_labels)
test_data = zip(test_gbr_labels,test_rfr_labels,test_bag_labels)
blended_inputs_train = [np.array(x) for x in data]
blended_inputs_test = [np.array(x) for x in test_data]
print blended_inputs_train[0]
print blended_inputs_test[0]


[ 9.36496417  8.62245751  9.18318413]
[ 5.98553209  6.75720119  4.71217228]


In [9]:
from sklearn.dummy import DummyRegressor

clf = DummyRegressor()
clf.fit(blended_inputs_train, labels_train)

labels_predict = clf.predict(inputs_test)

print "EVS", explained_variance_score(labels_test, labels_predict)
print "MAE", mean_absolute_error(labels_test, labels_predict)
print "MSE", mean_squared_error(labels_test, labels_predict)
print "MedAE", median_absolute_error(labels_test, labels_predict)
print "r^2", r2_score(labels_test, labels_predict)

EVS 0.0
MAE 1.55574313291
MSE 4.3253531783
MedAE 1.38961661763
r^2 -2.74564991654e-06


In [13]:
from sklearn import linear_model

clf = linear_model.LinearRegression()
clf.fit(blended_inputs_train, labels_train)
labels_predict = clf.predict(blended_inputs_test)
print labels_test
print labels_predict
print "EVS", explained_variance_score(labels_test, labels_predict)
print "MAE", mean_absolute_error(labels_test, labels_predict)
print "MSE", mean_squared_error(labels_test, labels_predict)
print "MedAE", median_absolute_error(labels_test, labels_predict)
print "r^2", r2_score(labels_test, labels_predict)

[ 5.01063529  4.60517019  9.21034037 ...,  6.68461173  3.91202301
  8.51719319]
[ 3.31679848  5.28711275  8.6740758  ...,  8.63889784  6.26953742
  7.95708678]
EVS 0.580413442938
MAE 0.932081745381
MSE 1.81569446431
MedAE 0.638782319444
r^2 0.580219377533


In [14]:
clf = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.001, l1_ratio=0.15, n_iter=50,
                                learning_rate='invscaling', eta0=0.01, power_t=0.25)
clf.fit(blended_inputs_train, labels_train)
labels_predict = clf.predict(blended_inputs_test)
print labels_test
print labels_predict
print "EVS", explained_variance_score(labels_test, labels_predict)
print "MAE", mean_absolute_error(labels_test, labels_predict)
print "MSE", mean_squared_error(labels_test, labels_predict)
print "MedAE", median_absolute_error(labels_test, labels_predict)
print "r^2", r2_score(labels_test, labels_predict)

[ 5.01063529  4.60517019  9.21034037 ...,  6.68461173  3.91202301
  8.51719319]
[ 3.34898518  5.30973964  8.72350539 ...,  8.67960349  6.29156451
  7.9985858 ]
EVS 0.581258752526
MAE 0.927121441206
MSE 1.8112996324
MedAE 0.632324419481
r^2 0.58123544346


In [15]:
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor(min_samples_leaf=2, min_samples_split=5, random_state=31)

clf.fit(blended_inputs_train, labels_train)
labels_predict = clf.predict(blended_inputs_test)
print labels_test
print labels_predict
print "EVS", explained_variance_score(labels_test, labels_predict)
print "MAE", mean_absolute_error(labels_test, labels_predict)
print "MSE", mean_squared_error(labels_test, labels_predict)
print "MedAE", median_absolute_error(labels_test, labels_predict)
print "r^2", r2_score(labels_test, labels_predict)

[ 5.01063529  4.60517019  9.21034037 ...,  6.68461173  3.91202301
  8.51719319]
[ 3.3755437   4.60517019  9.28536652 ...,  8.56277358  4.60517019
  7.25432887]
EVS 0.526854897912
MAE 0.939259893313
MSE 2.04676837916
MedAE 0.621226662447
r^2 0.526796098608


In [16]:
from sklearn.ensemble import AdaBoostRegressor

clf = AdaBoostRegressor(n_estimators=50, learning_rate=1.0, loss='linear', random_state=31)
clf.fit(blended_inputs_train, labels_train)
labels_predict = clf.predict(blended_inputs_test)
print labels_test
print labels_predict
print "EVS", explained_variance_score(labels_test, labels_predict)
print "MAE", mean_absolute_error(labels_test, labels_predict)
print "MSE", mean_squared_error(labels_test, labels_predict)
print "MedAE", median_absolute_error(labels_test, labels_predict)
print "r^2", r2_score(labels_test, labels_predict)

[ 5.01063529  4.60517019  9.21034037 ...,  6.68461173  3.91202301
  8.51719319]
[ 3.58444601  3.89091456  9.14985783 ...,  8.6228675   5.44525979
  7.75105997]
EVS 0.566775816383
MAE 0.980299967448
MSE 1.93813463702
MedAE 0.714255629273
r^2 0.551911744877
