## Imports

Importing Python packages.

In [28]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

%matplotlib inline

## Get the Data

Load up the Iron, Copper, Chloride, and Lead .csv file.

In [7]:
# load data from .csv file
metal_data = pd.read_csv('LSPC_data.csv')

# for now, let's just try to predict lead result from copper and iron results
x = pd.DataFrame(dict(copper_result=metal_data['Copper Result'].dropna(), iron_result=metal_data['Iron Result'].dropna(), chloride_result=metal_data['Chloride Result'].dropna()))
y = metal_data['Lead Result'].values.reshape((1365, 1))

# split up data (approx. 80% / 20% split for now) for training, testing datsets
train_data, test_data = (x[:1095], y[:1095]), (x[1095:], y[1095:])

In [8]:
len(train_data[0]), len(test_data[0])

(1095, 270)

## Fit a Linear Regression Model

Using off-the-shelf scikit-learn linear regression model.

In [10]:
# fit the model
model = LinearRegression(n_jobs=8)
model.fit(train_data[0], train_data[1])

# report results
print 'Linear regression R^2 score on training data:', model.score(train_data[0], train_data[1])
print 'Linear regression R^2 score on test data:', model.score(test_data[0], test_data[1])

Linear regression R^2 score on training data: 0.0693502104944
Linear regression R^2 score on test data: 0.0631283681763


## Fit a Kernel SVM Model

We'll try fitting this data with support vector regressors with different kernels.

In [14]:
# fit the model
linear_svr = SVR(kernel='linear')
linear_svr.fit(train_data[0], train_data[1])

# report results
print 'Linear kernel SVM R^2 score on training data:', linear_svr.score(train_data[0], train_data[1])
print 'Linear kernel SVM R^2 score on test data:', linear_svr.score(test_data[0], test_data[1])

Linear kernel SVM R^2 score on training data: 0.0347459816975
Linear kernel SVM R^2 score on test data: 0.448114217216


In [15]:
# fit the model
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(train_data[0], train_data[1])

# report results
print 'RBF kernel SVM R^2 score on training data:', rbf_svr.score(train_data[0], train_data[1])
print 'RBF kernel SVM R^2 score on test data:', rbf_svr.score(test_data[0], test_data[1])

RBF kernel SVM R^2 score on training data: -0.0142377561435
RBF kernel SVM R^2 score on test data: -0.135591978017


In [16]:
# fit the model
sigmoid_svr = SVR(kernel='sigmoid')
sigmoid_svr.fit(train_data[0], train_data[1])

# report the results
print 'Sigmoid kernel SVM R^2 score on training data:', sigmoid_svr.score(train_data[0], train_data[1])
print 'Sigmoid kernel SVM R^2 score on test data:', sigmoid_svr.score(test_data[0], test_data[1])

Sigmoid kernel SVM R^2 score on training data: -0.0189081186182
Sigmoid kernel SVM R^2 score on test data: -0.155749767966


## Neural Network Regression

In [41]:
model = MLPRegressor()

grid_search = GridSearchCV(model, { 'hidden_layer_sizes' : [ 2 ], 'activation' : [ 'relu' ], 'solver' : [ 'lbfgs', 'adam' ], 'alpha' : [ 1e-5, 3e-5, 5e-5, 7e-5, 1e-4 ] }, verbose=1)
grid_search.fit(train_data[0], train_data[1])

train_score = grid_search.score(train_data[0], train_data[1])
test_score = grid_search.score(test_data[0], test_data[1])

print '2 hidden neuron MLP regressor R^2 score on training data:', train_score
print '2 hidden neuron MLP regressor R^2 score on test data:', test_score

print grid_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
2 hidden neuron MLP regressor R^2 score on training data: 0.049395606011
2 hidden neuron MLP regressor R^2 score on test data: -0.0234165436244
MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=2, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    2.7s finished


In [42]:
model = MLPRegressor()

grid_search = GridSearchCV(model, { 'hidden_layer_sizes' : [ 4 ], 'activation' : [ 'relu' ], 'solver' : [ 'lbfgs', 'adam' ], 'alpha' : [ 1e-5, 3e-5, 5e-5, 7e-5, 1e-4 ] }, verbose=1)
grid_search.fit(train_data[0], train_data[1])

train_score = grid_search.score(train_data[0], train_data[1])
test_score = grid_search.score(test_data[0], test_data[1])

print '2 hidden neuron MLP regressor R^2 score on training data:', train_score
print '2 hidden neuron MLP regressor R^2 score on test data:', test_score

print grid_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
2 hidden neuron MLP regressor R^2 score on training data: 0.0799692187852
2 hidden neuron MLP regressor R^2 score on test data: 0.266669673111
MLPRegressor(activation='relu', alpha=3e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=4, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    3.1s finished


In [44]:
model = MLPRegressor()

grid_search = GridSearchCV(model, { 'hidden_layer_sizes' : [ 8 ], 'activation' : [ 'relu' ], 'solver' : [ 'lbfgs', 'adam' ], 'alpha' : [ 1e-5, 3e-5, 5e-5, 7e-5, 1e-4 ] }, verbose=1)
grid_search.fit(train_data[0], train_data[1])

train_score = grid_search.score(train_data[0], train_data[1])
test_score = grid_search.score(test_data[0], test_data[1])

print '2 hidden neuron MLP regressor R^2 score on training data:', train_score
print '2 hidden neuron MLP regressor R^2 score on test data:', test_score

print grid_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
2 hidden neuron MLP regressor R^2 score on training data: 0.0692073413925
2 hidden neuron MLP regressor R^2 score on test data: 0.0703476968022
MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=8, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    2.8s finished


In [46]:
model = MLPRegressor()

grid_search = GridSearchCV(model, { 'hidden_layer_sizes' : [ 16 ], 'activation' : [ 'relu' ], 'solver' : [ 'lbfgs', 'adam' ], 'alpha' : [ 1e-5, 3e-5, 5e-5, 7e-5, 1e-4 ] }, verbose=1)
grid_search.fit(train_data[0], train_data[1])

train_score = grid_search.score(train_data[0], train_data[1])
test_score = grid_search.score(test_data[0], test_data[1])

print '2 hidden neuron MLP regressor R^2 score on training data:', train_score
print '2 hidden neuron MLP regressor R^2 score on test data:', test_score

print grid_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
2 hidden neuron MLP regressor R^2 score on training data: 0.069283222442
2 hidden neuron MLP regressor R^2 score on test data: 0.085969949657
MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=16, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.8s finished


In [48]:
model = MLPRegressor()

grid_search = GridSearchCV(model, { 'hidden_layer_sizes' : [ 50 ], 'activation' : [ 'relu' ], 'solver' : [ 'lbfgs', 'adam' ], 'alpha' : [ 1e-5, 3e-5, 5e-5, 7e-5, 1e-4 ] }, verbose=1)
grid_search.fit(train_data[0], train_data[1])

train_score = grid_search.score(train_data[0], train_data[1])
test_score = grid_search.score(test_data[0], test_data[1])

print '2 hidden neuron MLP regressor R^2 score on training data:', train_score
print '2 hidden neuron MLP regressor R^2 score on test data:', test_score

print grid_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
2 hidden neuron MLP regressor R^2 score on training data: 0.0844730934378
2 hidden neuron MLP regressor R^2 score on test data: 0.23679250186
MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=50, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    2.4s finished


In [52]:
model = MLPRegressor()

grid_search = GridSearchCV(model, { 'hidden_layer_sizes' : [ 100 ], 'activation' : [ 'relu' ], 'solver' : [ 'lbfgs', 'adam' ], 'alpha' : [ 1e-5, 3e-5, 5e-5, 7e-5, 1e-4 ] }, verbose=1)
grid_search.fit(train_data[0], train_data[1])

train_score = grid_search.score(train_data[0], train_data[1])
test_score = grid_search.score(test_data[0], test_data[1])

print '2 hidden neuron MLP regressor R^2 score on training data:', train_score
print '2 hidden neuron MLP regressor R^2 score on test data:', test_score

print grid_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
2 hidden neuron MLP regressor R^2 score on training data: 0.0820279178969
2 hidden neuron MLP regressor R^2 score on test data: 0.291353105597
MLPRegressor(activation='relu', alpha=3e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    2.5s finished


## Decision Tree Regression

In [56]:
model = DecisionTreeRegressor()
train_scores, test_scores = [], []

for i in range(10):
    model.fit(train_data[0], train_data[1])
    
    train_scores.append(model.score(train_data[0], train_data[1]))
    test_scores.append(model.score(test_data[0], test_data[1]))

print 'Average MLP regressor R^2 score on training data:', sum(train_scores) / float(len(train_scores))

print 'Average MLP regressor R^2 score on test data:', sum(test_scores) / float(len(test_scores))

Average MLP regressor R^2 score on training data: 0.999981184422
Average MLP regressor R^2 score on test data: -74.3504649278


## Ridge Regression

In [58]:
model = Ridge()
model.fit(train_data[0], train_data[1])

print 'Ridge regression R^2 score on training data:', model.score(train_data[0], train_data[1])
print 'Ridge regression R^2 score on test data:', model.score(test_data[0], test_data[1])

Ridge regression R^2 score on training data: 0.0693502104944
Ridge regression R^2 score on test data: 0.0631284100986


## Ridge Regression with Hyperparameter Grid Search

In [61]:
# setting parameter distribution
param_dist = { 'alpha' : sp.stats.expon(scale=1), 'solver' : [ 'svd', 'cholesky', 'sparse_cg', 'lsqr', 'sag' ] }

# creating Ridge Regression model
model = Ridge()

# creating randomized search CV object and fit it to the training data
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=25)
random_search.fit(train_data[0], train_data[1])

# report results
print 'best Ridge regression R^2 score on training data:', random_search.score(train_data[0], train_data[1])
print 'best Ridge regression R^2 score on test data:', random_search.score(test_data[0], test_data[1])

best Ridge regression R^2 score on training data: 0.0693496459424
best Ridge regression R^2 score on test data: 0.0647013331384
