## Imports

Importing Python packages to use for data exploration.

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge

from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline

## Get the Data

Loading up the iron, copper, and lead data worksheet. We did some preprocessing on this to remove redundancy.

In [2]:
# load data from .csv file
metal_data = pd.read_csv('LSP_data.csv')

# for now, let's just try to predict lead result from copper and iron results
x = pd.DataFrame(dict(copper_result=metal_data['Copper Result'].dropna(), iron_result=metal_data['Iron Result'].dropna()))
y = metal_data['Lead Result'].values.reshape((1443, 1))

# split up data (approx. 80% / 20% split for now) for training, testing datsets
train_data, test_data = (x[:1150], y[:1150]), (x[1150:], y[1150:])

In [3]:
len(train_data[0]), len(test_data[0])

(1150, 293)

## Fit a Linear Regression Model

Using off-the-shelf scikit-learn linear regression model.

In [None]:
# fit the models
copper_model = LinearRegression(n_jobs=8)
copper_model.fit(train_data[0]['copper_result'].values.reshape((1150, 1)), train_data[1])

iron_model = LinearRegression(n_jobs=8)
iron_model.fit(train_data[0]['iron_result'].values.reshape((1150, 1)), train_data[1])

model = LinearRegression(n_jobs=8)
model.fit(train_data[0], train_data[1])

## Make Predictions

In [None]:
plt.scatter(test_data[0]['copper_result'], test_data[1]);
plt.title('Copper vs. Lead Results')
plt.plot(test_data[0]['copper_result'].values.reshape((293, 1)), copper_model.predict(test_data[0]['copper_result'].values.reshape((293, 1))))
plt.title('Model Predictions on Copper vs. Lead Results')
plt.show()

plt.scatter(test_data[0]['iron_result'], test_data[1]);
plt.title('Iron vs. Lead Results')
plt.plot(test_data[0]['iron_result'].values.reshape((293, 1)), iron_model.predict(test_data[0]['iron_result'].values.reshape((293, 1))))
plt.title('Model Predictions on Iron vs. Lead Results')
plt.show()

In [None]:
# get R^2 score of composite model on training data
print 'R^2 score of linear regression copper model on training data:', copper_model.score(train_data[0]['copper_result'].values.reshape(1150, 1), train_data[1])

# get R^2 score of composite model on test data
print 'R^2 score of linear regression copper model on test data:', copper_model.score(test_data[0]['copper_result'].values.reshape(293, 1), test_data[1])
# get R^2 score of composite model on training data

print 'R^2 score of linear regression iron model on training data:', iron_model.score(train_data[0]['iron_result'].values.reshape(1150, 1), train_data[1])

# get R^2 score of composite model on test data
print 'R^2 score of linear regression iron model on test data:', iron_model.score(test_data[0]['iron_result'].values.reshape(293, 1), test_data[1])

# get R^2 score of composite model on training data
print 'R^2 score of linear regression model on training data:', model.score(train_data[0], train_data[1])

# get R^2 score of composite model on test data
print 'R^2 score of linear regression model on test data:', model.score(test_data[0], test_data[1])

## Fit a Kernel SVM Model

We'll try fitting this data with support vector regressors with different kernels.

In [13]:
linear_svr = SVR(kernel='linear')

linear_svr.fit(train_data[0], train_data[1])

print 'Linear kernel SVM R^2 score on training data:', linear_svr.score(train_data[0], train_data[1])

print 'Linear kernel SVM R^2 score on test data:', linear_svr.score(test_data[0], test_data[1])

Linear kernel SVM R^2 score on training data: 0.0203199287037
Linear kernel SVM R^2 score on test data: 0.197837270486


In [16]:
rbf_svr = SVR(kernel='rbf')

rbf_svr.fit(train_data[0], train_data[1])

print 'RBF kernel SVM R^2 score on training data:', rbf_svr.score(train_data[0], train_data[1])

print 'RBF kernel SVM R^2 score on test data:', rbf_svr.score(test_data[0], test_data[1])

RBF kernel SVM R^2 score on training data: -0.0102193408219
RBF kernel SVM R^2 score on test data: -0.0989261126424


In [None]:
poly_svr = SVR(kernel='poly', degree=2)

poly_svr.fit(train_data[0][:10], train_data[1][:10])

print 'Polynomial kernel SVM R^2 score on training data:', poly_svr.score(train_data[0], train_data[1])

print 'Polynomial kernel SVM R^2 score on test data:', poly_svr.score(test_data[0], test_data[1])

In [6]:
sigmoid_svr = SVR(kernel='sigmoid')

sigmoid_svr.fit(train_data[0], train_data[1])

print 'Sigmoid kernel SVM R^2 score on training data:', sigmoid_svr.score(train_data[0], train_data[1])

print 'Sigmoid kernel SVM R^2 score on test data:', sigmoid_svr.score(test_data[0], test_data[1])

 Sigmoid kernel SVM R^2 score on training data: -0.018775728086
Sigmoid kernel SVM R^2 score on test data: -0.153926739469


## Neural Network Regression

Fuck it.

In [31]:
model = MLPRegressor(hidden_layer_sizes=2, activation='relu', solver='adam')

train_scores, test_scores = [], []

for i in range(10):
    model.fit(train_data[0], train_data[1])
    
    train_scores.append(model.score(train_data[0], train_data[1]))
    test_scores.append(model.score(test_data[0], test_data[1]))

print 'Average MLP regressor R^2 score on training data:', sum(train_scores) / float(len(train_scores))

print 'Average MLP regressor R^2 score on test data:', sum(test_scores) / float(len(test_scores))

Average MLP regressor R^2 score on training data: -0.0947082419425
Average MLP regressor R^2 score on test data: -0.60921001624


In [32]:
model = MLPRegressor(hidden_layer_sizes=4, activation='relu', solver='adam')

train_scores, test_scores = [], []

for i in range(10):
    model.fit(train_data[0], train_data[1])
    
    train_scores.append(model.score(train_data[0], train_data[1]))
    test_scores.append(model.score(test_data[0], test_data[1]))

print 'Average MLP regressor R^2 score on training data:', sum(train_scores) / float(len(train_scores))

print 'Average MLP regressor R^2 score on test data:', sum(test_scores) / float(len(test_scores))

Average MLP regressor R^2 score on training data: 0.0190313378963
Average MLP regressor R^2 score on test data: 0.0568200425328


In [33]:
model = MLPRegressor(hidden_layer_sizes=8, activation='relu', solver='adam')

train_scores, test_scores = [], []

for i in range(10):
    model.fit(train_data[0], train_data[1])
    
    train_scores.append(model.score(train_data[0], train_data[1]))
    test_scores.append(model.score(test_data[0], test_data[1]))

print 'Average MLP regressor R^2 score on training data:', sum(train_scores) / float(len(train_scores))

print 'Average MLP regressor R^2 score on test data:', sum(test_scores) / float(len(test_scores))

Average MLP regressor R^2 score on training data: 0.0663714178264
Average MLP regressor R^2 score on test data: 0.117112233342


In [34]:
model = MLPRegressor(hidden_layer_sizes=16, activation='relu', solver='adam')

train_scores, test_scores = [], []

for i in range(10):
    model.fit(train_data[0], train_data[1])
    
    train_scores.append(model.score(train_data[0], train_data[1]))
    test_scores.append(model.score(test_data[0], test_data[1]))

print 'Average MLP regressor R^2 score on training data:', sum(train_scores) / float(len(train_scores))

print 'Average MLP regressor R^2 score on test data:', sum(test_scores) / float(len(test_scores))

Average MLP regressor R^2 score on training data: 0.0682431039035
Average MLP regressor R^2 score on test data: 0.117773768907


In [35]:
model = MLPRegressor(hidden_layer_sizes=[16, 8], activation='relu', solver='adam')

train_scores, test_scores = [], []

for i in range(10):
    model.fit(train_data[0], train_data[1])
    
    train_scores.append(model.score(train_data[0], train_data[1]))
    test_scores.append(model.score(test_data[0], test_data[1]))

print 'Average MLP regressor R^2 score on training data:', sum(train_scores) / float(len(train_scores))

print 'Average MLP regressor R^2 score on test data:', sum(test_scores) / float(len(test_scores))

Average MLP regressor R^2 score on training data: 0.0601306614141
Average MLP regressor R^2 score on test data: 0.0688075013875


In [36]:
model = MLPRegressor(hidden_layer_sizes=[32, 16], activation='relu', solver='adam')

train_scores, test_scores = [], []

for i in range(10):
    model.fit(train_data[0], train_data[1])
    
    train_scores.append(model.score(train_data[0], train_data[1]))
    test_scores.append(model.score(test_data[0], test_data[1]))

print 'Average MLP regressor R^2 score on training data:', sum(train_scores) / float(len(train_scores))

print 'Average MLP regressor R^2 score on test data:', sum(test_scores) / float(len(test_scores))

Average MLP regressor R^2 score on training data: 0.0713761157549
Average MLP regressor R^2 score on test data: 0.0504977739232


## Decision Tree Regression

Last one for tonight.

In [10]:
model = DecisionTreeRegressor()

train_scores, test_scores = [], []

for i in range(10):
    model.fit(train_data[0], train_data[1])
    
    train_scores.append(model.score(train_data[0], train_data[1]))
    test_scores.append(model.score(test_data[0], test_data[1]))

print 'Average MLP regressor R^2 score on training data:', sum(train_scores) / float(len(train_scores))

print 'Average MLP regressor R^2 score on test data:', sum(test_scores) / float(len(test_scores))

Average MLP regressor R^2 score on training data: 0.592138630647
Average MLP regressor R^2 score on test data: -18.8791380323
<sklearn.tree._tree.Tree object at 0x7f803800a510>


## Ridge Regression

In [18]:
model = Ridge()

model.fit(train_data[0], train_data[1])

print 'Average Ridge regression R^2 score on training data:', model.score(train_data[0], train_data[1])

print 'Average Ridge regression R^2 score on test data:', model.score(test_data[0], test_data[1])

 Average Ridge regression R^2 score on training data: 0.0693886537958
Average Ridge regression R^2 score on test data: 0.0854539723189


## Ridge Regression with Hyperparameter Grid Search

In [20]:
# setting parameter distribution
param_dist = { 'alpha' : scipy.stats.expon(scale=1), 'solver' : [ 'svd', 'cholesky', 'sparse_cg', 'lsqr', 'sag' ] }

# creating Ridge Regression model
model = Ridge()

# creating randomized search CV object and fit it to the training data
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=25)
random_search.fit(train_data[0], train_data[1])

# report results
report(random_search.cv_results_)

ValueError: The total space of parameters 10 is smaller than n_iter=25. For exhaustive searches, use GridSearchCV.