In [1]:
# Imports
import numpy as np
import scipy.io as sio
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest, f_classif, SelectPercentile
from sklearn import linear_model
from sklearn import cross_validation
from sklearn.cross_decomposition import PLSRegression
from sklearn import tree

In [2]:
'''
Import your data 
'''
# Load the .mat files
events_1000 = sio.loadmat('Data/events_1000.mat')
missIdx = sio.loadmat('Data/missIdx.mat')
provideData_1000 = sio.loadmat('Data/provideData_1000.mat')
provideIdx = sio.loadmat('Data/provideIdx.mat')
trainData = sio.loadmat('Data/Train.mat')
testData = sio.loadmat('Data/Test.mat')
# 
events = events_1000.get('events')
missidx = missIdx.get('missIdx')
provideData = provideData_1000.get('provideData')
provideidx = provideIdx.get('provideIdx')

Xtrain = trainData.get('Xtrain')
Ytrain = trainData.get('Ytrain')
Xtest = testData.get('Xtest')

In [3]:
'''
Get full training data
'''
# Concatenate the Xtrain and Xtest
x = np.vstack((Xtrain, Xtest))
Xtrain = x[:, provideidx[0,0]-1]
Ytrain = x[:, missidx[0,0]-1]

for i in xrange(1, np.shape(x)[1] + 1):
    if( i in provideidx and i != provideidx[0,0] ):
        Xtrain = np.vstack(( Xtrain, x[:,i-1] ))
    elif( i in missidx and i != missidx[0,0] ):
        Ytrain = np.vstack(( Ytrain, x[:,i-1] ))

Xtrain = Xtrain.T
Ytrain = Ytrain.T

In [6]:
'''
Set up dimensionality reduction 
'''
# Parameters
comp = 1600
k_ = 800
percentile_ = 4
# PCA
pca = decomposition.PCA(n_components=comp)
# Feature selection
def f_regression(X,Y):
    import sklearn
    return sklearn.feature_selection.f_regression(X,Y,center=False)
selection = SelectKBest(k=k_, score_func=f_regression)
# Percentile selection
class_stuff = SelectPercentile(f_classif, percentile=percentile_)
# Feature Union
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

In [7]:
'''
Reduce dimensions of Xtrain
'''
# Scale Xtrain
Xscale = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(Xtrain)
# Reduce dimensions
Xpca = pca.fit_transform(Xscale)

In [28]:
'''
Use Feature Agglomeration
'''
# Imports
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.cluster import FeatureAgglomeration
# Do feature agglomeration on the data
ward = FeatureAgglomeration(n_clusters=10)
Xfa = ward.fit_transform(Xpca)

In [8]:
'''
Set up the test data
'''
# Get the test data
testX = provideData
# Scale the test data
testXScale = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(testX)
# Run PCA on the data
testXPCA = pca.transform(testXScale)

In [None]:
'''
Setup and use linear regression
'''
# Setup linear regression
lr = linear_model.LinearRegression(fit_intercept=True, normalize=True)
# Train the model using the training set
lr.fit(Xpca, Ytrain)
# Print CV score
print "Linear Regression:"
print np.mean(cross_validation.cross_val_score(lr, Xpca, Ytrain, scoring="mean_squared_error", cv=10))

In [40]:
'''
Setup and use the PLS Regression
'''
# Setup PLS Regression
comp_ = 800
max_iter_ = -1
tol_ = 1e-06
pls = PLSRegression(n_components=comp_, scale = True, max_iter=max_iter_, tol=tol_)
# Train the model
pls.fit(Xtrain, Ytrain)
# Print CV score
print "PLS Regression:"
print np.mean(cross_validation.cross_val_score(pls, Xtrain, Ytrain, scoring="mean_squared_error", cv=10))

PLS Regression:


KeyboardInterrupt: 

In [13]:
'''
Setup and use the Elastic Net Regression
'''
# Setup the parameters
alpha_ = 1 # 0 is Linear Regression
l1_ratio_ = 0.7 # 1 is L1, 0 = L2
max_iter_ = -1
tol_ = 1e-06
# Setup the Reggressor
enet = linear_model.ElasticNet(alpha=alpha_, l1_ratio=l1_ratio_, 
                 max_iter = max_iter_, tol=tol_)
# Train the model
enet.fit(Xtrain, Ytrain)
# Print CV score
print "Elastic Net"
print np.mean(cross_validation.cross_val_score(enet, Xtrain, Ytrain, scoring="mean_squared_error", cv=10))

Elastic Net
-0.480420401975


In [17]:
'''
Setup Regression trees
'''
# Parameters
max_depth_ = 3
random_state_ = 0
# Setup the Regressor
rt = tree.DecisionTreeRegressor(random_state=random_state_, max_depth=max_depth_)
# Train the model
rt.fit(Xpca, Ytrain)
# Print CV score
print np.mean(cross_validation.cross_val_score(rt, Xpca, Ytrain, scoring="mean_squared_error", cv=10))

-0.357238316694


In [None]:
# Do Bayesian Ridge
# Parameters
n_iter_ = -1
tol_ = 0.0001
alpha_1_ = 1e-6
alpha_2_ = 1e-6
lambda_1_ = 1e-6
lambda_2_ = 13-6
br = BayesianRidge(n_iter=n_iter_, tol=tol_, alpha_1=alpha_1_, alpha_2=alpha_2_,
                  lambda_1=lambda_1_, lambda_2=lambda_2_)
br.fit(Xfa, Ytrain)

In [35]:
'''
Setup Lasso Regression
'''
# Parameters
alpha_ = 0.0001
max_iter_ = -1
tol_= 0.0001
# Setup Lasso regression
lasso = linear_model.Lasso(alpha=alpha_, fit_intercept=True,
                          normalize=False, max_iter=max_iter_,tol=tol_,
                          random_state=None, positive=False, precompute=False)
# Train Lasso
lasso.fit(Xpca, Ytrain)
# Print CV score
print np.mean(cross_validation.cross_val_score(rt, Xpca, Ytrain, scoring="mean_squared_error", cv=10))

-0.357238316694


In [38]:
'''
Predict the values
'''
# Predict
yTest = lasso.predict(testXPCA)
# Print the shape
print np.shape(yTest)

(1000, 2731)


In [39]:
'''
Put the test values into a csv file
'''
np.savetxt('prediction.csv', yTest, delimiter=",",fmt='%1.7f' )