In [6]:
import numpy as np
import numpy.linalg as LA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes

In [52]:
def generateTrainTest():
    sample_set = load_diabetes()
    train_X, test_X, train_Y, test_Y = train_test_split(sample_set.data, sample_set.target, test_size=.2)
    return train_X, test_X, train_Y, test_Y

In [53]:
train_X, test_X, train_Y, test_Y = generateTrainTest()

In [57]:
ols_model = LinearRegression(fit_intercept=False).fit(train_X, train_Y)
print('Train score: {}'.format(ols_model.score(train_X, train_Y)))
print('Test score: {}'.format(ols_model.score(test_X, test_Y)))

Train score: -3.4760061474
Test score: -3.20657964569


In [58]:
def pcr(X, Y, k):
    assert k <= X.shape[1]
    u, s, v = np.linalg.svd(X)
    vk = v[:, :k]
    Wk = np.dot(X, vk)
    gamma_hat = np.dot(np.dot(np.linalg.inv(np.dot(Wk.T, Wk)), Wk.T), Y)
    return np.dot(vk, gamma_hat)

In [66]:
pcr(train_X, train_Y, 10)

array([ -93.18315689, -235.3122308 ,  658.66324423,  183.56446806,
       -878.25360546,  674.24579341,  220.561198  , -103.7852363 ,
        993.00027854,   53.13006703])

In [60]:
ols_model.coef_

array([ -93.18315689, -235.3122308 ,  658.66324423,  183.56446806,
       -878.25360546,  674.24579341,  220.561198  , -103.7852363 ,
        993.00027854,   53.13006703])

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator

In [62]:
class MyPCREstimator(BaseEstimator):
    def __init__(self, directions=1):
        self.bhat_ = None
        self.directions=directions
        
    def fit(self, X, y):
        self.bhat_ = pcr(X, y, self.directions)
    
    def predict(self, X):
        assert self.bhat_ is not None, 'Estimator not fit'
        return np.dot(X, self.bhat_)

fit_cv = GridSearchCV(MyPCREstimator(), {'directions': range(1, train_X.shape[1] + 1)}, 'r2').fit(train_X, train_Y)

In [63]:
print('Train_score: {}'.format(r2_score(train_Y, fit_cv.predict(train_X))))
print('Test_score: {}'.format(r2_score(test_Y, fit_cv.predict(test_X))))

Train_score: -3.48534434528
Test_score: -3.11237061038
