In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model

In [None]:
data = np.genfromtxt('../data/kc_house_train_data.csv', delimiter=',', skip_header=1)
d_test = np.genfromtxt('../data/kc_house_test_data.csv', delimiter=',', skip_header=1)
# since we imported them with numpy, the first row with chars 
# might be nan's, so we want to effectively ignore that.

In [None]:
def polynomial_feat(feature, degree):
    assert(isinstance(degree, int))
    assert(degree > 0)
    final_ft = np.empty((feature.shape[0], degree))
    final_ft[:, 0] = feature
    for i in range(2, degree + 1):
        final_ft[:, i - 1] = np.power(feature, i)
    return final_ft

In [None]:
inp = data[:, 5]
output = data[:, 2]

In [None]:
idx = np.argsort(inp)
# sort the input and output according to the appropriate indices. 
inp_sorted = inp[idx]
output_sorted = output[idx]

In [None]:
degree = 15
cc = polynomial_feat(inp_sorted, degree)

In [None]:
# reshape input and output to use sklearn
if degree == 1:
    cc = cc.reshape((cc.shape[0], 1))

# perform ridge regression
l2_small_penalty = 1.5e-5
regr1 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
regr1.fit(cc, output_sorted)
# predict the values
yp1 = regr1.predict(cc)
# print the coefficients
print('Coefficients for regression of degree {} are {}'.format(degree, regr1.coef_))

# Train and fit on different subsets

In [None]:
d1 = np.genfromtxt('../data/wk3_kc_house_set_1_data.csv', delimiter=',', skip_header=1)
d2 = np.genfromtxt('../data/wk3_kc_house_set_2_data.csv', delimiter=',', skip_header=1)
d3 = np.genfromtxt('../data/wk3_kc_house_set_3_data.csv', delimiter=',', skip_header=1)
d4 = np.genfromtxt('../data/wk3_kc_house_set_4_data.csv', delimiter=',', skip_header=1)

In [None]:
def aux_prepare_data(d, l2_penalty, degree=15, inp=5, out=2, verbose=True):
    # auxiliary function that: a) forms the input/output vectors, b) creates a linear 
    # prediction for the data provided. 
    inp = d[:, inp]
    output = d[:, out]
    idx = np.argsort(inp)
    # sort the input and output according to the appropriate indices. 
    inp_sorted = inp[idx]
    output_sorted = output[idx]
    cc = polynomial_feat(inp_sorted, degree)
    
    # perform linear regression
    # reshape input and output to use sklearn
    if degree == 1:
        cc = cc.reshape((cc.shape[0], 1))
    
    # build linear regression model
    regr1 = linear_model.Ridge(alpha=l2_penalty, normalize=True)
    regr1.fit(cc, output_sorted)
    # predict the values
    yp = regr1.predict(cc)
    
    if verbose:
        plt.figure()
        print(regr1.intercept_, regr1.coef_)
        plt.plot(cc, output_sorted, '.', cc, yp, '-')
    
    return cc, yp, output_sorted, regr1

In [None]:
l2_small_penalty=1e-9
cc1, yp1, out1, regr1 = aux_prepare_data(d1, l2_small_penalty)
cc2, yp2, out2, regr2 = aux_prepare_data(d2, l2_small_penalty)
cc3, yp3, out3, regr3 = aux_prepare_data(d3, l2_small_penalty)
cc4, yp4, out4, regr4 = aux_prepare_data(d4, l2_small_penalty)

In [None]:
# Fit with a large penalty
l2_large_penalty=1.23e2
cc1, yp1, out1, regr1 = aux_prepare_data(d1, l2_large_penalty)
cc2, yp2, out2, regr2 = aux_prepare_data(d2, l2_large_penalty)
cc3, yp3, out3, regr3 = aux_prepare_data(d3, l2_large_penalty)
cc4, yp4, out4, regr4 = aux_prepare_data(d4, l2_large_penalty)

# Cross validation for selecting L2 penalty

In [None]:
def compute_rss_ridge(y_pred, y_true, w):
    erri = y_pred - y_true
    return np.sum(np.multiply(erri, erri)) + np.linalg.norm(w, 2)

In [None]:
def train_valid(d, d_valid, l2_penalty, degree=1):
    inp = d[1:, 5]
    output = d[1:, 2]
    cc = polynomial_feat(inp, degree)
    assert(cc.shape[1] == degree)
    
    # perform ridge regression
    # reshape input and output to use sklearn
    if degree == 1:
        cc = cc.reshape((cc.shape[0], 1))
    
    # build linear regression model
    regr1 = linear_model.Ridge(alpha=l2_penalty, normalize=True)
    regr1.fit(cc, output)
    
    # predict the values (validation set)
    inp_v = d_valid[:, 5]
    output_v = d_valid[:, 2]
    
    inp_v = polynomial_feat(inp_v, degree)
    assert(inp_v.shape[1] == degree)
    if degree == 1:
        inp_v = inp_v.reshape((inp_v.shape[0], 1))
    yp = regr1.predict(inp_v)
    
    return compute_rss_ridge(yp, output_v, regr1.coef_)

In [None]:
def one_fold_exec(k, n, train_valid_shuffled, l2_penalty):
    sum_rss = 0
    for i in xrange(k):
        start = (n*i)/k
        end = (n*(i+1))/k-1
    #     print i, (start, end)
        val = train_valid_shuffled[start:end+1, :]  # validation set
        # create a new temp array for the training and then assign the training data
        tr = np.empty((n - (end-start) - 1, train_valid_shuffled.shape[1]))
        tr[0:start, :] = train_valid_shuffled[0:start, :]
        tr[start:, :] = train_valid_shuffled[end + 1:, :]
        assert(tr.shape[1] == val.shape[1] and 
               tr.shape[0] + val.shape[0] == train_valid_shuffled.shape[0])
        # call the function that performs the feature extraction and the ridge regression
        rss = train_valid(tr, val, l2_penalty, degree=15)
        sum_rss += rss
    sum_rss /= k
    return sum_rss

In [None]:
train_valid_shuffled = np.genfromtxt('../data/wk3_kc_house_train_valid_shuffled.csv', 
                                     delimiter=',', skip_header=1)
n = len(train_valid_shuffled)
k = 10  # 10-fold cross-validation
l2_best = -1
cost = 10 ** 18

for cnt, l2_penalty in enumerate(np.logspace(3, 9, num=13)):
    c1 = one_fold_exec(k, n, train_valid_shuffled, l2_penalty)
    print('penalty: {}, cost: {}'.format(np.round(l2_penalty), c1))
    if c1 < cost:
        cost = c1
        l2_best = l2_penalty
assert(l2_best > 0)

In [None]:
train_valid(train_valid_shuffled, d_test, l2_best, degree=15) / 10 ** 14