In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model

In [None]:
data = np.genfromtxt('../data/kc_house_train_data.csv', delimiter=',')
d_test = np.genfromtxt('../data/kc_house_test_data.csv', delimiter=',')
# since we imported them with numpy, the first row with chars 
# might be nan's, so we want to effectively ignore that.

In [None]:
def polynomial_feat(feature, degree):
    assert(isinstance(degree, int))
    assert(degree > 0)
    final_ft = np.empty((feature.shape[0], degree))
    final_ft[:, 0] = feature
    for i in range(2, degree + 1):
        final_ft[:, i - 1] = np.power(feature, i)
    return final_ft

In [None]:
inp = data[1:, 5]
output = data[1:, 2]

In [None]:
idx = np.argsort(inp)
# sort the input and output according to the appropriate indices. 
inp_sorted = inp[idx]
output_sorted = output[idx]

In [None]:
degree = 3
cc = polynomial_feat(inp_sorted, degree)

In [None]:
# reshape input and output to use sklearn
if degree == 1:
    cc = cc.reshape((cc.shape[0], 1))
# perform linear regression
regr1 = linear_model.LinearRegression()
regr1.fit(cc, output_sorted)
# predict the values
yp1 = regr1.predict(cc)

In [None]:
plt.plot(cc, output_sorted, '.', cc, yp1, '-');

# Use the different sets of data

In [None]:
d1 = np.genfromtxt('../data/wk3_kc_house_set_1_data.csv', delimiter=',')
d2 = np.genfromtxt('../data/wk3_kc_house_set_2_data.csv', delimiter=',')
d3 = np.genfromtxt('../data/wk3_kc_house_set_3_data.csv', delimiter=',')
d4 = np.genfromtxt('../data/wk3_kc_house_set_4_data.csv', delimiter=',')

In [None]:
def aux_prepare_data(d, degree=15, inp=5, out=2, verbose=True):
    # auxiliary function that: a) forms the input/output vectors, b) creates a linear 
    # prediction for the data provided. 
    inp = d[1:, inp]
    output = d[1:, out]
    idx = np.argsort(inp)
    # sort the input and output according to the appropriate indices. 
    inp_sorted = inp[idx]
    output_sorted = output[idx]
    cc = polynomial_feat(inp_sorted, degree)
    
    # perform linear regression
    # reshape input and output to use sklearn
    if degree == 1:
        cc = cc.reshape((cc.shape[0], 1))
    
    # build linear regression model
    regr1 = linear_model.LinearRegression()
    regr1.fit(cc, output_sorted)
    # predict the values
    yp = regr1.predict(cc)
    
    if verbose:
        plt.figure()
        print(regr1.intercept_, regr1.coef_)
        plt.plot(cc, output_sorted, '.', cc, yp, '-')
    
    return cc, yp, output_sorted, regr1

In [None]:
cc1, yp1, out1, regr1 = aux_prepare_data(d1)
cc2, yp2, out2, regr2 = aux_prepare_data(d2)
cc3, yp3, out3, regr3 = aux_prepare_data(d3)
cc4, yp4, out4, regr4 = aux_prepare_data(d4)

# Selecting a polynomial degree

In [None]:
tr = np.genfromtxt('../data/wk3_kc_house_train_data.csv', delimiter=',')
val = np.genfromtxt('../data/wk3_kc_house_valid_data.csv', delimiter=',')
test = np.genfromtxt('../data/wk3_kc_house_test_data.csv', delimiter=',')

In [None]:
def compute_rss(y_pred, y_true):
    erri = y_pred - y_true
    return np.sum(np.multiply(erri, erri))

In [None]:
def train_valid(d, d_valid, degree=1):
    inp = d[1:, 5]
    output = d[1:, 2]
    cc = polynomial_feat(inp, degree)
    assert(cc.shape[1] == degree)
    
    # perform linear regression
    # reshape input and output to use sklearn
    if degree == 1:
        cc = cc.reshape((cc.shape[0], 1))
    
    # build linear regression model
    regr1 = linear_model.LinearRegression()
    regr1.fit(cc, output)
    
    # predict the values (validation set)
    inp_v = d_valid[1:, 5]
    output_v = d_valid[1:, 2]
    
    inp_v = polynomial_feat(inp_v, degree)
    assert(inp_v.shape[1] == degree)
    if degree == 1:
        inp_v = inp_v.reshape((inp_v.shape[0], 1))
    yp = regr1.predict(inp_v)
    
    return compute_rss(yp, output_v)

In [None]:
min1 = 1000000000000
min_elem = 0
for i in range(1, 16):
    err = train_valid(tr, val, i)
    print('Loop {} with error {}.'.format(i, err))
    if err < min1 or i == 1:
        min1 = err 
        min_elem = i

In [None]:
# select the optimal degree from above
degree = min_elem
# find the rss on test data
print('Degree {} with error {}.'.format(degree, train_valid(tr, test, degree)))