In [None]:
import numpy as np

In [None]:
data = np.genfromtxt('../data/kc_house_train_data.csv', delimiter=',')
d_test = np.genfromtxt('../data/kc_house_test_data.csv', delimiter=',')
# since we imported them with numpy, the first row with chars 
# might be nan's, so we want to effectively ignore that.
# data = my_data[1:, :]

In [None]:
# bedrooms squared 
ftt1 = np.multiply(d_test[1:, 3], d_test[1:, 3])
print(np.average(ftt1))
# bedroom * bathroom
ftt2 = np.multiply(d_test[1:, 3], d_test[1:, 4])
print(np.average(ftt2))
# log_sqft_living
ftt3 = np.log(d_test[1:, 5])
print(np.average(ftt3))
# lat plus long
ftt4 = d_test[1:, 17] + d_test[1:, 18]
print(np.average(ftt4))


# Use sklearn for the linear regression

In [None]:
from sklearn import linear_model
y = data[1:, 2]

In [None]:
# additional features from training data
# bedrooms squared 
ft1 = np.multiply(data[1:, 3], data[1:, 3])
# bedroom * bathroom
ft2 = np.multiply(data[1:, 3], data[1:, 4])
# log_sqft_living
ft3 = np.log(data[1:, 5])
# lat plus long
ft4 = data[1:, 17] + data[1:, 18]

In [None]:
# model1: (Model 1: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’)
m1 = np.array([data[1:, 5], data[1:, 3], data[1:, 4], data[1:, 17], data[1:, 18]]).T
regr1 = linear_model.LinearRegression()
regr1.fit(m1, y)
print('Coefficients: {}'.format(regr1.coef_))
print('RSS on training data: {}'.format(regr1.residues_))

In [None]:
# model2: (Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’)
m2 = np.array([data[1:, 5], data[1:, 3], data[1:, 4], data[1:, 17], data[1:, 18], ft2]).T
regr2 = linear_model.LinearRegression()
regr2.fit(m2, y)
print('Coefficients: {}'.format(regr2.coef_))
print('RSS on training data: {}'.format(regr2.residues_))
assert(not np.all(regr2.coef_ == regr1.coef_))  # ensure that they are not all the same

In [None]:
# model3: (Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, 
#          ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’)
m3 = np.array([data[1:, 5], data[1:, 3], data[1:, 4], data[1:, 17], data[1:, 18], ft2, ft1, ft3, ft4]).T
regr3 = linear_model.LinearRegression()
regr3.fit(m3, y)
print('Coefficients: {}'.format(regr3.coef_))
print('RSS on training data: {}'.format(regr3.residues_))
assert(not np.all(regr3.coef_ == regr2.coef_))  # ensure that they are not all the same

def compute_rss(y_pred, y_true):
    erri = y_pred - y_true
    return np.sum(np.multiply(erri, erri))

# alternative way to extract the RSS: 
y3 = regr3.predict(m3)
print('RSS on training data: {}'.format(compute_rss(y3, y)))

# Use test data to compute RSS

In [None]:
yt = d_test[1:, 2]  # groundtruth data

In [None]:
def compute_rss(y_pred, y_true):
    erri = y_pred - y_true
    return np.sum(np.multiply(erri, erri))

In [None]:
mt1 = np.array([d_test[1:, 5], d_test[1:, 3], d_test[1:, 4], d_test[1:, 17], d_test[1:, 18]]).T
yp1 = regr1.predict(mt1)
print('RSS on test data: {}'.format(compute_rss(yp1, yt)))

In [None]:
mt2 = np.array([d_test[1:, 5], d_test[1:, 3], d_test[1:, 4], d_test[1:, 17], d_test[1:, 18], ftt2]).T
yp2 = regr2.predict(mt2)
print('RSS on test data: {}'.format(compute_rss(yp2, yt)))

In [None]:
mt3 = np.array([d_test[1:, 5], d_test[1:, 3], d_test[1:, 4], d_test[1:, 17], 
                d_test[1:, 18], ftt2, ftt1, ftt3, ftt4]).T
yp3 = regr3.predict(mt3)
print('RSS on test data: {}'.format(compute_rss(yp3, yt)))