In [1]:
from sklearn import linear_model
import numpy as np
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 
              'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 
              'sqft_lot':int, 'view':int}

In [2]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
sales = sales.sort(['sqft_living','price'])

  from ipykernel import kernelapp as app


In [3]:
def polynomial_sframe(feature, degree):
    poly_dataframe = pd.DataFrame()
    poly_dataframe['power_1'] = feature
    if degree > 1:
        for power in range(2, degree+1):
            name = 'power_' + str(power)
            poly_dataframe[name] = feature.apply(lambda x: x ** power)
    return poly_dataframe

### Ridge Regression with L2 Penalty

In [4]:
l2_small_penalty = 1.5e-5

poly15_data = polynomial_sframe(sales['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model.fit(poly15_data, sales['price'])

Ridge(alpha=1.5e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [5]:
# help(linear_model.ridge)

In [6]:
pd.DataFrame(model.coef_, poly15_data.columns)

Unnamed: 0,0
power_1,124.8733
power_2,-0.0477376
power_3,3.014462e-05
power_4,-2.444199e-09
power_5,-1.941537e-13
power_6,8.540857e-18
power_7,1.511421e-21
power_8,8.279791e-26
power_9,6.5260310000000005e-31
power_10,-3.2789499999999997e-34


In [7]:
# dtype_dict same as above
set_1 = pd.read_csv('wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv('wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv('wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv('wk3_kc_house_set_4_data.csv', dtype=dtype_dict)

In [8]:
def ridge_regression_on_poly15(data, l2_penalty):
    poly15_data = polynomial_sframe(data['sqft_living'], 15)
    model = linear_model.Ridge(alpha=l2_penalty, normalize=True)
    model.fit(poly15_data, data['price'])
    return model.coef_

In [9]:
l2_small_penalty=1e-9

coeffs = ridge_regression_on_poly15(set_1, l2_small_penalty).reshape(15,1)
coeffs = np.hstack((coeffs, ridge_regression_on_poly15(set_2, l2_small_penalty).reshape(15,1))) 
coeffs = np.hstack((coeffs, ridge_regression_on_poly15(set_3, l2_small_penalty).reshape(15,1))) 
coeffs = np.hstack((coeffs, ridge_regression_on_poly15(set_4, l2_small_penalty).reshape(15,1))) 

In [10]:
pd.DataFrame(coeffs, poly15_data.columns)

Unnamed: 0,0,1,2,3
power_1,544.6694,859.3626,-755.396,1119.446
power_2,-0.3554476,-0.8181183,0.9755796,-0.9837602
power_3,0.0001224464,0.00042888,-0.000458946,0.0003387709
power_4,-1.171753e-08,-9.127706e-08,7.779582e-08,3.603772e-08
power_5,-3.905125e-13,-2.696045e-12,7.15013e-12,-4.37814e-11
power_6,-1.3907610000000003e-17,3.739803e-15,-2.886019e-15,5.771917e-15
power_7,1.478603e-20,-1.4271179999999998e-19,-2.13678e-20,7.667953e-19
power_8,6.8749200000000005e-25,-6.307948e-23,3.3808520000000005e-23,-9.492978000000001e-23
power_9,-7.572042e-29,-1.445596e-27,2.191781e-27,-1.9603079999999998e-26
power_10,-1.040973e-32,7.443213e-31,-1.9706770000000002e-31,-2.1088040000000001e-32


In [11]:
l2_small_penalty=1.23e2

coeffs = ridge_regression_on_poly15(set_1, l2_small_penalty).reshape(15,1)
coeffs = np.hstack((coeffs, ridge_regression_on_poly15(set_2, l2_small_penalty).reshape(15,1))) 
coeffs = np.hstack((coeffs, ridge_regression_on_poly15(set_3, l2_small_penalty).reshape(15,1))) 
coeffs = np.hstack((coeffs, ridge_regression_on_poly15(set_4, l2_small_penalty).reshape(15,1))) 

In [12]:
pd.DataFrame(coeffs, poly15_data.columns)

Unnamed: 0,0,1,2,3
power_1,2.328068,2.097569,2.289063,2.085962
power_2,0.0003536216,0.0003908175,0.0004124722,0.0004050358
power_3,3.319697e-08,6.671899e-08,6.088353e-08,7.468646e-08
power_4,2.000825e-12,8.90003e-12,6.585722e-12,1.130966e-11
power_5,1.114926e-16,9.726399e-16,6.152782e-16,1.458644e-15
power_6,6.5778609999999996e-21,9.697337e-20,5.644466e-20,1.735613e-19
power_7,4.129395e-25,9.505645e-24,5.2883439999999996e-24,2.016096e-23
power_8,2.703938e-29,9.44491e-28,5.070914e-28,2.346053e-27
power_9,1.8161480000000002e-33,9.571913e-32,4.946573e-32,2.756361e-31
power_10,1.2382429999999999e-37,9.869452e-36,4.880438e-36,3.270431e-35


### k-fold Cross Validation

In [13]:
train_valid_shuffled = pd.read_csv('wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
test = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)

In [14]:
def get_segment_indexes(num_observation, num_segment, i):
        start = i * num_observation // num_segment
        end = (i+1) * num_observation // num_segment - 1
#         print(i, (start, end))
        return(start, end)

In [15]:
def ridge_regression_on_poly15_2(data, l2_penalty):
    poly15_data = polynomial_sframe(data['sqft_living'], 15)
    model = linear_model.Ridge(alpha=l2_penalty, normalize=True)
    model.fit(poly15_data, data['price'])
    return model

In [16]:
def k_fold_cross_validation(k, l2_penalty, data, feature_data):
    n = data.shape[0]
    avg_rss = []
    for i in range(1,k):
        start, end = get_segment_indexes(n, k, i)
        set_valid = data[start:end+1]
        set_train = data[0:start].append(data[end+1:n])
        model = ridge_regression_on_poly15_2(set_train, l2_penalty)
        poly15_set_valid = feature_data[start:end+1]
        rss = ((model.predict(poly15_set_valid) - set_valid['price']) ** 2).sum()
        avg_rss.append(rss)
        return np.array(avg_rss).mean()

In [17]:
poly15_valid_shuffled = polynomial_sframe(train_valid_shuffled['sqft_living'], 15) # precompute 15 polynomials
l2s = np.logspace(3, 9, num=13)

min_rss = 1e99
min_idx = -1

for l in range(0, l2s.size):
    curr_rss = k_fold_cross_validation(10, l2s[l], train_valid_shuffled, poly15_valid_shuffled)
    if curr_rss < min_rss:
        min_rss = curr_rss
        min_idx = l

print('Lowest average validation error (RSS = %E) was achieved when L2 penalty is %.E' % (min_rss, l2s[min_idx]))

Lowest average validation error (RSS = 2.533624E+14) was achieved when L2 penalty is 1E+03


In [18]:
model = linear_model.Ridge(alpha=l2s[min_idx], normalize=True)
model.fit(poly15_valid_shuffled, train_valid_shuffled['price'])
poly15_test = polynomial_sframe(test['sqft_living'], 15)
rss = ((model.predict(poly15_test) - test['price']) ** 2).sum()
print('Test data RSS is %E' % rss)

Test data RSS is 2.838569E+14
