In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
PATH = 'data/'

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv(f'{PATH}kc_house_data.csv', dtype=dtype_dict)

In [4]:
from math import log, sqrt

In [5]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [8]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [33]:
for i in model_all.coef_.nonzero()[0]:
    print(all_features[i])

sqft_living
view
grade


In [34]:
testing = pd.read_csv(f'{PATH}wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv(f'{PATH}wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv(f'{PATH}wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [35]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [36]:
def rss(x,y): return ((x-y)**2).sum()

In [42]:
best_validation_rss = None
best_l1_penalty = None
for l1_penalty in np.logspace(1, 7, num=13):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features],training['price'])
    validation_rss = rss(model.predict(validation[all_features]), validation['price'])
    test_rss = rss(model.predict(testing[all_features]), testing['price'])
    print("Validation error","{:e}".format(validation_rss))
    print("Test error","{:e}".format(test_rss))
    print("Non zero coefficients", np.count_nonzero(model.coef_), np.count_nonzero(model.intercept_))
    print('-------------------')
    if best_validation_rss is None or validation_rss < best_validation_rss:
        best_validation_rss = validation_rss
        best_l1_penalty = l1_penalty
print(best_l1_penalty, "{:e}".format(best_validation_rss))
    

Validation error 3.982133e+14
Test error 9.846740e+13
Non zero coefficients 14 1
-------------------
Validation error 3.990419e+14
Test error 9.977633e+13
Non zero coefficients 14 1
-------------------
Validation error 4.297916e+14
Test error 1.070207e+14
Non zero coefficients 10 1
-------------------
Validation error 4.637398e+14
Test error 1.142137e+14
Non zero coefficients 5 1
-------------------
Validation error 6.458987e+14
Test error 1.516941e+14
Non zero coefficients 3 1
-------------------
Validation error 1.222507e+15
Test error 2.847189e+14
Non zero coefficients 0 1
-------------------
Validation error 1.222507e+15
Test error 2.847189e+14
Non zero coefficients 0 1
-------------------
Validation error 1.222507e+15
Test error 2.847189e+14
Non zero coefficients 0 1
-------------------
Validation error 1.222507e+15
Test error 2.847189e+14
Non zero coefficients 0 1
-------------------
Validation error 1.222507e+15
Test error 2.847189e+14
Non zero coefficients 0 1
-----------------

In [44]:
max_nonzeros = 7
l1_penalty_min = None
l1_penalty_max = None

for l1_penalty in np.logspace(1, 4, num=20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features],training['price'])
    non_zero_weights = np.count_nonzero(model.coef_)+np.count_nonzero(model.intercept_)
    print(l1_penalty,non_zero_weights)
    print('---------------')
    if non_zero_weights > max_nonzeros:
        if l1_penalty_min is None or l1_penalty > l1_penalty_min:
            l1_penalty_min = l1_penalty
    elif non_zero_weights < max_nonzeros:
        if l1_penalty_max is None or l1_penalty < l1_penalty_max:
            l1_penalty_max = l1_penalty
print(l1_penalty_min, l1_penalty_max)

10.0 15
---------------
14.38449888287663 15
---------------
20.6913808111479 15
---------------
29.76351441631318 15
---------------
42.81332398719393 13
---------------
61.58482110660264 12
---------------
88.58667904100822 11
---------------
127.42749857031335 10
---------------
183.29807108324357 7
---------------
263.6650898730358 6
---------------
379.26901907322497 6
---------------
545.5594781168514 6
---------------
784.7599703514607 5
---------------
1128.8378916846884 3
---------------
1623.776739188721 3
---------------
2335.7214690901214 2
---------------
3359.818286283781 1
---------------
4832.930238571752 1
---------------
6951.927961775606 1
---------------
10000.0 1
---------------
127.42749857031335 263.6650898730358


In [49]:
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features],training['price'])
    validation_rss = rss(model.predict(validation[all_features]), validation['price'])
    test_rss = rss(model.predict(testing[all_features]), testing['price'])
    print("l1_penalty",l1_penalty)
    print("Validation error","{:e}".format(validation_rss))
    print("Test error","{:e}".format(test_rss))
    non_zero_weights = np.count_nonzero(model.coef_)+np.count_nonzero(model.intercept_)
    print("Non zero coefficients", non_zero_weights)
    for i in model.coef_.nonzero()[0]:
        print(all_features[i])
    print('-------------------')

l1_penalty 127.42749857031335
Validation error 4.353747e+14
Test error 1.083210e+14
Non zero coefficients 10
bedrooms
bathrooms
sqft_living
sqft_lot_sqrt
floors_square
waterfront
view
grade
yr_built
-------------------
l1_penalty 134.5978981125619
Validation error 4.370092e+14
Test error 1.086736e+14
Non zero coefficients 10
bedrooms
bathrooms
sqft_living
sqft_lot_sqrt
floors_square
waterfront
view
grade
yr_built
-------------------
l1_penalty 141.76829765481045
Validation error 4.382361e+14
Test error 1.089401e+14
Non zero coefficients 8
bathrooms
sqft_living
sqft_lot_sqrt
waterfront
view
grade
yr_built
-------------------
l1_penalty 148.938697197059
Validation error 4.391589e+14
Test error 1.091507e+14
Non zero coefficients 8
bathrooms
sqft_living
sqft_lot_sqrt
waterfront
view
grade
yr_built
-------------------
l1_penalty 156.10909673930755
Validation error 4.400374e+14
Test error 1.093468e+14
Non zero coefficients 7
bathrooms
sqft_living
waterfront
view
grade
yr_built
--------------