In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from math import log, sqrt

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 
              'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict, parse_dates=['date'])
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [4]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
## quiz
for i, x in enumerate(all_features):
    if model_all.coef_[i] != 0:
        print(model_all.coef_[i], x)

134.439313955 sqft_living
24750.0045856 view
61749.1030907 grade


In [10]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [11]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [18]:
models = []
rsses = []
l1_penalties = np.logspace(1, 7, num=13)
for l1 in l1_penalties:
    model = linear_model.Lasso(alpha=l1, normalize=True) # set parameters
    model.fit(training[all_features], training['price']) # learn weights
    models.append(model)
    rss = np.sum(np.square(model.predict(validation[all_features]) - validation['price']))
    rsses.append(rss)

In [19]:
l1_penalties

array([  1.00000000e+01,   3.16227766e+01,   1.00000000e+02,
         3.16227766e+02,   1.00000000e+03,   3.16227766e+03,
         1.00000000e+04,   3.16227766e+04,   1.00000000e+05,
         3.16227766e+05,   1.00000000e+06,   3.16227766e+06,
         1.00000000e+07])

In [21]:
for i, x in enumerate(all_features):
    if models[0].coef_[i] != 0:
        print(models[0].coef_[i], x)  # 15 in total (including intercept)

-16144.5627571 bedrooms
373.245384349 bedrooms_square
50841.2433399 bathrooms
617.853559504 sqft_living
-44411.3548667 sqft_living_sqrt
0.785623064832 sqft_lot
-701.194765368 sqft_lot_sqrt
5014.20045697 floors_square
619488.752486 waterfront
38041.8556525 view
24998.7718382 condition
128716.234621 grade
-3293.83117995 yr_built
10.0573208643 yr_renovated


In [60]:
max_nonzeros = 7
models = []
rsses = []
l1_penalties = np.logspace(1, 2.5, num=200)
l1s = []
nonzeros = []
for l1 in l1_penalties:
    model = linear_model.Lasso(alpha=l1, normalize=True) # set parameters
    model.fit(training[all_features], training['price']) # learn weights
    nnz_count = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    if nnz_count == max_nonzeros:
        l1s.append(l1)
        models.append(model)
        rss = np.sum(np.square(model.predict(validation[all_features]) - validation['price']))
        rsses.append(rss)

In [61]:
l1s

[155.22253574270479,
 157.94011951465393,
 160.70528182616385,
 163.51885566624858,
 166.38168860761274,
 169.29464306197789,
 172.25859653987874,
 175.27444191500823,
 178.34308769319094,
 181.46545828606469,
 184.64249428955426,
 187.87515276722067,
 191.16440753857017,
 194.51124947241311,
 197.91668678535575,
 201.38174534552101,
 204.9074689815846]

In [62]:
rsses

[439949715448732.8,
 440222783151840.94,
 440506486911241.9,
 440803020662235.1,
 441112744351832.06,
 441436134533178.06,
 441773747328224.3,
 442126134273697.4,
 442493856428185.8,
 442877546662565.94,
 443277803676887.2,
 443695253036967.56,
 444132781093838.44,
 444586470957058.44,
 445059353307090.5,
 445552021778652.2,
 446067056318656.25]

In [63]:
for i, x in enumerate(all_features):
    if models[0].coef_[i] != 0:
        print(models[0].coef_[i], x)

10797.2964181 bathrooms
163.321518489 sqft_living
507158.094785 waterfront
41975.1488356 view
116346.210565 grade
-2618.69036357 yr_built
