In [1]:
import numpy as np
import pandas as pd
from math import log, sqrt
from sklearn import linear_model  # using scikit-learn

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 
              'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 
              'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [3]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [4]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [5]:
pd.DataFrame(model_all.coef_, all_features)

Unnamed: 0,0
bedrooms,0.0
bedrooms_square,0.0
bathrooms,0.0
sqft_living,134.439314
sqft_living_sqrt,0.0
sqft_lot,0.0
sqft_lot_sqrt,0.0
floors,0.0
floors_square,0.0
waterfront,0.0


In [6]:
print('Number of non-zero weights is %d' % (np.count_nonzero(model_all.coef_) + np.count_nonzero(model_all.intercept_)))

Number of non-zero weights is 4


In [7]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [8]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [9]:
l1_penalties = np.logspace(1, 7, num=13)

Learn a model on TRAINING data using the specified l1_penalty. Make sure to specify normalize=True in the constructor:

In [10]:
min_rss = 1e99
min_idx = -1
for i in range(0, len(l1_penalties)):
    model_train = linear_model.Lasso(alpha=l1_penalties[i], normalize=True)
    model_train.fit(training[all_features], training['price']) # learn weights
    # Compute the RSS on VALIDATION for the current model (print or save the RSS)
    curr_rss = sum((model_train.predict(validation[all_features]) - validation['price']) ** 2)
    print('current RSS is %.3e' % curr_rss)
    if curr_rss < min_rss:
        min_rss = curr_rss
        min_idx = i
print('The L1 penalty that gives the minimum RSS is %.1f with index %d' % (l1_penalties[min_idx], min_idx))

current RSS is 3.982e+14
current RSS is 3.990e+14
current RSS is 4.298e+14
current RSS is 4.637e+14
current RSS is 6.459e+14
current RSS is 1.223e+15
current RSS is 1.223e+15
current RSS is 1.223e+15
current RSS is 1.223e+15
current RSS is 1.223e+15
current RSS is 1.223e+15
current RSS is 1.223e+15
current RSS is 1.223e+15
The L1 penalty that gives the minimum RSS is 10.0 with index 0


In [11]:
model_best = linear_model.Lasso(alpha=l1_penalties[min_idx], normalize=True)
model_best.fit(training[all_features], training['price']) # learn weights
pd.DataFrame(model_best.coef_, all_features)

Unnamed: 0,0
bedrooms,-16144.562757
bedrooms_square,373.245384
bathrooms,50841.24334
sqft_living,617.85356
sqft_living_sqrt,-44411.354867
sqft_lot,0.785623
sqft_lot_sqrt,-701.194765
floors,-0.0
floors_square,5014.200457
waterfront,619488.752486


In [12]:
print('Number of non-zero weights is %d' % (np.count_nonzero(model_best.coef_) + np.count_nonzero(model_best.intercept_)))

Number of non-zero weights is 15


 What if we absolutely wanted to limit ourselves to, say, 7 features? This may be important if we want to derive "a rule of thumb" --- an interpretable model that has only a few features in them.

You are going to implement a simple, two phase procedure to achieve this goal:

Explore a large range of ‘l1_penalty’ values to find a narrow region of ‘l1_penalty’ values where models are likely to have the desired number of non-zero weights.
Further explore the narrow region you found to find a good value for ‘l1_penalty’ that achieves the desired sparsity. Here, we will again use a validation set to choose the best value for ‘l1_penalty’.
10. Assign 7 to the variable ‘max_nonzeros’.

11. Exploring large range of l1_penalty

For l1_penalty in np.logspace(1, 4, num=20):

Fit a regression model with a given l1_penalty on TRAIN data. Add "alpha=l1_penalty" and "normalize=True" to the parameter list.

In [13]:
max_nonzeros = 7
l1_penalties = np.logspace(1, 4, num=20)
list_nonzeros = []

for i in range(0, len(l1_penalties)):
    model_train = linear_model.Lasso(alpha=l1_penalties[i], normalize=True)
    model_train.fit(training[all_features], training['price']) # learn weights   
    num_nonzeros = np.count_nonzero(model_train.coef_) + np.count_nonzero(model_train.intercept_)
    list_nonzeros += [num_nonzeros]

In [14]:
# pd.DataFrame(list_nonzeros, l1_penalties)

In [15]:
more_nz_idx = np.where(np.array(list_nonzeros) == max_nonzeros)[0] - 1
print('L1 penalty greater than %d will give non-zeros more than %d' % (l1_penalties[more_nz_idx], max_nonzeros))

L1 penalty greater than 127 will give non-zeros more than 7


In [16]:
less_nz_idx = np.where(np.array(list_nonzeros) == max_nonzeros)[0] + 1
print('L1 penalty less than %d will give non-zeros more than %d' % (l1_penalties[less_nz_idx], max_nonzeros))

L1 penalty less than 263 will give non-zeros more than 7


We now explore the region of l1_penalty we found: between ‘l1_penalty_min’ and ‘l1_penalty_max’. We look for the L1 penalty in this range that produces exactly the right number of nonzeros and also minimizes RSS on the VALIDATION set.

For l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):

Fit a regression model with a given l1_penalty on TRAIN data. As before, use "alpha=l1_penalty" and "normalize=True".
Measure the RSS of the learned model on the VALIDATION set
Find the model that the lowest RSS on the VALIDATION set and has sparsity equal to ‘max_nonzeros’. (Again, take account of the intercept when counting the number of nonzeros.)

In [17]:
l1_penalty_max = l1_penalties[less_nz_idx]
l1_penalty_min = l1_penalties[more_nz_idx]

l1_penalties = np.linspace(l1_penalty_min,l1_penalty_max,20)
min_rss = 1e99
min_idx = -1

for i in range(0, len(l1_penalties)):
    model_train = linear_model.Lasso(alpha=l1_penalties[i], normalize=True)
    model_train.fit(training[all_features], training['price']) # learn weights   
    curr_rss = sum((model_train.predict(validation[all_features]) - validation['price']) ** 2)
    num_nonzeros = np.count_nonzero(model_train.coef_) + np.count_nonzero(model_train.intercept_)
    print('current RSS is %.7e' % curr_rss)
    if curr_rss < min_rss and num_nonzeros == max_nonzeros:
        min_rss = curr_rss
        min_idx = i
print('The L1 penalty that gives the minimum RSS is %d with index %d' % (l1_penalties[min_idx], min_idx))

current RSS is 4.3537468e+14
current RSS is 4.3700923e+14
current RSS is 4.3823613e+14
current RSS is 4.3915894e+14
current RSS is 4.4003737e+14
current RSS is 4.4077749e+14
current RSS is 4.4156670e+14
current RSS is 4.4240641e+14
current RSS is 4.4329672e+14
current RSS is 4.4423978e+14
current RSS is 4.4523074e+14
current RSS is 4.4626890e+14
current RSS is 4.4711292e+14
current RSS is 4.4799819e+14
current RSS is 4.4892471e+14
current RSS is 4.4989248e+14
current RSS is 4.5090150e+14
current RSS is 4.5195243e+14
current RSS is 4.5304392e+14
current RSS is 4.5417667e+14
The L1 penalty that gives the minimum RSS is 156 with index 4


In [18]:
model_best = linear_model.Lasso(alpha=l1_penalties[min_idx], normalize=True)
model_best.fit(training[all_features], training['price']) # learn weights
pd.DataFrame(model_best.coef_, all_features)

Unnamed: 0,0
bedrooms,-0.0
bedrooms_square,-0.0
bathrooms,10610.890284
sqft_living,163.380252
sqft_living_sqrt,0.0
sqft_lot,-0.0
sqft_lot_sqrt,-0.0
floors,0.0
floors_square,0.0
waterfront,506451.687115
