In [1]:
#Import libraries
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [2]:
# Create new features
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [3]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms',         'bedrooms_square',  'bathrooms',     'sqft_living', 
                'sqft_living_sqrt', 'sqft_lot',         'sqft_lot_sqrt', 'floors', 
                'floors_square',    'waterfront',       'view',          'condition', 
                'grade',            'sqft_above',       'sqft_basement', 'yr_built', 
                'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [6]:
#Creating testing, training and validation data
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [7]:
# Creating new features in the the testing, training and validation data set
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [9]:
import numpy as np
col=training.columns.tolist()

In [14]:
a=validation['price']
for l1 in np.logspace(1,7,num=13):
    model = linear_model.Lasso(alpha=l1, normalize=True)
    model.fit(training[all_features],training['price'])
    model_non_zero = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    print "The RSS for l1_penalty of ",np.log10(l1),"is", '%3e' %np.sum((model.predict(validation[all_features])-a)**2)," and the number of non zero elements is", model_non_zero


The RSS for l1_penalty of  1.0 is 3.982133e+14  and the number of non zero elements is 15
The RSS for l1_penalty of  1.5 is 3.990419e+14  and the number of non zero elements is 15
The RSS for l1_penalty of  2.0 is 4.297916e+14  and the number of non zero elements is 11
The RSS for l1_penalty of  2.5 is 4.637398e+14  and the number of non zero elements is 6
The RSS for l1_penalty of  3.0 is 6.458987e+14  and the number of non zero elements is 4
The RSS for l1_penalty of  3.5 is 1.222507e+15  and the number of non zero elements is 1
The RSS for l1_penalty of  4.0 is 1.222507e+15  and the number of non zero elements is 1
The RSS for l1_penalty of  4.5 is 1.222507e+15  and the number of non zero elements is 1
The RSS for l1_penalty of  5.0 is 1.222507e+15  and the number of non zero elements is 1
The RSS for l1_penalty of  5.5 is 1.222507e+15  and the number of non zero elements is 1
The RSS for l1_penalty of  6.0 is 1.222507e+15  and the number of non zero elements is 1
The RSS for l1_pen

In [21]:
# Lets say that max non zero elements that are allowed are 7, then find the max and min l1_penalty
max_non_zeors=7
for l1 in np.logspace(1,4,num=20):
    model = linear_model.Lasso(alpha=l1, normalize=True)
    model.fit(training[all_features],training['price'])
    model_non_zero=np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    print("The l1 penalty is",'%.2f' %np.log10(l1)," and the number of nonzero features is",model_non_zero)

('The l1 penalty is', '1.00', ' and the number of nonzero features is', 15)
('The l1 penalty is', '1.16', ' and the number of nonzero features is', 15)
('The l1 penalty is', '1.32', ' and the number of nonzero features is', 15)
('The l1 penalty is', '1.47', ' and the number of nonzero features is', 15)
('The l1 penalty is', '1.63', ' and the number of nonzero features is', 13)
('The l1 penalty is', '1.79', ' and the number of nonzero features is', 12)
('The l1 penalty is', '1.95', ' and the number of nonzero features is', 11)
('The l1 penalty is', '2.11', ' and the number of nonzero features is', 10)
('The l1 penalty is', '2.26', ' and the number of nonzero features is', 7)
('The l1 penalty is', '2.42', ' and the number of nonzero features is', 6)
('The l1 penalty is', '2.58', ' and the number of nonzero features is', 6)
('The l1 penalty is', '2.74', ' and the number of nonzero features is', 6)
('The l1 penalty is', '2.89', ' and the number of nonzero features is', 5)
('The l1 penalty 

In [22]:
l1_penalty_max=263.665
l1_penalty_min=127.43

In [78]:
a=validation['price']
for l1 in np.linspace(l1_penalty_min,l1_penalty_max,num=20):
    model = linear_model.Lasso(alpha=l1, normalize=True)
    model.fit(training[all_features],training['price'])
    model_non_zero=np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    if model_non_zero==7:
        print('%8f' %(l1),'%12f' %np.sum(np.square(model.predict(validation[all_features])-a)))
        print(model.coef_)
        

('156.111053', '440037559545219.937500')
[ -0.00000000e+00  -0.00000000e+00   1.06104786e+04   1.63380381e+02
   0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   0.00000000e+00   5.06450129e+05   4.19600103e+04   0.00000000e+00
   1.16253349e+05   0.00000000e+00   0.00000000e+00  -2.61222063e+03
   0.00000000e+00]
('163.281316', '440777683505084.625000')
[ -0.00000000e+00  -0.00000000e+00   9.08361305e+03   1.63867800e+02
   0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   0.00000000e+00   5.00733892e+05   4.18391373e+04   0.00000000e+00
   1.15501757e+05   0.00000000e+00   0.00000000e+00  -2.55984090e+03
   0.00000000e+00]
('170.451579', '441566889269266.937500')
[ -0.00000000e+00  -0.00000000e+00   7.56688736e+03   1.64348679e+02
   0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   0.00000000e+00   4.95019291e+05   4.17175748e+04   0.00000000e+00
   1.14751342e+05   0.00000000e+00   0.00000000e+00  -2.50755056e+03
   0.000000

## Thank You!