In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from math import log, sqrt
import functools

In [13]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [19]:
training = pd.read_csv("/Users/wxu/workspace/regression/week3/wk3_kc_house_train_data.csv",sep=",",header=0,dtype = dtype_dict)
validation = pd.read_csv("/Users/wxu/workspace/regression/week3/wk3_kc_house_valid_data.csv",sep=",",header=0,dtype = dtype_dict)
testing = pd.read_csv("/Users/wxu/workspace/regression/week3/wk3_kc_house_test_data.csv",sep=",",header=0,dtype = dtype_dict)
house = pd.read_csv("/Users/wxu/workspace/regression/week3/kc_house_data.csv",sep=",",header= 0,dtype = dtype_dict)


In [15]:
house["sqft_living_sqrt"] = house["sqft_living"].apply(sqrt)
house['sqft_lot_sqrt'] = house['sqft_lot'].apply(sqrt)
house["bedrooms_square"] = house["bedrooms"]*house["bedrooms"]
house["floors_square"]=house["floors"]*house["floors"]

In [16]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [17]:
model_all = linear_model.Lasso(alpha=5e2,normalize=True)
model_all_fit = model_all.fit(house[all_features],house["price"])

In [87]:
print model_all_fit.coef_
print all_features

[     0.              0.              0.            134.43931396      0.
      0.              0.              0.              0.              0.
  24750.00458561      0.          61749.10309071      0.              0.
     -0.              0.        ]
['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt', 'floors', 'floors_square', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']


In [20]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [31]:
def select_lambda_rss(lambdaList):
    rss = [0]*len(lambdaList)
    for i in range(len(lambdaList)):
        
        curr_model = linear_model.Lasso(alpha = lambdaList[i],normalize=True)
        curr_model_fit = curr_model.fit(training[all_features],training["price"])
        curr_rss = sum((curr_model_fit.predict(validation[all_features])-validation["price"])**2)
        rss[i] = curr_rss
    return rss
    

In [35]:
myLambda = np.logspace(1, 7, num=13)
print myLambda

[  1.00000000e+01   3.16227766e+01   1.00000000e+02   3.16227766e+02
   1.00000000e+03   3.16227766e+03   1.00000000e+04   3.16227766e+04
   1.00000000e+05   3.16227766e+05   1.00000000e+06   3.16227766e+06
   1.00000000e+07]


In [33]:
myRss = select_lambda_rss(myLambda)
print myRss
print min(myRss)

[398213327300134.19, 399041900253348.19, 429791604072558.12, 463739831045119.62, 645898733633810.38, 1222506859427156.8, 1222506859427156.8, 1222506859427156.8, 1222506859427156.8, 1222506859427156.8, 1222506859427156.8, 1222506859427156.8, 1222506859427156.8]
3.982133273e+14


In [38]:
select_model = linear_model.Lasso(alpha = 10,normalize=True)
select_model_fit = select_model.fit(training[all_features],training["price"])
test_rss = sum((select_model_fit.predict(testing[all_features])-testing["price"])**2)
print test_rss

9.84674025527e+13


In [39]:
np.count_nonzero(select_model_fit.coef_) + np.count_nonzero(select_model_fit.intercept_)

15

In [41]:
def select_lambda_fixed(lambdaList):
    nonzero = [0]*len(lambdaList)
    for i in range(len(lambdaList)):
        
        curr_model = linear_model.Lasso(alpha = lambdaList[i],normalize=True)
        curr_model_fit = curr_model.fit(training[all_features],training["price"])
        curr_nonzero = np.count_nonzero(curr_model_fit.coef_) + np.count_nonzero(curr_model_fit.intercept_)
        nonzero[i] = curr_nonzero
    return nonzero
    

In [42]:
myLambda2 = np.logspace(1, 4, num=20)
myNonzero = select_lambda_fixed(myLambda2)

In [43]:
myNonzero

[15, 15, 15, 15, 13, 12, 11, 10, 7, 6, 6, 6, 5, 3, 3, 2, 1, 1, 1, 1]

In [48]:

a = [a for (a,b) in zip(myLambda2,myNonzero) if b==7]
print a
print myLambda2[8]

[183.29807108324357]
183.298071083


In [52]:
l1_penalty_min = max([a for (a,b) in zip(myLambda2,myNonzero) if b==10])
l1_penalty_max = min([a for (a,b) in zip(myLambda2,myNonzero) if b==6])
print l1_penalty_min
print l1_penalty_max

127.42749857
263.665089873


In [54]:
myLambda3 = np.linspace(l1_penalty_min,l1_penalty_max,20)

In [58]:
def select_lambda_rss_fixed(lambdaList):
    nonzero = [0]*len(lambdaList)
    rss = [0]*len(lambdaList)
    for i in range(len(lambdaList)):
        
        curr_model = linear_model.Lasso(alpha = lambdaList[i],normalize=True)
        curr_model_fit = curr_model.fit(training[all_features],training["price"])
        curr_nonzero = np.count_nonzero(curr_model_fit.coef_) + np.count_nonzero(curr_model_fit.intercept_)
        nonzero[i] = curr_nonzero
        curr_rss = sum((curr_model_fit.predict(validation[all_features])-validation["price"])**2)
        rss[i] = curr_rss
        theList = zip(rss,nonzero,myLambda3)
    return theList

In [62]:
mytuple = select_lambda_rss_fixed(myLambda3)
filterMytuple = [elem for elem in mytuple if elem[1]==7]
print filterMytuple

[(440037365263316.81, 7, 156.10909673930755), (440777489641605.69, 7, 163.27949628155611), (441566698090139.19, 7, 170.44989582380464), (442406413188665.12, 7, 177.6202953660532), (443296716874313.19, 7, 184.79069490830176), (444239780526141.06, 7, 191.96109445055032), (445230739842614.25, 7, 199.13149399279888)]


In [69]:
selectTuple = [elem for elem in filterMytuple if elem[0]==min(map(lambda x: x[0],filterMytuple))]

In [70]:
print selectTuple

[(440037365263316.81, 7, 156.10909673930755)]


In [71]:
final_model = linear_model.Lasso(alpha = 156.10909673930755,normalize=True)
final_model_fit = final_model.fit(training[all_features],training["price"])
final_model_fit.coef_

array([ -0.00000000e+00,  -0.00000000e+00,   1.06108903e+04,
         1.63380252e+02,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.06451687e+05,   4.19600436e+04,   0.00000000e+00,
         1.16253554e+05,   0.00000000e+00,   0.00000000e+00,
        -2.61223488e+03,   0.00000000e+00])

In [86]:
a = final_model_fit.coef_
final_feature = [feature for feature,index in zip(all_features,range(len(a))) if index in list(np.where(a!=0)[0])]
print final_feature

['bathrooms', 'sqft_living', 'waterfront', 'view', 'grade', 'yr_built']


In [84]:
print len(a)
print len(all_features)

17
17
