In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import linear_model

In [2]:
input_file = "home_data.csv"
sales = pd.read_csv(input_file, header = 0)

In [3]:
sales[sales['zipcode']==98039]['price'].mean()

2160606.6

In [4]:
select_sales=sales['sqft_living'].apply(lambda x: 2000<x<4000)

In [5]:
sales[select_sales].shape

(9111, 21)

In [6]:
sales.shape

(21613, 21)

## My feature model

In [7]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [8]:
my_features_model = linear_model.LinearRegression()

In [9]:
X1=sales[my_features]
y1=sales['price']

In [46]:
X1_train, X1_test, y1_train, y1_test= train_test_split(X1,y1, train_size = 0.8, random_state = 0)

In [48]:
X1_train.shape

(17290, 6)

In [11]:
my_features_model.fit (X1_train, y1_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [12]:
my_features_model.coef_

array([ -5.66006330e+04,   1.10870936e+04,   3.20408369e+02,
        -2.89499140e-01,  -2.41800491e+03,   6.17971071e+02])

In [13]:
my_features_model.intercept_

-60558259.018138468

# $R^2$ value

In [22]:
my_features_model.score(X1_train, y1_train)

0.51814231791782528

# RMSE

In [31]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y1_test, my_features_model.predict(X1_test)))

244004.77443104106

## Advanced feature model

In [36]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house				
'grade', # measure of quality of construction				
'waterfront', # waterfront property				
'view', # type of view				
'sqft_above', # square feet above ground				
'sqft_basement', # square feet in basement				
'yr_built', # the year built				
'yr_renovated', # the year renovated				
'lat', 'long', # the lat-long of the parcel				
'sqft_living15', # average sq.ft. of 15 nearest neighbors 				
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

In [37]:
advanced_features_model = linear_model.LinearRegression()

In [38]:
X2=sales[advanced_features]
y2=sales['price']

In [39]:
X2_train, X2_test, y2_train, y2_test= train_test_split(X2,y2, test_size = 0.2, random_state = 0)

In [49]:
X2_train.shape

(17290, 18)

In [40]:
advanced_features_model.fit(X2_train,y2_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [41]:
advanced_features_model.coef_

array([ -3.46742831e+04,   3.94310007e+04,   1.12610341e+02,
         1.75888737e-01,   7.42456845e+03,  -5.67468805e+02,
         2.61695183e+04,   9.35901502e+04,   6.05600529e+05,
         5.27368893e+04,   7.27875900e+01,   3.98227500e+01,
        -2.60713698e+03,   2.01053038e+01,   6.04865961e+05,
        -2.22392029e+05,   2.42333853e+01,  -4.67450072e-01])

In [42]:
advanced_features_model.intercept_

4166133.0203825179

# $R^2$ value

In [43]:
advanced_features_model.score(X2_train, y2_train)

0.70053493133500289

# RMSE

In [44]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y2_test, advanced_features_model.predict(X2_test)))

190473.37570967819