# Regression Week 2: Multiple Regression (Interpretation)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

# Load in house sales data

Dataset is from house sales in King County, the region where the city of Seattle, WA is located.

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}


In [4]:
data = pd.read_csv('week1/kc_house_data.csv', dtype=dtype_dict)

# Split data into training and testing.

In [5]:
# to split data into train and test set 
#from sklearn.cross_validation import train_test_split
#train, test = train_test_split(data, test_size = 0.2)

# for this exercise, we load train and test set
train = pd.read_csv('week1/kc_house_train_data.csv', dtype=dtype_dict)
test = pd.read_csv('week1/kc_house_test_data.csv', dtype=dtype_dict)

In [6]:
len(train), len(test)

(17384, 4229)

In [7]:
train.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639


In [8]:
# add some columns to train data
train['bedrooms_squared'] = train['bedrooms'].apply(lambda x: x**2)
train['bed_bath_rooms'] = train['bedrooms']*train['bathrooms']
train['log_sqft_living'] = np.log(train['sqft_living'])
train['lat_plus_long'] = train['lat']+train['long']

In [9]:
train.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,0,98178,47.5112,-122.257,1340,5650,9,3.0,7.07327,-74.7458
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,1991,98125,47.721,-122.319,1690,7639,9,6.75,7.851661,-74.598


In [10]:
# add the same columns to test data
test['bedrooms_squared'] = test['bedrooms'].apply(lambda x: x**2)
test['bed_bath_rooms'] = test['bedrooms']*test['bathrooms']
test['log_sqft_living'] = np.log(test['sqft_living'])
test['lat_plus_long'] = test['lat']+test['long']

In [11]:
# what are the means of the following columns:
test.loc[:,['bedrooms_squared','bed_bath_rooms','log_sqft_living','lat_plus_long']].mean()

bedrooms_squared    12.446678
bed_bath_rooms       7.503902
log_sqft_living      7.550275
lat_plus_long      -74.653334
dtype: float64

# Multiple regression model

Use train data to generate the following models:
* Model 1: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’
* Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’
* Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
x1 = train.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat','long']].values
x2 = train.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms']].values
x3 = train.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms', 'bedrooms_squared','log_sqft_living', 'lat_plus_long']].values
y = train[['price']].values

In [14]:
def linear_regression(input, output):
    """linear regression
    input: x, 
    ouput: y,
    return: (intercept, slope)
    """
    slr = LinearRegression()
    slr.fit(input,output)
    print('coefficient: '+str(slr.coef_))
    print('Intercept: %.3f'% slr.intercept_)
    print('r^2: %.3f'% slr.score(input, output, sample_weight=None))
    return (slr, slr.intercept_, slr.coef_)

In [15]:
linear_regression(x1, y)

coefficient: [[  3.12258646e+02  -5.95865332e+04   1.57067421e+04   6.58619264e+05
   -3.09374351e+05]]
Intercept: -69075726.793
r^2: 0.593


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([-69075726.79256983]),
 array([[  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
           6.58619264e+05,  -3.09374351e+05]]))

In [16]:
linear_regression(x2, y)

coefficient: [[  3.06610053e+02  -1.13446368e+05  -7.14613083e+04   6.54844630e+05
   -2.94298969e+05   2.55796520e+04]]
Intercept: -66867968.871
r^2: 0.597


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([-66867968.87107886]),
 array([[  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
           6.54844630e+05,  -2.94298969e+05,   2.55796520e+04]]))

In [17]:
linear_regression(x3, y)

coefficient: [[  5.29422820e+02   3.45142296e+04   6.70607813e+04   5.34085611e+05
   -4.06750711e+05  -8.57050439e+03  -6.78858667e+03  -5.61831484e+05
    1.27334900e+05]]
Intercept: -62036084.986
r^2: 0.620


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([-62036084.98609828]),
 array([[  5.29422820e+02,   3.45142296e+04,   6.70607813e+04,
           5.34085611e+05,  -4.06750711e+05,  -8.57050439e+03,
          -6.78858667e+03,  -5.61831484e+05,   1.27334900e+05]]))

In [18]:
# calculating RSS
m1_price_pred = linear_regression(x1, y)[0].predict(x1)
m2_price_pred = linear_regression(x2, y)[0].predict(x2)
m3_price_pred = linear_regression(x3, y)[0].predict(x3)

coefficient: [[  3.12258646e+02  -5.95865332e+04   1.57067421e+04   6.58619264e+05
   -3.09374351e+05]]
Intercept: -69075726.793
r^2: 0.593
coefficient: [[  3.06610053e+02  -1.13446368e+05  -7.14613083e+04   6.54844630e+05
   -2.94298969e+05   2.55796520e+04]]
Intercept: -66867968.871
r^2: 0.597
coefficient: [[  5.29422820e+02   3.45142296e+04   6.70607813e+04   5.34085611e+05
   -4.06750711e+05  -8.57050439e+03  -6.78858667e+03  -5.61831484e+05
    1.27334900e+05]]
Intercept: -62036084.986
r^2: 0.620


In [19]:
def rss(predict, actual):
    return np.sum((predict-actual)**2)

In [20]:
# rss of Train data
rss(m1_price_pred,train[['price']].values), rss(m2_price_pred,train[['price']].values),rss(m3_price_pred,train[['price']].values)

(967879963049545.75, 958419635074070.0, 903436455050479.0)

In [21]:
# work on Test data
tx1 = test.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat','long']].values
tx2 = test.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms']].values
tx3 = test.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms', 'bedrooms_squared','log_sqft_living', 'lat_plus_long']].values
ty = test[['price']].values

In [22]:
t1_price_pred = linear_regression(tx1, ty)[0].predict(tx1)
t2_price_pred = linear_regression(tx2, ty)[0].predict(tx2)
t3_price_pred = linear_regression(tx3, ty)[0].predict(tx3)

coefficient: [[  2.91523046e+02  -3.63442805e+04   2.24784634e+04   6.49781148e+05
   -3.22967664e+05]]
Intercept: -70365461.083
r^2: 0.584
coefficient: [[  2.84109954e+02  -9.38571172e+04  -7.73844701e+04   6.46266392e+05
   -3.05020252e+05   2.91947505e+04]]
Intercept: -67803197.056
r^2: 0.589
coefficient: [[  4.69716691e+02  -3.28857029e+04   5.04637585e+04   5.25504326e+05
   -4.08891578e+05  -2.28592380e+03   1.08100171e+03  -4.33486888e+05
    1.16612748e+05]]
Intercept: -63414968.747
r^2: 0.606


In [23]:
# rss of Test data
rss(t1_price_pred,test[['price']].values), rss(t2_price_pred,test[['price']].values),rss(t3_price_pred,test[['price']].values)

(223588470558143.28, 220888287427809.19, 211457519132479.25)