# Regression Week 2: Multiple Linear Regression Assignment 1


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from math import log

from sklearn.linear_model import LinearRegression

In [8]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
              'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 
              'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [9]:
df = pd.read_csv("kc_house_data.csv",dtype=dtype_dict)
X_train = pd.read_csv("kc_house_train_data.csv",dtype=dtype_dict)
X_test = pd.read_csv("kc_house_test_data.csv",dtype=dtype_dict)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### 3. Although we often think of multiple regression as including multiple different features (e.g. # of bedrooms, square feet, and # of bathrooms) but we can also consider transformations of existing variables e.g. the log of the square feet or even "interaction" variables such as the product of bedrooms and bathrooms. Add 4 new variables in both your train_data and test_data.

- ‘bedrooms_squared’ = ‘bedrooms’*‘bedrooms’
- ‘bed_bath_rooms’ = ‘bedrooms’*‘bathrooms’
- ‘log_sqft_living’ = log(‘sqft_living’)
- ‘lat_plus_long’ = ‘lat’ + ‘long’


In [40]:
X_train['bedrooms_squared'] = X_train['bedrooms']*X_train['bedrooms']
X_train['bed_bath_rooms'] = X_train['bedrooms']*X_train['bathrooms']
X_train['log_sqft_living'] = X_train['sqft_living'].apply(lambda x: log(x))
X_train['lat_plus_long'] = X_train['lat']*X_train['long']
y_train = X_train['price']
X_test['bedrooms_squared'] = X_test['bedrooms']*X_test['bedrooms']
X_test['bed_bath_rooms'] = X_test['bedrooms']*X_test['bathrooms']
X_test['log_sqft_living'] = X_test['sqft_living'].apply(lambda x: log(x))
X_test['lat_plus_long'] = X_test['lat']+X_test['long']
y_test = X_test['price']

#### 4. Quiz Question: what are the mean (arithmetic average) values of your 4 new variables on TEST data? (round to 2 digits)

In [41]:
new_variable = ['bedrooms_squared','bed_bath_rooms','log_sqft_living','lat_plus_long']
X_test[new_variable].mean()

bedrooms_squared    12.446678
bed_bath_rooms       7.503902
log_sqft_living      7.550275
lat_plus_long      -74.653334
dtype: float64

#### 5. Use graphlab.linear_regression.create (or any other regression library/function) to estimate the regression coefficients/weights for predicting ‘price’ for the following three models:(In all 3 models include an intercept -- most software does this by default).

- Model 1: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’
- Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’
- Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’

You’ll note that the three models here are “nested” in that all of the features of the Model 1 are in Model 2 and all of the features of Model 2 are in Model 3.

If you use graphlab.linear_regression.create() to estimate these models please ensure that you set validation_set = None. This way you will get the same answer every time you run the code.

Learn all three models on the TRAINING data set. Save your model results for quiz questions later.

#### 6. Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1?




In [42]:
feature = ['sqft_living','bedrooms','bathrooms','lat','long']
feature2 = feature + ['bed_bath_rooms']
feature3 = feature2 + ['bedrooms_squared','log_sqft_living','lat_plus_long']

model1 = LinearRegression().fit(X_train[feature],X_train['price'])

In [43]:
model1.coef_[2]

15706.742082734634

In [44]:
model1.predict(X_train[feature])
# print(X_train[feature].shape, ' X')
# print(y_train.shape, " y")

array([ 244657.18811044,  855689.66538487,  318101.67899466, ...,
        528928.42823838,  356549.38348044,  317948.91207276])

#### 7. Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?


In [45]:
model2 = LinearRegression().fit(X_train[feature2],y_train)
model3 = LinearRegression().fit(X_train[feature3],y_train)

In [46]:
model2.coef_[2]


-71461.308292759655

#### 8. Is the sign for the coefficient the same in both models? Think about why this might be the case.



In [47]:
def get_RSS(model, X_data, y_data):
    prediction = model.predict(X_data)
    residual = y_data - prediction
    RSS = (residual **2).sum()
    return RSS



#### 9. Now using your three estimated models compute the RSS (Residual Sum of Squares) on the Training data.

#### 10. Quiz Question: Which model (1, 2 or 3) had the lowest RSS on TRAINING data?




In [48]:
RSS_train_1 = get_RSS(model1,X_train[feature],y_train)
RSS_train_2 = get_RSS(model2,X_train[feature2],y_train)
RSS_train_3 = get_RSS(model3,X_train[feature3],y_train)

print(RSS_train_1, " ", RSS_train_2, " ",RSS_train_3)
min(RSS_train_1,RSS_train_2,RSS_train_3)

967879963049549.5   958419635074067.6   895927073789701.9


895927073789701.9

#### 11. Now using your three estimated models compute the RSS on the Testing data

#### 12. Quiz Question: Which model (1, 2, or 3) had the lowest RSS on TESTING data?

#### 13. Did you get the same answer for 9 and 11? Think about why this might be the case.

In [49]:
RSS_test_1 = get_RSS(model1,X_test[feature],y_test)
RSS_test_2 = get_RSS(model2,X_test[feature2],y_test)
RSS_test_3 = get_RSS(model3,X_test[feature3],y_test)

print(RSS_test_1, " ", RSS_test_2, " ",RSS_test_3)
min(RSS_test_1,RSS_test_2,RSS_test_3)

225500469795490.4   223377462976466.88   1.8359814427114747e+23


223377462976466.88