In [111]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

In [112]:
dtype_dict = {'bathrooms':float, 
              'waterfront':int, 
              'sqft_above':int, 
              'sqft_living15':float, 
              'grade':int, 
              'yr_renovated':int, 
              'price':float, 
              'bedrooms':float, 
              'zipcode':str, 'long':float, 
              'sqft_lot15':float, 
              'sqft_living':float, 
              'floors':str, 
              'condition':int, 
              'lat':float, 'date':str, 
              'sqft_basement':int, 
              'yr_built':int, 
              'id':str, 
              'sqft_lot':int, 
              'view':int}

In [113]:
# Read in the training and testing data
data_train = pd.read_csv("data/kc_house_train_data.csv", dtype=dtype_dict)
data_test = pd.read_csv("data/kc_house_test_data.csv", dtype=dtype_dict)

In [114]:
# Check that dtypes have changed: looks ok
data_train.dtypes

id                object
date              object
price            float64
bedrooms         float64
bathrooms        float64
sqft_living      float64
sqft_lot           int32
floors            object
waterfront         int32
view               int32
condition          int32
grade              int32
sqft_above         int32
sqft_basement      int32
yr_built           int32
yr_renovated       int32
zipcode           object
lat              float64
long             float64
sqft_living15    float64
sqft_lot15       float64
dtype: object

In [115]:
data_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


## Feature engineering

Add 4 new variables in both your train_data and test_data.

1. bedrooms_squared = ‘bedrooms’*‘bedrooms’
2. bed_bath_rooms = ‘bedrooms’*‘bathrooms’
3. log_sqft_living = log(‘sqft_living’)
4. lat_plus_long = ‘lat’ + ‘long’

In [116]:
data_train['bedrooms'].head(n=3)

0    3.0
1    3.0
2    2.0
Name: bedrooms, dtype: float64

In [117]:
# Train data
data_train['bedrooms_squared'] = data_train['bedrooms']**2
data_train['bed_bath_rooms'] = data_train['bedrooms']*data_train['bathrooms']
data_train['log_sqft_living'] = data_train['sqft_living'].apply(np.log)
data_train['lat_plus_long'] = data_train['lat'] + data_train['long']

# Test data
data_test['bedrooms_squared'] = data_test['bedrooms']**2
data_test['bed_bath_rooms'] = data_test['bedrooms']*data_test['bathrooms']
data_test['log_sqft_living'] = data_test['sqft_living'].apply(np.log)
data_test['lat_plus_long'] = data_test['lat'] + data_test['long']

# Add those two together to create a full data frame
data = pd.concat([data_train, data_test])

In [118]:
print data_train.shape
print data_test.shape
print data.shape # Seems that the dataframes are simply added

(17384, 25)
(4229, 25)
(21613, 25)


### Let’s explain these new variables:

- Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly affect houses with many bedrooms.

- Bedrooms times bathrooms is what's called an "interaction" variable. It is large when both of them are large.

- Taking the log of square feet has the effect of bringing large values closer together and spreading out small values.

- Adding latitude to longitude is non-sensical but we will do it anyway (you'll see why)

# Q4 Quiz Question 

What are the mean (arithmetic average) values of your 4 new variables on TEST data? (round to 2 digits)

In [119]:
features = ['bedrooms_squared', 'bed_bath_rooms', 
            'log_sqft_living', 'lat_plus_long']
data_test[features].mean()

bedrooms_squared    12.446678
bed_bath_rooms       7.503902
log_sqft_living      7.550275
lat_plus_long      -74.653334
dtype: float64

# Linear regression estimation

- Model 1: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’
- Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’
- Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’

## Setting up the features

In [120]:
features1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']

# Need first to create a copy, then add extra features
features2 = features1[:]
features2.append('bed_bath_rooms')

features3 = features2[:]
features3 += ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [121]:
def fitModel(features, target, model):
    """
    Fit a linear regression model
    """
    model.fit(features, target)
    return model

In [122]:
from sklearn.linear_model import LinearRegression

## Fit all three models

In [123]:
target = data_train['price']

model1 = fitModel(data_train[features1], target, model=LinearRegression())
model2 = fitModel(data_train[features2], target, model=LinearRegression())
model3 = fitModel(data_train[features3], target, model=LinearRegression())

In [124]:
def present_coeff(featureNames, model):
    print "Intercept: %e" % model.intercept_
    for i, f in enumerate(featureNames):
        print "%s: %e" % (f, model.coef_[i])

In [125]:
# Model 1
present_coeff(features1, model1)

Intercept: -6.907573e+07
sqft_living: 3.122586e+02
bedrooms: -5.958653e+04
bathrooms: 1.570674e+04
lat: 6.586193e+05
long: -3.093744e+05


In [126]:
# Model 2
present_coeff(features2, model2)

Intercept: -6.686797e+07
sqft_living: 3.066101e+02
bedrooms: -1.134464e+05
bathrooms: -7.146131e+04
lat: 6.548446e+05
long: -2.942990e+05
bed_bath_rooms: 2.557965e+04


In [127]:
# Model 3
present_coeff(features3, model3)

Intercept: -6.203608e+07
sqft_living: 5.294228e+02
bedrooms: 3.451423e+04
bathrooms: 6.706078e+04
lat: 5.340856e+05
long: -4.067507e+05
bed_bath_rooms: -8.570504e+03
bedrooms_squared: -6.788587e+03
log_sqft_living: -5.618315e+05
lat_plus_long: 1.273349e+05


# Calcualte the RSS

In [129]:
def rss(predicted, actual):
    """
    Calculate the residual sum of squares between the actual
    data and what is predicted.
    """
    return np.sum((predicted-actual)**2)

### Training data

In [130]:
# Predict the price for using each model
pred1 = model1.predict(data_train[features1])
pred2 = model2.predict(data_train[features2])
pred3 = model3.predict(data_train[features3])

print "Model 1 rss: %e" % rss(pred1, data_train.price)
print "Model 2 rss: %e" % rss(pred2, data_train.price)
print "Model 3 rss: %e" % rss(pred3, data_train.price)

Model 1 rss: 9.678800e+14
Model 2 rss: 9.584196e+14
Model 3 rss: 9.034365e+14


Model 3 has lowest RSS on the training data. The model has the most number of features.

## Testing data

In [131]:
# Predict the price for using each model
pred1 = model1.predict(data_test[features1])
pred2 = model2.predict(data_test[features2])
pred3 = model3.predict(data_test[features3])

print "Model 1 rss: %e" % rss(pred1, data_test.price)
print "Model 2 rss: %e" % rss(pred2, data_test.price)
print "Model 3 rss: %e" % rss(pred3, data_test.price)

Model 1 rss: 2.255005e+14
Model 2 rss: 2.233775e+14
Model 3 rss: 2.592363e+14


However, on the testing data, it i model 2, which has the lowest RSS.