In [31]:
import pandas as pd
import numpy as np
from math import log
# import regression tools
import sys
sys.path.append(r'../')
import RegressionTools as reg

In [32]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 
              'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 
              'sqft_lot':int, 'view':int}

In [33]:
# load all csv files
kc_house_train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
kc_house_test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [34]:
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
example_coeffs = reg.multiple_linear_regression(kc_house_train_data, example_features, 'price')
example_coeffs

array([   308.93552416, -44362.93724549,  19283.82752088])

In [35]:
Ht = kc_house_train_data[example_features[0]][0]
for i in range(1, len(example_features)):
    Ht = np.vstack((Ht, kc_house_train_data[example_features[i]][0]))
example_coeffs.dot(Ht)

array([ 250738.93429136])

### Create new features

In [36]:
kc_house_train_data['bedrooms_squared'] = kc_house_train_data['bedrooms'].apply(lambda x: x**2)
kc_house_train_data['bed_bath_rooms']   = kc_house_train_data['bedrooms'] * kc_house_train_data['bathrooms']
kc_house_train_data['log_sqft_living']  = kc_house_train_data['sqft_living'].apply(lambda x: log(x))
kc_house_train_data['lat_plus_long']    = kc_house_train_data['lat'] + kc_house_train_data['long']

In [37]:
kc_house_test_data['bedrooms_squared'] = kc_house_test_data['bedrooms'].apply(lambda x: x**2)
kc_house_test_data['bed_bath_rooms']   = kc_house_test_data['bedrooms'] * kc_house_test_data['bathrooms']
kc_house_test_data['log_sqft_living']  = kc_house_test_data['sqft_living'].apply(lambda x: log(x))
kc_house_test_data['lat_plus_long']    = kc_house_test_data['lat'] + kc_house_test_data['long']

In [38]:
print('bedrooms_squared mean: %.2f' % kc_house_test_data['bedrooms_squared'].mean())
print('bed_bath_rooms mean:   %.2f' % kc_house_test_data['bed_bath_rooms'].mean())
print('log_sqft_living mean:  %.2f' % kc_house_test_data['log_sqft_living'].mean())
print('lat_plus_long mean:    %.2f' % kc_house_test_data['lat_plus_long'].mean())

bedrooms_squared mean: 12.45
bed_bath_rooms mean:   7.50
log_sqft_living mean:  7.55
lat_plus_long mean:    -74.65


### Learning multiple models

In [39]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [40]:
model_1_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_1_features, 'price')
pd.DataFrame(model_1_coeffs, model_1_features)

Unnamed: 0,0
sqft_living,300.963659
bedrooms,-59554.887988
bathrooms,5321.077302
lat,532596.984132
long,206418.679603


In [41]:
model_2_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_2_features, 'price')
pd.DataFrame(model_2_coeffs, model_2_features)

Unnamed: 0,0
sqft_living,293.999642
bedrooms,-130492.300057
bathrooms,-109046.39602
lat,532930.482043
long,204561.517539
bed_bath_rooms,33689.673259


In [42]:
model_3_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_3_features, 'price')
pd.DataFrame(model_3_coeffs, model_3_features)

Unnamed: 0,0
sqft_living,646.075415
bedrooms,-30922.613674
bathrooms,140385.970983
lat,455709.54081
long,95594.4595
bed_bath_rooms,-11454.291836
bedrooms_squared,-2309.203673
log_sqft_living,-633873.612071
lat_plus_long,81126.689341


### Comparing multiple models

In [43]:
# on training data
model_1_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_1_features, 'price', model_1_coeffs)
model_2_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_2_features, 'price', model_2_coeffs)
model_3_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_3_features, 'price', model_3_coeffs)
print('RSS results: \nmodel 1 = %.4E, model 2 = %.4E, model 3 = %.4E' % (model_1_RSS, model_2_RSS, model_3_RSS))

RSS results: 
model 1 = 1.0658E+15, model 2 = 1.0493E+15, model 3 = 1.2315E+15


In [44]:
# on training data
model_1_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_1_features, 'price', model_1_coeffs)
model_2_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_2_features, 'price', model_2_coeffs)
model_3_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_3_features, 'price', model_3_coeffs)
print('RSS results: \nmodel 1 = %.4E, model 2 = %.4E, model 3 = %.4E' % (model_1_RSS, model_2_RSS, model_3_RSS))

RSS results: 
model 1 = 2.5046E+14, model 2 = 2.4657E+14, model 3 = 3.0765E+14
