In [1]:
import pandas as pd
import numpy as np
from math import log
from sklearn.linear_model import LinearRegression

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
              'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 
              'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [40]:
X_train = pd.read_csv("kc_house_train_data.csv",dtype=dtype_dict)
X_test = pd.read_csv("kc_house_test_data.csv",dtype=dtype_dict)

In [41]:
X_train['bedrooms_squared'] = X_train['bedrooms']*X_train['bedrooms']
X_train['bed_bath_rooms'] = X_train['bedrooms']*X_train['bathrooms']
X_train['log_sqft_living'] = X_train['sqft_living'].apply(lambda x: log(x))
X_train['lat_plus_long'] = X_train['lat']*X_train['long']
y_train = X_train['price']
X_test['bedrooms_squared'] = X_test['bedrooms']*X_test['bedrooms']
X_test['bed_bath_rooms'] = X_test['bedrooms']*X_test['bathrooms']
X_test['log_sqft_living'] = X_test['sqft_living'].apply(lambda x: log(x))
X_test['lat_plus_long'] = X_test['lat']+X_test['long']
y_test = X_test['price']

new_variable = ['bedrooms_squared','bed_bath_rooms','log_sqft_living','lat_plus_long']


In [42]:
feature = ['sqft_living','bedrooms','bathrooms','lat','long']
feature2 = feature + ['bed_bath_rooms']
feature3 = feature2 + ['bedrooms_squared','log_sqft_living','lat_plus_long']

model1 = LinearRegression().fit(X_train[feature],X_train['price'])
model2 = LinearRegression().fit(X_train[feature2],X_train['price'])
model3 = LinearRegression().fit(X_train[feature3],X_train['price'])

In [55]:
print("Model1 coefficient (w):", model1.coef_)
print("")
print("Model2 coefficient (w):", model2.coef_)
print("")
print("Model3 coefficient (w):", model3.coef_)


Model1 coefficient (w): [  3.12258646e+02  -5.95865332e+04   1.57067421e+04   6.58619264e+05
  -3.09374351e+05]

Model2 coefficient (w): [  3.06610053e+02  -1.13446368e+05  -7.14613083e+04   6.54844630e+05
  -2.94298969e+05   2.55796520e+04]

Model3 coefficient (w): [  5.31964492e+02   3.66338204e+04   6.75006294e+04  -1.39665060e+08
   5.43198511e+07  -9.02007090e+03  -6.96138493e+03  -5.61309405e+05
  -1.14822353e+06]


In [43]:
def get_RSS(model, X_data, y_data):
    prediction = model.predict(X_data)
    residual = y_data - prediction
    RSS = (residual **2).sum()
    return RSS

In [49]:
RSS_train_1 = get_RSS(model1,X_train[feature],y_train)
RSS_train_2 = get_RSS(model2,X_train[feature2],y_train)
RSS_train_3 = get_RSS(model3,X_train[feature3],y_train)

print("RSS_1:",RSS_train_1)
print("RSS_2:",RSS_train_2)
print("RSS_3:",RSS_train_3)
print("")
print("-------------------------")
print("")
prediction1 = model1.predict(X_test[feature].iloc[10].values.reshape(1,-1))
prediction2 = model2.predict(X_test[feature2].iloc[10].values.reshape(1,-1))
prediction3 = model3.predict(X_test[feature3].iloc[10].values.reshape(1,-1))

print("Prediction_1:",prediction1)
print("Prediction_2:",prediction2)
print("Prediction_3:",prediction3)

RSS_1: 967879963049549.5
RSS_2: 958419635074071.5
RSS_3: 895927073789704.5

-------------------------

Prediction_1: [ 284545.19411053]
Prediction_2: [ 283331.49956516]
Prediction_3: [ -6.55099336e+09]
