In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
kc_house=pd.read_csv('kc_house_data.csv',dtype=dtype_dict)
kc_house_train=pd.read_csv('kc_house_train_data.csv',dtype=dtype_dict)
kc_house_test=pd.read_csv('kc_house_test_data.csv',dtype=dtype_dict)
kc_house_train.head(1)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0


In [5]:
# Creating new features
# Square is used to increase the increase of a value
# Log is used to decrease the scale of values
# Interactive features have a greater effect together 
kc_house_test['bed_bath_rooms']=kc_house_test['bedrooms']*kc_house_test['bathrooms']
kc_house_test['lat_plus_long']=kc_house_test['lat']+kc_house_test['long']
kc_house_test['bedrooms_squared']=kc_house_test['bedrooms']*kc_house_test['bedrooms']
kc_house_test['log_sqft_living']=np.log(kc_house_test['sqft_living'])

In [6]:
kc_house_train['bed_bath_rooms']=kc_house_train['bedrooms']*kc_house_train['bathrooms']
kc_house_train['lat_plus_long']=kc_house_train['lat']+kc_house_train['long']
kc_house_train['bedrooms_squared']=kc_house_train['bedrooms']*kc_house_train['bedrooms']
kc_house_train['log_sqft_living']=np.log(kc_house_train['sqft_living'])

In [74]:
#Mean of bed_bath_rooms
np.mean(kc_house_test['bed_bath_rooms'])

7.5039016315913925

In [7]:
#Mean of lat_plus_long
np.mean(kc_house_test['lat_plus_long'])

-74.65333355403185

In [8]:
#Mean of bedrooms squared
np.mean(kc_house_test['bedrooms_squared'])

12.4466777015843

In [9]:
#Mean of log sqft_living
np.mean(kc_house_test['log_sqft_living'])

7.550274679645921

In [10]:
#Training a linear model on the multiple features
y=kc_house_train['price']
model1_X=kc_house_train[['sqft_living','bedrooms','bathrooms','lat','long']]
model1=LinearRegression()
model1_values=model1.fit(model1_X,y) #Fitting the data
model1_pred=model1.predict(model1_X) #Predicting the values
model1_coeff=pd.DataFrame(model1.coef_,model1_X.columns,columns=['Co-efficients'])#
# Creating a new dataframe for the features and thier coefficients

In [11]:
model1_coeff

Unnamed: 0,Co-efficients
sqft_living,312.258646
bedrooms,-59586.533154
bathrooms,15706.742083
lat,658619.263931
long,-309374.351268


In [12]:
#Training a linear model on the different set of features
y=kc_house_train['price']
model2_X=kc_house_train[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']]
model2=LinearRegression()
model2_values=model2.fit(model2_X,y)
model2_pred=model2.predict(model2_X)
model2_coeff=pd.DataFrame(model2.coef_,model2_X.columns,columns=['Co-efficients'])

In [13]:
model2_coeff

Unnamed: 0,Co-efficients
sqft_living,306.610053
bedrooms,-113446.36807
bathrooms,-71461.308293
lat,654844.629503
long,-294298.969138
bed_bath_rooms,25579.652001


In [14]:
#Training a linear model on the different set of features
y=kc_house_train['price']
model3_X=kc_house_train[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared', 'log_sqft_living','lat_plus_long']]
model3=LinearRegression()
model3_values=model3.fit(model3_X,y)
model3_pred=model3.predict(model3_X)
model3_coeff=pd.DataFrame(model3.coef_,model3_X.columns,columns=['Co-efficients'])

In [15]:
model3_coeff

Unnamed: 0,Co-efficients
sqft_living,529.42282
bedrooms,34514.229578
bathrooms,67060.781319
lat,534085.610867
long,-406750.710861
bed_bath_rooms,-8570.504395
bedrooms_squared,-6788.58667
log_sqft_living,-561831.484076
lat_plus_long,127334.900006


In [23]:
#Question 7
model1_RSS=np.sum(np.square(y-model1_pred))
model2_RSS=np.sum(np.square(y-model2_pred))
model3_RSS=np.sum(np.square(y-model3_pred))

print "For Training Data"
print "RSS of Model 1 is:",'%.3e' % model1_RSS,"\nRSS of Model 2 is:",'%.3e' % model2_RSS,"\nRsS of Model 3 is:", '%.3e' % model3_RSS

For Training Data
RSS of Model 1 is: 2.255e+14 
RSS of Model 2 is: 2.234e+14 
RsS of Model 3 is: 2.592e+14


In [22]:
#Similarly on test data
model1_X=kc_house_test[['sqft_living','bedrooms','bathrooms','lat','long']]
model1_pred=model1.predict(model1_X)

model2_X=kc_house_test[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']]
model2_pred=model2.predict(model2_X)

model3_X=kc_house_test[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared', 
                        'log_sqft_living','lat_plus_long']]
model3_pred=model3.predict(model3_X)

y=kc_house_test['price']

model1_RSS=np.sum(np.square(y-model1_pred))
model2_RSS=np.sum(np.square(y-model2_pred))
model3_RSS=np.sum(np.square(y-model3_pred))

print " For Test Data"
print "RSS of Model 1 is:",'%.3e' % model1_RSS,"\nRSS of Model 2 is:",'%.3e' % model2_RSS,"\nRsS of Model 3 is:", '%.3e' % model3_RSS

 For Test Data
RSS of Model 1 is: 2.255e+14 
RSS of Model 2 is: 2.234e+14 
RsS of Model 3 is: 2.592e+14


## Thank You!