### What relates to the prices of the houses?

In [77]:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import statsmodels.api as sm

%matplotlib inline

df = pd.read_csv('kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [78]:
df.view.value_counts()

0    19489
2      963
3      510
1      332
4      319
Name: view, dtype: int64

In [79]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

Let's add some more columns to the data

In [80]:
df['has_basement'] = (df['sqft_basement'] != 0)*1

In [81]:
df['more_than_1_floor'] = (df['floors'] > 1)*1

In [82]:
df['has_been_renovated'] = (df['yr_renovated'] !=0)*1
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'has_basement',
       'more_than_1_floor', 'has_been_renovated'],
      dtype='object')

In [91]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,has_basement,more_than_1_floor,has_been_renovated
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,1955,0,98178,47.5112,-122.257,1340,5650,0,0,0
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,1951,1991,98125,47.721,-122.319,1690,7639,1,1,1
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,1933,0,98028,47.7379,-122.233,2720,8062,0,0,0
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,1965,0,98136,47.5208,-122.393,1360,5000,1,0,0
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,1987,0,98074,47.6168,-122.045,1800,7503,0,0,0


In [83]:
# define
X = df[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view',
       'sqft_above', 'sqft_basement', 'yr_built',
       'lat', 'long', 'has_basement',
       'more_than_1_floor', 'has_been_renovated']]
y = df.price

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

# instantiate
lm = LinearRegression(normalize=True)

# fit
lm.fit(X_train, y_train)

# predict
y_test_preds = lm.predict(X_test) 

# score
r2_score(y_test, y_test_preds)

0.653870740918853

Let's try using statesmodel

In [90]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
r2_score(y_test, knn_preds)
# i just realized knn is a bad choice for this analisis since knn is used for classification rather than regression

0.21385408012266494

In [59]:
df['intercept'] = 1
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,has_basement,more_than_1_floor,has_been_renovated,intercept
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340,5650,0,0,0,1
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690,7639,1,1,1,1
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720,8062,0,0,0,1
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360,5000,1,0,0,1
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800,7503,0,0,0,1


In [60]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,has_basement,more_than_1_floor,has_been_renovated,intercept
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340,5650,0,0,0,1
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690,7639,1,1,1,1
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720,8062,0,0,0,1
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360,5000,1,0,0,1
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800,7503,0,0,0,1


In [61]:
lm2 = sm.OLS(df.price, df[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view',
       'sqft_above', 'sqft_basement', 'yr_built',
       'lat', 'long', 'has_basement',
       'more_than_1_floor', 'has_been_renovated']])

In [62]:
results = lm2.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.889
Model:,OLS,Adj. R-squared (uncentered):,0.888
Method:,Least Squares,F-statistic:,12300.0
Date:,"Tue, 21 Jan 2020",Prob (F-statistic):,0.0
Time:,02:26:48,Log-Likelihood:,-296340.0
No. Observations:,21613,AIC:,592700.0
Df Residuals:,21599,BIC:,592800.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
bedrooms,-4.598e+04,2032.723,-22.619,0.000,-5e+04,-4.2e+04
bathrooms,6.157e+04,3529.154,17.445,0.000,5.46e+04,6.85e+04
sqft_living,157.2316,2.433,64.614,0.000,152.462,162.001
sqft_lot,-0.2798,0.037,-7.584,0.000,-0.352,-0.207
floors,1.036e+05,7233.989,14.318,0.000,8.94e+04,1.18e+05
waterfront,5.656e+05,1.88e+04,30.089,0.000,5.29e+05,6.02e+05
view,6.788e+04,2263.281,29.991,0.000,6.34e+04,7.23e+04
sqft_above,124.1913,2.990,41.531,0.000,118.330,130.053
sqft_basement,33.0404,4.512,7.322,0.000,24.196,41.885

0,1,2,3
Omnibus:,15437.685,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,809588.945
Skew:,2.877,Prob(JB):,0.0
Kurtosis:,32.426,Cond. No.,1.84e+17


In [None]:
#The p-values are very low for most features, should I conclude that the variables are significant for predicting the price? 
#the r-squared also seems pretty accurate for me, aproximately 0.9
