In [55]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from random import gauss
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats as stats

%matplotlib inline

# Read in Data

In [56]:
df = pd.read_csv('data/kc_house_data.csv')

In [57]:
# Why reduce features?
# I narrowed my list of features to avoid over-fit of the training dataset
# Reducing redundancy in features increases the accuracy of the model

df = df.drop(['date','view', 'sqft_above', 'sqft_basement', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], axis=1)

In [58]:
df['waterfront'] = df['waterfront'].fillna(0)
df['waterfront'] = df["waterfront"].astype(int)
# Note: waterfront is our only categorical value.
# We don't need to use dummy coding or any other coding system because it's already dichotomous (1 or 0) 
df['waterfront'].unique()

array([0, 1])

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           21597 non-null  int64  
 1   price        21597 non-null  float64
 2   bedrooms     21597 non-null  int64  
 3   bathrooms    21597 non-null  float64
 4   sqft_living  21597 non-null  int64  
 5   sqft_lot     21597 non-null  int64  
 6   floors       21597 non-null  float64
 7   waterfront   21597 non-null  int64  
 8   condition    21597 non-null  int64  
 9   grade        21597 non-null  int64  
 10  yr_built     21597 non-null  int64  
dtypes: float64(3), int64(8)
memory usage: 1.8 MB


# Price Millions

In [60]:
# Create price column in millions:
# df['price_millions'] = df['price'] / 1000000

In [61]:
format_dict = {'bedrooms': '{:.2f}', 'bathrooms': '{:.2f}', 'floors': '{:.2f}', 'sqft_living': '{:20,.2f}', 'sqft_lot': '{:20,.2f}',
               'sqft_lot': '{:20,.2f}',  'price': '${:20,.0f}', 'yr_built': '{:.0f}', 'condition': '{:.2f}', 'grade': '{:.2f}', 'waterfront': '{:.5f}'}
df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'waterfront', 'condition', 'grade', 'yr_built']].describe().style.format(format_dict)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,yr_built
count,"$ 21,597",21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597
mean,"$ 540,297",3.37,2.12,2080.32,15099.41,1.49,0.00676,3.41,7.66,1971
std,"$ 367,368",0.93,0.77,918.11,41412.64,0.54,0.08194,0.65,1.17,29
min,"$ 78,000",1.0,0.5,370.0,520.0,1.0,0.0,1.0,3.0,1900
25%,"$ 322,000",3.0,1.75,1430.0,5040.0,1.0,0.0,3.0,7.0,1951
50%,"$ 450,000",3.0,2.25,1910.0,7618.0,1.5,0.0,3.0,7.0,1975
75%,"$ 645,000",4.0,2.5,2550.0,10685.0,2.0,0.0,4.0,8.0,1997
max,"$ 7,700,000",33.0,8.0,13540.0,1651359.0,3.5,1.0,5.0,13.0,2015


In [62]:
# Remove id because it has no predicting power, waterfront because it's all one value, and sqft_lot & year since they can't be altered.
df_model = df.drop(['waterfront','id'], axis=1)

In [63]:
df_model.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'condition', 'grade', 'yr_built'],
      dtype='object')

In [65]:
# make copy of df_model for normalization purposes: 
df_norm = df_model.copy()

# Apply z-score normalization:
def z_score_norm(my_column):
    return (my_column - my_column.mean())/my_column.std()

for predictor in ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'condition', 'grade', 'yr_built']:
    df_norm[predictor] = z_score_norm(df_norm[predictor])
# print normalized data: 
print(df_norm)

          price  bedrooms  bathrooms  sqft_living  sqft_lot    floors  \
0      221900.0 -0.402894  -1.451039    -0.980629 -0.228177 -0.915531   
1      538000.0 -0.402894   0.174482     0.533357 -0.189735  0.937409   
2      180000.0 -1.482459  -1.451039    -1.427201 -0.123137 -0.915531   
3      604000.0  0.676671   1.149794    -0.131054 -0.243873 -0.915531   
4      510000.0 -0.402894  -0.150622    -0.436030 -0.169499 -0.915531   
...         ...       ...        ...          ...       ...       ...   
21592  360000.0 -0.402894   0.499586    -0.599410 -0.337298  2.790349   
21593  400000.0  0.676671   0.499586     0.250165 -0.224241  0.937409   
21594  402101.0 -1.482459  -1.776143    -1.154901 -0.332010  0.937409   
21595  400000.0 -0.402894   0.499586    -0.523166 -0.306945  0.937409   
21596  325000.0 -1.482459  -1.776143    -1.154901 -0.338626  0.937409   

       condition     grade  yr_built  
0      -0.629972 -0.560787 -0.544665  
1      -0.629972 -0.560787 -0.680835  
2     

In [66]:
# Predictors
house_pred = df_norm.drop('price', axis = 1)


### TARGET
# Price Millions
house_target = df_norm['price'] / 1000000
# Price
# house_target = df_norm['price']
house_pred.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,yr_built
0,-0.402894,-1.451039,-0.980629,-0.228177,-0.915531,-0.629972,-0.560787,-0.544665
1,-0.402894,0.174482,0.533357,-0.189735,0.937409,-0.629972,-0.560787,-0.680835
2,-1.482459,-1.451039,-1.427201,-0.123137,-0.915531,-0.629972,-1.413157,-1.293596
3,0.676671,1.149794,-0.131054,-0.243873,-0.915531,2.444371,-0.560787,-0.204243
4,-0.402894,-0.150622,-0.43603,-0.169499,-0.915531,-0.629972,0.291583,0.544688


In [67]:
# Add constant (AKA y-intercept):
# The constant is set to 1 as default - with means that our constant coefficient will be equal to 1*Beta(0)
# For our other variables (other Betas) will be multiplied by a particular coefficient to predict price
predictors = sm.add_constant(house_pred)
predictors.head()

Unnamed: 0,const,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,yr_built
0,1.0,-0.402894,-1.451039,-0.980629,-0.228177,-0.915531,-0.629972,-0.560787,-0.544665
1,1.0,-0.402894,0.174482,0.533357,-0.189735,0.937409,-0.629972,-0.560787,-0.680835
2,1.0,-1.482459,-1.451039,-1.427201,-0.123137,-0.915531,-0.629972,-1.413157,-1.293596
3,1.0,0.676671,1.149794,-0.131054,-0.243873,-0.915531,2.444371,-0.560787,-0.204243
4,1.0,-0.402894,-0.150622,-0.43603,-0.169499,-0.915531,-0.629972,0.291583,0.544688


In [68]:
# The order of the parameters is: endogenous response variable(dependent variable), exogenous variables(independent variables)
model = sm.OLS(house_target, predictors).fit()

In [69]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.618
Model:,OLS,Adj. R-squared:,0.618
Method:,Least Squares,F-statistic:,4363.0
Date:,"Sat, 07 Aug 2021",Prob (F-statistic):,0.0
Time:,16:26:00,Log-Likelihood:,1371.2
No. Observations:,21597,AIC:,-2724.0
Df Residuals:,21588,BIC:,-2653.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5403,0.002,349.582,0.000,0.537,0.543
bedrooms,-0.0455,0.002,-23.151,0.000,-0.049,-0.042
bathrooms,0.0406,0.003,14.734,0.000,0.035,0.046
sqft_living,0.1721,0.003,54.784,0.000,0.166,0.178
sqft_lot,-0.0102,0.002,-6.439,0.000,-0.013,-0.007
floors,0.0115,0.002,5.922,0.000,0.008,0.015
condition,0.0128,0.002,7.593,0.000,0.009,0.016
grade,0.1539,0.003,58.577,0.000,0.149,0.159
yr_built,-0.1178,0.002,-57.983,0.000,-0.122,-0.114

0,1,2,3
Omnibus:,17302.265,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1207162.645
Skew:,3.353,Prob(JB):,0.0
Kurtosis:,39.007,Cond. No.,4.75


# Interpretation of Model Summary:
- The R-squared value explains almost 80% of the variation in the data
- The only p-value that's good is sqft_living