In [15]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from random import gauss
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats as stats

%matplotlib inline

# Read in Data

In [16]:
df = pd.read_csv('data/kc_house_data.csv')

In [17]:
# Why reduce features?
# I narrowed my list of features to avoid over-fit of the training dataset
# Reducing redundancy in features increases the accuracy of the model

df = df.drop(['date','view', 'sqft_above', 'sqft_basement', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], axis=1)

In [18]:
df['waterfront'] = df['waterfront'].fillna(0)
df['waterfront'] = df["waterfront"].astype(int)
# Note: waterfront is our only categorical value.
# We don't need to use dummy coding or any other coding system because it's already dichotomous (1 or 0) 
df['waterfront'].unique()

array([0, 1])

In [19]:
df_water =  df.loc[df['waterfront'] == 1]

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 49 to 21560
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           146 non-null    int64  
 1   price        146 non-null    float64
 2   bedrooms     146 non-null    int64  
 3   bathrooms    146 non-null    float64
 4   sqft_living  146 non-null    int64  
 5   sqft_lot     146 non-null    int64  
 6   floors       146 non-null    float64
 7   waterfront   146 non-null    int64  
 8   condition    146 non-null    int64  
 9   grade        146 non-null    int64  
 10  yr_built     146 non-null    int64  
dtypes: float64(3), int64(8)
memory usage: 13.7 KB


# Price Millions

In [8]:
# Create price column in millions:
# df['price_millions'] = df['price'] / 1000000

In [23]:
format_dict = {'bedrooms': '{:.2f}', 'bathrooms': '{:.2f}', 'floors': '{:.2f}', 'sqft_living': '{:20,.2f}', 'sqft_lot': '{:20,.2f}',
               'sqft_lot': '{:20,.2f}',  'price': '${:20,.0f}', 'yr_built': '{:.0f}', 'condition': '{:.2f}', 'grade': '{:.2f}', 'waterfront': '{:.5f}'}
df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'waterfront', 'condition', 'grade', 'yr_built']].describe().style.format(format_dict)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,yr_built
count,$ 146,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146
mean,"$ 1,717,215",3.35,2.71,3244.75,25870.88,1.63,1.0,3.54,8.84,1962
std,"$ 1,145,385",1.1,1.12,1652.13,44629.08,0.55,0.0,0.74,1.78,27
min,"$ 285,000",1.0,0.75,440.0,1989.0,1.0,1.0,1.0,5.0,1905
25%,"$ 827,500",3.0,1.81,2082.5,11692.25,1.0,1.0,3.0,8.0,1941
50%,"$ 1,510,000",3.0,2.5,2900.0,17730.5,2.0,1.0,3.0,9.0,1960
75%,"$ 2,282,500",4.0,3.25,4117.5,26692.5,2.0,1.0,4.0,10.0,1985
max,"$ 7,060,000",6.0,6.75,10040.0,505166.0,3.0,1.0,5.0,12.0,2014


In [26]:
# Remove id because it has no predicting power, waterfront because it's all one value, and sqft_lot & year since they can't be altered.
df_model = df.drop(['waterfront','id'], axis=1)

In [27]:
df_model.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'condition', 'grade', 'yr_built'],
      dtype='object')

In [28]:
# make copy of df_model for normalization purposes: 
df_norm = df_model.copy()

# Apply z-score normalization:
def z_score_norm(my_column):
    return (my_column - my_column.mean())/my_column.std()

for predictor in ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'condition', 'grade', 'yr_built']:
    df_norm[predictor] = z_score_norm(df_norm[predictor])
# print normalized data: 
print(df_norm)

           price  bedrooms  bathrooms  sqft_living  sqft_lot    floors  \
49     1350000.0 -0.317799  -0.186271    -0.297647  0.876875 -1.150105   
230     655000.0 -1.227578  -0.855012    -1.086324 -0.225702  0.675062   
246    2400000.0  0.591979  -0.186271     0.245287 -0.392499 -1.150105   
264     369900.0 -2.137356  -1.746668    -1.503965 -0.353847 -1.150105   
300    3080000.0  0.591979   2.042869     0.790036 -0.161999 -1.150105   
...          ...       ...        ...          ...       ...       ...   
19968  1900000.0 -0.317799  -0.186271    -0.251041 -0.482575  2.500229   
20309  3000000.0 -0.317799   0.705385     0.705297 -0.338678  0.675062   
20751  2300000.0  0.591979   1.151213     0.675034 -0.396510  1.587645   
21185  2230000.0 -0.317799   0.705385     0.311867 -0.453446  0.675062   
21560  3570000.0  1.501758   1.597041     0.971620 -0.342532  0.675062   

       condition     grade  yr_built  
49      1.961006  0.092286 -0.342231  
230    -0.727322 -1.030531 -1.743

In [51]:
# Predictors
house_pred = df_norm.drop('price', axis = 1)


### TARGET
# Price Millions
house_target = df_norm['price'] / 1000000
# Price
# house_target = df_norm['price']
house_pred.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,yr_built
49,-0.317799,-0.186271,-0.297647,0.876875,-1.150105,1.961006,0.092286,-0.342231
230,-1.227578,-0.855012,-1.086324,-0.225702,0.675062,-0.727322,-1.030531,-1.743485
246,0.591979,-0.186271,0.245287,-0.392499,-1.150105,-0.727322,0.092286,1.390898
264,-2.137356,-1.746668,-1.503965,-0.353847,-1.150105,1.961006,-2.153349,-0.969108
300,0.591979,2.042869,0.790036,-0.161999,-1.150105,-0.727322,0.653695,1.464648


In [52]:
# Add constant (AKA y-intercept):
# The constant is set to 1 as default - with means that our constant coefficient will be equal to 1*Beta(0)
# For our other variables (other Betas) will be multiplied by a particular coefficient to predict price
predictors = sm.add_constant(house_pred)
predictors.head()

Unnamed: 0,const,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,yr_built
49,1.0,-0.317799,-0.186271,-0.297647,0.876875,-1.150105,1.961006,0.092286,-0.342231
230,1.0,-1.227578,-0.855012,-1.086324,-0.225702,0.675062,-0.727322,-1.030531,-1.743485
246,1.0,0.591979,-0.186271,0.245287,-0.392499,-1.150105,-0.727322,0.092286,1.390898
264,1.0,-2.137356,-1.746668,-1.503965,-0.353847,-1.150105,1.961006,-2.153349,-0.969108
300,1.0,0.591979,2.042869,0.790036,-0.161999,-1.150105,-0.727322,0.653695,1.464648


In [53]:
# The order of the parameters is: endogenous response variable(dependent variable), exogenous variables(independent variables)
model = sm.OLS(house_target, predictors).fit()

In [54]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.799
Model:,OLS,Adj. R-squared:,0.788
Method:,Least Squares,F-statistic:,68.28
Date:,"Sat, 07 Aug 2021",Prob (F-statistic):,4.74e-44
Time:,16:02:15,Log-Likelihood:,-109.18
No. Observations:,146,AIC:,236.4
Df Residuals:,137,BIC:,263.2
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.7172,0.044,39.324,0.000,1.631,1.804
bedrooms,0.0112,0.057,0.195,0.846,-0.102,0.125
bathrooms,0.0876,0.081,1.081,0.281,-0.073,0.248
sqft_living,0.8593,0.087,9.847,0.000,0.687,1.032
sqft_lot,-0.0546,0.045,-1.226,0.222,-0.143,0.033
floors,-0.0118,0.052,-0.229,0.819,-0.114,0.090
condition,0.0663,0.046,1.443,0.151,-0.025,0.157
grade,0.1579,0.082,1.915,0.058,-0.005,0.321
yr_built,-0.0680,0.059,-1.155,0.250,-0.184,0.048

0,1,2,3
Omnibus:,1.096,Durbin-Watson:,2.056
Prob(Omnibus):,0.578,Jarque-Bera (JB):,0.878
Skew:,-0.188,Prob(JB):,0.645
Kurtosis:,3.052,Cond. No.,4.86


# Interpretation of Model Summary:
- The R-squared value explains almost 80% of the variation in the data
- The only p-value that's good is sqft_living