# Final Model Construction

Below is the construction for a multiple linear regression model predicting housing prices in King County. Data is imported from the preprocessed data that was cleaned in preceding notebooks and saved as separate csv files. The data is imported to this notebook and combined for the first model. Further iterations are contained in additional subsections of this notebook.

### Import Packages

In [19]:
import scipy.stats as stats
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import median_absolute_error, mean_squared_error 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.linear_model import LinearRegression



### Import and Combine Cleaned Data

In [6]:
categorical_ohe = pd.read_csv("./data/cat_hot_dataframe")
categorical_ordinal = pd.read_csv("./data/cat_ordinal_dataframe")
numeric = pd.read_csv("./data/initial_numeric_inputs")
target = pd.read_csv("./data/house_price_target_natlog")

### Construct Model

In [7]:
all_predictors = pd.concat([numeric, categorical_ohe], axis=1)

In [8]:
#defining constructor function so it can be used for each iteration

def construct_model(exog_df):
    exog = sm.add_constant(exog_df)
    endog = target

    first_model = sm.OLS(endog, exog).fit().summary()
    return first_model

In [9]:
#run 1st model
construct_model(all_predictors)



0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.648
Model:,OLS,Adj. R-squared:,0.648
Method:,Least Squares,F-statistic:,1655.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,10:46:24,Log-Likelihood:,-5517.1
No. Observations:,21597,AIC:,11080.0
Df Residuals:,21572,BIC:,11280.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.3377,0.090,103.931,0.000,9.162,9.514
bedrooms,-0.0374,0.003,-12.215,0.000,-0.043,-0.031
bathrooms,0.0871,0.005,17.387,0.000,0.077,0.097
ln_sqft_living,0.4146,0.014,29.751,0.000,0.387,0.442
sqft_lot,2.654e-07,7.41e-08,3.581,0.000,1.2e-07,4.11e-07
floors,0.1163,0.006,20.768,0.000,0.105,0.127
ln_sqft_above,-0.1776,0.012,-14.554,0.000,-0.202,-0.154
age,0.0062,9.9e-05,62.310,0.000,0.006,0.006
renovated,0.0115,0.005,2.192,0.028,0.001,0.022

0,1,2,3
Omnibus:,58.161,Durbin-Watson:,1.964
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75.08
Skew:,-0.008,Prob(JB):,4.97e-17
Kurtosis:,3.288,Cond. No.,7440000.0


### Evaluate Model Performance

 (JB):score is fairly low, some of our data skew may be impacting our results
 also Cond. No.	7.44e+06 is very large, indicating significant multicollinearity  
 our Rsqr at .648 is decent but will continue to improve it.


### Assess Potential Model Improvements

(1)We're going to scale our features, so they have equal impact on their outcome
(2)Assess feature for collinearity and remove as needed


# 2nd Iteration

In [4]:
ss = StandardScaler()
ss.fit(numeric)
num_scaled = ss.transform(numeric)

In [11]:
#drop Ln_sqrft_liv15, sqft_lot15

all_predictors_2 = all_predictors.drop(labels=['ln_sqft_living15','sqft_lot15'], axis=1)

#run 2nd model
construct_model(all_predictors_2)

0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.639
Model:,OLS,Adj. R-squared:,0.638
Method:,Least Squares,F-statistic:,1734.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,10:53:30,Log-Likelihood:,-5796.6
No. Observations:,21597,AIC:,11640.0
Df Residuals:,21574,BIC:,11820.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.4212,0.078,132.862,0.000,10.267,10.575
bedrooms,-0.0399,0.003,-12.894,0.000,-0.046,-0.034
bathrooms,0.0849,0.005,16.735,0.000,0.075,0.095
ln_sqft_living,0.4922,0.014,35.924,0.000,0.465,0.519
sqft_lot,7.881e-08,5.35e-08,1.473,0.141,-2.61e-08,1.84e-07
floors,0.0951,0.006,17.045,0.000,0.084,0.106
ln_sqft_above,-0.1335,0.012,-10.952,0.000,-0.157,-0.110
age,0.0061,0.000,60.761,0.000,0.006,0.006
renovated,0.0086,0.005,1.612,0.107,-0.002,0.019

0,1,2,3
Omnibus:,75.136,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,100.897
Skew:,0.006,Prob(JB):,1.23e-22
Kurtosis:,3.335,Cond. No.,6490000.0


In [13]:
#remove one hot encoded condition
all_predictors_3 = all_predictors_2.drop(labels=['x0_Poor','x0_Good','x0_Fair','x0_Very Good'], axis=1)

#run 3rd model

construct_model(all_predictors_3)


0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.636
Model:,OLS,Adj. R-squared:,0.636
Method:,Least Squares,F-statistic:,2098.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,11:07:49,Log-Likelihood:,-5868.0
No. Observations:,21597,AIC:,11770.0
Df Residuals:,21578,BIC:,11930.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.3988,0.079,132.227,0.000,10.245,10.553
bedrooms,-0.0394,0.003,-12.694,0.000,-0.046,-0.033
bathrooms,0.0881,0.005,17.363,0.000,0.078,0.098
ln_sqft_living,0.5010,0.014,36.522,0.000,0.474,0.528
sqft_lot,6.487e-08,5.36e-08,1.210,0.226,-4.02e-08,1.7e-07
floors,0.0924,0.006,16.669,0.000,0.082,0.103
ln_sqft_above,-0.1404,0.012,-11.499,0.000,-0.164,-0.116
age,0.0063,9.56e-05,66.099,0.000,0.006,0.007
renovated,0.0057,0.005,1.075,0.283,-0.005,0.016

0,1,2,3
Omnibus:,76.856,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,103.31
Skew:,-0.013,Prob(JB):,3.6899999999999996e-23
Kurtosis:,3.338,Cond. No.,6490000.0


In [14]:
all_predictors_4 = all_predictors_3.drop(labels=['sqft_lot','renovated'], axis=1)

#run 4th model

construct_model(all_predictors_4)


0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.636
Model:,OLS,Adj. R-squared:,0.636
Method:,Least Squares,F-statistic:,2360.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,11:11:37,Log-Likelihood:,-5869.3
No. Observations:,21597,AIC:,11770.0
Df Residuals:,21580,BIC:,11910.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.3870,0.078,133.067,0.000,10.234,10.540
bedrooms,-0.0398,0.003,-12.852,0.000,-0.046,-0.034
bathrooms,0.0883,0.005,17.418,0.000,0.078,0.098
ln_sqft_living,0.5015,0.014,36.574,0.000,0.475,0.528
floors,0.0918,0.006,16.652,0.000,0.081,0.103
ln_sqft_above,-0.1388,0.012,-11.425,0.000,-0.163,-0.115
age,0.0063,9.49e-05,66.740,0.000,0.006,0.007
x0_11 Excellent,0.2239,0.019,12.018,0.000,0.187,0.260
x0_12 Luxury,0.4858,0.035,13.766,0.000,0.417,0.555

0,1,2,3
Omnibus:,77.059,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,103.628
Skew:,-0.013,Prob(JB):,3.1399999999999995e-23
Kurtosis:,3.338,Cond. No.,7910.0


In [15]:
#numeric
construct_model(numeric)



0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.562
Model:,OLS,Adj. R-squared:,0.562
Method:,Least Squares,F-statistic:,2771.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,11:15:34,Log-Likelihood:,-7873.7
No. Observations:,21597,AIC:,15770.0
Df Residuals:,21586,BIC:,15860.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.9822,0.072,69.180,0.000,4.841,5.123
bedrooms,-0.0761,0.003,-22.734,0.000,-0.083,-0.070
bathrooms,0.1375,0.005,25.186,0.000,0.127,0.148
ln_sqft_living,0.6262,0.015,41.799,0.000,0.597,0.656
sqft_lot,2.934e-07,8.25e-08,3.555,0.000,1.32e-07,4.55e-07
floors,0.1568,0.006,25.854,0.000,0.145,0.169
ln_sqft_above,-0.0765,0.013,-5.730,0.000,-0.103,-0.050
age,0.0057,0.000,55.933,0.000,0.006,0.006
renovated,0.0127,0.006,2.170,0.030,0.001,0.024

0,1,2,3
Omnibus:,56.15,Durbin-Watson:,1.97
Prob(Omnibus):,0.0,Jarque-Bera (JB):,70.297
Skew:,0.034,Prob(JB):,5.44e-16
Kurtosis:,3.271,Cond. No.,1540000.0


In [16]:
#ohe
construct_model(categorical_ohe)

0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.519
Model:,OLS,Adj. R-squared:,0.519
Method:,Least Squares,F-statistic:,1663.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,11:16:48,Log-Likelihood:,-8889.0
No. Observations:,21597,AIC:,17810.0
Df Residuals:,21582,BIC:,17930.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,13.7766,0.011,1264.544,0.000,13.755,13.798
x0_Fair,-0.1204,0.028,-4.234,0.000,-0.176,-0.065
x0_Good,0.1003,0.006,17.197,0.000,0.089,0.112
x0_Poor,0.0080,0.068,0.117,0.907,-0.126,0.142
x0_Very Good,0.2654,0.009,28.080,0.000,0.247,0.284
x0_11 Excellent,0.3339,0.021,15.702,0.000,0.292,0.376
x0_12 Luxury,0.7145,0.040,17.767,0.000,0.636,0.793
x0_13 Mansion,1.2362,0.102,12.132,0.000,1.036,1.436
x0_3 Poor,-1.5659,0.366,-4.283,0.000,-2.282,-0.849

0,1,2,3
Omnibus:,90.585,Durbin-Watson:,1.974
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96.054
Skew:,0.133,Prob(JB):,1.3900000000000001e-21
Kurtosis:,3.189,Cond. No.,173.0


In [17]:
#ordinal
construct_model(categorical_ordinal)

0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.516
Model:,OLS,Adj. R-squared:,0.516
Method:,Least Squares,F-statistic:,11510.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,11:18:04,Log-Likelihood:,-8957.7
No. Observations:,21597,AIC:,17920.0
Df Residuals:,21594,BIC:,17950.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.2487,0.015,756.724,0.000,11.220,11.278
cond_ord,0.1177,0.004,30.382,0.000,0.110,0.125
grade_ord,0.3254,0.002,151.481,0.000,0.321,0.330

0,1,2,3
Omnibus:,112.405,Durbin-Watson:,1.974
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121.049
Skew:,0.146,Prob(JB):,5.1800000000000005e-27
Kurtosis:,3.221,Cond. No.,33.2


In [18]:
categorical_cond_ord = categorical_ordinal.drop(labels=['cond_ord'], axis=1)

numeric_revised = numeric.drop(labels=['ln_sqft_living15','sqft_lot15','sqft_lot','renovated'], axis=1)

ord_model1 = pd.concat([numeric_revised, categorical_cond_ord], axis=1)

construct_model(ord_model1)




0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.635
Model:,OLS,Adj. R-squared:,0.635
Method:,Least Squares,F-statistic:,5375.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,11:51:17,Log-Likelihood:,-5896.3
No. Observations:,21597,AIC:,11810.0
Df Residuals:,21589,BIC:,11870.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.6657,0.065,133.069,0.000,8.538,8.793
bedrooms,-0.0408,0.003,-13.211,0.000,-0.047,-0.035
bathrooms,0.0883,0.005,17.596,0.000,0.078,0.098
ln_sqft_living,0.4996,0.014,36.798,0.000,0.473,0.526
floors,0.0932,0.005,17.081,0.000,0.082,0.104
ln_sqft_above,-0.1332,0.012,-11.091,0.000,-0.157,-0.110
age,0.0063,9.28e-05,68.351,0.000,0.006,0.007
grade_ord,0.2420,0.003,79.003,0.000,0.236,0.248

0,1,2,3
Omnibus:,81.858,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,111.574
Skew:,0.0,Prob(JB):,5.91e-25
Kurtosis:,3.352,Cond. No.,1630.0


In [21]:
rscale = RobustScaler()

In [29]:
rs = rscale.fit_transform(ord_model1.drop(labels='grade_ord', axis=1),target)
rs_df = pd.DataFrame(rs, columns=ord_model1.drop(labels='grade_ord', axis=1).columns)
rs_df

Unnamed: 0,bedrooms,bathrooms,ln_sqft_living,floors,ln_sqft_above,age
0,0.0,-1.666667,-0.832595,-0.5,-0.450975,0.434783
1,0.0,0.000000,0.513128,0.5,0.533151,0.521739
2,-1.0,-1.666667,-1.570606,-0.5,-1.140559,0.913043
3,1.0,1.000000,0.044676,-0.5,-0.639532,0.217391
4,0.0,-0.333333,-0.221828,-0.5,0.119715,-0.260870
...,...,...,...,...,...,...
21592,0.0,0.333333,-0.383520,1.5,-0.031368,-0.739130
21593,1.0,0.333333,0.328731,0.5,0.634147,-0.847826
21594,-1.0,-2.000000,-1.084509,0.5,-0.686359,-0.739130
21595,0.0,0.333333,-0.306179,0.5,0.040899,-0.630435


In [31]:
ord_model_scaled = pd.concat([rs_df, pd.DataFrame(ord_model1['grade_ord']) ], axis=1)
construct_model(ord_model_scaled)


0,1,2,3
Dep. Variable:,ln_price,R-squared:,0.635
Model:,OLS,Adj. R-squared:,0.635
Method:,Least Squares,F-statistic:,5375.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,12:28:37,Log-Likelihood:,-5896.3
No. Observations:,21597,AIC:,11810.0
Df Residuals:,21589,BIC:,11870.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.9308,0.015,806.388,0.000,11.902,11.960
bedrooms,-0.0408,0.003,-13.211,0.000,-0.047,-0.035
bathrooms,0.0662,0.004,17.596,0.000,0.059,0.074
ln_sqft_living,0.2890,0.008,36.798,0.000,0.274,0.304
floors,0.0932,0.005,17.081,0.000,0.082,0.104
ln_sqft_above,-0.0824,0.007,-11.091,0.000,-0.097,-0.068
age,0.2918,0.004,68.351,0.000,0.283,0.300
grade_ord,0.2420,0.003,79.003,0.000,0.236,0.248

0,1,2,3
Omnibus:,81.858,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,111.574
Skew:,0.0,Prob(JB):,5.91e-25
Kurtosis:,3.352,Cond. No.,35.1


# Additional feature cleaning

In [37]:
z = ord_model_scaled.columns
for v in ord_model_scaled.columns:
    plt.figure(v)
    sns.scatterplot(y=target,x=ord_model_scaled[v])
  

TypeError: 'Index' object is not callable