# Housing Prices

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import boxcox
from scipy.stats import normaltest
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
import statsmodels.api as sm

import warnings
warnings.filterwarnings(action="ignore")

from config import usr, pwd, url, port, db
import helpers as hp

%matplotlib inline

## Load Data
Load dataset used to create previously developed housing prices model.

In [2]:
df_model = pd.read_csv('housing_prices_pvalue_cut.csv')

## Model 1

In [3]:
features = df_model.columns.tolist()
features.remove('saleprice_sd_bc_tk')
X = df_model.loc[:, features]
y = df_model['saleprice_sd_bc_tk']

X = sm.add_constant(X)

results_1 = sm.OLS(y, X).fit()

In [4]:
results_1.summary()

0,1,2,3
Dep. Variable:,saleprice_sd_bc_tk,R-squared:,0.884
Model:,OLS,Adj. R-squared:,0.881
Method:,Least Squares,F-statistic:,263.7
Date:,"Wed, 14 Aug 2019",Prob (F-statistic):,0.0
Time:,17:05:33,Log-Likelihood:,1843.7
No. Observations:,1460,AIC:,-3603.0
Df Residuals:,1418,BIC:,-3381.0
Df Model:,41,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.2446,0.024,-51.221,0.000,-1.292,-1.197
Unnamed: 0,-5.439e-06,4.38e-06,-1.243,0.214,-1.4e-05,3.15e-06
lotshape_IR2_oh,0.0309,0.011,2.690,0.007,0.008,0.053
lotconfig_CulDSac_oh,0.0258,0.008,3.312,0.001,0.011,0.041
lotconfig_FR2_oh,-0.0182,0.011,-1.729,0.084,-0.039,0.002
neighborhood_Blueste_oh,-0.1251,0.050,-2.510,0.012,-0.223,-0.027
neighborhood_BrDale_oh,-0.1361,0.019,-7.078,0.000,-0.174,-0.098
neighborhood_BrkSide_oh,-0.0397,0.012,-3.302,0.001,-0.063,-0.016
neighborhood_Crawfor_oh,0.0437,0.011,3.807,0.000,0.021,0.066

0,1,2,3
Omnibus:,264.3,Durbin-Watson:,1.908
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1209.665
Skew:,-0.782,Prob(JB):,2.1099999999999998e-263
Kurtosis:,7.176,Cond. No.,32800.0


In [5]:
df_model_stats = hp.append_model_stats(results_1, 1)

In [6]:
df_model_stats

Unnamed: 0,model,fvalue,f_pvalue,rsquared,rsquared_adj,aic,bic
0,model_1,263.666513,0.0,0.88404,0.880687,-3603.437106,-3381.417054


**Observations:**

The small p-value for the F-statistic indicates that the model is useful in explaining housing prices. The large R<sup>2</sup> values indicate a good model fit, that is boarding on overfitting. AIC and BIC are both large negative values.

## Model 2
Remove features with coefficient values < 0.05.

In [7]:
df_results_1 = hp.store_results(results_1)

In [8]:
df_results_1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
coeff,41.0,-0.015179,0.075976,-0.1819269,-0.04938701,-0.019537,0.030897,0.267217
pvalue,41.0,0.009281,0.035577,1.270452e-116,8.21288e-09,7.6e-05,0.000983,0.2142
abs_coeff,41.0,0.057811,0.050822,5.438746e-06,0.02771376,0.043334,0.071107,0.267217


In [9]:
df_results_1[df_results_1['abs_coeff'] > 0.05].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
coeff,16.0,-0.02782,0.115731,-0.1819269,-0.1082469,-0.06675563,0.056984,0.267217
pvalue,16.0,0.001614,0.003844,1.270452e-116,6.482525e-16,1.037188e-11,7.7e-05,0.01218
abs_coeff,16.0,0.100911,0.057943,0.05265207,0.06130769,0.07555761,0.127864,0.267217


In [10]:
features_2 = df_results_1.loc[df_results_1['abs_coeff'] > 0.05, 'feature'].tolist()

In [11]:
df_results_1.loc[df_results_1['abs_coeff'] > 0.05, :].sort_values(by='abs_coeff')

Unnamed: 0,feature,coeff,pvalue,abs_coeff
22,extercond_Fa_oh,-0.052652,0.0002298782,0.052652
38,totalbsmtsf_sd_bc_tk,0.055706,8.422738e-28,0.055706
30,bsmtcond_TA_oh,0.056487,1.348805e-08,0.056487
33,centralair_Y_oh,0.058477,1.845619e-11,0.058477
26,bsmtqual_Fa_oh,-0.062251,2.588347e-05,0.062251
40,overallqual_sd_bc_tk,0.071107,2.925197e-44,0.071107
15,neighborhood_NPkVill_oh,-0.07126,0.003130153,0.07126
9,neighborhood_Edwards_oh,-0.075381,8.250877e-16,0.075381
29,bsmtcond_Gd_oh,0.075734,8.854797e-09,0.075734
19,neighborhood_OldTown_oh,-0.084784,1.177469e-16,0.084784


In [12]:
X = df_model.loc[:, features_2]
y = df_model['saleprice_sd_bc_tk']

X = sm.add_constant(X)

results_2 = sm.OLS(y, X).fit()

In [13]:
results_2.summary()

0,1,2,3
Dep. Variable:,saleprice_sd_bc_tk,R-squared:,0.842
Model:,OLS,Adj. R-squared:,0.84
Method:,Least Squares,F-statistic:,481.3
Date:,"Wed, 14 Aug 2019",Prob (F-statistic):,0.0
Time:,17:05:33,Log-Likelihood:,1618.8
No. Observations:,1460,AIC:,-3204.0
Df Residuals:,1443,BIC:,-3114.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.1155,0.024,-46.896,0.000,-1.162,-1.069
neighborhood_Blueste_oh,-0.0733,0.057,-1.288,0.198,-0.185,0.038
neighborhood_BrDale_oh,-0.1533,0.021,-7.468,0.000,-0.194,-0.113
neighborhood_Edwards_oh,-0.0569,0.009,-6.505,0.000,-0.074,-0.040
neighborhood_IDOTRR_oh,-0.0894,0.014,-6.307,0.000,-0.117,-0.062
neighborhood_MeadowV_oh,-0.0944,0.020,-4.722,0.000,-0.134,-0.055
neighborhood_NPkVill_oh,-0.0566,0.027,-2.103,0.036,-0.109,-0.004
neighborhood_OldTown_oh,-0.0948,0.009,-11.017,0.000,-0.112,-0.078
extercond_Fa_oh,-0.0702,0.016,-4.326,0.000,-0.102,-0.038

0,1,2,3
Omnibus:,248.368,Durbin-Watson:,1.939
Prob(Omnibus):,0.0,Jarque-Bera (JB):,799.203
Skew:,-0.838,Prob(JB):,2.85e-174
Kurtosis:,6.214,Cond. No.,144.0


In [14]:
df_model_stats = hp.append_model_stats(results_2, 2)

In [15]:
df_model_stats

Unnamed: 0,model,fvalue,f_pvalue,rsquared,rsquared_adj,aic,bic
0,model_1,263.666513,0.0,0.88404,0.880687,-3603.437106,-3381.417054
1,model_2,481.298375,0.0,0.842188,0.840438,-3203.525273,-3113.660014


**Observations:**

Removing features actually decreased the model performance. Both R<sup>2</sup> statistics decreased and the AIC and BIC statistics increased.