In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm 
import matplotlib.pyplot as plt

In [2]:
data = pd.read_excel('C:/Users/USER/Documents/Udemy Python for Finance/97 Running a Multivariate Regression in Python/Python 3/Housing.xlsx')

In [3]:
data

Unnamed: 0,House Price,House Size (sq.ft.),State,Number of Rooms,Year of Construction
0,1116000,1940,IN,8,2002
1,860000,1300,IN,5,1992
2,818400,1420,IN,6,1987
3,1000000,1680,IN,7,2000
4,640000,1270,IN,5,1995
5,1010000,1850,IN,7,1998
6,600000,1000,IN,4,2015
7,700000,1100,LA,4,2014
8,1100000,1600,LA,7,2017
9,570000,1000,NY,5,1997


### Multivariate Regression:

Independent Variables: *"House Size (sq.ft.)", "Number of Rooms", "Year of Construction"*

In [4]:
# Double the brackets around the names of the independent variables to show X will be multi-dimensional.
X = data[['House Size (sq.ft.)', 'Number of Rooms', 'Year of Construction']]
Y = data['House Price']

In [5]:
X1 = sm.add_constant(X)
reg = sm.OLS(Y, X1).fit()

reg.summary()

0,1,2,3
Dep. Variable:,House Price,R-squared:,0.736
Model:,OLS,Adj. R-squared:,0.687
Method:,Least Squares,F-statistic:,14.9
Date:,"Wed, 13 May 2020",Prob (F-statistic):,6.82e-05
Time:,02:21:01,Log-Likelihood:,-258.43
No. Observations:,20,AIC:,524.9
Df Residuals:,16,BIC:,528.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.452e+06,5.4e+06,-1.752,0.099,-2.09e+07,1.99e+06
House Size (sq.ft.),341.8271,179.666,1.903,0.075,-39.049,722.703
Number of Rooms,1.16e+04,3.74e+04,0.310,0.760,-6.77e+04,9.08e+04
Year of Construction,4863.5761,2697.969,1.803,0.090,-855.862,1.06e+04

0,1,2,3
Omnibus:,2.14,Durbin-Watson:,1.938
Prob(Omnibus):,0.343,Jarque-Bera (JB):,1.747
Skew:,-0.676,Prob(JB):,0.418
Kurtosis:,2.484,Cond. No.,540000.0


In [None]:
# The coefficient of the constant: -9.452e+06 ~ 452,000
# So the coefficient values are the constant and of the house size drastically changed.
# The constant jumped from 260,800 to 452,000 while the House size beta had dropped from 402 to 341.
# By adding the other two independent variables we destroyed the nice results of our univariate regression and that house size 
#cannot explain the house price any more. Please do not rush to make such a conclusion.
# Let's check the R-squared value of this regression. It changed from 0.6 78 to 0.7 3:6.
# Therefore according to this statistic the second model is slightly better in terms of explanatory power.
# So at least some of the independent variables influence the price of a house.
# The coefficient p values are important indicator we should consider. They are not small enough.
# -->> Being greater than 5 percent this means the 3 coefficients are "not statistically significant".
# So what should we infer from our analysis is how size a good predictor.
# !!! An experienced researcher runs hundreds of regressions before making a sound inference. !!!
# ===>>> Run three other regressions with two independent variables size and number of rooms.

1/ Independent Variables: *"House Size (sq.ft.)", "Number of Rooms"*

In [6]:
X = data[['House Size (sq.ft.)', 'Number of Rooms']]
Y = data['House Price']

In [7]:
X1 = sm.add_constant(X)
reg = sm.OLS(Y, X1).fit()

reg.summary()

0,1,2,3
Dep. Variable:,House Price,R-squared:,0.683
Model:,OLS,Adj. R-squared:,0.645
Method:,Least Squares,F-statistic:,18.3
Date:,"Wed, 13 May 2020",Prob (F-statistic):,5.77e-05
Time:,02:29:36,Log-Likelihood:,-260.28
No. Observations:,20,AIC:,526.6
Df Residuals:,17,BIC:,529.6
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.737e+05,1.03e+05,2.655,0.017,5.62e+04,4.91e+05
House Size (sq.ft.),314.1363,190.485,1.649,0.117,-87.752,716.025
Number of Rooms,1.944e+04,3.95e+04,0.492,0.629,-6.39e+04,1.03e+05

0,1,2,3
Omnibus:,1.326,Durbin-Watson:,1.852
Prob(Omnibus):,0.515,Jarque-Bera (JB):,0.81
Skew:,-0.487,Prob(JB):,0.667
Kurtosis:,2.853,Cond. No.,5890.0


2/ Independent Variables: *"House Size (sq.ft.)", "Year of Construction"*

In [8]:
X = data[['House Size (sq.ft.)', 'Year of Construction']]
Y = data['House Price']

In [9]:
X1 = sm.add_constant(X)
reg = sm.OLS(Y, X1).fit()

reg.summary()

0,1,2,3
Dep. Variable:,House Price,R-squared:,0.735
Model:,OLS,Adj. R-squared:,0.704
Method:,Least Squares,F-statistic:,23.55
Date:,"Wed, 13 May 2020",Prob (F-statistic):,1.26e-05
Time:,02:29:47,Log-Likelihood:,-258.49
No. Observations:,20,AIC:,523.0
Df Residuals:,17,BIC:,526.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.654e+06,5.21e+06,-1.852,0.081,-2.07e+07,1.34e+06
House Size (sq.ft.),394.0417,61.098,6.449,0.000,265.137,522.947
Year of Construction,4960.9407,2607.443,1.903,0.074,-540.283,1.05e+04

0,1,2,3
Omnibus:,2.064,Durbin-Watson:,1.926
Prob(Omnibus):,0.356,Jarque-Bera (JB):,1.689
Skew:,-0.663,Prob(JB):,0.43
Kurtosis:,2.48,Cond. No.,536000.0


3/ Independent Variables: *"Number of Rooms", "Year of Construction"*

In [10]:
X = data[['Number of Rooms', 'Year of Construction']]
Y = data['House Price']

In [11]:
X1 = sm.add_constant(X)
reg = sm.OLS(Y, X1).fit()

reg.summary()

0,1,2,3
Dep. Variable:,House Price,R-squared:,0.677
Model:,OLS,Adj. R-squared:,0.639
Method:,Least Squares,F-statistic:,17.79
Date:,"Wed, 13 May 2020",Prob (F-statistic):,6.79e-05
Time:,02:29:51,Log-Likelihood:,-260.47
No. Observations:,20,AIC:,526.9
Df Residuals:,17,BIC:,529.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.471e+06,5.77e+06,-1.468,0.160,-2.06e+07,3.7e+06
Number of Rooms,7.824e+04,1.4e+04,5.574,0.000,4.86e+04,1.08e+05
Year of Construction,4424.7160,2887.793,1.532,0.144,-1667.996,1.05e+04

0,1,2,3
Omnibus:,2.115,Durbin-Watson:,1.959
Prob(Omnibus):,0.347,Jarque-Bera (JB):,1.4
Skew:,-0.407,Prob(JB):,0.497
Kurtosis:,1.991,Cond. No.,434000.0


In [None]:
# When we skim through the values in the new output we can see year of construction does not get a low P value in any of the two 
#regressions it has been involved in.
# This means it is not related to house prices in these same regressions.
# The p value for number of rooms and house size is practically zero which is amazing.
# When we run a regression with only these two variables size and number of rooms there are p values increase significantly.
# So we cannot confirm they can influence the price of a house at this stage even if we cannot make a firm conclusion.
# These results can give us good guidance for future research and this is a valuable asset.
# What could we do next? 
# 1/ First our output suggests that if we gather more data about more observations house size or number of rooms might prove to be good 
#indicators of house prices and their p value will probably decrease in the regressions we will run.
# Well in our calculations the value of the R-squared was high (0.683).
# This gives us confidence that in general we have gathered a good set of explanatory variables.
# --->>> So it might not be necessary to change the set.
# 2/ In addition there could be a problem among these three explanatory variables most probably house size and number of rooms are 
#related and they act as a single explanatory variable.
# This means they are "the same factor".
# 3/ We should gather data about other types of explanatory variables.
# For ex, what type of people are the neighbors what is the distance between the House and downtown. Are there any supermarkets close by?
# ===>>> All of these might affect the house price and they might change the coefficients of the variables we use so far.
