# 13장 이분산

## 예제 13.3 이분산에 견고한 표준오차

In [1]:
import pandas as pd
from numpy import log
import statsmodels.formula.api as smf

Housing = pd.read_csv('csv/Ecdat/Housing.csv')
#https://stackoverflow.com/questions/30553838/getting-statsmodels-to-use-heteroskedasticity-corrected-standard-errors-in-coeff
model = smf.ols('log(price)~log(lotsize)+bedrooms+bathrms', data=Housing)
ols = model.fit(cov_type = 'HC0')
ols.summary(slim=True)

0,1,2,3
Dep. Variable:,log(price),R-squared:,0.504
Model:,OLS,Adj. R-squared:,0.501
No. Observations:,546,F-statistic:,181.4
Covariance Type:,HC0,Prob (F-statistic):,2e-81

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,6.6222,0.248,26.708,0.000,6.136,7.108
log(lotsize),0.4568,0.030,15.172,0.000,0.398,0.516
bedrooms,0.0892,0.018,4.999,0.000,0.054,0.124
bathrms,0.2368,0.026,8.971,0.000,0.185,0.288


## 예제 13.4 통상적인 표준오차와 여러 견고한 표준오차들의 비교

In [2]:
ols = model.fit() # fit again (ordinary se)
pd.DataFrame({'ord': ols.bse, 'hc0': ols.HC0_se, 'hc1': ols.HC1_se, 'hc2': ols.HC2_se, 'hc3': ols.HC3_se})

Unnamed: 0,ord,hc0,hc1,hc2,hc3
Intercept,0.241234,0.247946,0.248859,0.249356,0.25078
log(lotsize),0.028983,0.03011,0.030221,0.030286,0.030464
bedrooms,0.016513,0.01785,0.017916,0.017985,0.018122
bathrms,0.024479,0.026392,0.02649,0.026611,0.026834


## 예제 13.5 지역별 사망률

In [3]:
import pandas as pd
import statsmodels.formula.api as smf

Death = pd.read_csv('csv/loedata/Death.csv')
model = smf.ols('deathrate~drink+smoke+aged+vehipc+C(year)', data=Death) # C=categorical
model.fit().summary(slim=True)

0,1,2,3
Dep. Variable:,deathrate,R-squared:,0.921
Model:,OLS,Adj. R-squared:,0.919
No. Observations:,258,F-statistic:,487.3
Covariance Type:,nonrobust,Prob (F-statistic):,3.4199999999999996e-135

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2241,0.769,-0.291,0.771,-1.739,1.291
C(year)[T.2009],-0.3788,0.098,-3.867,0.000,-0.572,-0.186
C(year)[T.2010],-0.3510,0.102,-3.457,0.001,-0.551,-0.151
drink,0.0064,0.011,0.594,0.553,-0.015,0.028
smoke,0.0333,0.018,1.873,0.062,-0.002,0.068
aged,0.4027,0.010,38.401,0.000,0.382,0.423
vehipc,1.4079,1.163,1.211,0.227,-0.882,3.698


위의 주석 \[2\]는 [condition number](https://en.wikipedia.org/wiki/Condition_number)에 관한 것이다. 이 값은 $X'X$의 가장 큰 eigenvalue와 가장 작은 eigenvalue 간 비율에 제곱근을 취한 것과 동일하다. 근사적인 다중공선성의 지표가 되기도 하는데, 크게 신경 쓰지 않아도 된다.

In [4]:
model.fit(cov_type='HC3').summary(slim=True)

0,1,2,3
Dep. Variable:,deathrate,R-squared:,0.921
Model:,OLS,Adj. R-squared:,0.919
No. Observations:,258,F-statistic:,650.1
Covariance Type:,HC3,Prob (F-statistic):,8.45e-150

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2241,0.785,-0.285,0.775,-1.763,1.315
C(year)[T.2009],-0.3788,0.095,-3.996,0.000,-0.565,-0.193
C(year)[T.2010],-0.3510,0.104,-3.371,0.001,-0.555,-0.147
drink,0.0064,0.011,0.565,0.572,-0.016,0.029
smoke,0.0333,0.019,1.770,0.077,-0.004,0.070
aged,0.4027,0.010,39.371,0.000,0.383,0.423
vehipc,1.4079,1.295,1.087,0.277,-1.130,3.946


## 예제 13.6 사망률 분석에서 F검정

In [5]:
# Continue
hypo = 'drink=0, smoke=0'
rego = model.fit()
print(rego.f_test(hypo))
regh = model.fit(cov_type='HC3')
print(regh.f_test(hypo))

<F test: F=3.2488401914796645, p=0.04045761180359891, df_denom=251, df_num=2>
<F test: F=2.9862362091563925, p=0.052273714967270596, df_denom=251, df_num=2>


## 예제 13.8 사망률 모형의 WLS 추정

In [6]:
import pandas as pd
import statsmodels.formula.api as smf

Death = pd.read_csv('csv/loedata/Death.csv')
fm = 'deathrate~drink+smoke+aged+vehipc+C(year)'
wls = smf.wls(fm, data=Death, weights=Death.regpop).fit()
wls.summary(slim=True)

0,1,2,3
Dep. Variable:,deathrate,R-squared:,0.941
Model:,WLS,Adj. R-squared:,0.939
No. Observations:,258,F-statistic:,663.9
Covariance Type:,nonrobust,Prob (F-statistic):,7.08e-151

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.5815,0.764,-0.761,0.448,-2.087,0.924
C(year)[T.2009],-0.2963,0.096,-3.093,0.002,-0.485,-0.108
C(year)[T.2010],-0.2977,0.099,-3.008,0.003,-0.493,-0.103
drink,0.0166,0.011,1.575,0.117,-0.004,0.037
smoke,0.0327,0.018,1.830,0.068,-0.002,0.068
aged,0.4105,0.010,42.003,0.000,0.391,0.430
vehipc,0.5491,1.185,0.464,0.643,-1.784,2.882


## 예제 13.9 FGLS의 예

In [7]:
import statsmodels.formula.api as smf
from numpy import log, exp

# Continue
fm = 'deathrate~drink+smoke+aged+vehipc+C(year)'
# Step 1
ols = smf.ols(fm, data=Death).fit()
Death['u'] = ols.resid
# Step 2
fm_aux = fm.replace('deathrate', 'log(u**2)')
aux = smf.ols(fm_aux, data=Death).fit()
# Step 3
h = exp(aux.fittedvalues)
# Step 4
fgls = smf.wls(fm, data=Death, weights=1/h).fit()
fgls.summary(slim=True)

0,1,2,3
Dep. Variable:,deathrate,R-squared:,0.936
Model:,WLS,Adj. R-squared:,0.934
No. Observations:,258,F-statistic:,608.2
Covariance Type:,nonrobust,Prob (F-statistic):,2.13e-146

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2457,0.773,-0.318,0.751,-1.769,1.278
C(year)[T.2009],-0.3521,0.090,-3.914,0.000,-0.529,-0.175
C(year)[T.2010],-0.2974,0.097,-3.056,0.002,-0.489,-0.106
drink,0.0087,0.011,0.817,0.414,-0.012,0.030
smoke,0.0363,0.017,2.187,0.030,0.004,0.069
aged,0.4050,0.010,40.215,0.000,0.385,0.425
vehipc,0.7946,1.113,0.714,0.476,-1.397,2.987


## 이분산 존재 검정

In [8]:
import pandas as pd
import numpy as np

# Generate data
np.random.seed(101)
n = 50
x1 = np.random.normal(size=n)
x2 = np.random.normal(size=n)
u = [a*b for a,b in zip(x1,np.random.normal(size=n))]
y = 1+x1-x2+u
DF = pd.DataFrame({'y': y, 'x1': x1, 'x2': x2})

In [9]:
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan as bptest

uhat = sm.OLS.from_formula('y~x1+x2', data=DF).fit().resid
bp = bptest(uhat, sm.add_constant(pd.DataFrame({"x1":x1, "x1sq":x1**2})))
bp # (lm, lm_pvalue, fvalue, f_pvalue)

(25.43047687088538,
 3.00498363642047e-06,
 24.32347601234627,
 5.604385363360125e-08)

맨 앞 숫자가 LM 검정통계, 그 다음이 p값이다. 세 번째 숫자는 F 검정통계, 네 번째 숫자는 이에 해당하는 p값이다.

In [10]:
DF['u2'] = uhat**2
aux = sm.OLS.from_formula('u2~x1+I(x1**2)', data=DF).fit()
aux.nobs*aux.rsquared

25.43047687088538

LM 검정통계는 앞에서 구한 값과 동일하다.