# 6장 최소제곱법을 이용한 가설검정

In [1]:
## https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.t.html#scipy.stats.t
from scipy.stats import t

In [2]:
t.ppf(.975, 23)

2.0686576104190406

In [3]:
t.ppf(.95, 23)

1.7138715277470473

## Housing

In [4]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

Housing = pd.read_csv('csv/Ecdat/Housing.csv')
ols = smf.ols('np.log(price)~np.log(lotsize)', data=Housing).fit()
ols.summary()

0,1,2,3
Dep. Variable:,np.log(price),R-squared:,0.336
Model:,OLS,Adj. R-squared:,0.335
Method:,Least Squares,F-statistic:,275.8
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,2.14e-50
Time:,02:23:54,Log-Likelihood:,-122.36
No. Observations:,546,AIC:,248.7
Df Residuals:,544,BIC:,257.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.4685,0.277,23.374,0.000,5.925,7.012
np.log(lotsize),0.5422,0.033,16.606,0.000,0.478,0.606

0,1,2,3
Omnibus:,0.255,Durbin-Watson:,1.086
Prob(Omnibus):,0.88,Jarque-Bera (JB):,0.333
Skew:,-0.045,Prob(JB):,0.847
Kurtosis:,2.92,Cond. No.,183.0


In [5]:
from scipy.stats import t

t.ppf(.995, 544)

2.5848970040670145

## 예제 6.1 가구주 나이와 통신비 지출 비중

In [6]:
import pandas as pd
from statsmodels.api import OLS

Hcons = pd.read_csv('csv/loedata/Hcons.csv')
Hcons.describe()

Unnamed: 0,age,comm,rec
count,6723.0,6723.0,6723.0
mean,45.860033,6.841078,5.16253
std,8.23718,3.925046,4.836962
min,30.0,0.0,0.0
25%,39.0,4.261053,2.253281
50%,46.0,6.031846,3.856771
75%,53.0,8.44065,6.514518
max,60.0,37.129649,72.807483


In [7]:
ols = smf.ols('comm~age', data=Hcons).fit()
ols.summary()

0,1,2,3
Dep. Variable:,comm,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,4.522
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,0.0335
Time:,02:23:55,Log-Likelihood:,-18730.0
No. Observations:,6723,AIC:,37460.0
Df Residuals:,6721,BIC:,37480.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.2744,0.271,23.176,0.000,5.744,6.805
age,0.0124,0.006,2.127,0.033,0.001,0.024

0,1,2,3
Omnibus:,2757.039,Durbin-Watson:,1.844
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15710.71
Skew:,1.887,Prob(JB):,0.0
Kurtosis:,9.468,Cond. No.,264.0


## p값

In [8]:
from scipy.stats import t

2*t.cdf(-1.54,48)

0.13012747345659167

In [9]:
2*(1-t.cdf(1.54,48))

0.1301274734565918

## t(544) 분포의 5% 임계값

In [10]:
t.ppf(.975, 544)

1.9643343306673329

## 예제 6.2 고령자의 배우자 존재 여부와 삶의 만족도

In [11]:
import pandas as pd
Klosa = pd.read_csv('csv/loedata/Klosa.csv')

In [12]:
# Subsetting
Klosa1 = Klosa[(Klosa['working']==0) & (Klosa['age']>=65)]

In [13]:
import statsmodels.formula.api as smf
fm = 'satisfy5~married'
smf.ols(fm, data=Klosa1).fit().summary()

0,1,2,3
Dep. Variable:,satisfy5,R-squared:,0.026
Model:,OLS,Adj. R-squared:,0.025
Method:,Least Squares,F-statistic:,28.49
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,1.15e-07
Time:,02:23:55,Log-Likelihood:,-4641.8
No. Observations:,1060,AIC:,9288.0
Df Residuals:,1058,BIC:,9297.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,51.5534,0.851,60.562,0.000,49.883,53.224
married,6.3365,1.187,5.337,0.000,4.007,8.666

0,1,2,3
Omnibus:,27.276,Durbin-Watson:,1.739
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29.01
Skew:,-0.402,Prob(JB):,5.02e-07
Kurtosis:,2.904,Cond. No.,2.65


In [14]:
import numpy as np
np.std(Klosa.satisfy5)

18.522274892616505

In [15]:
smf.ols(fm, data=Klosa1[Klosa1.hlth3>=0]).fit().summary()

0,1,2,3
Dep. Variable:,satisfy5,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.567
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,0.452
Time:,02:23:55,Log-Likelihood:,-1250.9
No. Observations:,300,AIC:,2506.0
Df Residuals:,298,BIC:,2513.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,62.1186,1.446,42.960,0.000,59.273,64.964
married,1.3978,1.856,0.753,0.452,-2.256,5.051

0,1,2,3
Omnibus:,11.384,Durbin-Watson:,1.656
Prob(Omnibus):,0.003,Jarque-Bera (JB):,12.106
Skew:,-0.484,Prob(JB):,0.00235
Kurtosis:,2.824,Cond. No.,2.95


In [16]:
smf.ols(fm, data=Klosa1[Klosa1.hlth3<0]).fit().summary()

0,1,2,3
Dep. Variable:,satisfy5,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,21.54
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,4.08e-06
Time:,02:23:55,Log-Likelihood:,-3344.5
No. Observations:,760,AIC:,6693.0
Df Residuals:,758,BIC:,6702.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,48.4131,0.991,48.849,0.000,46.468,50.359
married,6.6558,1.434,4.641,0.000,3.841,9.471

0,1,2,3
Omnibus:,11.637,Durbin-Watson:,1.738
Prob(Omnibus):,0.003,Jarque-Bera (JB):,11.918
Skew:,-0.294,Prob(JB):,0.00258
Kurtosis:,2.828,Cond. No.,2.57


## 신뢰구간

In [17]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

Housing = pd.read_csv('csv/Ecdat/Housing.csv')
ols = smf.ols('np.log(price)~np.log(lotsize)', data=Housing).fit()

## https://stackoverflow.com/questions/44302099/python-statsmodels-ols-confidence-interval
## https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLSResults.conf_int.html
ols.conf_int(alpha=.05)

Unnamed: 0,0,1
Intercept,5.92492,7.012143
np.log(lotsize),0.478043,0.606315


In [18]:
ols.conf_int(alpha=.01)

Unnamed: 0,0,1
Intercept,5.753185,7.183879
np.log(lotsize),0.457782,0.626576


## 예제 6.3 2012년 한국 상장기업 급여

In [19]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

Ksalary = pd.read_csv('csv/loedata/Ksalary.csv')
Ksalary1 = Ksalary[(Ksalary.kospi==1) & (Ksalary.sector=='ElecElectron')]
ols = smf.ols('np.log(avgsal)~np.log(sales/emp)', data=Ksalary1).fit()
ols.summary()

0,1,2,3
Dep. Variable:,np.log(avgsal),R-squared:,0.101
Model:,OLS,Adj. R-squared:,0.084
Method:,Least Squares,F-statistic:,6.072
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,0.0169
Time:,02:23:55,Log-Likelihood:,-0.95089
No. Observations:,56,AIC:,5.902
Df Residuals:,54,BIC:,9.952
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.8047,0.046,82.889,0.000,3.713,3.897
np.log(sales / emp),0.1225,0.050,2.464,0.017,0.023,0.222

0,1,2,3
Omnibus:,0.191,Durbin-Watson:,0.216
Prob(Omnibus):,0.909,Jarque-Bera (JB):,0.337
Skew:,0.119,Prob(JB):,0.845
Kurtosis:,2.704,Cond. No.,2.32


In [20]:
ols.conf_int(.01)

Unnamed: 0,0,1
Intercept,3.682129,3.927238
np.log(sales / emp),-0.010231,0.255249


In [21]:
ols.conf_int(.05)

Unnamed: 0,0,1
Intercept,3.712658,3.896709
np.log(sales / emp),0.022836,0.222183


## 예제 6.4 주택가격

In [22]:
Housing['unitprice'] = [pi/li for pi,li in zip(Housing.price, Housing.lotsize)]
ols = smf.ols('np.log(unitprice)~np.log(lotsize)', data=Housing).fit()
ols.summary()

0,1,2,3
Dep. Variable:,np.log(unitprice),R-squared:,0.265
Model:,OLS,Adj. R-squared:,0.264
Method:,Least Squares,F-statistic:,196.6
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,2.36e-38
Time:,02:23:55,Log-Likelihood:,-122.36
No. Observations:,546,AIC:,248.7
Df Residuals:,544,BIC:,257.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.4685,0.277,23.374,0.000,5.925,7.012
np.log(lotsize),-0.4578,0.033,-14.022,0.000,-0.522,-0.394

0,1,2,3
Omnibus:,0.255,Durbin-Watson:,1.086
Prob(Omnibus):,0.88,Jarque-Bera (JB):,0.333
Skew:,-0.045,Prob(JB):,0.847
Kurtosis:,2.92,Cond. No.,183.0


## 예제 6.5 담배소비의 가격탄력성

In [23]:
import pandas as pd
import statsmodels.formula.api as smf

data = pd.read_csv('csv/AER/CigarettesB.csv')
smf.ols('packs~price', data=data).fit().summary()

0,1,2,3
Dep. Variable:,packs,R-squared:,0.291
Model:,OLS,Adj. R-squared:,0.275
Method:,Least Squares,F-statistic:,18.08
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,0.000108
Time:,02:23:55,Log-Likelihood:,19.195
No. Observations:,46,AIC:,-34.39
Df Residuals:,44,BIC:,-30.73
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.0941,0.063,81.247,0.000,4.968,5.220
price,-1.1983,0.282,-4.253,0.000,-1.766,-0.630

0,1,2,3
Omnibus:,1.86,Durbin-Watson:,2.307
Prob(Omnibus):,0.395,Jarque-Bera (JB):,1.209
Skew:,-0.389,Prob(JB):,0.546
Kurtosis:,3.164,Cond. No.,12.2


In [24]:
smf.ols('I(packs+price)~price', data=data).fit().summary()

0,1,2,3
Dep. Variable:,I(packs + price),R-squared:,0.011
Model:,OLS,Adj. R-squared:,-0.011
Method:,Least Squares,F-statistic:,0.4953
Date:,"Wed, 11 Feb 2026",Prob (F-statistic):,0.485
Time:,02:23:55,Log-Likelihood:,19.195
No. Observations:,46,AIC:,-34.39
Df Residuals:,44,BIC:,-30.73
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.0941,0.063,81.247,0.000,4.968,5.220
price,-0.1983,0.282,-0.704,0.485,-0.766,0.370

0,1,2,3
Omnibus:,1.86,Durbin-Watson:,2.307
Prob(Omnibus):,0.395,Jarque-Bera (JB):,1.209
Skew:,-0.389,Prob(JB):,0.546
Kurtosis:,3.164,Cond. No.,12.2
