# OLS와 다중공선성

In [164]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm

# 데이터 불러오기
boston = pd.read_csv("Boston_house.csv")
boston_data = boston.drop(['Target'], axis=1)

# crim, rm, lstat을 통한 다중 선형회귀분석
X = boston[['CRIM','RM', 'LSTAT', 'B', 'TAX', 'AGE', 'ZN', 'NOX', 'INDUS']]
Y = boston[["Target"]]

In [165]:
pd.concat([X, Y], axis=1).head(3)

Unnamed: 0,CRIM,RM,LSTAT,B,TAX,AGE,ZN,NOX,INDUS,Target
0,0.00632,6.575,4.98,396.9,296,65.2,18.0,0.538,2.31,24.0
1,0.02731,6.421,9.14,396.9,242,78.9,0.0,0.469,7.07,21.6
2,0.02729,7.185,4.03,392.83,242,61.1,0.0,0.469,7.07,34.7


In [166]:
# OLS 검정
multi_model = sm.OLS(Y, sm.add_constant(X))
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,Target,R-squared:,0.662
Model:,OLS,Adj. R-squared:,0.656
Method:,Least Squares,F-statistic:,108.1
Date:,"Sat, 10 Feb 2024",Prob (F-statistic):,5.76e-111
Time:,19:08:57,Log-Likelihood:,-1565.5
No. Observations:,506,AIC:,3151.0
Df Residuals:,496,BIC:,3193.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.1088,3.828,-1.857,0.064,-14.631,0.413
CRIM,-0.0453,0.036,-1.269,0.205,-0.115,0.025
RM,5.0922,0.458,11.109,0.000,4.192,5.993
LSTAT,-0.5651,0.057,-9.854,0.000,-0.678,-0.452
B,0.0090,0.003,2.952,0.003,0.003,0.015
TAX,-0.0060,0.002,-2.480,0.013,-0.011,-0.001
AGE,0.0236,0.014,1.653,0.099,-0.004,0.052
ZN,0.0294,0.013,2.198,0.028,0.003,0.056
NOX,3.4838,3.833,0.909,0.364,-4.047,11.014

0,1,2,3
Omnibus:,195.49,Durbin-Watson:,0.848
Prob(Omnibus):,0.0,Jarque-Bera (JB):,872.873
Skew:,1.686,Prob(JB):,2.87e-190
Kurtosis:,8.479,Cond. No.,10400.0


## 다중공선성 제거

In [167]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['feature'] = X.columns
vif

Unnamed: 0,VIF Factor,feature
0,1.917332,CRIM
1,46.535369,RM
2,8.844137,LSTAT
3,16.856737,B
4,19.923044,TAX
5,18.457503,AGE
6,2.086502,ZN
7,72.439753,NOX
8,12.642137,INDUS


In [168]:
# 모든 vif의 값이 10 미만이 될때까지 반복수행

cols = X.columns.tolist()

while True:
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X[cols].values, i) for i in range(len(cols))]
    vif['feature'] = cols
    factor, feat = vif.iloc[vif['VIF Factor'].idxmax()]
    
    if factor > 10:
        print(factor, feat)
        cols.remove(feat)
    else:
        break

72.4397532624526 NOX
30.80630113779095 RM
16.233237134468638 TAX
13.259584818500517 AGE


In [169]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X[cols].values, i) for i in range(len(cols))]
vif['feature'] = cols
vif

Unnamed: 0,VIF Factor,feature
0,1.620427,CRIM
1,6.515147,LSTAT
2,4.811292,B
3,1.577093,ZN
4,6.224923,INDUS


In [170]:
multi_model = sm.OLS(Y, sm.add_constant(X[cols]))
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,Target,R-squared:,0.554
Model:,OLS,Adj. R-squared:,0.55
Method:,Least Squares,F-statistic:,124.3
Date:,"Sat, 10 Feb 2024",Prob (F-statistic):,2.52e-85
Time:,19:09:00,Log-Likelihood:,-1635.9
No. Observations:,506,AIC:,3284.0
Df Residuals:,500,BIC:,3309.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,31.3280,1.625,19.277,0.000,28.135,34.521
CRIM,-0.0505,0.038,-1.341,0.181,-0.124,0.023
LSTAT,-0.8541,0.051,-16.612,0.000,-0.955,-0.753
B,0.0058,0.003,1.723,0.086,-0.001,0.012
ZN,0.0241,0.014,1.712,0.088,-0.004,0.052
INDUS,-0.0144,0.056,-0.257,0.797,-0.125,0.096

0,1,2,3
Omnibus:,154.534,Durbin-Watson:,0.863
Prob(Omnibus):,0.0,Jarque-Bera (JB):,374.445
Skew:,1.564,Prob(JB):,4.8999999999999997e-82
Kurtosis:,5.825,Cond. No.,2180.0
