# VIF(Variance Inflation Factor)

* 次元削除の方法
* 多重共線性の回避など
* 参考  
https://gochikika.ntt.com/Preprocessing/reduction.html#vif

In [1]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
import pandas as pd

## データの用意

In [2]:
dataset = load_wine()

In [6]:
df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


## VIF
* 一般的に使用されるVIFの閾値は10以上で，10より大きい場合削除される
* これは相関係数が0.95で次元削除を行なっているのと概ね一致する
* VIFで特徴量を削除する場合は一つずつ削除を行う

In [9]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
vif.index = df.columns
vif

Unnamed: 0,VIF Factor
alcohol,206.189057
malic_acid,8.925541
ash,165.64037
alcalinity_of_ash,73.141564
magnesium,67.364868
total_phenols,62.786935
flavanoids,35.535602
nonflavanoid_phenols,16.636708
proanthocyanins,17.115665
color_intensity,17.022272


## 再起的にVIFを実行

In [10]:
while True:
    df_vif = df[vif.index].copy()
    vif["VIF Factor"] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]
    if(vif["VIF Factor"].max(axis = 0) > 10):
        vif = vif.drop(vif["VIF Factor"].idxmax(axis = 0), axis = 0)
    else:
        break
vif

Unnamed: 0,VIF Factor
malic_acid,5.815558
flavanoids,2.824526
nonflavanoid_phenols,6.758139
color_intensity,5.69387
