In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [2]:
df = pd.read_csv("car-mpg.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mpg       398 non-null    float64
 1   cyl       398 non-null    int64  
 2   disp      398 non-null    float64
 3   hp        398 non-null    object 
 4   wt        398 non-null    int64  
 5   acc       398 non-null    float64
 6   yr        398 non-null    int64  
 7   origin    398 non-null    int64  
 8   car_type  398 non-null    int64  
 9   car_name  398 non-null    object 
dtypes: float64(3), int64(5), object(2)
memory usage: 31.2+ KB


In [3]:
###############################
df = df.drop("car_name", axis=1)
df['hp'] = df.iloc[:,[2]].replace("?",np.nan).astype('float')
df.fillna(df.mean(),inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mpg       398 non-null    float64
 1   cyl       398 non-null    int64  
 2   disp      398 non-null    float64
 3   hp        398 non-null    float64
 4   wt        398 non-null    int64  
 5   acc       398 non-null    float64
 6   yr        398 non-null    int64  
 7   origin    398 non-null    int64  
 8   car_type  398 non-null    int64  
dtypes: float64(4), int64(5)
memory usage: 28.1 KB


In [4]:
###############################
df_x = df.iloc[:,1:]
df_y = df.iloc[:,[0]]
df_x.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cyl,398.0,5.454774,1.701004,3.0,4.0,4.0,8.0,8.0
disp,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
hp,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
wt,398.0,2970.424623,846.841774,1613.0,2223.75,2803.5,3608.0,5140.0
acc,398.0,15.56809,2.757689,8.0,13.825,15.5,17.175,24.8
yr,398.0,76.01005,3.697627,70.0,73.0,76.0,79.0,82.0
origin,398.0,1.572864,0.802055,1.0,1.0,1.0,2.0,3.0
car_type,398.0,0.530151,0.499718,0.0,0.0,1.0,1.0,1.0


In [5]:
###############################
vif = pd.DataFrame()
vif["Variable"] = df_x.columns
vif["VIF"] = [variance_inflation_factor(df_x.values, i) for i in range(df_x.shape[1])]
# Print the VIF results
print(vif)

   Variable         VIF
0       cyl  173.168442
1      disp         inf
2        hp         inf
3        wt  114.047172
4       acc   50.511790
5        yr  167.203601
6    origin    8.173920
7  car_type   11.557821


  vif = 1. / (1. - r_squared_i)


In [6]:

###############################
## Alternate method:
# Add a constant term (intercept) to the independent variables
X = sm.add_constant(df_x)

# Fit an OLS (Ordinary Least Squares) model to estimate the coefficients
model = sm.OLS(df_y.to_numpy(), X.to_numpy()).fit()

# Calculate VIF for each independent variable
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print the VIF results
print(vif)

   Variable         VIF
0     const  734.395349
1       cyl   17.618785
2      disp         inf
3        hp         inf
4        wt    8.554710
5       acc    1.663006
6        yr    1.184529
7    origin    1.720472
8  car_type    6.506512


  vif = 1. / (1. - r_squared_i)
