<a href="https://colab.research.google.com/github/dajebbar/Diabetes-Classification/blob/main/Multicollinearity_and_VIF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multicollinearity & VIF(Variance Inflation Factor)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
diabetes_df = pd.read_csv('./diabetes.csv')
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Prepare Data

In [3]:
def zero_check(df):
  features_list = df.columns[:-1].tolist()
  for feature in features_list:
    print(f'No of zeros in {feature} : {df[df[feature] <= 0].shape[0]}')

In [4]:
zero_check(diabetes_df)

No of zeros in Pregnancies : 111
No of zeros in Glucose : 5
No of zeros in BloodPressure : 35
No of zeros in SkinThickness : 227
No of zeros in Insulin : 374
No of zeros in BMI : 11
No of zeros in DiabetesPedigreeFunction : 0
No of zeros in Age : 0


In [5]:
def zero_to_mean(df):
  list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
  for item in list:
    df[item] = np.where(df[item] < 0, 0, df[item])
    print(f'{item} mean: {df[item].mean()}')
    df[item] = df[item].replace(0, df[item].mean())

In [6]:
zero_to_mean(diabetes_df)
print()
zero_check(diabetes_df)

Glucose mean: 120.89453125
BloodPressure mean: 69.10546875
SkinThickness mean: 20.536458333333332
Insulin mean: 79.79947916666667
BMI mean: 31.992578124999977

No of zeros in Pregnancies : 111
No of zeros in Glucose : 0
No of zeros in BloodPressure : 0
No of zeros in SkinThickness : 0
No of zeros in Insulin : 0
No of zeros in BMI : 0
No of zeros in DiabetesPedigreeFunction : 0
No of zeros in Age : 0


In [7]:
diabetes_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,121.681605,30.436016,44.0,99.75,117.0,140.25,199.0
BloodPressure,768.0,72.254807,12.115932,24.0,64.0,72.0,80.0,122.0
SkinThickness,768.0,26.606479,9.631241,7.0,20.536458,23.0,32.0,99.0
Insulin,768.0,118.660163,93.080358,14.0,79.799479,79.799479,127.25,846.0
BMI,768.0,32.450805,6.875374,18.2,27.5,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


## Separate Data

In [8]:
data, target = diabetes_df.drop(columns=['Outcome']), diabetes_df.Outcome

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
