In [50]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler


In [51]:
def load():
    df=pd.read_csv("diabetes.csv")
    return df

df=load()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [52]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Getting know about numerical and categorical variables

In [53]:
def grab_col_names(dataframe,car_th=20,cat_th=10):
    num_cols=[col for col in dataframe.columns if dataframe[col].dtype!='O']
    num_but_cat=[col for col in num_cols if dataframe[col].nunique()<cat_th]
    num_cols=[col for col in num_cols if col not in num_but_cat]
    cat_cols=[col for col in dataframe.columns if dataframe[col].dtype=='O']
    cat_cols=cat_cols+num_but_cat
    print(f"Observation: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_cols: {len(cat_cols)}")
    return num_cols,cat_cols

num_cols,cat_cols=grab_col_names(df)


Observation: 768
Variables: 9
num_cols: 8
cat_cols: 1


In [54]:
for col in num_cols:
    print(df.groupby("Outcome").agg({col:"mean"}))


         Pregnancies
Outcome             
0           3.298000
1           4.865672
            Glucose
Outcome            
0        109.980000
1        141.257463
         BloodPressure
Outcome               
0            68.184000
1            70.824627
         SkinThickness
Outcome               
0            19.664000
1            22.164179
            Insulin
Outcome            
0         68.792000
1        100.335821
               BMI
Outcome           
0        30.304200
1        35.142537
         DiabetesPedigreeFunction
Outcome                          
0                        0.429734
1                        0.550500
               Age
Outcome           
0        31.190000
1        37.067164


In [55]:
def outlier_threshold(dataframe,col_name,q1=0.25,q3=0.75):
    quartile1=dataframe[col_name].quantile(q1)
    quartile3=dataframe[col_name].quantile(q3)
    iqr=quartile3-quartile1
    low_limit=quartile1-1.5*iqr
    up_limit=quartile3+1.5*iqr
    return low_limit,up_limit

low_limit,up_limit=outlier_threshold(df,"DiabetesPedigreeFunction")
print(low_limit,up_limit)

-0.32999999999999996 1.2


In [56]:
def check_outlier(dataframe,col_name):
    low_limit,up_limit=outlier_threshold(dataframe,col_name)
    if dataframe[(dataframe[col_name]<low_limit)|(dataframe[col_name]>up_limit)].any(axis=None):
        return True
    else:
        return False
    
for col in df.columns:
    print(f'columns:{col}',check_outlier(df,col))

columns:Pregnancies True
columns:Glucose True
columns:BloodPressure True
columns:SkinThickness True
columns:Insulin True
columns:BMI True
columns:DiabetesPedigreeFunction True
columns:Age True
columns:Outcome False


In [57]:
def missing_values(dataframe):
    na_cols=[col for col in dataframe.columns if dataframe[col].isnull().sum()>0]
    n_miss=dataframe[na_cols].isnull().sum().sort_values(ascending=False)
    ratio=(dataframe[na_cols].isnull().sum()/dataframe.shape[0]*100).sort_values(ascending=False)
    missing_df=pd.concat([n_miss,np.round(ratio,2)],axis=1,keys=["n_miss","ratio"])
    print(missing_df)
    return na_cols
missing_values(df)

Empty DataFrame
Columns: [n_miss, ratio]
Index: []


[]

In [58]:
for col in ["Glucose","BloodPressure","SkinThickness","Insulin","DiabetesPedigreeFunction","BMI","Age"]:
    df.loc[df[col] == 0, col] = np.nan
 
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50.0,1
1,1,85.0,66.0,29.0,,26.6,0.351,31.0,0
2,8,183.0,64.0,,,23.3,0.672,32.0,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [59]:
df.groupby("Age").agg({"Age":["count","mean"]})

Unnamed: 0_level_0,Age,Age
Unnamed: 0_level_1,count,mean
Age,Unnamed: 1_level_2,Unnamed: 2_level_2
21.0,63,21.0
22.0,72,22.0
23.0,38,23.0
24.0,46,24.0
25.0,48,25.0
26.0,33,26.0
27.0,32,27.0
28.0,35,28.0
29.0,29,29.0
30.0,21,30.0


In [64]:
df.loc[(df["Age"]>=21) & (df["Age"] < 51), "NEW_AGE"] ="Mature"
df.loc[(df["Age"]>=51),"NEW_AGE"]="Senior"
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NEW_AGE
0,6,148.0,72.0,35.0,,33.6,0.627,50.0,1,Mature
1,1,85.0,66.0,29.0,,26.6,0.351,31.0,0,Mature
2,8,183.0,64.0,,,23.3,0.672,32.0,1,Mature
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0,Mature
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1,Mature
5,5,116.0,74.0,,,25.6,0.201,30.0,0,Mature
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1,Mature
7,10,115.0,,,,35.3,0.134,29.0,0,Mature
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53.0,1,Senior
9,8,125.0,96.0,,,,0.232,54.0,1,Senior


In [69]:
le=LabelEncoder()
le.fit_transform(df["NEW_AGE"])[0:10]
le.inverse_transform([0,1])

array(['Mature', 'Senior'], dtype=object)

In [71]:
ss = StandardScaler()
df["Age_standard_Scaler"] = ss.fit_transform(df[["Age"]])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NEW_AGE,Age_standard_Scaler
0,6,148.0,72.0,35.0,,33.6,0.627,50.0,1,Mature,1.425995
1,1,85.0,66.0,29.0,,26.6,0.351,31.0,0,Mature,-0.190672
2,8,183.0,64.0,,,23.3,0.672,32.0,1,Mature,-0.105584
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0,Mature,-1.041549
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1,Mature,-0.020496


In [73]:
df[num_cols] = ss.fit_transform(df[num_cols])
df[num_cols].head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.862287,-0.032746,0.558557,,0.165097,0.468492,1.425995
1,-0.844885,-1.202229,-0.517645,-0.014657,,-0.846404,-0.365061,-0.190672
2,1.23388,2.009241,-0.679278,,,-1.323254,0.604397,-0.105584
3,-0.844885,-1.071148,-0.517645,-0.587871,-0.518847,-0.629654,-0.920763,-1.041549
4,-1.141852,0.501816,-2.618874,0.558557,0.104968,1.537847,5.484909,-0.020496
