In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
!pip install missingno
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)



In [50]:
df_ = pd.read_csv("diabetes.csv")
df = df_

In [51]:
#Part 1

In [52]:
#Step 1
df.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [53]:
df.shape

(768, 9)

In [54]:
# Genel olarak bir yorum yapılmak istenirse, yaş ilerledikçe hastalığın pozitif olması artış göstermekte, glukoz oranı düşük
# olsa bile vücut kitle indexinin fazla olması hastalığın pozitif olmasını etkilemekte gibi duruyor. İstatiksel olarak olmasa
# da bütün verilerin belli bir değer üzerinde olması diğer parametrelerle birlikte iken hastalığı etkilemiş olabilir.

In [55]:
#Step 2

In [56]:
def grab_col_names(dataframe, cat_th=10,car_th=15):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes=="O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique()<cat_th and dataframe[col].dtypes!="O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique()>cat_th and dataframe[col].dtypes=="O"]
    
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    
    #num cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes!="O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    
    
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [57]:
#Step 3
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 768
Variables: 9
cat_cols: 1
num_cols: 8
cat_but_car: 0
num_but_cat: 1


In [58]:
# Değişkenleri inceledğimizde kategorik değişken sayısı 1, numerik değişken sayısı ise 8 tane olduğu gözlenmiştir.

In [59]:
#Step 4

In [60]:
df.groupby("Outcome")[num_cols].mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.304,0.43,31.19
1,4.866,141.257,70.825,22.164,100.336,35.143,0.55,37.067


In [61]:
df.groupby(cat_cols).agg({"Outcome":"mean"})

Unnamed: 0_level_0,Outcome
Outcome,Unnamed: 1_level_1
0,0.0
1,1.0


In [21]:
#Step 5

def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.99):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [22]:
for col in num_cols:    
    print(col, check_outlier(df,col))

Pregnancies False
Glucose False
BloodPressure False
SkinThickness False
Insulin False
BMI False
DiabetesPedigreeFunction False
Age False


In [23]:
#Verinin yapısına fazla karışmamak adına değerleri yukarıda yazan şekilde belirlediğimizde aykırı değer gözükmemektedir.

In [24]:
#Step 6

In [25]:
df.isnull().values.any()

False

In [26]:
#Eksik gözlem bulunmamaktadır.

In [27]:
df.corr().unstack().sort_values()["Outcome"]

BloodPressure              0.065
SkinThickness              0.075
Insulin                    0.131
DiabetesPedigreeFunction   0.174
Pregnancies                0.222
Age                        0.238
BMI                        0.293
Glucose                    0.467
Outcome                    1.000
dtype: float64

In [28]:
#Korelasyon sonucu hastalıkla ilgisi en fazla olan değişken Glucose olarak gözlenmiştir. En az olanı ise BloodPressure.

In [29]:
#Part 2

In [30]:
#Step 1

In [62]:
for col in df.columns:
    df.replace(0,np.nan,inplace=True)
    if col in cat_cols:
        df[col].replace(np.nan,0,inplace=True)       

In [63]:
df.isnull().sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [33]:
df=pd.get_dummies(df[cat_cols + num_cols],drop_first=True)

In [64]:
#knn ile eksik değerleri dolduralım
from sklearn.impute import KNNImputer

In [65]:
imputer = KNNImputer(n_neighbors=5)
df = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,169.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,58.6,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,25.8,164.6,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,6.2,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [66]:
#Step2
df.loc[(df["Age"]>=20) & (df["Age"]<=30),"New_Age_Cat"]="Young"
df.loc[(df["Age"]>30) & (df["Age"]<=50),"New_Age_Cat"]="Mature"
df.loc[(df["Age"]>50),"New_Age_Cat"]="Senior"

In [67]:
df.drop("Age",inplace=True,axis=1)

In [68]:
df.loc[(df["SkinThickness"]>=5) & (df["SkinThickness"]<=25),"New_Skin_Cat"]="Thin"
df.loc[(df["SkinThickness"]>25) & (df["SkinThickness"]<=50),"New_Skin_Cat"]="MediumThick"
df.loc[(df["SkinThickness"]>50),"New_Skin_Cat"]="Thick"

In [69]:
df.drop("SkinThickness",inplace=True,axis=1)

In [70]:
#Step 3
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 768
Variables: 9
cat_cols: 3
num_cols: 6
cat_but_car: 0
num_but_cat: 1


In [71]:
def one_hot_encoding(dataframe,cat_cols,drop_first=False):
    dataframe = pd.get_dummies(dataframe,columns=cat_cols,drop_first=drop_first)
    return dataframe

In [72]:
ohe_cols = [col for col in df.columns if 10 >= df[col].nunique() > 2]
df = one_hot_encoding(df,ohe_cols,drop_first=True)

In [73]:
#Step 4
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [74]:
#Step 5
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.7748917748917749