In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, \
    classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import warnings

warnings.simplefilter(action="ignore")

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.shape

(768, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
df.describe([.01,.05,.25,.50,.75,.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,99%,max
Pregnancies,768.0,3.845052,3.369578,0.0,0.0,0.0,1.0,3.0,6.0,13.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,57.0,79.0,99.0,117.0,140.25,196.0,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,0.0,38.7,62.0,72.0,80.0,106.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,0.0,0.0,23.0,32.0,51.33,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,0.0,0.0,30.5,127.25,519.9,846.0
BMI,768.0,31.992578,7.88416,0.0,0.0,21.8,27.3,32.0,36.6,50.759,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.09468,0.14035,0.24375,0.3725,0.62625,1.69833,2.42
Age,768.0,33.240885,11.760232,21.0,21.0,21.0,24.0,29.0,41.0,67.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [7]:
df["Outcome"].value_counts() / len(df) * 100

0    65.104167
1    34.895833
Name: Outcome, dtype: float64

In [8]:
# Aşağıda yer alan kolonlarımızda 0 olan değerlerin 0 olması bilimsel olarak mümkün olmamaktadır bu yüzden 0 olan değerleri NaN'a çeviriyoruz
df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]] = df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]].replace(0,np.NaN)

In [9]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [10]:
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [11]:
mis_value = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]

In [12]:
# eksik değerleri hedef değişken kırılımında ortalama ile dolduruyoruz.
for i in mis_value:
    df[i][(df[i].isnull()) & (df["Outcome"] == 0)] = df[i][(df[i].isnull()) & (df["Outcome"] == 0)].fillna(df[i][df["Outcome"] == 0].mean())
    df[i][(df[i].isnull()) & (df["Outcome"] == 1)] = df[i][(df[i].isnull()) & (df["Outcome"] == 1)].fillna(df[i][df["Outcome"] == 1].mean())

In [13]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,206.846154,33.6,0.627,50,1
1,1,85.0,66.0,29.0,130.287879,26.6,0.351,31,0
2,8,183.0,64.0,33.0,206.846154,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [14]:
def outlier_thresholds(dataframe,variable):
    q1 = dataframe[variable].quantile(0.25)
    q3 = dataframe[variable].quantile(0.75)
    IQR = q3 - q1
    low_limit = q1 - 1.5 * IQR
    up_limit = q3 + 1.5 * IQR
    return low_limit,up_limit

In [15]:
def has_outliers(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    if dataframe[(dataframe[variable] < low_limit) | (dataframe[variable] > up_limit)].any(axis=None):
        print(variable, "yes")

In [16]:
# Aykırı gözlemler var mı?
for col in df.columns:
    has_outliers(df,col)

Pregnancies yes
BloodPressure yes
SkinThickness yes
Insulin yes
BMI yes
DiabetesPedigreeFunction yes
Age yes


In [17]:
outlier_columns = [col for col in df.columns if col not in "Outcome"]

In [18]:
outlier_columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [19]:
for col in df.columns:
    outlier_thresholds(df,col)

In [19]:
df.describe([0.01,0.25,0.50,0.75,0.99]).T

Unnamed: 0,count,mean,std,min,1%,25%,50%,75%,99%,max
Pregnancies,768.0,3.845052,3.369578,0.0,0.0,1.0,3.0,6.0,13.0,17.0
Glucose,768.0,121.697358,30.462008,44.0,67.67,99.75,117.0,141.0,196.0,199.0
BloodPressure,768.0,72.428141,12.106044,24.0,44.0,64.0,72.0,80.0,106.0,122.0
SkinThickness,768.0,29.247042,8.923908,7.0,10.0,25.0,28.0,33.0,51.33,99.0
Insulin,768.0,157.003527,88.860914,14.0,24.34,121.5,130.287879,206.846154,519.9,846.0
BMI,768.0,32.44642,6.87897,18.2,19.5,27.5,32.05,36.6,50.759,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.09468,0.24375,0.3725,0.62625,1.69833,2.42
Age,768.0,33.240885,11.760232,21.0,21.0,24.0,29.0,41.0,67.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [20]:
def replace_with_thresholds(dataframe,variable):
    low_limit,up_limit = outlier_thresholds(dataframe,variable)
    dataframe.loc[(dataframe[variable] < low_limit),variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit),variable] = up_limit

In [21]:
# Aykırı değerleri baskılıyoruz.
for col in outlier_columns:
    replace_with_thresholds(df,col)

## Model

In [22]:
y = df["Outcome"]
X = df.drop("Outcome",axis = 1)

In [23]:
models = [('LR', LogisticRegression()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('SVM', SVC(gamma='auto')),
          ('XGB', GradientBoostingClassifier()),
          ("LightGBM", LGBMClassifier())]

In [24]:
results = []
names = []

In [25]:
for name, model in models:
    kfold = KFold(n_splits=10, random_state=123456)
    cv_results = cross_val_score(model, X, y, cv=10, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.806015 (0.043905)
KNN: 0.856733 (0.032526)
CART: 0.854238 (0.031433)
RF: 0.886722 (0.033536)
SVM: 0.651059 (0.003418)
XGB: 0.898496 (0.028776)
LightGBM: 0.889388 (0.029024)


- Kfold cross validation kullanarak farklı sınıflandırma modellerimizin başarılarını ölçtük. Gradient Boosting Classifier en iyi tahmin başarısını elde eden modelimiz oldu.
- En kötü modelimiz SVC olmuştur.