In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer


In [2]:
# https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database
df=pd.read_csv("diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
df["Insulin"].value_counts()

Insulin
0      374
105     11
130      9
140      9
120      8
      ... 
73       1
171      1
255      1
52       1
112      1
Name: count, Length: 186, dtype: int64

In [7]:
df["BloodPressure"].value_counts()

BloodPressure
70     57
74     52
78     45
68     45
72     44
64     43
80     40
76     39
60     37
0      35
62     34
66     30
82     30
88     25
84     23
90     22
86     21
58     21
50     13
56     12
52     11
54     11
75      8
92      8
65      7
85      6
94      6
48      5
96      4
44      4
100     3
106     3
98      3
110     3
55      2
108     2
104     2
46      2
30      2
122     1
95      1
102     1
61      1
24      1
38      1
40      1
114     1
Name: count, dtype: int64

In [8]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [9]:
columns_to_check=['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']
for col in columns_to_check:
    zero_count=(df[col]==0).sum()
    zero_percentage=100*zero_count/len(df)
    print(f"{col}: {zero_count} (%{zero_percentage:.2f})")

Glucose: 5 (%0.65)
BloodPressure: 35 (%4.56)
SkinThickness: 227 (%29.56)
Insulin: 374 (%48.70)
BMI: 11 (%1.43)


In [10]:
X=df.drop(["Outcome"], axis=1) 
y=df["Outcome"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=15)

In [12]:
X_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,3.907166,120.607492,69.446254,20.114007,79.625407,31.92215,0.469948,33.285016
std,3.385438,31.904793,19.240036,15.789295,111.225898,7.999904,0.328516,11.678337
min,0.0,0.0,0.0,0.0,0.0,0.0,0.084,21.0
25%,1.0,99.25,64.0,0.0,0.0,27.4,0.23825,24.0
50%,3.0,117.0,72.0,23.0,34.0,32.0,0.3705,29.0
75%,6.0,139.75,80.0,32.0,130.0,36.5,0.63075,40.0
max,17.0,199.0,122.0,63.0,680.0,67.1,2.42,81.0


In [13]:

all_features = X.columns.tolist()
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())                    
])
full_preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', preprocessor, all_features)
    ],
    remainder='passthrough'
)

In [14]:
dt_base = DecisionTreeClassifier(random_state=42) # AdaBoost'un temel tahmincisi

models_to_run = {
    "AdaBoost": {
        "estimator": AdaBoostClassifier(estimator=dt_base, algorithm='SAMME', random_state=42), # algorithm='SAMME' eklendi
        "param_grid": {
            "classifier__n_estimators": [50, 100, 200], 
            "classifier__learning_rate": [0.01, 0.1, 1], 
        }
    },
    
    "DecisionTreeClassifier": {
        "estimator": dt_base,
        "param_grid": {
            "classifier__criterion": ["gini", "entropy"], 
            "classifier__max_depth": [5, 10, None], 
            "classifier__max_features": ["sqrt", None]
        }
    },
    
    "GradientBoosting": {
        "estimator": GradientBoostingClassifier(random_state=42),
        "param_grid": {
            "classifier__loss": ['log_loss', 'exponential'],
            "classifier__learning_rate": [0.05, 0.1], 
            "classifier__n_estimators": [100, 200], 
            "classifier__max_depth": [3, 5], 
            "classifier__subsample": [0.8, 1.0]
        }
    },
    
    "KNeighborsClassifier": {
        "estimator": KNeighborsClassifier(),
        "param_grid": {
            "classifier__n_neighbors": [3, 7, 11, 15], 
            "classifier__weights": ['uniform', 'distance'], 
            "classifier__p": [1, 2]
        }
    },
    
    "GaussianNB": {
        "estimator": GaussianNB(),
        "param_grid": {
            "classifier__var_smoothing": np.logspace(-11, -7, num=5) # Hassas tuning için logspace kullanıldı
        }
    }
}

In [15]:
class_names = ['No Diabetes (0)', 'Diabetes (1)']

for name, model_data in models_to_run.items():
    
    pipeline = Pipeline(steps=[('preprocessor', full_preprocessor), ('classifier', model_data["estimator"])])
    
    grid_search = GridSearchCV(
        estimator=pipeline, 
        param_grid=model_data["param_grid"], 
        cv=5, 
        scoring='accuracy',
        n_jobs=-1 
    )

    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    print(f"\n\n{'='*20} {name} RESULT {'='*20}")
    print(f"Best Parametters: {grid_search.best_params_}")
    print(f"Best CV Scores: {grid_search.best_score_:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))
    print("Accuracy Score:")
    print(accuracy_score(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))



En İyi Parametreler: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 50}
En İyi CV Skoru: 0.7247

Classification Report:
                 precision    recall  f1-score   support

No Diabetes (0)       0.80      0.73      0.76       108
   Diabetes (1)       0.47      0.57      0.51        46

       accuracy                           0.68       154
      macro avg       0.64      0.65      0.64       154
   weighted avg       0.70      0.68      0.69       154

Accuracy Score:
0.6818181818181818

Confusion Matrix:
[[79 29]
 [20 26]]


En İyi Parametreler: {'classifier__criterion': 'gini', 'classifier__max_depth': 5, 'classifier__max_features': None}
En İyi CV Skoru: 0.7476

Classification Report:
                 precision    recall  f1-score   support

No Diabetes (0)       0.81      0.78      0.79       108
   Diabetes (1)       0.52      0.57      0.54        46

       accuracy                           0.71       154
      macro avg       0.66      0.67      0.67