<a href="https://colab.research.google.com/github/eysaritas/ILK-REPOSITORY/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE #Dengesiz veri setini düzenler.

In [None]:
df = pd.read_csv("diabetes_binary_health_indicators_BRFSS2015.csv")
print(f"Size of dataset : {df.shape}")

Size of dataset : (265389, 22)


In [None]:
print(df.head())

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0   

## **2 - Data Preprocessing**

### 2.1 - Delete duplicate rows



In [None]:
initial_rows = df.shape[0]
df.drop_duplicates(inplace=True)
print(f"Number of deleted lines: {initial_rows - df.shape[0]}")

Number of deleted lines: 35914


### 2.2 - Incomplete Data Management

In [None]:
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

### 2.3 - Checking for Data Imbalance

In [None]:
print(df_imputed['Diabetes_binary'].value_counts(normalize=True))
#Model sadece 0 diyerek %85 başarı alır. Bu durum veri dengesizliği oluşturuyor.
#Bunu SMOTE ile düzelteceğiz.

Diabetes_binary
0.0    0.847051
1.0    0.152949
Name: proportion, dtype: float64


## **3 - Preparation of Training and Test Data**

In [None]:
X = df_imputed.drop('Diabetes_binary', axis=1)
y = df_imputed['Diabetes_binary']

print(f"Size of X: {X.shape}")
print(f"Size of y: {y.shape}")

Size of X: (229475, 21)
Size of y: (229475,)


### 3.1 - Data Standardization

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### 3.2 - Train-Test Split (Train %80, Test %20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

### 3.2 - Data Balancing with SMOTE (0 - %50, 1 - %50 )

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"After SMOTE training data: {X_train_resampled.shape}")
print(f"Class distribution after SMOTE: \n{pd.Series(y_train_resampled).value_counts()}")

After SMOTE training data: (311004, 21)
Class distribution after SMOTE: 
Diabetes_binary
0.0    155502
1.0    155502
Name: count, dtype: int64


## **3 - MLP Model Architecture and Optimization**

###### MLP Architecture:
###### Input Layer: 21 Features
###### Hidden Layer 1: 64 Neurons
###### Hidden Layer 2: 32 Neurons
###### Output Layer: 1 Neuron (Sigmoid/Logistic)

In [None]:
mlp = MLPClassifier(random_state=42, max_iter=200)

## 3.1 - Hyperparameter Optimization (Grid Search)

In [None]:
param_grid = {
    'hidden_layer_sizes': [(50, 50), (100,)], # İki farklı mimari deneniyor
    'activation': ['relu'],           # İki farklı aktivasyon fonksiyonu
    'solver': ['adam'],
    'alpha': [0.0001],                  # Regularization (Overfitting önleyici)
    'learning_rate': ['constant']
}

In [None]:
grid_search = GridSearchCV(mlp, param_grid, n_jobs=-1, cv=2, scoring='f1', verbose=3)
grid_search.fit(X_train_resampled, y_train_resampled)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


In [None]:
print(f"\nEn İyi Parametreler: {grid_search.best_params_}")
best_model = grid_search.best_estimator_