In [64]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [65]:
# Memuat dataset
data = pd.read_csv('diabetes_prediction_dataset.csv')

In [66]:
# Melihat beberapa baris pertama dari dataset
data.head(5).T

Unnamed: 0,0,1,2,3,4
gender,Female,Female,Male,Female,Male
age,80.0,54.0,28.0,36.0,76.0
hypertension,0,0,0,0,1
heart_disease,1,0,0,0,1
smoking_history,never,No Info,never,current,current
bmi,25.19,27.32,27.32,23.45,20.14
HbA1c_level,6.6,6.6,5.7,5.0,4.8
blood_glucose_level,140,80,158,155,155
diabetes,0,0,0,0,0


In [67]:
# Menampilkan bentuk (shape) dari dataset
data.shape

(100000, 9)

In [68]:
# Melihat info dari dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [69]:
# Menghitung jumlah data yang duplikat
data.duplicated().sum()

np.int64(3854)

In [70]:
# Menghapus data duplikat dan mereset indek
# Create by IBNU RICHO
data = data.drop_duplicates(keep="last").reset_index()

In [71]:
# Jumlah duplikat setelah dihapus
data.duplicated().sum()

np.int64(0)

In [72]:
# Menampilkan bentuk (shape) dari dataset setelah penghapusan duplikat
data.shape

(96146, 10)

In [73]:
# Menampilkan nama kolom
data.columns

Index(['index', 'gender', 'age', 'hypertension', 'heart_disease',
       'smoking_history', 'bmi', 'HbA1c_level', 'blood_glucose_level',
       'diabetes'],
      dtype='object')

In [74]:
# Menghapus kolom 'index' yang tidak diperlukan
data = data.drop(['index'],axis = 1)

In [75]:
data.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

In [76]:
# Melihat beberapa baris pertama setelah pra-pemrosesan
data.head(5).T

Unnamed: 0,0,1,2,3,4
gender,Female,Male,Female,Male,Female
age,80.0,28.0,36.0,76.0,20.0
hypertension,0,0,0,1,0
heart_disease,1,0,0,1,0
smoking_history,never,never,current,current,never
bmi,25.19,27.32,23.45,20.14,27.32
HbA1c_level,6.6,5.7,5.0,4.8,6.6
blood_glucose_level,140,158,155,155,85
diabetes,0,0,0,0,0


In [77]:
# Pra-pemrosesan data
data['gender'] = data['gender'].replace({'Female': 0, 'Male': 1, 'Other': 2})
data['smoking_history'] = data['smoking_history'].replace({'No Info': 0, 'never': 1, 'former': 2, 'current': 3, 'not current': 4, 'ever':5})

  data['gender'] = data['gender'].replace({'Female': 0, 'Male': 1, 'Other': 2})
  data['smoking_history'] = data['smoking_history'].replace({'No Info': 0, 'never': 1, 'former': 2, 'current': 3, 'not current': 4, 'ever':5})


In [84]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,1,25.19,6.6,140,0
1,1,28.0,0,0,1,27.32,5.7,158,0
2,0,36.0,0,0,3,23.45,5.0,155,0
3,1,76.0,1,1,3,20.14,4.8,155,0
4,0,20.0,0,0,1,27.32,6.6,85,0


In [85]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,1,25.19,6.6,140,0
1,1,28.0,0,0,1,27.32,5.7,158,0
2,0,36.0,0,0,3,23.45,5.0,155,0
3,1,76.0,1,1,3,20.14,4.8,155,0
4,0,20.0,0,0,1,27.32,6.6,85,0
...,...,...,...,...,...,...,...,...,...
96141,0,80.0,0,0,0,27.32,6.2,90,0
96142,0,2.0,0,0,0,17.37,6.5,100,0
96143,1,66.0,0,0,2,27.83,5.7,155,0
96144,0,24.0,0,0,1,35.42,4.0,100,0


In [78]:
# Memisahkan variabel prediktor (X) dan target (y)
X = np.array(data.drop(['diabetes'], axis = 1))
y = data['diabetes']

In [79]:
# Menerapkan validasi silang bertingkat (StratifiedKFold)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# Fungsi untuk mengevaluasi model
def evaluasi_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    akurasi = accuracy_score(y_test, y_pred)
    presisi = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return akurasi, presisi, recall, f1, cm

# Menyimpan metrik untuk model
jst_metrics_list = []

# Iterasi melalui setiap lipatan dalam validasi silang
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [80]:
 # Menerapkan model JST (MLPClassifier)
jst = MLPClassifier(random_state=42, max_iter=1000, verbose=True)

# Pelatihan model
jst.fit(X_train, y_train)

# Melakukan prediksi dan menyimpan metrik
jst_metrics_list.append(evaluasi_model(jst, X_test, y_test))

Iteration 1, loss = 0.48809399
Iteration 2, loss = 0.25644697
Iteration 3, loss = 0.23045669
Iteration 4, loss = 0.20507410
Iteration 5, loss = 0.18492293
Iteration 6, loss = 0.16407295
Iteration 7, loss = 0.15266543
Iteration 8, loss = 0.14235836
Iteration 9, loss = 0.13577128
Iteration 10, loss = 0.13621577
Iteration 11, loss = 0.12926059
Iteration 12, loss = 0.12603184
Iteration 13, loss = 0.12817387
Iteration 14, loss = 0.12139313
Iteration 15, loss = 0.12463631
Iteration 16, loss = 0.12490605
Iteration 17, loss = 0.12097107
Iteration 18, loss = 0.11877357
Iteration 19, loss = 0.12136975
Iteration 20, loss = 0.11870861
Iteration 21, loss = 0.11811821
Iteration 22, loss = 0.11737087
Iteration 23, loss = 0.12019056
Iteration 24, loss = 0.11724562
Iteration 25, loss = 0.11690847
Iteration 26, loss = 0.12035001
Iteration 27, loss = 0.11595858
Iteration 28, loss = 0.11628511
Iteration 29, loss = 0.11656139
Iteration 30, loss = 0.11437752
Iteration 31, loss = 0.11514290
Iteration 32, los

In [81]:
# Fungsi untuk menghitung rata-rata metrik untuk setiap model
def rata_rata_metrik(metrik_list):
    akurasi = np.mean([m[0] for m in metrik_list])
    presisi = np.mean([m[1] for m in metrik_list])
    recall = np.mean([m[2] for m in metrik_list])
    f1 = np.mean([m[3] for m in metrik_list])
    cm = np.mean([m[4] for m in metrik_list], axis=0)
    return akurasi, presisi, recall, f1, cm

# Menghitung rata-rata metrik JST
jst_metrics = rata_rata_metrik(jst_metrics_list)

In [82]:
# Cetak Metrik JST
print("Neural Network (JST):")
print(f"1. Akurasi: {jst_metrics[0]}")
print(f"2. Presisi: {jst_metrics[1]}")
print(f"3. Recall: {jst_metrics[2]}")
print(f"4. F1 Score: {jst_metrics[3]}")

Neural Network (JST):
1. Akurasi: 0.9573538589556896
2. Presisi: 0.9802631578947368
3. Recall: 0.527122641509434
4. F1 Score: 0.6855828220858896


In [83]:
# Menyimpan model yang telah dilatih ke dalam file
filename = 'diabetes-model.sav'
pickle.dump(jst, open(filename, 'wb'))