# Model 2

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_recall_curve, average_precision_score, roc_auc_score, roc_curve
import numpy as np

In [2]:
model_df = pd.read_csv("status.csv")
model_df.head()

Unnamed: 0,hypertension,heart_disease,diabetes,problematic,gender,smoking_history_No Info,smoking_history_current,smoking_history_not current,age,bmi,HbA1c_level,blood_glucose_level
0,0,1,0,0,0,0.0,0.0,1.0,1.0,-0.645658,0.127273,-0.454545
1,0,0,0,0,0,1.0,0.0,0.0,0.349349,-0.595938,0.127273,-1.0
2,0,0,0,0,1,0.0,0.0,1.0,-0.301301,-0.595938,-0.2,-0.290909
3,0,0,0,0,0,0.0,1.0,0.0,-0.101101,-0.686275,-0.454545,-0.318182
4,1,1,0,0,1,0.0,1.0,0.0,0.8999,-0.763539,-0.527273,-0.318182


In [3]:
# Define features (X) and target (y)
X = model_df.drop("diabetes",axis=1)
y = model_df["diabetes"]

In [4]:
# Split data into training and test sets
X_train_, X_test, y_train_, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=y
)
X_train_.shape,X_test.shape

((89983, 11), (9999, 11))

In [5]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train_, y_train_)
X_train.shape, X_test.shape



((164666, 11), (9999, 11))

In [6]:
# Encode target variable using One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, drop=None)
y_train_encoded = encoder.fit_transform(y_train.to_frame())
y_test_encoded = encoder.transform(y_test.to_frame())
y_train_encoded, y_test_encoded

(array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]]),
 array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]]))

In [7]:
len(y_train_encoded), len(y_test_encoded),len(X_train), len(X_test)

(164666, 9999, 164666, 9999)

In [None]:
# Build a neural network model 
model = Sequential()
model.add(Dense(256, input_shape=(X_train.shape[1],), activation='tanh'))
model.add(Dense(128, activation='tanh'))
model.add(Dense(128, activation='tanh'))
model.add(Dense(2, activation='softmax'))

# Compile the model with categorical crossentropy loss and Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.005), metrics=['accuracy',"recall"])
model.summary()

# Train the model with early stopping to prevent overfitting
h = model.fit(
    X_train, y_train_encoded,
    epochs=200,
    batch_size=128,
    callbacks = EarlyStopping(
    monitor='val_loss',   
    patience=15),
    verbose=1,
    validation_data=(X_test, y_test_encoded) 
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-08-24 23:06:28.559812: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-08-24 23:06:28.559840: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-08-24 23:06:28.559846: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-08-24 23:06:28.559863: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-24 23:06:28.559876: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/200


2025-08-24 23:06:29.000158: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m 433/1287[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m10s[0m 12ms/step - accuracy: 0.8758 - loss: 0.2651 - recall: 0.8758

In [None]:
# Plot training vs validation accuracy and loss across epochs
def LossHistory (history):
    # Accuracy 
    plt.figure(figsize=(8,4))
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Accuracy per Epoch')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Loss 
    plt.figure(figsize=(8,4))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss per Epoch')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
LossHistory(h)

In [None]:
# Evaluate model performance on train set
y_pred_prob_train = model.predict(X_train)
y_pred_train = np.argmax(y_pred_prob_train, axis=1)
y_true_train = np.argmax(y_train_encoded, axis=1)

print(classification_report(y_true_train, y_pred_train))

In [None]:
# Evaluate model performance on test set
y_pred_prob_test = model.predict(X_test)
y_pred_test = np.argmax(y_pred_prob_test, axis=1)
y_true = np.argmax(y_test_encoded, axis=1)

print(classification_report(y_true, y_pred_test))

In [None]:
# Generate confusion matrix to evaluate classification results
cm = confusion_matrix(y_true, y_pred_test)
cm

In [None]:
# AUC calculation
# y_pred_prob[:,1] = predicted probability for class 1(diabetes)
auc = roc_auc_score(y_true, y_pred_prob_test[:, 1])
print("AUC score:", auc)

In [None]:
#Precision Recall Curve
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_prob_test[:, 1])

ap = average_precision_score(y_true, y_pred_prob_test[:, 1])

plt.figure(figsize=(7,5))
plt.plot(recall, precision, label=f'PR curve (AP = {ap:.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
valid_idx = np.where(recall > 0.85)

precision_valid = precision[valid_idx]
recall_valid = recall[valid_idx]
thresholds_valid = thresholds[valid_idx[0]-1]  

best_idx = np.argmax(precision_valid)
best_precision = precision_valid[best_idx]
best_recall = recall_valid[best_idx]
best_threshold = thresholds_valid[best_idx]

print("Best Threshold:", best_threshold)

y_pred_opt = (y_pred_prob_test[:, 1] >= best_threshold).astype(int)


print("\nClassification Report (Optimized Threshold):")
print(classification_report(y_true, y_pred_opt))

cm_opt = confusion_matrix(y_true, y_pred_opt)
print("Confusion Matrix (Optimized Threshold):\n", cm_opt)