In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
df = pd.read_csv("data/diabetic_data.csv")

# Replace '?' with NaN
df = df.replace('?', np.nan)

# Keep only <30 and >30 for readmission
df = df[df['readmitted'].isin(['<30', '>30', "NO"])]

# Drop columns with more than 40% missing data
missing_threshold = 0.4
df = df.loc[:, df.isnull().mean() < missing_threshold]

# Drop rows with missing values **only in the remaining columns**
df = df.dropna()

# Drop identifiers
df = df.drop(['encounter_id', 'patient_nbr'], axis=1, errors='ignore')

# One-hot encode categorical features
categorical_cols = df.select_dtypes(include='object').columns.drop('readmitted')
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Binary target: 1 if readmitted within 30 days
df['readmitted'] = df['readmitted'].map({
    "NO": 0,
    "<30": 1,
    ">30": 2
})

print(f"Dataset shape after cleaning: {df.shape}")

Dataset shape after cleaning: (59129, 2163)


In [3]:
# label encoding for the target variable
X = df.drop("readmitted", axis=1)
y = df["readmitted"]

In [4]:
# Split the dataset into features and target variable
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

# Further split the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=13)

In [7]:
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_te_scaled = scaler.transform(X_test)

In [None]:
# best_hyperparams = {
#     'hidden_layer_sizes': (32,),
#     'activation': 'relu',
#     'solver': 'sgd',
#     'alpha': 0.1,  # stronger L2 regularization
#     'learning_rate': 'constant',
#     'learning_rate_init': 0.01,
#     'max_iter': 200,
#     'n_iter_no_change': 20
# }
# Training accuracy:   0.9678
# Validation accuracy: 0.6764
# Test accuracy:       0.6838

# best_hyperparams = {
#     'hidden_layer_sizes': (64, 32),
#     'activation': 'relu',
#     'solver': 'sgd',
#     'alpha': 0.01,
#     'learning_rate': 'invscaling',
#     'learning_rate_init': 0.01,
#     'max_iter': 200,
#     'n_iter_no_change': 20
# }
# Training accuracy:   0.7704
# Validation accuracy: 0.7559
# Test accuracy:       0.7660

# best_hyperparams = {
#     'hidden_layer_sizes': (64,),
#     'activation': 'relu',
#     'solver': 'adam',
#     'alpha': 0.001,
#     'learning_rate_init': 0.001,
#     'max_iter': 200,
#     'n_iter_no_change': 20
# }
# Training accuracy:   0.9959
# Validation accuracy: 0.6799
# Test accuracy:       0.6889

# {
#     'hidden_layer_sizes': (128, 64),
#     'activation': 'tanh',
#     'solver': 'sgd',
#     'alpha': 0.1,
#     'learning_rate': 'constant',
#     'learning_rate_init': 0.01,
#     'max_iter': 300,
#     'n_iter_no_change': 20
# }
# Training accuracy:   0.9955
# Validation accuracy: 0.6835
# Test accuracy:       0.6988

# {
#     'hidden_layer_sizes': (64,),
#     'activation': 'relu',
#     'solver': 'sgd',
#     'alpha': 0.01,
#     'learning_rate': 'constant',
#     'learning_rate_init': 0.001,
#     'max_iter': 500,
#     'n_iter_no_change': 30
# }
# Training accuracy:   0.9459
# Validation accuracy: 0.4837
# Test accuracy:       0.4836




In [11]:
best_hyperparams = {
    'hidden_layer_sizes': (64,),
    'activation': 'relu',
    'solver': 'sgd',
    'alpha': 0.01,
    'learning_rate': 'constant',
    'learning_rate_init': 0.001,
    'max_iter': 200,
    'n_iter_no_change': 30
}




In [12]:
mlp = MLPClassifier(**best_hyperparams, random_state=seed, verbose=False)
mlp.fit(X_tr_scaled, y_train)

# Step 4: Evaluate on train, val, and test sets
train_acc = mlp.score(X_tr_scaled, y_train)
val_acc = mlp.score(X_val_scaled, y_val)
test_acc = mlp.score(X_te_scaled, y_test)

print(f"Training accuracy:   {train_acc:.4f}")
print(f"Validation accuracy: {val_acc:.4f}")
print(f"Test accuracy:       {test_acc:.4f}")

# Step 5: Classification report and confusion matrix
y_pred = mlp.predict(X_te_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))




Training accuracy:   0.8873
Validation accuracy: 0.5013
Test accuracy:       0.4968

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.64      0.61      6259
           1       0.18      0.11      0.13      1282
           2       0.42      0.41      0.41      4285

    accuracy                           0.50     11826
   macro avg       0.39      0.38      0.38     11826
weighted avg       0.48      0.50      0.49     11826

Confusion Matrix:
[[3990  323 1946]
 [ 639  137  506]
 [2221  316 1748]]


In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from itertools import product

# Parameter grid
hidden_layer_sizes = [(64,), (64, 32), (128, 64, 32)]
alphas = [0.01, 0.1]
learning_rates_init = [0.001, 0.01]
learning_rate = 'adaptive'

best_val_acc = 0
best_params = None

combs = list(product(hidden_layer_sizes, alphas, learning_rates_init))

print(f"Starting manual grid search... in {len(combs)}\n")

count = 0
# Loop through all combinations
for hls, alpha, lr_init in combs:
    print(f"Iteration {count + 1}/{len(combs)}")
    clf = MLPClassifier(
        hidden_layer_sizes=hls,
        activation='relu',
        solver='adam',
        alpha=alpha,
        learning_rate=learning_rate,
        learning_rate_init=lr_init,
        max_iter=500,
        n_iter_no_change=30,
        random_state=13
    )

    clf.fit(X_train, y_train)
    val_preds = clf.predict(X_val)
    val_acc = accuracy_score(y_val, val_preds)
    

    print(f"Params: hls={hls}, alpha={alpha}, lr_init={lr_init} --> Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_params = (hls, alpha, lr_init)

print("\nBest parameters:")
print(f"hidden_layer_sizes={best_params[0]}, alpha={best_params[1]}, learning_rate_init={best_params[2]}")
print(f"Validation Accuracy: {best_val_acc:.4f}")

Starting manual grid search... in 12

Iteration 1/12




Params: hls=(64,), alpha=0.01, lr_init=0.001 --> Val Acc: 0.4696
Iteration 1/12
Params: hls=(64,), alpha=0.01, lr_init=0.01 --> Val Acc: 0.5772
Iteration 1/12
Params: hls=(64,), alpha=0.1, lr_init=0.001 --> Val Acc: 0.5538
Iteration 1/12
Params: hls=(64,), alpha=0.1, lr_init=0.01 --> Val Acc: 0.5785
Iteration 1/12
Params: hls=(64, 32), alpha=0.01, lr_init=0.001 --> Val Acc: 0.4747
Iteration 1/12
Params: hls=(64, 32), alpha=0.01, lr_init=0.01 --> Val Acc: 0.5830
Iteration 1/12


MemoryError: Unable to allocate 585. MiB for an array with shape (2162, 35477) and data type float64

In [22]:
# Training accuracy:   0.6042
# Validation accuracy: 0.5830
# Test accuracy:       0.5740

best_hyperparams = {
    'hidden_layer_sizes': (64,32),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.01,
    'learning_rate': 'adaptive',
    'learning_rate_init': 0.01,
    'max_iter': 500,
    'n_iter_no_change': 30
}




In [23]:
mlp = MLPClassifier(**best_hyperparams, random_state=13, verbose=False)
mlp.fit(X_train, y_train)



In [24]:
# Step 4: Evaluate on train, val, and test sets
train_acc = mlp.score(X_train, y_train)
val_acc = mlp.score(X_val, y_val)
test_acc = mlp.score(X_test, y_test)

print(f"Training accuracy:   {train_acc:.4f}")
print(f"Validation accuracy: {val_acc:.4f}")
print(f"Test accuracy:       {test_acc:.4f}")

# Step 5: Classification report and confusion matrix
y_pred = mlp.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Training accuracy:   0.6042
Validation accuracy: 0.5830
Test accuracy:       0.5740

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.77      0.69      6259
           1       0.43      0.02      0.04      1282
           2       0.49      0.46      0.47      4285

    accuracy                           0.57     11826
   macro avg       0.51      0.42      0.40     11826
weighted avg       0.55      0.57      0.54     11826

Confusion Matrix:
[[4793    5 1461]
 [ 633   25  624]
 [2287   28 1970]]
