In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [4]:
# Load the dataset
df = pd.read_csv("data/diabetic_data.csv")

# Replace '?' with NaN
df = df.replace('?', np.nan)

# Keep only <30 and >30 for readmission
df = df[df['readmitted'].isin(['<30', '>30', "NO"])]

# Drop columns with more than 40% missing data
missing_threshold = 0.4
df = df.loc[:, df.isnull().mean() < missing_threshold]

# Drop rows with missing values **only in the remaining columns**
df = df.dropna()

# Drop identifiers
df = df.drop(['encounter_id', 'patient_nbr'], axis=1, errors='ignore')

# One-hot encode categorical features
categorical_cols = df.select_dtypes(include='object').columns.drop('readmitted')
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Binary target: 1 if readmitted within 30 days
df['readmitted'] = df['readmitted'].map({
    "NO": 0,
    "<30": 1,
    ">30": 2
})

print(f"Dataset shape after cleaning: {df.shape}")

Dataset shape after cleaning: (59129, 2163)


In [5]:
# label encoding for the target variable
X = df.drop("readmitted", axis=1)
y = df["readmitted"]

In [6]:
# Split the dataset into features and target variable
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

# Further split the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=13)

In [7]:
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_te_scaled = scaler.transform(X_test)

In [None]:
# best_hyperparams = {
#     'hidden_layer_sizes': (32,),
#     'activation': 'relu',
#     'solver': 'sgd',
#     'alpha': 0.1,  # stronger L2 regularization
#     'learning_rate': 'constant',
#     'learning_rate_init': 0.01,
#     'max_iter': 200,
#     'n_iter_no_change': 20
# }
# Training accuracy:   0.9678
# Validation accuracy: 0.6764
# Test accuracy:       0.6838

# best_hyperparams = {
#     'hidden_layer_sizes': (64, 32),
#     'activation': 'relu',
#     'solver': 'sgd',
#     'alpha': 0.01,
#     'learning_rate': 'invscaling',
#     'learning_rate_init': 0.01,
#     'max_iter': 200,
#     'n_iter_no_change': 20
# }
# Training accuracy:   0.7704
# Validation accuracy: 0.7559
# Test accuracy:       0.7660

# best_hyperparams = {
#     'hidden_layer_sizes': (64,),
#     'activation': 'relu',
#     'solver': 'adam',
#     'alpha': 0.001,
#     'learning_rate_init': 0.001,
#     'max_iter': 200,
#     'n_iter_no_change': 20
# }
# Training accuracy:   0.9959
# Validation accuracy: 0.6799
# Test accuracy:       0.6889

# {
#     'hidden_layer_sizes': (128, 64),
#     'activation': 'tanh',
#     'solver': 'sgd',
#     'alpha': 0.1,
#     'learning_rate': 'constant',
#     'learning_rate_init': 0.01,
#     'max_iter': 300,
#     'n_iter_no_change': 20
# }
# Training accuracy:   0.9955
# Validation accuracy: 0.6835
# Test accuracy:       0.6988

# {
#     'hidden_layer_sizes': (64,),
#     'activation': 'relu',
#     'solver': 'sgd',
#     'alpha': 0.01,
#     'learning_rate': 'constant',
#     'learning_rate_init': 0.001,
#     'max_iter': 500,
#     'n_iter_no_change': 30
# }
# Training accuracy:   0.9459
# Validation accuracy: 0.4837
# Test accuracy:       0.4836




In [11]:
best_hyperparams = {
    'hidden_layer_sizes': (64,),
    'activation': 'relu',
    'solver': 'sgd',
    'alpha': 0.01,
    'learning_rate': 'constant',
    'learning_rate_init': 0.001,
    'max_iter': 200,
    'n_iter_no_change': 30
}




In [12]:
mlp = MLPClassifier(**best_hyperparams, random_state=seed, verbose=False)
mlp.fit(X_tr_scaled, y_train)

# Step 4: Evaluate on train, val, and test sets
train_acc = mlp.score(X_tr_scaled, y_train)
val_acc = mlp.score(X_val_scaled, y_val)
test_acc = mlp.score(X_te_scaled, y_test)

print(f"Training accuracy:   {train_acc:.4f}")
print(f"Validation accuracy: {val_acc:.4f}")
print(f"Test accuracy:       {test_acc:.4f}")

# Step 5: Classification report and confusion matrix
y_pred = mlp.predict(X_te_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))




Training accuracy:   0.8873
Validation accuracy: 0.5013
Test accuracy:       0.4968

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.64      0.61      6259
           1       0.18      0.11      0.13      1282
           2       0.42      0.41      0.41      4285

    accuracy                           0.50     11826
   macro avg       0.39      0.38      0.38     11826
weighted avg       0.48      0.50      0.49     11826

Confusion Matrix:
[[3990  323 1946]
 [ 639  137  506]
 [2221  316 1748]]
