In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import json
import os


In [3]:
# Adjust the path if your file is somewhere else
df = pd.read_excel("T2D dataset.xlsx")

target_col = "Complication"
X = df.drop(columns=[target_col])
y = df[target_col].astype(int)

print("Shape:", df.shape)
print("Target distribution:\n", y.value_counts())


Shape: (7000, 14)
Target distribution:
 Complication
0    3784
1    3216
Name: count, dtype: int64


In [4]:
df.head()

Unnamed: 0,Age,Sex,BMI,HbA1c,Fasting Glucose,LDL,HDL,Triglycerides,Systolic BP,Diastolic BP,Family History,Diet Quality,Duration Since Diagnosis,Complication
0,68,Female,36.1,8.4,121.2,44.1,25.0,241.6,125,84,1,Poor,18,1
1,81,Male,32.1,7.7,90.2,106.0,35.4,203.2,155,80,1,Average,21,1
2,58,Male,41.8,4.6,85.2,100.7,52.2,179.8,134,76,0,Average,13,0
3,44,Female,20.2,10.6,131.4,147.1,55.0,203.0,117,93,0,Poor,14,0
4,72,Male,25.5,7.9,71.5,96.2,55.3,208.4,173,99,0,Average,15,1


In [5]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
print("Numeric:", numeric_features)
print("Categorical:", categorical_features)


Numeric: ['Age', 'BMI', 'HbA1c', 'Fasting  Glucose', 'LDL', 'HDL', 'Triglycerides', 'Systolic BP', 'Diastolic BP', 'Family History', 'Duration Since Diagnosis']
Categorical: ['Sex', 'Diet Quality']


In [6]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", MLPClassifier(max_iter=1000, random_state=42))
])


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Classification report:\n", classification_report(y_test, y_pred))


ROC AUC: 0.9970107919655019
Classification report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       757
           1       0.97      0.97      0.97       643

    accuracy                           0.97      1400
   macro avg       0.97      0.97      0.97      1400
weighted avg       0.97      0.97      0.97      1400



In [8]:
os.makedirs("models", exist_ok=True)
joblib.dump(pipeline, "models/mlp_pipeline.joblib")
print("✅ Model saved at models/mlp_pipeline.joblib")


✅ Model saved at models/mlp_pipeline.joblib


In [9]:
features_info = {}

for col in numeric_features:
    features_info[col] = {
        "median": float(df[col].median()),
        "min": float(df[col].min()),
        "max": float(df[col].max())
    }

for col in categorical_features:
    values = df[col].dropna().unique().tolist()
    features_info[col] = {"values": [str(v) for v in values]}

with open("models/features_info.json", "w") as f:
    json.dump(features_info, f)

print("✅ Features info saved at models/features_info.json")


✅ Features info saved at models/features_info.json
