In [2]:
# Libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# -----------------------------
# Step 1: Load Dataset
# -----------------------------
data_path = "https://raw.githubusercontent.com/Kamaleswaran-Lab/The-2024-Pediatric-Sepsis-Challenge/refs/heads/main/SyntheticData_Training.csv"  # update this if needed
df = pd.read_csv(data_path, index_col=0)


In [4]:
# -----------------------------
# Step 2: Target and Feature Setup
# -----------------------------
target = "inhospital_mortality"

# Drop intervention and removed variables
intervention_vars = [f"admitabx_adm___{i}" for i in range(1, 22)]
removed_vars = ["cookfuel_adm___8", "symptoms_adm___17"]
drop_vars = intervention_vars + removed_vars

df = df.drop(columns=[var for var in drop_vars if var in df.columns])

In [5]:
# -----------------------------
# Step 3: Split Features
# -----------------------------
y = df[target]
X = df.drop(columns=[target, "studyid_adm"] if "studyid_adm" in df.columns else [target])

# Identify numeric and categorical columns
num_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool", "int32"]).columns.difference(num_cols).tolist()


In [7]:
# -----------------------------
# Step 4: Build Preprocessing Pipeline
# -----------------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [8]:


# -----------------------------
# Step 5: Define Model Pipeline
# -----------------------------
base_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
calibrated_model = CalibratedClassifierCV(base_model, method='isotonic', cv=5)

model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", calibrated_model)
])

In [9]:
# -----------------------------
# Step 6: Train-Test Split and Train
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
model_pipeline.fit(X_train, y_train)

In [10]:
# -----------------------------
# Step 7: Evaluation Metrics
# -----------------------------
y_probs = model_pipeline.predict_proba(X_test)[:, 1]
y_preds = model_pipeline.predict(X_test)

roc_auc = roc_auc_score(y_test, y_probs)
auprc = average_precision_score(y_test, y_probs)
ece = brier_score_loss(y_test, y_probs)  # approximation of ECE

print(f"AUC-ROC: {roc_auc:.4f}")
print(f"AUPRC: {auprc:.4f}")
print(f"Estimated Calibration Error (ECE): {ece:.4f}")

AUC-ROC: 0.7331
AUPRC: 0.2005
Estimated Calibration Error (ECE): 0.0404


In [None]:
# -----------------------------
# Step 8: Save the Model
# -----------------------------
joblib.dump(model_pipeline, "pediatric_sepsis_model.joblib")

In [11]:
# -----------------------------
# Optional: Calibration Plot
# -----------------------------
from sklearn.calibration import calibration_curve

prob_true, prob_pred = calibration_curve(y_test, y_probs, n_bins=10)
plt.figure(figsize=(6, 6))
plt.plot(prob_pred, prob_true, marker='o', label='Model')
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfectly Calibrated')
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
plt.title('Calibration Plot')
plt.legend()
plt.grid()
plt.tight_layout()
plt.savefig("calibration_plot.png")
plt.close()