In [None]:
# Standard library imports
import os
import warnings

# Third-party imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve
from sklearn.calibration import calibration_curve
import joblib

# Suppress specific warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
warnings.filterwarnings("ignore", category=FutureWarning, module='sklearn')

# --- Configuration ---
DATA_FILE_PATH = 'SyntheticData_Training.csv'
MODEL_OUTPUT_PATH = 'pediatric_sepsis_mortality_model.joblib'
TARGET_VARIABLE = 'inhospital_mortality'

# --- Evaluation Metric Functions ---
def calculate_ece(y_true, y_prob, n_bins=10):
    """
    Calculates the Expected Calibration Error (ECE).
    Args:
        y_true (array-like): True binary labels.
        y_prob (array-like): Predicted probabilities for the positive class.
        n_bins (int): Number of bins to use for calibration.
    Returns:
        float: The Expected Calibration Error.
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    # Sort by probability
    idx = np.argsort(y_prob)
    y_true = y_true[idx]
    y_prob = y_prob[idx]

    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    ece = 0.0

    for i in range(n_bins):
        bin_start = bin_boundaries[i]
        bin_end = bin_boundaries[i+1]

        # Find samples in the current bin
        # For the last bin, include 1.0
        if i < n_bins -1:
            in_bin = (y_prob >= bin_start) & (y_prob < bin_end)
        else:
            in_bin = (y_prob >= bin_start) & (y_prob <= bin_end)

        if np.sum(in_bin) > 0:
            # Average predicted probability in this bin
            avg_confidence_in_bin = np.mean(y_prob[in_bin])
            # Fraction of positives in this bin
            accuracy_in_bin = np.mean(y_true[in_bin])

            delta = np.abs(avg_confidence_in_bin - accuracy_in_bin)
            ece += delta * (np.sum(in_bin) / len(y_true)) # Weight by proportion of samples in bin

    return ece

def calculate_net_benefit(y_true, y_pred_proba, thresholds):
    """
    Calculates Net Benefit for a range of probability thresholds.
    This is a simplified conceptual version. A full decision curve analysis is more involved.
    Args:
        y_true (array-like): True binary labels.
        y_pred_proba (array-like): Predicted probabilities for the positive class.
        thresholds (array-like): Array of probability thresholds to evaluate.
    Returns:
        dict: A dictionary with thresholds as keys and Net Benefit as values.
    """
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)
    net_benefits = {}

    n = len(y_true)
    if n == 0:
        return {t: 0 for t in thresholds}

    for pt in thresholds:
        # Classify based on threshold
        y_pred_binary = (y_pred_proba >= pt).astype(int)

        tp = np.sum((y_pred_binary == 1) & (y_true == 1))
        fp = np.sum((y_pred_binary == 1) & (y_true == 0))

        # Net Benefit formula: (TP/N) - (FP/N) * (pt / (1-pt))
        # Avoid division by zero if pt is 1; net benefit is undefined or handled as limit
        if pt == 1.0:
            # If threshold is 1, we only treat if predicted prob is 1.
            # If no one is predicted as 1, TP and FP are 0.
            # If pt=1, (pt/(1-pt)) is infinite. Net benefit is typically -infinity unless FP=0.
            # A common convention is to not calculate for pt=1 or handle it carefully.
            # For simplicity, if pt=1 and FP > 0, it's highly negative. If FP=0, it's TP/N.
            if fp > 0:
                 net_benefit_pt = -np.inf
            else:
                 net_benefit_pt = tp / n
        elif pt == 0.0:
             # If threshold is 0, everyone is treated. TP = total positives, FP = total negatives.
             # (pt/(1-pt)) is 0. Net benefit = TP/N.
             # TP here is total number of actual positives if all are treated.
             net_benefit_pt = np.sum(y_true == 1) / n
        else:
            net_benefit_pt = (tp / n) - (fp / n) * (pt / (1 - pt))

        net_benefits[pt] = net_benefit_pt
    return net_benefits

# --- Main Script ---
def main():
    """
    Main function to run the ML pipeline.
    """
    print("Starting Pediatric Sepsis Mortality Prediction Model Training...")

    # 1. Load Data
    print(f"\n[1] Loading data from {DATA_FILE_PATH}...")
    if not os.path.exists(DATA_FILE_PATH):
        print(f"ERROR: Data file not found at {DATA_FILE_PATH}. Please ensure it's in the correct location.")
        return

    try:
        data = pd.read_csv(DATA_FILE_PATH)
        print(f"Data loaded successfully. Shape: {data.shape}")
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return

    # 2. Identify Target and Features, and Exclude Variables
    print("\n[2] Identifying target, features, and excluding specified variables...")
    if TARGET_VARIABLE not in data.columns:
        print(f"ERROR: Target variable '{TARGET_VARIABLE}' not found in the dataset.")
        return

    y = data[TARGET_VARIABLE]

    # Variables to exclude based on the data dictionary and challenge instructions
    intervention_vars = [f'admitabx_adm___{i}' for i in range(1, 22)]
    # Ensure correct spelling for symptoms_adm___17 (data dictionary had a typo 'symtoms_adm___17')
    removed_vars_update = ['cookfuel_adm___8', 'symptoms_adm___17', 'symtoms_adm___17']
    other_excluded = ['studyid_adm', 'lengthadm'] # 'lengthadm' is an outcome, potential leakage

    columns_to_drop = intervention_vars + removed_vars_update + other_excluded

    # Drop only existing columns to avoid KeyErrors
    actual_columns_to_drop = [col for col in columns_to_drop if col in data.columns]
    X = data.drop(columns=[TARGET_VARIABLE] + actual_columns_to_drop, errors='ignore')

    print(f"Target variable: '{TARGET_VARIABLE}'")
    print(f"Number of features before selection: {X.shape[1]}")
    print(f"Dropped columns: {actual_columns_to_drop}")


    # 3. Define Numerical and Categorical Features
    # CRITICAL STEP: These lists are based on the provided data dictionary snippet and common interpretations.
    # They MUST be thoroughly reviewed and validated against the complete data dictionary and dataset.
    print("\n[3] Defining numerical and categorical features...")

    numerical_features = [
        'agecalc_adm', 'height_cm_adm', 'weight_kg_adm', 'muac_mm_adm', 'hr_bpm_adm',
        'rr_brpm_app_adm', 'sysbp_mmhg_adm', 'diasbp_mmhg_adm', 'temp_c_adm',
        'spo2site1_pc_oxi_adm', 'spo2site2_pc_oxi_adm', 'spo2other_adm', 'momage_adm',
        'momagefirstpreg_adm', 'householdsize_adm', 'alivechildren_adm', 'deadchildren_adm',
        'hematocrit_gpdl_adm', 'lactate_mmolpl_adm', 'lactate2_mmolpl_adm',
        'glucose_mmolpl_adm', 'sqi1_perc_oxi_adm', 'sqi2_perc_oxi_adm'
    ]

    categorical_features = [
        'sex_adm', 'spo2onoxy_adm', 'oxygenavail_adm', 'respdistress_adm', 'caprefill_adm',
        'bcseye_adm', 'bcsmotor_adm', 'bcsverbal_adm', 'bcgscar_adm', 'vaccmeasles_adm',
        'vaccmeaslessource_adm', 'vaccpneumoc_adm', 'vaccpneumocsource_adm', 'vaccdpt_adm',
        'vaccdptsource_adm', 'priorweekabx_adm', 'priorweekantimal_adm',
        'symptoms_adm___1', 'symptoms_adm___2', 'symptoms_adm___3', 'symptoms_adm___4',
        'symptoms_adm___5', 'symptoms_adm___6', 'symptoms_adm___7', 'symptoms_adm___8',
        'symptoms_adm___9', 'symptoms_adm___10', 'symptoms_adm___11', 'symptoms_adm___12',
        'symptoms_adm___13', 'symptoms_adm___14', 'symptoms_adm___15', 'symptoms_adm___16',
        'symptoms_adm___18', # symptoms_adm___17 is excluded
        'comorbidity_adm___1', 'comorbidity_adm___2', 'comorbidity_adm___3', 'comorbidity_adm___4',
        'comorbidity_adm___5', 'comorbidity_adm___6', 'comorbidity_adm___7', 'comorbidity_adm___8',
        'comorbidity_adm___9', 'comorbidity_adm___10', 'comorbidity_adm___11', 'comorbidity_adm___12',
        'priorhosp_adm', 'prioryearwheeze_adm', 'prioryearcough_adm', 'diarrheaoften_adm',
        'tbcontact_adm', 'feedingstatus_adm', 'exclbreastfed_adm', 'nonexclbreastfed_adm',
        'totalbreastfed_adm', 'deliveryloc_adm', 'birthattend_adm', 'duedateknown_adm',
        'birthdetail_adm___1', 'birthdetail_adm___2', 'birthdetail_adm___3',
        'birthdetail_adm___4', 'birthdetail_adm___5', 'birthdetail_adm___6',
        'travelmethod_adm', 'traveldist_adm', 'badhealthduration_adm', 'caregiver_adm_new',
        'caregiverage_adm', 'caregivermarried_adm', 'momalive_adm', 'momageknown_adm',
        'momagefirstpregknown_adm', 'momedu_adm', 'momhiv_adm', 'watersource_adm', 'waterpure_adm',
        'cookfuel_adm___1', 'cookfuel_adm___2', 'cookfuel_adm___3', 'cookfuel_adm___4',
        'cookfuel_adm___5', 'cookfuel_adm___6', 'cookfuel_adm___7', # cookfuel_adm___8 is excluded
        'cookloc_adm', 'lightfuel_adm', 'tobacco_adm', 'bednet_adm',
        'hctpretransfusion_adm', 'hivstatus_adm', 'malariastatuspos_adm'
    ]

    # Filter features to only those present in X
    numerical_features = [f for f in numerical_features if f in X.columns]
    categorical_features = [f for f in categorical_features if f in X.columns]

    # Check for overlap or missing features
    all_defined_features = set(numerical_features + categorical_features)
    all_X_columns = set(X.columns)

    if all_defined_features != all_X_columns:
        print("\nWARNING: Feature set mismatch!")
        missing_in_defined = all_X_columns - all_defined_features
        if missing_in_defined:
            print(f"  Features in X but not in defined lists (will be dropped by ColumnTransformer if remainder='drop'): {missing_in_defined}")
        extra_in_defined = all_defined_features - all_X_columns
        if extra_in_defined:
            print(f"  Features in defined lists but not in X (were likely dropped or misnamed): {extra_in_defined}")

    print(f"Selected {len(numerical_features)} numerical features.")
    print(f"Selected {len(categorical_features)} categorical features.")
    print(f"Total features for model: {len(numerical_features) + len(categorical_features)}")


    # 4. Create Preprocessing Pipelines
    print("\n[4] Creating preprocessing pipelines...")
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')), # Can also use a constant fill_value
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop' # Drop any columns not explicitly handled
    )

    # 5. Split Data
    print("\n[5] Splitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y # Important for imbalanced datasets
    )
    print(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
    print(f"Test set shape: X_test={X_test.shape}, y_test={y_test.shape}")

    # 6. Define and Train Model
    # Using Logistic Regression as a starting point.
    # Consider RandomForestClassifier or GradientBoostingClassifier for potentially better performance.
    print("\n[6] Defining and training Logistic Regression model...")
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced'))
    ])

    try:
        model_pipeline.fit(X_train, y_train)
        print("Model training complete.")
    except Exception as e:
        print(f"Error during model training: {e}")
        # Potentially print more details about which features might be causing issues
        # For example, if a numerical feature is actually all NaNs after selection,
        # or if a categorical feature has an unexpected dtype.
        # You can try to fit the preprocessor alone to debug:
        # try:
        #     X_train_transformed = preprocessor.fit_transform(X_train)
        #     print(f"Preprocessor fit_transform successful on X_train. Shape: {X_train_transformed.shape}")
        # except Exception as pe:
        #     print(f"Error during preprocessor fitting: {pe}")
        return

    # 7. Evaluate Model
    print("\n[7] Evaluating model...")
    try:
        y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]
        y_pred_binary = model_pipeline.predict(X_test)
    except Exception as e:
        print(f"Error during model prediction: {e}")
        return

    # AUC-ROC
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    print(f"  Area Under the ROC Curve (AUC-ROC): {auc_roc:.4f}")

    # AUPRC
    auprc = average_precision_score(y_test, y_pred_proba)
    print(f"  Area Under the Precision-Recall Curve (AUPRC): {auprc:.4f}")

    # Estimated Calibration Error (ECE)
    ece = calculate_ece(y_test, y_pred_proba)
    print(f"  Estimated Calibration Error (ECE): {ece:.4f}")

    # Net Benefit (Conceptual - requires careful threshold selection and interpretation)
    # Define clinically relevant thresholds for decision making
    # Example thresholds:
    nb_thresholds = np.linspace(0.05, 0.95, 10)
    net_benefits = calculate_net_benefit(y_test, y_pred_proba, nb_thresholds)
    print(f"  Net Benefit (conceptual):")
    for thresh, nb_val in net_benefits.items():
        print(f"    Threshold {thresh:.2f}: NB = {nb_val:.4f}")
    print("    Note: Net Benefit calculation here is illustrative. Full Decision Curve Analysis is recommended.")


    # 8. Save Trained Model
    print(f"\n[8] Saving trained model to {MODEL_OUTPUT_PATH}...")
    try:
        joblib.dump(model_pipeline, MODEL_OUTPUT_PATH)
        print(f"Model saved successfully to {MODEL_OUTPUT_PATH}")
    except Exception as e:
        print(f"Error saving model: {e}")

    print("\n--- Script Finished ---")

if __name__ == '__main__':
    main()