This notebook creates the final datasets from the preprocessed dataset. It splits the data into train, val and test sets and applies various sampling methods for the train set: no_sampling, upsampling and downsampling

Config

In [12]:
INPUT_FILE_NAME = "after-vectorization"
OUTPUT_FOLDER_NAME = "final_datasets"
TEST_SIZE = 0.2
RANDOM_STATE = 42
VAL_SIZE = 0.125
SAMPLING_MINORITY_RATIO = 0.3

## Setup

Imports

In [13]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from sklearn.utils import resample
from pathlib import Path
import numpy as np
import random
import json
from sklearn.preprocessing import StandardScaler

Load the dataset

In [14]:
def find_project_root(marker=".gitignore"):
    """
    walk up from the current working directory until a directory containing the
    specified marker (e.g., .gitignore) is found.
    """
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent.resolve()
    raise FileNotFoundError(
        f"Project root marker '{marker}' not found starting from {current}"
    )

In [15]:
project_root = find_project_root()
print("Project root:", project_root)
if project_root not in sys.path:
    sys.path.append(project_root)

df_path = f"{project_root}/dataset/{INPUT_FILE_NAME}.parquet"
try:
    df = pd.read_parquet(df_path)
except Exception as e:
    sys.exit(f"Error loading dataset from {df_path}: {e}")

Project root: C:\Users\Administrator\Desktop\ds\dl-sepsis-prediction


In [16]:
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

## Helper Functions

In [17]:
def over_under_sample(df, method="oversample", minority_ratio=0.3):
    """
    Balances the dataset at the patient level.

    Each patient's overall sepsis label is taken as the maximum value
    (if any record shows sepsis, the patient is marked as septic).

    We then either oversample the septic (minority) patients or undersample
    the non-septic (majority) patients to change the ratio.

    In the final dataset, each copy of a patient gets a unique ID so that
    oversampled patients appear as separate instances.
    """
    # Create a patient-level summary with one record per patient.
    patient_df = df.groupby("patient_id")["SepsisLabel"].max().reset_index()

    # Count patients in each group.
    counts = patient_df["SepsisLabel"].value_counts()
    majority_class = counts.idxmax()
    minority_class = counts.idxmin()

    # Split the patients into majority and minority groups.
    majority_patients = patient_df[patient_df["SepsisLabel"] == majority_class]
    minority_patients = patient_df[patient_df["SepsisLabel"] == minority_class]

    # Resample based on the chosen method.
    if method == "oversample":
        # Duplicate minority patients to reach desired ratio.
        n_desired_minority = int(
            (minority_ratio * len(majority_patients)) / (1 - minority_ratio)
        )
        minority_upsampled = resample(
            minority_patients,
            replace=True,
            n_samples=n_desired_minority,
            random_state=RANDOM_STATE,
        )
        balanced_patient_df = pd.concat([majority_patients, minority_upsampled])
    elif method == "undersample":
        # Remove some majority patients to reach desired ratio.
        n_desired_majority = int(
            ((1 - minority_ratio) / minority_ratio) * len(minority_patients)
        )
        majority_downsampled = resample(
            majority_patients,
            replace=False,
            n_samples=n_desired_majority,
            random_state=RANDOM_STATE,
        )
        balanced_patient_df = pd.concat([majority_downsampled, minority_patients])
    else:
        raise ValueError("Method must be 'oversample' or 'undersample'")

    # Rebuild the full dataset with patient records.
    # If a patient appears more than once due to resampling,
    # assign a new unique patient ID to each duplicate.
    final_dfs = []
    patient_occurrences = {}

    for pid in balanced_patient_df["patient_id"]:
        # Get all records for this patient.
        patient_records = df[df["patient_id"] == pid].copy()
        # Count how many times this patient has been added.
        if pid in patient_occurrences:
            patient_occurrences[pid] += 1
            # Create a new unique ID by appending a suffix.
            new_pid = pid * 1000 + patient_occurrences[pid]
            patient_records["patient_id"] = new_pid
        else:
            # first occurrence, keep original ID
            patient_occurrences[pid] = 0
        final_dfs.append(patient_records)

    balanced_df = pd.concat(final_dfs, ignore_index=True)

    return balanced_df

## Datasets and Processing

Split dataset into train, val, and test



In [18]:
patient_labels = df.groupby("patient_id")["SepsisLabel"].max()

train_val_patients, test_patients = train_test_split(
    patient_labels.index,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=patient_labels,  # Stratify by patient-level labels
)

train_val_df = df[df["patient_id"].isin(train_val_patients)]
test_df = df[df["patient_id"].isin(test_patients)]

train_patients, val_patients = train_test_split(
        train_val_patients,
        test_size=VAL_SIZE,
        random_state=RANDOM_STATE,
        stratify=patient_labels[train_val_patients],
)

train_df = train_val_df[train_val_df["patient_id"].isin(train_patients)]
val_df = train_val_df[train_val_df["patient_id"].isin(val_patients)]

Scale the dataset

In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def scale_dfs(train_df, val_df, test_df, label="SepsisLabel", eps=1e-6):
    # Identify feature columns
    feature_cols = [c for c in train_df.columns if c != label]

    # Fit scaler on train features
    scaler = StandardScaler().fit(train_df[feature_cols])

    # Apply scaling
    def apply_scaling(df):
        df_scaled = df.copy()
        df_scaled[feature_cols] = scaler.transform(df[feature_cols])
        return df_scaled

    train_scaled = apply_scaling(train_df)
    val_scaled   = apply_scaling(val_df)
    test_scaled  = apply_scaling(test_df)

    # --- TEST: mean ≈ 0 and std ≈ 1 on train_scaled ---
    means = train_scaled[feature_cols].mean()
    stds  = train_scaled[feature_cols].std(ddof=0)  # population std to match StandardScaler

    # Check each column
    mean_pass = (means.abs() < eps).all()
    std_pass  = ((stds - 1).abs() < eps).all()

    # Print detailed results
    print("Scaling test results on train set:")
    df_test = pd.DataFrame({
        'mean': means.round(6),
        'std':  stds.round(6),
        'mean_ok': (means.abs() < eps),
        'std_ok':  ((stds - 1).abs() < eps)
    })
    print(df_test)
    print(f"\nAll means ≈ 0? {'✅' if mean_pass else '❌'}")
    print(f"All stds ≈ 1?  {'✅' if std_pass else '❌'}\n")

    return train_scaled, val_scaled, test_scaled



In [None]:
train_df, val_df, test_df = scale_dfs(train_df, val_df, test_df)

In [27]:
train_df.head()

Unnamed: 0,patient_id,SepsisLabel,ICULOS_last,SOFA_score_mean_global_1,SOFA_score_mean_global_2,SOFA_score_mean_global_3,SOFA_score_mean_global_4,SOFA_score_mean_global_5,SOFA_score_mean_global_7,SOFA_score_mean_global_8,...,MAP_mean_6h_6,MAP_median_6h_6,MAP_std_6h_6,MAP_diff_std_6h_6,Resp_max_6h_6,Resp_min_6h_6,Resp_mean_6h_6,Resp_median_6h_6,Resp_std_6h_6,Resp_diff_std_6h_6
0,1,0,54,1.87037,1.87037,1.87037,1.87037,1.87037,1.87037,1.87037,...,85.614953,87.099858,5.597793,9.243436,30.0,19.0,23.702044,23.25,3.720878,5.929981
2,3,0,48,2.0625,2.0625,2.0625,2.0625,2.0625,2.0625,2.0625,...,72.284326,69.83,7.299862,11.137502,40.0,19.0,27.088652,25.765957,7.156928,10.856961
3,4,0,29,4.965517,4.965517,4.965517,4.965517,4.965517,4.965517,4.965517,...,74.789673,74.75,5.958097,7.605339,26.0,17.0,20.886905,19.160714,4.047701,4.468455
6,7,0,45,4.377778,4.377778,4.377778,4.377778,4.377778,4.377778,4.377778,...,82.6875,82.0,12.515428,22.518673,33.0,15.0,22.984848,21.454545,6.486455,8.238039
7,8,0,40,3.95,3.95,3.95,3.95,3.95,3.95,3.95,...,67.333333,66.5,4.179314,8.408329,22.0,16.0,18.352564,17.807692,2.384171,3.167256


In [21]:
# check the scaler is working ie the mean and std of the train set are 0 and 1
print(train_df.mean())
print(train_df.std())

patient_id                  20175.458773
SepsisLabel                     0.072714
ICULOS_last                    38.989410
SOFA_score_mean_global_1        3.988410
SOFA_score_mean_global_2        3.988410
                                ...     
Resp_diff_std_6h_332            2.162803
Resp_diff_std_6h_333            3.024195
Resp_diff_std_6h_334            3.364495
Resp_diff_std_6h_335            4.324151
Resp_diff_std_6h_336            3.509959
Length: 15795, dtype: float64
patient_id                  11646.655705
SepsisLabel                     0.259671
ICULOS_last                    22.766570
SOFA_score_mean_global_1        1.108869
SOFA_score_mean_global_2        1.108869
                                ...     
Resp_diff_std_6h_332            1.467350
Resp_diff_std_6h_333            1.809208
Resp_diff_std_6h_334            1.722178
Resp_diff_std_6h_335            2.787051
Resp_diff_std_6h_336            2.023596
Length: 15795, dtype: float64


Save the train (no sample), test and val sets

In [29]:
#XGBoost only
def filter_by_relative_steps(df, allowed_steps=[6, 12, 18, 24, 30, 36]):
    base_cols = []
    dynamic_cols = []

    for col in df.columns:
        for step in allowed_steps:
            if col.endswith(f"_{step}"):
                dynamic_cols.append(col)
                break
        else:
            base_cols.append(col)

    df_dynamic = df[dynamic_cols]
    non_nan_dynamic = df_dynamic.columns[df_dynamic.notna().all()].tolist()

    keep_cols = base_cols + non_nan_dynamic
    return df[keep_cols]


train_df = filter_by_relative_steps(train_df)
val_df = filter_by_relative_steps(val_df)
test_df = filter_by_relative_steps(test_df)

#XGBoost only

output_folder = f"{project_root}/dataset/{OUTPUT_FOLDER_NAME}"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
train_df.to_parquet(f"{output_folder}/no_sampling_train.parquet")
val_df.to_parquet(f"{output_folder}/val.parquet")
test_df.to_parquet(f"{output_folder}/test.parquet")

train_df.head()

Unnamed: 0,patient_id,SepsisLabel,ICULOS_last,SOFA_score_mean_global_1,SOFA_score_mean_global_2,SOFA_score_mean_global_3,SOFA_score_mean_global_4,SOFA_score_mean_global_5,SOFA_score_mean_global_7,SOFA_score_mean_global_8,...,MAP_mean_6h_6,MAP_median_6h_6,MAP_std_6h_6,MAP_diff_std_6h_6,Resp_max_6h_6,Resp_min_6h_6,Resp_mean_6h_6,Resp_median_6h_6,Resp_std_6h_6,Resp_diff_std_6h_6
0,1,0,54,1.87037,1.87037,1.87037,1.87037,1.87037,1.87037,1.87037,...,85.614953,87.099858,5.597793,9.243436,30.0,19.0,23.702044,23.25,3.720878,5.929981
2,3,0,48,2.0625,2.0625,2.0625,2.0625,2.0625,2.0625,2.0625,...,72.284326,69.83,7.299862,11.137502,40.0,19.0,27.088652,25.765957,7.156928,10.856961
3,4,0,29,4.965517,4.965517,4.965517,4.965517,4.965517,4.965517,4.965517,...,74.789673,74.75,5.958097,7.605339,26.0,17.0,20.886905,19.160714,4.047701,4.468455
6,7,0,45,4.377778,4.377778,4.377778,4.377778,4.377778,4.377778,4.377778,...,82.6875,82.0,12.515428,22.518673,33.0,15.0,22.984848,21.454545,6.486455,8.238039
7,8,0,40,3.95,3.95,3.95,3.95,3.95,3.95,3.95,...,67.333333,66.5,4.179314,8.408329,22.0,16.0,18.352564,17.807692,2.384171,3.167256


Create the sampled datasets: one for undersampling and one for oversampling

In [30]:
upsampled_train_df = over_under_sample(
    df=train_df.copy(), method="oversample", minority_ratio=SAMPLING_MINORITY_RATIO
)
undersampled_train_df = over_under_sample(
    df=train_df.copy(), method="undersample", minority_ratio=SAMPLING_MINORITY_RATIO
)

#Save the sampled datasets
upsampled_train_df.to_parquet(f"{output_folder}/oversampled_train.parquet")
undersampled_train_df.to_parquet(f"{output_folder}/undersampled_train.parquet")


## Analysis

Checking the balance ratio for the datasets

In [31]:
def get_dataset_stats(df, name):
    total_patients = df["patient_id"].nunique()
    patient_labels = df.groupby("patient_id")["SepsisLabel"].max()

    non_sepsis = (patient_labels == 0).sum()
    sepsis = (patient_labels == 1).sum()

    non_sepsis_pct = non_sepsis / total_patients * 100
    sepsis_pct = sepsis / total_patients * 100

    imbalance_ratio = non_sepsis / sepsis if sepsis > 0 else float("inf")

    return {
        "Dataset": name,
        "Total Patients": total_patients,
        "Non-Sepsis Patients": non_sepsis,
        "Non-Sepsis %": f"{non_sepsis_pct:.2f}%",
        "Sepsis Patients": sepsis,
        "Sepsis %": f"{sepsis_pct:.2f}%",
        "Imbalance Ratio": f"{imbalance_ratio:.2f}",
    }

datasets = [
    (train_df, "No Sampling (Train)"),
    (val_df, "Validation"),
    (test_df, "Test"),
    (upsampled_train_df, "Oversampled (Train)"),
    (undersampled_train_df, "Undersampled (Train)"),
]

results = []
for df, name in datasets:
    results.append(get_dataset_stats(df, name))

# Create and display the results table
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Dataset,Total Patients,Non-Sepsis Patients,Non-Sepsis %,Sepsis Patients,Sepsis %,Imbalance Ratio
0,No Sampling (Train),28234,26181,92.73%,2053,7.27%,12.75
1,Validation,4034,3741,92.74%,293,7.26%,12.77
2,Test,8068,7482,92.74%,586,7.26%,12.77
3,Oversampled (Train),37395,26175,70.00%,11220,30.00%,2.33
4,Undersampled (Train),6843,4790,70.00%,2053,30.00%,2.33


Features of the datasets

In [32]:
from pprint import pprint
pprint(train_df.columns.tolist())

['patient_id',
 'SepsisLabel',
 'ICULOS_last',
 'SOFA_score_mean_global_1',
 'SOFA_score_mean_global_2',
 'SOFA_score_mean_global_3',
 'SOFA_score_mean_global_4',
 'SOFA_score_mean_global_5',
 'SOFA_score_mean_global_7',
 'SOFA_score_mean_global_8',
 'SOFA_score_median_global_1',
 'SOFA_score_median_global_2',
 'SOFA_score_median_global_3',
 'SOFA_score_median_global_4',
 'SOFA_score_median_global_5',
 'SOFA_score_median_global_7',
 'SOFA_score_median_global_8',
 'SOFA_score_max_global_1',
 'SOFA_score_max_global_2',
 'SOFA_score_max_global_3',
 'SOFA_score_max_global_4',
 'SOFA_score_max_global_5',
 'SOFA_score_max_global_7',
 'SOFA_score_max_global_8',
 'SOFA_score_last_global_1',
 'SOFA_score_last_global_2',
 'SOFA_score_last_global_3',
 'SOFA_score_last_global_4',
 'SOFA_score_last_global_5',
 'SOFA_score_last_global_7',
 'SOFA_score_last_global_8',
 'NEWS_score_mean_global_1',
 'NEWS_score_mean_global_2',
 'NEWS_score_mean_global_3',
 'NEWS_score_mean_global_4',
 'NEWS_score_mean_

Ensure there is no data leakage

In [33]:
# load the patient ids from the file
patient_ids = json.load(open(f"{output_folder}/patient_ids.json"))

# check that the patient ids in the datasets are the same as the patient ids in the files
assert train_df["patient_id"].isin(patient_ids["train"]).all()
assert val_df["patient_id"].isin(patient_ids["val"]).all()
assert test_df["patient_id"].isin(patient_ids["test"]).all()


In [34]:
assert not train_df["patient_id"].isin(val_df["patient_id"]).any()
assert not train_df["patient_id"].isin(test_df["patient_id"]).any()
assert not val_df["patient_id"].isin(test_df["patient_id"]).any()

In [35]:
import json
# save the patient ids from the datasets to 1 json file with different keys
patient_ids = {
    "train": train_df["patient_id"].unique().tolist(),
    "val": val_df["patient_id"].unique().tolist(),
    "test": test_df["patient_id"].unique().tolist()
}
with open(f"{output_folder}/patient_ids.json", "w") as f:
    json.dump(patient_ids, f)