This notebook creates the final datasets from the preprocessed dataset. It splits the data into train, val and test sets and applies various sampling methods for the train set: no_sampling, upsampling and downsampling

Config

In [1]:
INPUT_FILE_NAME = "preprocessed_data"
OUTPUT_FOLDER_NAME = "final_datasets"
TEST_SIZE = 0.2
RANDOM_STATE = 42
VAL_SIZE = 0.125
SAMPLING_MINORITY_RATIO = 0.3

## Setup

Imports

In [2]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from sklearn.utils import resample
from pathlib import Path
import numpy as np
import random
import json

Load the dataset

In [3]:
def find_project_root(marker=".gitignore"):
    """
    walk up from the current working directory until a directory containing the
    specified marker (e.g., .gitignore) is found.
    """
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent.resolve()
    raise FileNotFoundError(
        f"Project root marker '{marker}' not found starting from {current}"
    )

In [4]:
project_root = find_project_root()
print("Project root:", project_root)
if project_root not in sys.path:
    sys.path.append(project_root)

df_path = f"{project_root}/dataset/{INPUT_FILE_NAME}.parquet"
try:
    df = pd.read_parquet(df_path)
except Exception as e:
    sys.exit(f"Error loading dataset from {df_path}: {e}")

Project root: /Users/damianstone/Documents/Code/machine-learning/dl-sepsis-prediction


In [5]:
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

## Helper Functions

In [6]:
def over_under_sample(df, method="oversample", minority_ratio=0.3):
    """
    Balances the dataset at the patient level.

    Each patient's overall sepsis label is taken as the maximum value
    (if any record shows sepsis, the patient is marked as septic).

    We then either oversample the septic (minority) patients or undersample
    the non-septic (majority) patients to change the ratio.

    In the final dataset, each copy of a patient gets a unique ID so that
    oversampled patients appear as separate instances.
    """
    # Create a patient-level summary with one record per patient.
    patient_df = df.groupby("patient_id")["SepsisLabel"].max().reset_index()

    # Count patients in each group.
    counts = patient_df["SepsisLabel"].value_counts()
    majority_class = counts.idxmax()
    minority_class = counts.idxmin()

    # Split the patients into majority and minority groups.
    majority_patients = patient_df[patient_df["SepsisLabel"] == majority_class]
    minority_patients = patient_df[patient_df["SepsisLabel"] == minority_class]

    # Resample based on the chosen method.
    if method == "oversample":
        # Duplicate minority patients to reach desired ratio.
        n_desired_minority = int(
            (minority_ratio * len(majority_patients)) / (1 - minority_ratio)
        )
        minority_upsampled = resample(
            minority_patients,
            replace=True,
            n_samples=n_desired_minority,
            random_state=RANDOM_STATE,
        )
        balanced_patient_df = pd.concat([majority_patients, minority_upsampled])
    elif method == "undersample":
        # Remove some majority patients to reach desired ratio.
        n_desired_majority = int(
            ((1 - minority_ratio) / minority_ratio) * len(minority_patients)
        )
        majority_downsampled = resample(
            majority_patients,
            replace=False,
            n_samples=n_desired_majority,
            random_state=RANDOM_STATE,
        )
        balanced_patient_df = pd.concat([majority_downsampled, minority_patients])
    else:
        raise ValueError("Method must be 'oversample' or 'undersample'")

    # Rebuild the full dataset with patient records.
    # If a patient appears more than once due to resampling,
    # assign a new unique patient ID to each duplicate.
    final_dfs = []
    patient_occurrences = {}

    for pid in balanced_patient_df["patient_id"]:
        # Get all records for this patient.
        patient_records = df[df["patient_id"] == pid].copy()
        # Count how many times this patient has been added.
        if pid in patient_occurrences:
            patient_occurrences[pid] += 1
            # Create a new unique ID by appending a suffix.
            new_pid = pid * 1000 + patient_occurrences[pid]
            patient_records["patient_id"] = new_pid
        else:
            # first occurrence, keep original ID
            patient_occurrences[pid] = 0
        final_dfs.append(patient_records)

    balanced_df = pd.concat(final_dfs, ignore_index=True)

    return balanced_df

## Datasets and Processing

Split dataset into train, val, and test



In [7]:
patient_labels = df.groupby("patient_id")["SepsisLabel"].max()

train_val_patients, test_patients = train_test_split(
    patient_labels.index,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=patient_labels,  # Stratify by patient-level labels
)

train_val_df = df[df["patient_id"].isin(train_val_patients)]
test_df = df[df["patient_id"].isin(test_patients)]

train_patients, val_patients = train_test_split(
        train_val_patients,
        test_size=VAL_SIZE,
        random_state=RANDOM_STATE,
        stratify=patient_labels[train_val_patients],
)

train_df = train_val_df[train_val_df["patient_id"].isin(train_patients)]
val_df = train_val_df[train_val_df["patient_id"].isin(val_patients)]

Save the train (no sample), test and val sets

In [8]:
output_folder = f"{project_root}/dataset/{OUTPUT_FOLDER_NAME}"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
val_df.to_parquet(f"{output_folder}/val.parquet")
test_df.to_parquet(f"{output_folder}/test.parquet")
train_df.to_parquet(f"{output_folder}/no_sampling_train.parquet")

Create the sampled datasets: one for undersampling and one for oversampling

In [9]:
upsampled_train_df = over_under_sample(
    df=train_df.copy(), method="oversample", minority_ratio=SAMPLING_MINORITY_RATIO
)
undersampled_train_df = over_under_sample(
    df=train_df.copy(), method="undersample", minority_ratio=SAMPLING_MINORITY_RATIO
)

#Save the sampled datasets
upsampled_train_df.to_parquet(f"{output_folder}/oversampled_train.parquet")
undersampled_train_df.to_parquet(f"{output_folder}/undersampled_train.parquet")


## Analysis

Checking the balance ratio for the datasets

In [10]:
def get_dataset_stats(df, name):
    total_patients = df["patient_id"].nunique()
    patient_labels = df.groupby("patient_id")["SepsisLabel"].max()

    non_sepsis = (patient_labels == 0).sum()
    sepsis = (patient_labels == 1).sum()

    non_sepsis_pct = non_sepsis / total_patients * 100
    sepsis_pct = sepsis / total_patients * 100

    imbalance_ratio = non_sepsis / sepsis if sepsis > 0 else float("inf")

    return {
        "Dataset": name,
        "Total Patients": total_patients,
        "Non-Sepsis Patients": non_sepsis,
        "Non-Sepsis %": f"{non_sepsis_pct:.2f}%",
        "Sepsis Patients": sepsis,
        "Sepsis %": f"{sepsis_pct:.2f}%",
        "Imbalance Ratio": f"{imbalance_ratio:.2f}",
    }

datasets = [
    (train_df, "No Sampling (Train)"),
    (val_df, "Validation"),
    (test_df, "Test"),
    (upsampled_train_df, "Oversampled (Train)"),
    (undersampled_train_df, "Undersampled (Train)"),
]

results = []
for df, name in datasets:
    results.append(get_dataset_stats(df, name))

# Create and display the results table
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Dataset,Total Patients,Non-Sepsis Patients,Non-Sepsis %,Sepsis Patients,Sepsis %,Imbalance Ratio
0,No Sampling (Train),28234,26181,92.73%,2053,7.27%,12.75
1,Validation,4034,3741,92.74%,293,7.26%,12.77
2,Test,8068,7482,92.74%,586,7.26%,12.77
3,Oversampled (Train),37401,26181,70.00%,11220,30.00%,2.33
4,Undersampled (Train),6843,4790,70.00%,2053,30.00%,2.33


Features of the datasets

In [11]:
from pprint import pprint
pprint(train_df.columns.tolist())

['HR',
 'O2Sat',
 'Temp',
 'SBP',
 'MAP',
 'DBP',
 'Resp',
 'EtCO2',
 'BaseExcess',
 'HCO3',
 'FiO2',
 'pH',
 'PaCO2',
 'SaO2',
 'AST',
 'BUN',
 'Alkalinephos',
 'Calcium',
 'Chloride',
 'Creatinine',
 'Bilirubin_direct',
 'Glucose',
 'Lactate',
 'Magnesium',
 'Phosphate',
 'Potassium',
 'Bilirubin_total',
 'TroponinI',
 'Hct',
 'Hgb',
 'PTT',
 'WBC',
 'Fibrinogen',
 'Platelets',
 'Age',
 'Gender',
 'ICULOS',
 'SepsisLabel',
 'patient_id',
 'SOFA',
 'NEWS',
 'qSOFA',
 'MAP_MA_3h',
 'MAP_SD_3h',
 'MAP_Delta',
 'MAP_MA_6h',
 'MAP_SD_6h',
 'MAP_MA_12h',
 'MAP_SD_12h',
 'Creatinine_MA_3h',
 'Creatinine_SD_3h',
 'Creatinine_Delta',
 'Creatinine_MA_6h',
 'Creatinine_SD_6h',
 'Creatinine_MA_12h',
 'Creatinine_SD_12h',
 'Platelets_MA_3h',
 'Platelets_SD_3h',
 'Platelets_Delta',
 'Platelets_MA_6h',
 'Platelets_SD_6h',
 'Platelets_MA_12h',
 'Platelets_SD_12h']


Ensure there is no data leakage

In [12]:
# load the patient ids from the file
patient_ids = json.load(open(f"{output_folder}/patient_ids.json"))

# check that the patient ids in the datasets are the same as the patient ids in the files
assert train_df["patient_id"].isin(patient_ids["train"]).all()
assert val_df["patient_id"].isin(patient_ids["val"]).all()
assert test_df["patient_id"].isin(patient_ids["test"]).all()


In [13]:
assert not train_df["patient_id"].isin(val_df["patient_id"]).any()
assert not train_df["patient_id"].isin(test_df["patient_id"]).any()
assert not val_df["patient_id"].isin(test_df["patient_id"]).any()

In [14]:
# import json
# # save the patient ids from the datasets to 1 json file with different keys
# patient_ids = {
#     "train": train_df["patient_id"].unique().tolist(),
#     "val": val_df["patient_id"].unique().tolist(),
#     "test": test_df["patient_id"].unique().tolist()
# }
# with open(f"{output_folder}/patient_ids.json", "w") as f:
#     json.dump(patient_ids, f)