# Q1.1 - Data Transform

In [1]:
# Import necessary libraries
import os
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
from pathlib import Path

from project_1.config import PROJ_ROOT, DATA_DIRECTORY, PROCESSED_DATA_DIR, LOGS_DIR

tqdm.pandas()

[32m2025-03-28 20:40:44.863[0m | [1mINFO    [0m | [36mproject_1.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc[0m


# First, visualize data

### Here we extract the three outcomes (A, B, C)

In [2]:
# Load the data (for the first time)
sets = ["a", "b", "c"]
sets_dict = {}
for set_name in sets:
    set_folder = DATA_DIRECTORY / f"set-{set_name}"
    print(f"Reading data from {set_folder}")

    patient_data_list = []

    files = [f for f in os.listdir(set_folder) if f.endswith(".txt")]
    for filename in tqdm(
        files, desc=f"Processing files in set-{set_name}", unit="file"
    ):
        file_path = os.path.join(set_folder, filename)

        # Read patient file
        patient_df = pd.read_csv(file_path)

        # Extract RecordID from the 'Parameter' column where value is 'RecordID'
        record_id = patient_df.loc[
            patient_df["Parameter"] == "RecordID", "Value"
        ].values[0]

        # Pivot to transform measurements, using 'first' to resolve duplicates
        patient_df = patient_df.pivot_table(
            index="Time", columns="Parameter", values="Value", aggfunc="first"
        )
        patient_df.reset_index(inplace=True)

        # Remove any existing RecordID column and insert our extracted one as the first column
        if "RecordID" in patient_df.columns:
            patient_df.drop(columns=["RecordID"], inplace=True)
        patient_df.insert(0, "RecordID", record_id)

        # Append the processed DataFrame to the list
        patient_data_list.append(patient_df)

    # Combine all patient data into a single DataFrame
    patients_df = pd.concat(patient_data_list, ignore_index=True)

    # Store in dictionary
    sets_dict["set_" + set_name] = patients_df

# Output the first 5 rows of the Set A DataFrame
print(sets_dict["set_a"].head())
print(sets_dict["set_a"].shape)

Reading data from /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/set-a


Processing files in set-a: 100%|██████████| 4000/4000 [00:05<00:00, 676.12file/s]


Reading data from /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/set-b


Processing files in set-b: 100%|██████████| 4000/4000 [00:05<00:00, 704.80file/s]


Reading data from /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/set-c


Processing files in set-c: 100%|██████████| 4000/4000 [00:07<00:00, 568.09file/s]


Parameter  RecordID   Time   Age   BUN  Creatinine   GCS  Gender  Glucose  \
0          132592.0  00:00  35.0   NaN         NaN   NaN     0.0      NaN   
1          132592.0  01:20   NaN   NaN         NaN  15.0     NaN      NaN   
2          132592.0  02:20   NaN   NaN         NaN   NaN     NaN      NaN   
3          132592.0  02:36   NaN  68.0         2.3   NaN     NaN    603.0   
4          132592.0  03:20   NaN   NaN         NaN   NaN     NaN      NaN   

Parameter  HCO3   HCT  ...  PaCO2  PaO2  pH  DiasABP  MAP  SaO2  SysABP  \
0           NaN   NaN  ...    NaN   NaN NaN      NaN  NaN   NaN     NaN   
1           NaN   NaN  ...    NaN   NaN NaN      NaN  NaN   NaN     NaN   
2           NaN   NaN  ...    NaN   NaN NaN      NaN  NaN   NaN     NaN   
3          11.0  25.5  ...    NaN   NaN NaN      NaN  NaN   NaN     NaN   
4           NaN   NaN  ...    NaN   NaN NaN      NaN  NaN   NaN     NaN   

Parameter  Lactate  Cholesterol  TroponinI  
0              NaN          NaN        Na

In [3]:
# Create the directory if it doesn't exist
set_a_path = Path(PROCESSED_DATA_DIR / "set_a")
set_b_path = Path(PROCESSED_DATA_DIR / "set_b")
set_c_path = Path(PROCESSED_DATA_DIR / "set_c")

set_a_path.mkdir(parents=True, exist_ok=True)
set_b_path.mkdir(parents=True, exist_ok=True)
set_c_path.mkdir(parents=True, exist_ok=True)

# Discretize Time

In [4]:
from datetime import datetime, timedelta

base_date = "2025-03-10"  # Format is YYYY-MM-DD (date is chosen randomly)


# Function to fix invalid times
def adjust_time(time_str, base_date):
    # Split hours and minutes
    hours, minutes = map(int, time_str.split(":"))

    # Calculate valid hour & days overflow
    day_offset = hours // 24  # Number of days to add
    new_hour = hours % 24  # Wrapped hour (0-23)

    # Create the corrected datetime
    corrected_datetime = datetime.strptime(base_date, "%Y-%m-%d") + timedelta(
        days=day_offset, hours=new_hour, minutes=minutes
    )

    return corrected_datetime


def round_up_next_hour(ts):
    # If timestamp is exactly on the hour, return it unchanged.
    if ts.minute == 0 and ts.second == 0 and ts.microsecond == 0:
        return ts
    # Otherwise, round up to the next hour.
    return ts.ceil("H")


# Apply the functions to the 'Time' column (for each sets)
for set_name, set_df in tqdm(sets_dict.items(), desc="Discretizing time", unit="set"):
    # Convert 'Time' column from string to datetime using the adjust_time function.
    set_df["Time"] = set_df["Time"].progress_apply(lambda x: adjust_time(x, base_date))
    # Round up each timestamp to the next hour, except if it is exactly on the hour.
    set_df["Time"] = set_df["Time"].progress_apply(round_up_next_hour)
    # Group by RecordID and discretized Time, taking the mean in case of multiple measurements.
    sets_dict[set_name] = set_df.groupby(["RecordID", "Time"], as_index=False).mean()

# Output the first 5 rows of the Set A DataFrame
print(sets_dict["set_a"].head())



100%|██████████| 299264/299264 [00:01<00:00, 261194.19it/s]
  return ts.ceil("H")
100%|██████████| 299264/299264 [00:06<00:00, 45192.46it/s]
100%|██████████| 299068/299068 [00:01<00:00, 270458.60it/s]s/set]
  return ts.ceil("H")
100%|██████████| 299068/299068 [00:06<00:00, 48121.36it/s]
100%|██████████| 300020/300020 [00:01<00:00, 270617.43it/s]s/set]
  return ts.ceil("H")
100%|██████████| 300020/300020 [00:06<00:00, 48990.77it/s]
Discretizing time: 100%|██████████| 3/3 [00:22<00:00,  7.55s/set]

Parameter  RecordID                Time   Age  BUN  Creatinine   GCS  Gender  \
0          132539.0 2025-03-10 00:00:00  54.0  NaN         NaN   NaN     0.0   
1          132539.0 2025-03-10 01:00:00   NaN  NaN         NaN  15.0     NaN   
2          132539.0 2025-03-10 02:00:00   NaN  NaN         NaN   NaN     NaN   
3          132539.0 2025-03-10 03:00:00   NaN  NaN         NaN   NaN     NaN   
4          132539.0 2025-03-10 04:00:00   NaN  NaN         NaN  15.0     NaN   

Parameter  Glucose  HCO3   HCT  ...  PaCO2  PaO2  pH  DiasABP  MAP  SaO2  \
0              NaN   NaN   NaN  ...    NaN   NaN NaN      NaN  NaN   NaN   
1              NaN   NaN   NaN  ...    NaN   NaN NaN      NaN  NaN   NaN   
2              NaN   NaN   NaN  ...    NaN   NaN NaN      NaN  NaN   NaN   
3              NaN   NaN   NaN  ...    NaN   NaN NaN      NaN  NaN   NaN   
4              NaN   NaN  33.7  ...    NaN   NaN NaN      NaN  NaN   NaN   

Parameter  SysABP  Lactate  Cholesterol  TroponinI  
0        




# Save the results into Parquet Format (efficient for large datasets)

In [5]:
for set_name, set_df in tqdm(sets_dict.items(), desc="Storing DataFrames", unit="set"):
    output_path = PROCESSED_DATA_DIR / f"{set_name}" / f"{set_name}.parquet"
    set_df.to_parquet(output_path, index=False, engine="pyarrow")
    print(f"Saved {output_path}")

print("\nAll DataFrames have been saved to Parquet format.")

Storing DataFrames: 100%|██████████| 3/3 [00:00<00:00, 16.02set/s]

Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_a/set_a.parquet
Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_b/set_b.parquet
Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_c/set_c.parquet

All DataFrames have been saved to Parquet format.





***

# Q1.3 - Data Preprocessing

In [6]:
import numpy as np
SEED = 42
np.random.seed(SEED)

Check that -1 values in the static variables only appear in the first row for each patient

In [7]:

def check_neg_vals(df):
    # Group by RecordID and check for -1
    def check_first_row(group):
        return pd.Series(
            {
                "Age_First_Only": (group["Age"].iloc[1:] != -1).all(),
                "Gender_First_Only": (group["Gender"].iloc[1:] != -1).all(),
                "Height_First_Only": (group["Height"].iloc[1:] != -1).all(),
                "Weight_First_Only": (group["Weight"].iloc[1:] != -1).all(),
            }
        )

    # Apply the function to each group
    result = df.groupby("RecordID").apply(check_first_row).reset_index()

    # Replace -1 with NA in the specified columns
    df.replace(
        {
            "Height": {-1: None},
            "Age": {-1: None},
            "Weight": {-1: None},
            "ICUType": {-1: None},
            "Gender": {-1: None},
        },
        inplace=True,
    )

    # Check if any RecordID has -1 in non-first rows
    violations = result[
        (~result["Age_First_Only"])
        | (~result["Gender_First_Only"])
        | (~result["Height_First_Only"])
        | (~result["Weight_First_Only"])
    ]

    # Print violations
    # print(violations)
    if violations.empty:
        print("No violations found.")


for set_key, df in sets_dict.items():
    check_neg_vals(df)

  result = df.groupby("RecordID").apply(check_first_row).reset_index()


No violations found.


  result = df.groupby("RecordID").apply(check_first_row).reset_index()


No violations found.
No violations found.


  result = df.groupby("RecordID").apply(check_first_row).reset_index()


## Outlier Detection

In [8]:
def clean_df(df):
    """
    Clean the input DataFrame according to the following rules:
      - Missing value handling: For Age, Gender, Height, ICUType, Weight, set -1 to NA.
      - Height outlier removal: Set Height to NA if < 100 cm or >= 300 cm.
      - Weight outlier removal: Set Weight to NA if < 20 kg or >= 300 kg.
      - PaO2 corrections: Set PaO2 equal to 0 to NA; if PaO2 equals 7.47, correct it to 74.7 because it's the only value out of range
      - pH unit correction: If pH is between 65 and 80, divide by 10; if between 650 and 800, divide by 100.
      - Temperature corrections: Set Temp to NA if Temp is less than 20.
    """
    # 1. Missing value handling: Replace -1 with np.nan for selected columns.
    missing_cols = ["Age", "Gender", "Height", "ICUType", "Weight"]
    for col in missing_cols:
        df.loc[df[col] == -1, col] = np.nan

    # 2. Height outlier removal: Set Height to NA if < 100 or >= 300.
    df.loc[(df["Height"] < 100) | (df["Height"] >= 300), "Height"] = np.nan

    # 3. Weight outlier removal: Set Weight to NA if < 20 or >= 300.
    df.loc[(df["Weight"] < 20) | (df["Weight"] >= 300), "Weight"] = np.nan

    # 4. PaO2 corrections:
    #    Set PaO2 equal to 0 to NA, and if PaO2 is 7.47, correct it to 74.7.
    df.loc[df["PaO2"] == 0, "PaO2"] = np.nan
    df.loc[df["PaO2"] == 7.47, "PaO2"] = 74.7

    # 5. pH unit correction:
    #    If pH is between 65 and 80, divide by 10; if between 650 and 800, divide by 100.
    def correct_ph(ph):
        if pd.isna(ph):
            return ph
        if 65 <= ph <= 80:
            return ph / 10.0
        elif 650 <= ph <= 800:
            return ph / 100.0
        else:
            return ph

    df["pH"] = df["pH"].apply(correct_ph)

    # 6. Temperature corrections: Set Temp to NA if Temp is < 20.
    df.loc[df["Temp"] < 20, "Temp"] = np.nan

    return df


for set_key, df in sets_dict.items():
    # Clean the DataFrame
    cleaned_df = clean_df(df)
    sets_dict[set_key] = cleaned_df  # Update dictionary (optional)

    # Export to Parquet file (e.g., "set_a_cleaned.parquet")
    output_filename = (
        PROCESSED_DATA_DIR / f"{set_key}" / f"{set_key}_before_imputation.parquet"
    )
    cleaned_df.to_parquet(output_filename, index=False)
    print(f"Cleaned data for {set_key} saved as {output_filename}")

Cleaned data for set_a saved as /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_a/set_a_before_imputation.parquet
Cleaned data for set_b saved as /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_b/set_b_before_imputation.parquet
Cleaned data for set_c saved as /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_c/set_c_before_imputation.parquet


## Imputation for Missing Static Features

In [9]:
from sklearn.impute import KNNImputer


def knn_impute_static_features(
    df, static_features=["Age", "Weight", "Height", "Gender"], n_neighbors=10
):
    """
    Impute missing static values (currently indicated by -1) using KNN imputation with n_neighbors.

    Parameters:
      df (pd.DataFrame): DataFrame with one row per patient.
      static_features (list): List of static feature column names to impute.
      n_neighbors (int): Number of neighbors to use for KNN imputation.

    Returns:
      pd.DataFrame: The DataFrame with missing static feature values imputed.
    """
    # Work on a copy to avoid modifying the original DataFrame.
    df_impute = df.copy()

    # Replace missing values (-1) with np.nan in the static columns.
    df_impute[static_features] = df_impute[static_features].replace(-1, np.nan)

    # Initialize the KNN imputer.
    imputer = KNNImputer(n_neighbors=n_neighbors)

    # Fit and transform the static features.
    imputed_array = imputer.fit_transform(df_impute[static_features])

    # Create a new DataFrame with the imputed static features.
    df_imputed_static = pd.DataFrame(
        imputed_array, columns=static_features, index=df_impute.index
    )

    # Update the original DataFrame with the imputed values.
    df_impute.update(df_imputed_static)

    return df_impute


for set_key, df in sets_dict.items():
    # Impute missing static features
    # Get the static features in df
    static_df = (
        df.groupby("RecordID", as_index=False)
        .first()[["RecordID", "Age", "Weight", "Height", "Gender"]]
        .copy()
    )
    imputed_df = knn_impute_static_features(static_df)

    if imputed_df.index.name == "RecordID":
        imputed_df = imputed_df.reset_index()

    # Check if there are any NaN values
    if imputed_df.isnull().values.any():
        print(f"NaN values found in imputed static features for {set_key}.")

    # Update the original DataFrame with the imputed values
    static_cols = ["Age", "Weight", "Height", "Gender"]
    # Create a mapping for each column and update df_full.
    for col in static_cols:
        mapping = imputed_df.set_index("RecordID")[col]
        df[col] = df["RecordID"].map(mapping)

    # sets_dict[set_key] = imputed_df  # Update dictionary (optional)

    # Reorder columns

    # Assume that the first two columns should remain in place.
    # For example, we assume these are the first two columns of the DataFrame.
    first_two = list(df.columns[:2])

    # The rest of the columns, excluding the static columns.
    remaining = [
        col for col in df.columns if col not in static_cols and col not in first_two
    ]

    # Create the new order: first two columns, then the static columns, then the remaining columns.
    new_order = first_two + static_cols + remaining

    # Reorder the DataFrame and return
    df = df[new_order]

    # Check if new dataframe has NaN values on the static columns
    if df[static_cols].isnull().values.any():
        print(f"NaN values found in imputed static features for {set_key}.")
    print(f"{df.head(10)}")

    # Export to Parquet file (e.g., "set_a....parquet")
    output_filename = (
        PROCESSED_DATA_DIR / f"{set_key}" / f"{set_key}_before_ffill.parquet"
    )
    df.to_parquet(output_filename, index=False)
    print(f"Cleaned data for {set_key} saved as {output_filename}")
    

Parameter  RecordID                Time   Age Weight  Height Gender  BUN  \
0          132539.0 2025-03-10 00:00:00  54.0  76.06  160.79    0.0  NaN   
1          132539.0 2025-03-10 01:00:00  54.0  76.06  160.79    0.0  NaN   
2          132539.0 2025-03-10 02:00:00  54.0  76.06  160.79    0.0  NaN   
3          132539.0 2025-03-10 03:00:00  54.0  76.06  160.79    0.0  NaN   
4          132539.0 2025-03-10 04:00:00  54.0  76.06  160.79    0.0  NaN   
5          132539.0 2025-03-10 05:00:00  54.0  76.06  160.79    0.0  NaN   
6          132539.0 2025-03-10 06:00:00  54.0  76.06  160.79    0.0  NaN   
7          132539.0 2025-03-10 08:00:00  54.0  76.06  160.79    0.0  NaN   
8          132539.0 2025-03-10 09:00:00  54.0  76.06  160.79    0.0  NaN   
9          132539.0 2025-03-10 10:00:00  54.0  76.06  160.79    0.0  NaN   

Parameter  Creatinine   GCS  Glucose  ...  PaCO2  PaO2  pH  DiasABP  MAP  \
0                 NaN   NaN      NaN  ...    NaN   NaN NaN      NaN  NaN   
1          

# MechVent Imputation

MechVent is a feature with a peculiar behaviour, so we adjust it manually.
The website assures the values are 0/1. This is not true, there are only 1 values. So we assume that those patients that have the 1 somewhere has had the MechVentilation for the whole time. While those patients that have only NA didn't have any MechVent. In that way it becomes a static variable, but this choice seems more reasonable to us. 

In [11]:
# Check MechVent column in Set A
set_a_df = sets_dict["set_a"]
print(set_a_df["MechVent"])
print(set_a_df["MechVent"].unique())

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
183411   NaN
183412   NaN
183413   NaN
183414   NaN
183415   NaN
Name: MechVent, Length: 183416, dtype: float64
[nan  1.]


In [12]:
# DO the imputation
for set_key, df in sets_dict.items():
    df["MechVent"] = df.groupby("RecordID")["MechVent"].transform(
        lambda x: 1 if x.eq(1).any() else 0
    )

In [13]:
# Check that each patient has only one value for MechVent and it is 0 or 1
for set_key, df in sets_dict.items():
    unique_values = df.groupby("RecordID")["MechVent"].nunique()
    if not all(unique_values <= 1):
        print(f"More than one unique value for MechVent in {set_key}.")

    if not all(df["MechVent"].isin([0, 1])):
        print(f"Invalid values for MechVent in {set_key}.")

## Forward Filling

In [11]:
def forward_fill(df):
    # Ensure the DataFrame is sorted by RecordID and Time
    df.sort_values(by=["RecordID", "Time"], inplace=True)

    # Get a list of all columns except "RecordID" and "Time"
    other_cols = [col for col in df.columns if col != "RecordID" and col != "Time"]

    # Group by RecordID and apply forward fill for each group.
    df[other_cols] = df.groupby("RecordID")[other_cols].ffill()

    return df


for set_key, df in sets_dict.items():
    # Forward fill the DataFrame
    filled_df = forward_fill(df)
    sets_dict[set_key] = filled_df  # Update dictionary (optional)

    # Export to Parquet file (e.g., "set_a....parquet")
    output_filename = (
        PROCESSED_DATA_DIR / f"{set_key}" / f"{set_key}_before_backward.parquet"
    )
    filled_df.to_parquet(output_filename, index=False)
    print(f"Forward-filled data for {set_key} saved as {output_filename}")

Forward-filled data for set_a saved as /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_a/set_a_before_backward.parquet
Forward-filled data for set_b saved as /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_b/set_b_before_backward.parquet
Forward-filled data for set_c saved as /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_c/set_c_before_backward.parquet


# Backward Filling using Interpolation

In [12]:
def time_based_interpolation(df):
    """
    Perform time-based interpolation on the DataFrame.

    This function:
      - Converts the "Time" column to datetime,
      - Sets "Time" as the index,
      - Interpolates numeric columns (excluding "RecordID") using method='time'
        with limit_direction='both',
      - Resets the index to restore "Time" as a regular column.

    Parameters:
      df (pd.DataFrame): Input DataFrame with at least "Time" and "RecordID" columns.

    Returns:
      pd.DataFrame: The DataFrame with interpolated values.
    """
    # Ensure the "Time" column is in datetime format.
    df["Time"] = pd.to_datetime(df["Time"])

    # Set "Time" as the DataFrame index for time-based interpolation.
    df = df.set_index("Time")

    # Identify the columns to interpolate (exclude non-numeric columns like "RecordID").
    cols_to_interp = [col for col in df.columns if col not in ["RecordID", "Time"]]

    # Apply time-based interpolation; limit_direction='both' fills NaNs at the start and end too.
    df[cols_to_interp] = df[cols_to_interp].interpolate(
        method="time", limit_direction="both"
    )

    # Restore "Time" as a regular column by resetting the index.
    df = df.reset_index()

    return df


for key, df in sets_dict.items():
    sets_dict[key] = time_based_interpolation(df)
    print(f"After interpolation, {key} has shape: {sets_dict[key].shape}")

# Saving temporary data for plotting
for set_name, set_df in tqdm(sets_dict.items(), desc="Storing DataFrames", unit="set"):
    output_path = (
        PROCESSED_DATA_DIR / f"{set_name}" / f"{set_name}_before_scaling.parquet"
    )
    set_df.to_parquet(output_path, index=False, engine="pyarrow")
    print(f"Saved {output_path}")

  df[cols_to_interp] = df[cols_to_interp].interpolate(


After interpolation, set_a has shape: (183416, 43)


  df[cols_to_interp] = df[cols_to_interp].interpolate(


After interpolation, set_b has shape: (183495, 43)


  df[cols_to_interp] = df[cols_to_interp].interpolate(


After interpolation, set_c has shape: (183711, 43)


Storing DataFrames:  33%|███▎      | 1/3 [00:00<00:00,  9.91set/s]

Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_a/set_a_before_scaling.parquet
Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_b/set_b_before_scaling.parquet


Storing DataFrames: 100%|██████████| 3/3 [00:00<00:00, 10.69set/s]

Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_c/set_c_before_scaling.parquet





## Scale the data

In [13]:
from sklearn.preprocessing import StandardScaler, RobustScaler

cols_to_scale = [col for col in df.columns if col not in ["RecordID", "Time", "Gender"]]

### For normally-distributed columns, we use the StandardScaler
### For non-normally-distributed columns, we use the RobustScaler

nd_cols = [
    "Height",
    "Weight",
    "Age",
    "Albumin",
    "Cholesterol",
    "DiasABP",
    "HCO3",
    "HCT",
    "HR",
    "Mg",
    "MAP",
    "Na",
    "NIDiasABP",
    "NIMAP",
    "NISysABP",
    "SysABP",
    "PaCO2",
    "PaO2",
    "Platelets",
    "RespRate",
    "Temp",
]
nnd_cols = [col for col in cols_to_scale if col not in nd_cols]

scaler_nd = StandardScaler()
scaler_nnd = RobustScaler()

# Process each set: fit on set_a, then only transform on the others.
for set_key in ["set_a", "set_b", "set_c"]:
    df = sets_dict[set_key]
    if set_key == "set_a":
        # Fit on the first set
        scaled_values_nd = scaler_nd.fit_transform(df[nd_cols])
        scaled_values_nnd = scaler_nnd.fit_transform(df[nnd_cols])
    else:
        # Transform the other sets using the fitted scalers
        scaled_values_nd = scaler_nd.transform(df[nd_cols])
        scaled_values_nnd = scaler_nnd.transform(df[nnd_cols])

    # Convert the scaled numpy arrays to DataFrames while preserving the index
    df_scaled_nd = pd.DataFrame(scaled_values_nd, columns=nd_cols, index=df.index)
    df_scaled_nnd = pd.DataFrame(scaled_values_nnd, columns=nnd_cols, index=df.index)

    # Combine the scaled DataFrames along the columns axis
    df_scaled = pd.concat([df_scaled_nd, df_scaled_nnd], axis=1)

    # Combine the unmodified columns with the scaled columns.
    df_final = pd.concat(
        [
            df[["RecordID", "Time", "Gender"]].reset_index(drop=True),
            df_scaled.reset_index(drop=True),
        ],
        axis=1,
    )

    # Update the dictionary with the final DataFrame
    sets_dict[set_key] = df_final

# Optionally, print the first 10 rows of set_a to check the result.
print(sets_dict["set_a"].head(10))

   RecordID                Time Gender    Height   Weight       Age   Albumin  \
0  132539.0 2025-03-10 00:00:00    0.0 -0.950526 -0.23008 -0.596332  1.671639   
1  132539.0 2025-03-10 01:00:00    0.0 -0.950526 -0.23008 -0.596332  1.967793   
2  132539.0 2025-03-10 02:00:00    0.0 -0.950526 -0.23008 -0.596332 -1.734132   
3  132539.0 2025-03-10 03:00:00    0.0 -0.950526 -0.23008 -0.596332  1.523562   
4  132539.0 2025-03-10 04:00:00    0.0 -0.950526 -0.23008 -0.596332  0.487023   
5  132539.0 2025-03-10 05:00:00    0.0 -0.950526 -0.23008 -0.596332  0.042792   
6  132539.0 2025-03-10 06:00:00    0.0 -0.950526 -0.23008 -0.596332  0.635100   
7  132539.0 2025-03-10 08:00:00    0.0 -0.950526 -0.23008 -0.596332  0.635100   
8  132539.0 2025-03-10 09:00:00    0.0 -0.950526 -0.23008 -0.596332 -0.401439   
9  132539.0 2025-03-10 10:00:00    0.0 -0.950526 -0.23008 -0.596332 -0.105285   

   Cholesterol   DiasABP      HCO3  ...      Urine       WBC     pH  MechVent  \
0    -0.013487 -0.832594 -0

# Save the data

In [14]:
for set_name, set_df in tqdm(sets_dict.items(), desc="Storing DataFrames", unit="set"):
    output_path = PROCESSED_DATA_DIR / f"{set_name}" / f"{set_name}_final.parquet"
    set_df.to_parquet(output_path, index=False, engine="pyarrow")
    print(f"Saved {output_path}")

print("\nAll DataFrames have been saved to Parquet format.")

Storing DataFrames: 100%|██████████| 3/3 [00:00<00:00, 11.66set/s]

Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_a/set_a_final.parquet
Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_b/set_b_final.parquet
Saved /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc/data/processed/set_c/set_c_final.parquet

All DataFrames have been saved to Parquet format.



