# This Jupyter Notebook serves to the following:
- Import three data sets (A => Training Set, B => Validation Set, C => Test Set)
- Preprocess all three using Forward Fill, then adjusting the Time format
- Saving the new DataFrames to local files ([train, validation, test]_set.h5) using HDF5 method (fast and efficient for large datasets)

# Setup the absolute / relative path

In [1]:
from pathlib import Path
import os

PROJECT_ROOT = Path(os.getcwd()).resolve().parent
DATA_DIR = PROJECT_ROOT / "data"

print(f"Project root: {PROJECT_ROOT}")

Project root: /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc


# First, visualize data

### Here we extract the three outcomes (A, B, C)

In [2]:
import pandas as pd

# Load structured data
outcomes_a = PROJECT_ROOT / "data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/Outcomes-a.txt"
outcomes_b = PROJECT_ROOT / "data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/Outcomes-b.txt"
outcomes_c = PROJECT_ROOT / "data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/Outcomes-c.txt"
outcomes_a_df = pd.read_csv(outcomes_a)
outcomes_b_df = pd.read_csv(outcomes_b)
outcomes_c_df = pd.read_csv(outcomes_c)
print(outcomes_a_df.head())
print(outcomes_a_df.info())

   RecordID  SAPS-I  SOFA  Length_of_stay  Survival  In-hospital_death
0    132539       6     1               5        -1                  0
1    132540      16     8               8        -1                  0
2    132541      21    11              19        -1                  0
3    132543       7     1               9       575                  0
4    132545      17     2               4       918                  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   RecordID           4000 non-null   int64
 1   SAPS-I             4000 non-null   int64
 2   SOFA               4000 non-null   int64
 3   Length_of_stay     4000 non-null   int64
 4   Survival           4000 non-null   int64
 5   In-hospital_death  4000 non-null   int64
dtypes: int64(6)
memory usage: 187.6 KB
None


In [3]:
import os

# Define folder path
set_a_folder = PROJECT_ROOT / "data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/set-a"

# Create an empty list to store patient data
patient_data_list = []

# Loop through all patient files
for filename in os.listdir(set_a_folder):
    if filename.endswith(".txt"):  # Ensure processing only text files
        file_path = os.path.join(set_a_folder, filename)
        
        # Read patient file
        patient_df = pd.read_csv(file_path)
        
        # Extract RecordID
        record_id = patient_df.loc[patient_df['Parameter'] == 'RecordID', 'Value'].values[0]
        
        # Pivot with aggregation to avoid duplicates
        patient_df = patient_df.pivot_table(index='Time', columns='Parameter', values='Value', aggfunc='first')
        
        # Reset index
        patient_df.reset_index(inplace=True)

        # Ensure 'RecordID' is not already in the DataFrame
        if 'RecordID' in patient_df.columns:
            patient_df.drop(columns=['RecordID'], inplace=True)
        
        # Add RecordID as the first column
        patient_df.insert(0, "RecordID", record_id)
        
        # Store in list
        patient_data_list.append(patient_df)

# Combine all patient data into a single DataFrame
patients_df = pd.concat(patient_data_list, ignore_index=True)

# Merge patient data with outcome data
merged_df = patients_df.merge(outcomes_a_df, on="RecordID", how="left")
print(f"Num of Records: {patients_df.shape[0]}")
print(f"Num of Patients: {len(patients_df["RecordID"].unique())}")
patients_df.head(10)

Num of Records: 299264
Num of Patients: 4000


Parameter,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132592.0,00:00,35.0,,,,0.0,,,,...,,,,,,,,,,
1,132592.0,01:20,,,,15.0,,,,,...,,,,,,,,,,
2,132592.0,02:20,,,,,,,,,...,,,,,,,,,,
3,132592.0,02:36,,68.0,2.3,,,603.0,11.0,25.5,...,,,,,,,,,,
4,132592.0,03:20,,,,,,,,,...,,,,,,,,,,
5,132592.0,04:20,,,,,,,,,...,,,,,,,,,,
6,132592.0,05:20,,61.0,2.0,,,362.0,15.0,23.7,...,,,,,,,,,,
7,132592.0,06:20,,,,15.0,,,,,...,,,,,,,,,,
8,132592.0,07:20,,59.0,1.9,,,254.0,15.0,23.2,...,,,,,,,,,,
9,132592.0,08:20,,,,,,,,,...,,,,,,,,,,


# We can count the number of non-NaN rows for the dataframe

In [7]:
merged_df.head(10)

Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,SaO2,SysABP,Lactate,Cholesterol,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,132592.0,00:00,35.0,,,,0.0,,,,...,,,,,,15,3,3,-1,0
1,132592.0,01:20,,,,15.0,,,,,...,,,,,,15,3,3,-1,0
2,132592.0,02:20,,,,,,,,,...,,,,,,15,3,3,-1,0
3,132592.0,02:36,,68.0,2.3,,,603.0,11.0,25.5,...,,,,,,15,3,3,-1,0
4,132592.0,03:20,,,,,,,,,...,,,,,,15,3,3,-1,0
5,132592.0,04:20,,,,,,,,,...,,,,,,15,3,3,-1,0
6,132592.0,05:20,,61.0,2.0,,,362.0,15.0,23.7,...,,,,,,15,3,3,-1,0
7,132592.0,06:20,,,,15.0,,,,,...,,,,,,15,3,3,-1,0
8,132592.0,07:20,,59.0,1.9,,,254.0,15.0,23.2,...,,,,,,15,3,3,-1,0
9,132592.0,08:20,,,,,,,,,...,,,,,,15,3,3,-1,0


In [11]:
patients_df["non_nan_count"] = patients_df.notna().sum(axis=1)
print(patients_df["non_nan_count"].describe())

# Check the first few rows
print(patients_df[["Time", "RecordID", "non_nan_count"]].head())

count    299264.000000
mean          8.848468
std           2.580872
min           4.000000
25%           8.000000
50%           9.000000
75%          10.000000
max          31.000000
Name: non_nan_count, dtype: float64
Parameter   Time  RecordID  non_nan_count
0          00:00  132592.0              8
1          01:20  132592.0             11
2          02:20  132592.0             10
3          02:36  132592.0             14
4          03:20  132592.0             10


### There are on average almost 9 non NaN values per row. Use forward fill to fill the missing values.

In [4]:
# Sort the DataFrame by RecordID and Time
'''print("Shape before sorting:", patients_df.shape)
number_rows_record = patients_df["RecordID"].value_counts()
print(number_rows_record.describe()) # Here it's count = 4000!!!'''

t_patients_df = merged_df.sort_values(by=["RecordID", "Time"])

# Identify columns that should be forward filled (i.e. all except RecordID)
cols_to_ffill = [col for col in t_patients_df.columns if col != "RecordID" and col != "Parameter" and col != "Time"]

# Apply forward fill to those columns within each RecordID group using transform
t_patients_df[cols_to_ffill] = t_patients_df.groupby("RecordID")[cols_to_ffill].transform("ffill")
#t_patients_df["RecordID"] = t_patients_df["RecordID"].astype(float)
'''print("Shape after sorting:", t_patients_df.shape)

# Check number of rows for RecordID
number_rows_record = t_patients_df["RecordID"].value_counts()
print(number_rows_record.describe())'''

# Now reset Parameter column
#t_patients_df["Parameter"] = t_patients_df["Parameter"] - t_patients_df["Parameter"].min()
#print(t_patients_df.columns)
t_patients_df.reset_index(drop=True, inplace=True)

# Finally copy data
merged_df = t_patients_df.copy()
t_patients_df.head(10)

Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,SaO2,SysABP,Lactate,Cholesterol,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,132539.0,00:00,54.0,,,,0.0,,,,...,,,,,,6,1,5,-1,0
1,132539.0,00:07,54.0,,,15.0,0.0,,,,...,,,,,,6,1,5,-1,0
2,132539.0,00:37,54.0,,,15.0,0.0,,,,...,,,,,,6,1,5,-1,0
3,132539.0,01:37,54.0,,,15.0,0.0,,,,...,,,,,,6,1,5,-1,0
4,132539.0,02:37,54.0,,,15.0,0.0,,,,...,,,,,,6,1,5,-1,0
5,132539.0,03:08,54.0,,,15.0,0.0,,,33.7,...,,,,,,6,1,5,-1,0
6,132539.0,03:37,54.0,,,15.0,0.0,,,33.7,...,,,,,,6,1,5,-1,0
7,132539.0,04:37,54.0,,,15.0,0.0,,,33.7,...,,,,,,6,1,5,-1,0
8,132539.0,05:37,54.0,,,15.0,0.0,,,33.7,...,,,,,,6,1,5,-1,0
9,132539.0,07:37,54.0,,,15.0,0.0,,,33.7,...,,,,,,6,1,5,-1,0


# Insert a base standard date for each measurement
## And then correct the hours exceeding 24-hour format

In [4]:
from datetime import datetime, timedelta

base_date = "2025-03-10" # Format is YYYY-MM-DD
# Function to fix invalid times
def adjust_time(time_str, base_date):
    # Split hours and minutes
    hours, minutes = map(int, time_str.split(":"))
    
    # Calculate valid hour & days overflow
    day_offset = hours // 24  # Number of days to add
    new_hour = hours % 24  # Wrapped hour (0-23)
    
    # Create the corrected datetime
    corrected_datetime = datetime.strptime(base_date, "%Y-%m-%d") + timedelta(days=day_offset, hours=new_hour, minutes=minutes)
    
    return corrected_datetime

# Apply the function to the 'Time' column
merged_df['Time'] = merged_df['Time'].apply(lambda x: adjust_time(x, base_date))
merged_df.head(10)



Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,SaO2,SysABP,Lactate,Cholesterol,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,132592.0,2025-03-10 00:00:00,35.0,,,,0.0,,,,...,,,,,,15,3,3,-1,0
1,132592.0,2025-03-10 01:20:00,,,,15.0,,,,,...,,,,,,15,3,3,-1,0
2,132592.0,2025-03-10 02:20:00,,,,,,,,,...,,,,,,15,3,3,-1,0
3,132592.0,2025-03-10 02:36:00,,68.0,2.3,,,603.0,11.0,25.5,...,,,,,,15,3,3,-1,0
4,132592.0,2025-03-10 03:20:00,,,,,,,,,...,,,,,,15,3,3,-1,0
5,132592.0,2025-03-10 04:20:00,,,,,,,,,...,,,,,,15,3,3,-1,0
6,132592.0,2025-03-10 05:20:00,,61.0,2.0,,,362.0,15.0,23.7,...,,,,,,15,3,3,-1,0
7,132592.0,2025-03-10 06:20:00,,,,15.0,,,,,...,,,,,,15,3,3,-1,0
8,132592.0,2025-03-10 07:20:00,,59.0,1.9,,,254.0,15.0,23.2,...,,,,,,15,3,3,-1,0
9,132592.0,2025-03-10 08:20:00,,,,,,,,,...,,,,,,15,3,3,-1,0


In [7]:
# Prova
# Print the shape of the array of column Weight for Record ID = 132539
print(merged_df[merged_df["RecordID"] == 132539]["Weight"].shape)

(51,)


## Round time

In [8]:
def round_up_next_hour(ts):
    # If already on an hour, add one hour to always go to the next hour
    if ts.minute == 0 and ts.second == 0 and ts.microsecond == 0:
        return ts + pd.Timedelta(hours=1)
    # Otherwise, use ceil to round up to the nearest hour
    return ts.ceil("H")


# Apply the function to create a new column with discretized times
merged_df["Time"] = merged_df["Time"].apply(round_up_next_hour)
merged_df.head(20)

  return ts.ceil("H")


Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,SaO2,SysABP,Lactate,Cholesterol,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,132592.0,2025-03-10 01:00:00,35.0,,,,0.0,,,,...,,,,,,15,3,3,-1,0
1,132592.0,2025-03-10 02:00:00,,,,15.0,,,,,...,,,,,,15,3,3,-1,0
2,132592.0,2025-03-10 03:00:00,,,,,,,,,...,,,,,,15,3,3,-1,0
3,132592.0,2025-03-10 03:00:00,,68.0,2.3,,,603.0,11.0,25.5,...,,,,,,15,3,3,-1,0
4,132592.0,2025-03-10 04:00:00,,,,,,,,,...,,,,,,15,3,3,-1,0
5,132592.0,2025-03-10 05:00:00,,,,,,,,,...,,,,,,15,3,3,-1,0
6,132592.0,2025-03-10 06:00:00,,61.0,2.0,,,362.0,15.0,23.7,...,,,,,,15,3,3,-1,0
7,132592.0,2025-03-10 07:00:00,,,,15.0,,,,,...,,,,,,15,3,3,-1,0
8,132592.0,2025-03-10 08:00:00,,59.0,1.9,,,254.0,15.0,23.2,...,,,,,,15,3,3,-1,0
9,132592.0,2025-03-10 09:00:00,,,,,,,,,...,,,,,,15,3,3,-1,0


# Group by Time Step

In [9]:
final_df = merged_df.copy()
final_df = final_df.groupby(["RecordID", "Time"], as_index=False).mean()
print(final_df.shape)
final_df.head(10)

(180555, 48)


Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,SaO2,SysABP,Lactate,Cholesterol,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,132539.0,2025-03-10 01:00:00,54.0,,,15.0,0.0,,,,...,,,,,,6.0,1.0,5.0,-1.0,0.0
1,132539.0,2025-03-10 02:00:00,,,,,,,,,...,,,,,,6.0,1.0,5.0,-1.0,0.0
2,132539.0,2025-03-10 03:00:00,,,,,,,,,...,,,,,,6.0,1.0,5.0,-1.0,0.0
3,132539.0,2025-03-10 04:00:00,,,,15.0,,,,33.7,...,,,,,,6.0,1.0,5.0,-1.0,0.0
4,132539.0,2025-03-10 05:00:00,,,,,,,,,...,,,,,,6.0,1.0,5.0,-1.0,0.0
5,132539.0,2025-03-10 06:00:00,,,,,,,,,...,,,,,,6.0,1.0,5.0,-1.0,0.0
6,132539.0,2025-03-10 08:00:00,,,,15.0,,,,,...,,,,,,6.0,1.0,5.0,-1.0,0.0
7,132539.0,2025-03-10 09:00:00,,,,,,,,,...,,,,,,6.0,1.0,5.0,-1.0,0.0
8,132539.0,2025-03-10 10:00:00,,,,,,,,,...,,,,,,6.0,1.0,5.0,-1.0,0.0
9,132539.0,2025-03-10 11:00:00,,13.0,0.8,,,205.0,26.0,33.5,...,,,,,,6.0,1.0,5.0,-1.0,0.0


In [13]:
print(final_df[final_df["RecordID"] == 132539.0]["Weight"])

0    -1.0
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    NaN
28    NaN
29    NaN
30    NaN
31    NaN
32    NaN
33    NaN
34    NaN
35    NaN
36    NaN
37    NaN
38    NaN
39    NaN
40    NaN
41    NaN
42    NaN
43    NaN
44    NaN
45    NaN
46    NaN
Name: Weight, dtype: float64


# Now let's save the updated DataFrame in HDF5 format (very nice for Time Series)

In [8]:
OUTPUT_FOLDER = PROJECT_ROOT / "data/processed"

# Remove the final 5 features (do not save them to file)
final_df = final_df.iloc[:, :-5]
print(final_df.shape)

# Save
#final_df.to_hdf(OUTPUT_FOLDER / "train_set.h5", key="patients", mode="w")
final_df.to_parquet(OUTPUT_FOLDER / "train_set.parquet", index=False)

(180555, 43)


# Step 2. Process Set B (Validation Set)

In [68]:
import os

# Define folder path
set_b_folder = PROJECT_ROOT / "data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/set-b"

# Create an empty list to store patient data
patient_data_list = []

# Loop through all patient files
for filename in os.listdir(set_b_folder):
    if filename.endswith(".txt"):  # Ensure processing only text files
        file_path = os.path.join(set_b_folder, filename)
        
        # Read patient file
        patient_df = pd.read_csv(file_path)
        
        # Extract RecordID
        record_id = patient_df.loc[patient_df['Parameter'] == 'RecordID', 'Value'].values[0]
        
        # Pivot with aggregation to avoid duplicates
        patient_df = patient_df.pivot_table(index='Time', columns='Parameter', values='Value', aggfunc='first')
        
        # Reset index
        patient_df.reset_index(inplace=True)

        # Ensure 'RecordID' is not already in the DataFrame
        if 'RecordID' in patient_df.columns:
            patient_df.drop(columns=['RecordID'], inplace=True)
        
        # Add RecordID as the first column
        patient_df.insert(0, "RecordID", record_id)
        
        # Store in list
        patient_data_list.append(patient_df)

# Combine all patient data into a single DataFrame
patients_df = pd.concat(patient_data_list, ignore_index=True)

# Merge patient data with outcome data
merged_df = patients_df.merge(outcomes_b_df, on="RecordID", how="left")
print(f"Num of Records: {patients_df.shape[0]}")
print(f"Num of Patients: {len(patients_df["RecordID"].unique())}")
patients_df.head(10)

Num of Records: 299068
Num of Patients: 4000


Parameter,RecordID,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Creatinine,...,Temp,Urine,WBC,Weight,pH,Lactate,Cholesterol,RespRate,TroponinT,TroponinI
0,144404.0,00:00,,,,71.0,,,,,...,,,,74.2,,,,,,
1,144404.0,00:04,,,,,,,,,...,37.4,140.0,,74.2,,,,,,
2,144404.0,00:19,,,,,,,,,...,,,,,,,,,,
3,144404.0,01:04,,,,,,,,,...,,80.0,,74.2,,,,,,
4,144404.0,02:04,,,,,,,,,...,,85.0,,74.2,,,,,,
5,144404.0,02:20,,,,,,,,,...,,,,,7.45,,,,,
6,144404.0,03:03,177.0,18.0,25.0,,1.8,24.0,0.7,0.6,...,,,17.9,,,,,,,
7,144404.0,03:04,,,,,,,,,...,,60.0,,74.2,,,,,,
8,144404.0,04:04,,,,,,,,,...,,100.0,,74.2,,,,,,
9,144404.0,04:34,,,,,,,,,...,,,,,,,,,,


In [69]:
# Sort the DataFrame by RecordID and Time
t_patients_df = merged_df.sort_values(by=["RecordID", "Time"])

# Identify columns that should be forward filled (i.e. all except RecordID)
cols_to_ffill = [col for col in t_patients_df.columns if col != "RecordID" and col != "Parameter" and col != "Time"]

# Apply forward fill to those columns within each RecordID group using transform
t_patients_df[cols_to_ffill] = t_patients_df.groupby("RecordID")[cols_to_ffill].transform("ffill")

# Now reset Parameter column
#t_patients_df["Parameter"] = t_patients_df["Parameter"] - t_patients_df["Parameter"].min()
#print(t_patients_df.columns)
t_patients_df.reset_index(drop=True, inplace=True)

# Finally copy data
merged_df = t_patients_df.copy()
t_patients_df.head(10)

Unnamed: 0,RecordID,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Creatinine,...,Lactate,Cholesterol,RespRate,TroponinT,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,142675.0,00:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
1,142675.0,00:44,,,,70.0,,,,,...,,,,,,27,14,9,7,1
2,142675.0,01:18,,,,70.0,,,,,...,,,,,,27,14,9,7,1
3,142675.0,01:33,,,,70.0,,,,,...,,,,,,27,14,9,7,1
4,142675.0,01:39,,,,70.0,,,,,...,,,,,,27,14,9,7,1
5,142675.0,01:48,,,,70.0,,,,,...,,,,,,27,14,9,7,1
6,142675.0,01:53,,,,70.0,,,,,...,,,,,,27,14,9,7,1
7,142675.0,01:58,,,,70.0,,,,,...,,,,,,27,14,9,7,1
8,142675.0,01:59,,,,70.0,,,,,...,,,,,,27,14,9,7,1
9,142675.0,02:03,,,,70.0,,,,,...,,,,,,27,14,9,7,1


In [70]:
from datetime import datetime, timedelta

base_date = "2025-03-10" # Format is YYYY-MM-DD
# Function to fix invalid times
def adjust_time(time_str, base_date):
    # Split hours and minutes
    hours, minutes = map(int, time_str.split(":"))
    
    # Calculate valid hour & days overflow
    day_offset = hours // 24  # Number of days to add
    new_hour = hours % 24  # Wrapped hour (0-23)
    
    # Create the corrected datetime
    corrected_datetime = datetime.strptime(base_date, "%Y-%m-%d") + timedelta(days=day_offset, hours=new_hour, minutes=minutes)
    
    return corrected_datetime

# Apply the function to the 'Time' column
merged_df['Time'] = merged_df['Time'].apply(lambda x: adjust_time(x, base_date))
merged_df.head(10)

Unnamed: 0,RecordID,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Creatinine,...,Lactate,Cholesterol,RespRate,TroponinT,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,142675.0,2025-03-10 00:00:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
1,142675.0,2025-03-10 00:44:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
2,142675.0,2025-03-10 01:18:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
3,142675.0,2025-03-10 01:33:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
4,142675.0,2025-03-10 01:39:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
5,142675.0,2025-03-10 01:48:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
6,142675.0,2025-03-10 01:53:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
7,142675.0,2025-03-10 01:58:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
8,142675.0,2025-03-10 01:59:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1
9,142675.0,2025-03-10 02:03:00,,,,70.0,,,,,...,,,,,,27,14,9,7,1


## Round up the timestamps

In [None]:
def round_up_next_hour(ts):
    # If already on an hour, add one hour to always go to the next hour
    if ts.minute == 0 and ts.second == 0 and ts.microsecond == 0:
        return ts + pd.Timedelta(hours=1)
    # Otherwise, use ceil to round up to the nearest hour
    return ts.ceil("H")


# Apply the function to create a new column with discretized times
merged_df["Time"] = merged_df["Time"].apply(round_up_next_hour)
merged_df.head(20)

In [71]:
OUTPUT_FOLDER = PROJECT_ROOT / "data/processed"

# Save
merged_df.to_hdf(OUTPUT_FOLDER / "validation_set.h5", key="patients", mode="w")

# Step 3. Process Test Set (Set C)

In [72]:
import os

# Define folder path
set_c_folder = PROJECT_ROOT / "data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/set-c"

# Create an empty list to store patient data
patient_data_list = []

# Loop through all patient files
for filename in os.listdir(set_c_folder):
    if filename.endswith(".txt"):  # Ensure processing only text files
        file_path = os.path.join(set_c_folder, filename)
        
        # Read patient file
        patient_df = pd.read_csv(file_path)
        
        # Extract RecordID
        record_id = patient_df.loc[patient_df['Parameter'] == 'RecordID', 'Value'].values[0]
        
        # Pivot with aggregation to avoid duplicates
        patient_df = patient_df.pivot_table(index='Time', columns='Parameter', values='Value', aggfunc='first')
        
        # Reset index
        patient_df.reset_index(inplace=True)

        # Ensure 'RecordID' is not already in the DataFrame
        if 'RecordID' in patient_df.columns:
            patient_df.drop(columns=['RecordID'], inplace=True)
        
        # Add RecordID as the first column
        patient_df.insert(0, "RecordID", record_id)
        
        # Store in list
        patient_data_list.append(patient_df)

# Combine all patient data into a single DataFrame
patients_df = pd.concat(patient_data_list, ignore_index=True)

# Merge patient data with outcome data
merged_df = patients_df.merge(outcomes_c_df, on="RecordID", how="left")
print(f"Num of Records: {patients_df.shape[0]}")
print(f"Num of Patients: {len(patients_df["RecordID"].unique())}")
patients_df.head(10)

Num of Records: 300020
Num of Patients: 4000


Parameter,RecordID,Time,Age,BUN,Creatinine,DiasABP,FiO2,GCS,Gender,Glucose,...,pH,MechVent,TroponinT,ALP,ALT,AST,Albumin,Bilirubin,Cholesterol,TroponinI
0,154617.0,00:00,58.0,,,,,,1.0,,...,,,,,,,,,,
1,154617.0,00:12,,,,,,15.0,,,...,,,,,,,,,,
2,154617.0,00:27,,,,,,,,,...,,,,,,,,,,
3,154617.0,00:42,,,,31.0,,,,,...,,,,,,,,,,
4,154617.0,00:45,,,,,,,,,...,7.46,,,,,,,,,
5,154617.0,00:57,,,,75.0,,,,,...,,,,,,,,,,
6,154617.0,01:12,,,,58.0,,,,,...,,,,,,,,,,
7,154617.0,01:27,,,,57.0,,,,,...,,,,,,,,,,
8,154617.0,01:42,,11.0,0.5,55.0,,15.0,,190.0,...,,,,,,,,,,
9,154617.0,01:45,,,,,,,,,...,,,,,,,,,,


In [73]:
# Sort the DataFrame by RecordID and Time
t_patients_df = merged_df.sort_values(by=["RecordID", "Time"])

# Identify columns that should be forward filled (i.e. all except RecordID)
cols_to_ffill = [col for col in t_patients_df.columns if col != "RecordID" and col != "Parameter" and col != "Time"]

# Apply forward fill to those columns within each RecordID group using transform
t_patients_df[cols_to_ffill] = t_patients_df.groupby("RecordID")[cols_to_ffill].transform("ffill")

# Now reset Parameter column
#t_patients_df["Parameter"] = t_patients_df["Parameter"] - t_patients_df["Parameter"].min()
#print(t_patients_df.columns)
t_patients_df.reset_index(drop=True, inplace=True)

# Finally copy data
merged_df = t_patients_df.copy()
t_patients_df.head(10)

Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,DiasABP,FiO2,GCS,Gender,Glucose,...,AST,Albumin,Bilirubin,Cholesterol,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,152871.0,00:00,71.0,,,,,,1.0,,...,,,,,,19,10,23,-1,0
1,152871.0,00:28,71.0,,,,0.4,,1.0,,...,,,,,,19,10,23,-1,0
2,152871.0,00:38,71.0,,,59.0,0.4,,1.0,,...,,,,,,19,10,23,-1,0
3,152871.0,00:48,71.0,,,59.0,0.4,,1.0,,...,,,,,,19,10,23,-1,0
4,152871.0,01:18,71.0,,,36.0,0.4,9.0,1.0,,...,,,,,,19,10,23,-1,0
5,152871.0,01:44,71.0,36.0,3.7,36.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0
6,152871.0,01:48,71.0,36.0,3.7,54.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0
7,152871.0,01:58,71.0,36.0,3.7,58.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0
8,152871.0,02:03,71.0,36.0,3.7,58.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0
9,152871.0,02:18,71.0,36.0,3.7,59.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0


In [74]:
from datetime import datetime, timedelta

base_date = "2025-03-10" # Format is YYYY-MM-DD
# Function to fix invalid times
def adjust_time(time_str, base_date):
    # Split hours and minutes
    hours, minutes = map(int, time_str.split(":"))
    
    # Calculate valid hour & days overflow
    day_offset = hours // 24  # Number of days to add
    new_hour = hours % 24  # Wrapped hour (0-23)
    
    # Create the corrected datetime
    corrected_datetime = datetime.strptime(base_date, "%Y-%m-%d") + timedelta(days=day_offset, hours=new_hour, minutes=minutes)
    
    return corrected_datetime

# Apply the function to the 'Time' column
merged_df['Time'] = merged_df['Time'].apply(lambda x: adjust_time(x, base_date))
merged_df.head(10)

Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,DiasABP,FiO2,GCS,Gender,Glucose,...,AST,Albumin,Bilirubin,Cholesterol,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,152871.0,2025-03-10 00:00:00,71.0,,,,,,1.0,,...,,,,,,19,10,23,-1,0
1,152871.0,2025-03-10 00:28:00,71.0,,,,0.4,,1.0,,...,,,,,,19,10,23,-1,0
2,152871.0,2025-03-10 00:38:00,71.0,,,59.0,0.4,,1.0,,...,,,,,,19,10,23,-1,0
3,152871.0,2025-03-10 00:48:00,71.0,,,59.0,0.4,,1.0,,...,,,,,,19,10,23,-1,0
4,152871.0,2025-03-10 01:18:00,71.0,,,36.0,0.4,9.0,1.0,,...,,,,,,19,10,23,-1,0
5,152871.0,2025-03-10 01:44:00,71.0,36.0,3.7,36.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0
6,152871.0,2025-03-10 01:48:00,71.0,36.0,3.7,54.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0
7,152871.0,2025-03-10 01:58:00,71.0,36.0,3.7,58.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0
8,152871.0,2025-03-10 02:03:00,71.0,36.0,3.7,58.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0
9,152871.0,2025-03-10 02:18:00,71.0,36.0,3.7,59.0,0.4,9.0,1.0,84.0,...,,,,,,19,10,23,-1,0


In [75]:
OUTPUT_FOLDER = PROJECT_ROOT / "data/processed"

# Save
patients_df.to_hdf(OUTPUT_FOLDER / "test_set.h5", key="patients", mode="w")