# Setup the absolute / relative path

In [2]:
from pathlib import Path
import os

PROJECT_ROOT = Path(os.getcwd()).resolve().parent
DATA_DIR = PROJECT_ROOT / "data"

print(f"Project root: {PROJECT_ROOT}")

Project root: /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc


# First, visualize data

In [3]:
import pandas as pd

# Load structured data
outcomes_file = PROJECT_ROOT / "data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/Outcomes-a.txt"
outcomes_df = pd.read_csv(outcomes_file)
print(outcomes_df.head())
print(outcomes_df.info())

   RecordID  SAPS-I  SOFA  Length_of_stay  Survival  In-hospital_death
0    132539       6     1               5        -1                  0
1    132540      16     8               8        -1                  0
2    132541      21    11              19        -1                  0
3    132543       7     1               9       575                  0
4    132545      17     2               4       918                  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   RecordID           4000 non-null   int64
 1   SAPS-I             4000 non-null   int64
 2   SOFA               4000 non-null   int64
 3   Length_of_stay     4000 non-null   int64
 4   Survival           4000 non-null   int64
 5   In-hospital_death  4000 non-null   int64
dtypes: int64(6)
memory usage: 187.6 KB
None


In [20]:
import os

# Define folder path
set_a_folder = PROJECT_ROOT / "data/data_1/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/set-a"

# Create an empty list to store patient data
patient_data_list = []

# Loop through all patient files
for filename in os.listdir(set_a_folder):
    if filename.endswith(".txt"):  # Ensure processing only text files
        file_path = os.path.join(set_a_folder, filename)
        
        # Read patient file
        patient_df = pd.read_csv(file_path)
        
        # Extract RecordID
        record_id = patient_df.loc[patient_df['Parameter'] == 'RecordID', 'Value'].values[0]
        
        # Pivot with aggregation to avoid duplicates
        patient_df = patient_df.pivot_table(index='Time', columns='Parameter', values='Value', aggfunc='first')
        
        # Reset index
        patient_df.reset_index(inplace=True)

        # Ensure 'RecordID' is not already in the DataFrame
        if 'RecordID' in patient_df.columns:
            patient_df.drop(columns=['RecordID'], inplace=True)
        
        # Add RecordID as the first column
        patient_df.insert(0, "RecordID", record_id)
        
        # Store in list
        patient_data_list.append(patient_df)

# Combine all patient data into a single DataFrame
patients_df = pd.concat(patient_data_list, ignore_index=True)

# Merge patient data with outcome data
merged_df = patients_df.merge(outcomes_df, on="RecordID", how="left")
print(f"Num of Records: {patients_df.shape[0]}")
print(f"Num of Patients: {len(patients_df["RecordID"].unique())}")
patients_df.head(10)

Num of Records: 299264
Num of Patients: 4000


Parameter,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132592.0,00:00,35.0,,,,0.0,,,,...,,,,,,,,,,
1,132592.0,01:20,,,,15.0,,,,,...,,,,,,,,,,
2,132592.0,02:20,,,,,,,,,...,,,,,,,,,,
3,132592.0,02:36,,68.0,2.3,,,603.0,11.0,25.5,...,,,,,,,,,,
4,132592.0,03:20,,,,,,,,,...,,,,,,,,,,
5,132592.0,04:20,,,,,,,,,...,,,,,,,,,,
6,132592.0,05:20,,61.0,2.0,,,362.0,15.0,23.7,...,,,,,,,,,,
7,132592.0,06:20,,,,15.0,,,,,...,,,,,,,,,,
8,132592.0,07:20,,59.0,1.9,,,254.0,15.0,23.2,...,,,,,,,,,,
9,132592.0,08:20,,,,,,,,,...,,,,,,,,,,


# We can count the number of non-NaN rows for the dataframe

In [7]:
merged_df.head(10)

Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,SaO2,SysABP,Lactate,Cholesterol,TroponinI,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,132592.0,00:00,35.0,,,,0.0,,,,...,,,,,,15,3,3,-1,0
1,132592.0,01:20,,,,15.0,,,,,...,,,,,,15,3,3,-1,0
2,132592.0,02:20,,,,,,,,,...,,,,,,15,3,3,-1,0
3,132592.0,02:36,,68.0,2.3,,,603.0,11.0,25.5,...,,,,,,15,3,3,-1,0
4,132592.0,03:20,,,,,,,,,...,,,,,,15,3,3,-1,0
5,132592.0,04:20,,,,,,,,,...,,,,,,15,3,3,-1,0
6,132592.0,05:20,,61.0,2.0,,,362.0,15.0,23.7,...,,,,,,15,3,3,-1,0
7,132592.0,06:20,,,,15.0,,,,,...,,,,,,15,3,3,-1,0
8,132592.0,07:20,,59.0,1.9,,,254.0,15.0,23.2,...,,,,,,15,3,3,-1,0
9,132592.0,08:20,,,,,,,,,...,,,,,,15,3,3,-1,0


In [11]:
patients_df["non_nan_count"] = patients_df.notna().sum(axis=1)
print(patients_df["non_nan_count"].describe())

# Check the first few rows
print(patients_df[["Time", "RecordID", "non_nan_count"]].head())

count    299264.000000
mean          8.848468
std           2.580872
min           4.000000
25%           8.000000
50%           9.000000
75%          10.000000
max          31.000000
Name: non_nan_count, dtype: float64
Parameter   Time  RecordID  non_nan_count
0          00:00  132592.0              8
1          01:20  132592.0             11
2          02:20  132592.0             10
3          02:36  132592.0             14
4          03:20  132592.0             10


### There are on average almost 9 non NaN values per row. Use forward fill to fill the missing values.

In [43]:
# Sort the DataFrame by RecordID and Time
'''print("Shape before sorting:", patients_df.shape)
number_rows_record = patients_df["RecordID"].value_counts()
print(number_rows_record.describe()) # Here it's count = 4000!!!'''

t_patients_df = patients_df.sort_values(by=["RecordID", "Time"])

# Identify columns that should be forward filled (i.e. all except RecordID)
cols_to_ffill = [col for col in t_patients_df.columns if col != "RecordID" and col != "Parameter" and col != "Time"]

# Apply forward fill to those columns within each RecordID group using transform
t_patients_df[cols_to_ffill] = t_patients_df.groupby("RecordID")[cols_to_ffill].transform("ffill")
#t_patients_df["RecordID"] = t_patients_df["RecordID"].astype(float)
'''print("Shape after sorting:", t_patients_df.shape)

# Check number of rows for RecordID
number_rows_record = t_patients_df["RecordID"].value_counts()
print(number_rows_record.describe())'''

# Now reset Parameter column
#t_patients_df["Parameter"] = t_patients_df["Parameter"] - t_patients_df["Parameter"].min()
#print(t_patients_df.columns)
t_patients_df.reset_index(drop=True, inplace=True)

# Finally copy data
patients_df = t_patients_df.copy()
t_patients_df.head(10)

Parameter,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132539.0,00:00,54.0,,,,0.0,,,,...,,,,,,,,,,
1,132539.0,00:07,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
2,132539.0,00:37,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
3,132539.0,01:37,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
4,132539.0,02:37,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
5,132539.0,03:08,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
6,132539.0,03:37,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
7,132539.0,04:37,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
8,132539.0,05:37,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
9,132539.0,07:37,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,


# Insert a base standard date for each measurement
## And then correct the hours exceeding 24-hour format

In [44]:
from datetime import datetime, timedelta

base_date = "2025-03-10" # Format is YYYY-MM-DD
# Function to fix invalid times
def adjust_time(time_str, base_date):
    # Split hours and minutes
    hours, minutes = map(int, time_str.split(":"))
    
    # Calculate valid hour & days overflow
    day_offset = hours // 24  # Number of days to add
    new_hour = hours % 24  # Wrapped hour (0-23)
    
    # Create the corrected datetime
    corrected_datetime = datetime.strptime(base_date, "%Y-%m-%d") + timedelta(days=day_offset, hours=new_hour, minutes=minutes)
    
    return corrected_datetime

# Apply the function to the 'Time' column
patients_df['Time'] = patients_df['Time'].apply(lambda x: adjust_time(x, base_date))
patients_df.head(10)



Parameter,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132539.0,2025-03-10 00:00:00,54.0,,,,0.0,,,,...,,,,,,,,,,
1,132539.0,2025-03-10 00:07:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
2,132539.0,2025-03-10 00:37:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
3,132539.0,2025-03-10 01:37:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
4,132539.0,2025-03-10 02:37:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
5,132539.0,2025-03-10 03:08:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
6,132539.0,2025-03-10 03:37:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
7,132539.0,2025-03-10 04:37:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
8,132539.0,2025-03-10 05:37:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
9,132539.0,2025-03-10 07:37:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,


# Now let's save the updated DataFrame in HDF5 format (very nice for Time Series)

In [45]:
OUTPUT_FOLDER = PROJECT_ROOT / "data/processed"

# Save
patients_df.to_hdf(OUTPUT_FOLDER / "patients.h5", key="patients", mode="w")