In [1]:
import os
import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def round_timestamp(ts_str):
    ts_str_splitted = ts_str.split(":")
    hours = ts_str_splitted[0]
    minutes = ts_str_splitted[1]
    if minutes != "00":
        hours = str(int(hours) + 1).zfill(2)
        minutes = "00"
    return hours + ":" + minutes

In [3]:
path = "../../data"

# Define all expected columns
expected_columns = [
    'Age', 'Gender', 'Height', 'Weight', 'ICUType', 'Albumin', 'ALP', 'ALT', 'AST',
    'Bilirubin', 'BUN', 'Cholesterol', 'Creatinine', 'FiO2', 'DiasABP',
    'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'Lactate', 'Mg', 'MAP',
    'MechVent', 'Na', 'NIDiasABP', 'NIMAP', 'NISysABP', 'PaCO2', 'PaO2',
    'pH', 'Platelets', 'RespRate', 'SaO2', 'SysABP', 'Temp', 'TroponinI',
    'TroponinT', 'Urine', 'WBC', 'RecordID'
]

# Define static parameters
static_params = ['Age','Gender','Height', 'Weight', 'RecordID']

for patient_set in os.listdir(path):
    directory = os.path.join(path, patient_set)
    if os.path.isdir(directory):
        set_df = None
        with os.scandir(directory) as entries:
            for file in entries:
                if file.is_file():
                    df = pd.read_csv(file, delimiter=",")

                    # Extract static variables including Weight
                    static_vars = df[
                        (df['Time'] == '00:00') &
                        (df['Parameter'].isin(static_params))
                    ].drop_duplicates(subset=['Parameter'], keep='first').set_index('Parameter')['Value']

                    # Round timestamps
                    df['Time'] = df['Time'].apply(round_timestamp)
                    df = df.pivot_table(index="Time", columns="Parameter", values="Value").reset_index()

                    # Identify missing columns and add them
                    for col in expected_columns:
                        if col not in df.columns:
                            df[col] = None  # or use np.nan for float-based missing values

                    # Generate the full time range from 00:00 to 48:00
                    full_time_range = [f"{str(h).zfill(2)}:00" for h in range(49)]

                    # Set 'time' as the index
                    df.set_index('Time', inplace=True)

                    # Reindex the DataFrame to include all times, filling missing values with NaN
                    df = df.reindex(full_time_range).reset_index()


                    # Add static variables
                    for param in static_params:
                        if param != "Weight":
                            df[param] = static_vars[param]
                        else:
                            try:
                                df["Weight(static)"] = static_vars["Weight"]
                            except ValueError:
                                print("Error: ", static_vars)
                                display(static_vars)
                                exit()

                    # Replace missing values
                    df = df.applymap(lambda x: pd.NA if isinstance(x, (int, float)) and x < 0 else x)

                    # Convert ID to int
                    df['RecordID'] = df['RecordID'].astype(int)

                    if set_df is None:
                        set_df = df
                    else:
                        set_df = pd.concat([set_df, df], ignore_index=True)
            set_df.to_parquet(f"{directory}.parquet", index=False)