# Q1.3 - Data Preprocessing

### First we need to load the data

In [2]:
# Load data from parquet file
import pandas as pd
import numpy as np
from project_1.config import PROJ_ROOT, PROCESSED_DATA_DIR

sets_dict = {}
sets = ["a", "b", "c"]

for set_name in sets:
    directory = PROCESSED_DATA_DIR / f"set_{set_name}.parquet"
    temp_set = pd.read_parquet(directory)
    sets_dict[f"set_{set_name}"] = temp_set

# Assure the loading was correct
print(sets_dict["set_a"].shape)
sets_dict["set_a"].head(10)


(183416, 43)


Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132539.0,2025-03-10 00:00:00,54.0,,,,0.0,,,,...,,,,,,,,,,
1,132539.0,2025-03-10 01:00:00,,,,15.0,,,,,...,,,,,,,,,,
2,132539.0,2025-03-10 02:00:00,,,,,,,,,...,,,,,,,,,,
3,132539.0,2025-03-10 03:00:00,,,,,,,,,...,,,,,,,,,,
4,132539.0,2025-03-10 04:00:00,,,,15.0,,,,33.7,...,,,,,,,,,,
5,132539.0,2025-03-10 05:00:00,,,,,,,,,...,,,,,,,,,,
6,132539.0,2025-03-10 06:00:00,,,,,,,,,...,,,,,,,,,,
7,132539.0,2025-03-10 08:00:00,,,,15.0,,,,,...,,,,,,,,,,
8,132539.0,2025-03-10 09:00:00,,,,,,,,,...,,,,,,,,,,
9,132539.0,2025-03-10 10:00:00,,,,,,,,,...,,,,,,,,,,


## Step 1 - Use Forward Filling imputation (use only Set A for now)

In [5]:
df = sets_dict["set_a"]

# Ensure the DataFrame is sorted by RecordID and Time
df.sort_values(by=["RecordID", "Time"], inplace=True)

# Get a list of all columns except "RecordID" and "Time"
other_cols = [col for col in df.columns if col != "RecordID" and col != "Time"]

# Group by RecordID and apply forward fill for each group.
df[other_cols] = df.groupby("RecordID")[other_cols].ffill()

# Display the first 10 rows to check the result
df.head(10)

Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132539.0,2025-03-10 00:00:00,54.0,,,,0.0,,,,...,,,,,,,,,,
1,132539.0,2025-03-10 01:00:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
2,132539.0,2025-03-10 02:00:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
3,132539.0,2025-03-10 03:00:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
4,132539.0,2025-03-10 04:00:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
5,132539.0,2025-03-10 05:00:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
6,132539.0,2025-03-10 06:00:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
7,132539.0,2025-03-10 08:00:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
8,132539.0,2025-03-10 09:00:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
9,132539.0,2025-03-10 10:00:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,


### Success, the values are being forwarded. Keep in mind that this can be altered in the future

# Step 2 - Scale the data