In [3]:
import pandas as pd
import numpy as np
from project_1.config import PROCESSED_DATA_DIR, PROJ_ROOT

In [13]:
# Load the data from Parquet files
sets_dict = {}
sets = ["a", "b", "c"]

for set_name in sets:
    directory = PROCESSED_DATA_DIR / f"set_{set_name}.parquet"
    temp_set = pd.read_parquet(directory)
    sets_dict[f"set_{set_name}"] = temp_set

# Assure the loading was correct
print(sets_dict["set_a"].shape)
sets_dict["set_a"].head(10)


(183416, 43)


Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132539.0,2025-03-10 00:00:00,54.0,,,,0.0,,,,...,,,,,,,,,,
1,132539.0,2025-03-10 01:00:00,,,,15.0,,,,,...,,,,,,,,,,
2,132539.0,2025-03-10 02:00:00,,,,,,,,,...,,,,,,,,,,
3,132539.0,2025-03-10 03:00:00,,,,,,,,,...,,,,,,,,,,
4,132539.0,2025-03-10 04:00:00,,,,15.0,,,,33.7,...,,,,,,,,,,
5,132539.0,2025-03-10 05:00:00,,,,,,,,,...,,,,,,,,,,
6,132539.0,2025-03-10 06:00:00,,,,,,,,,...,,,,,,,,,,
7,132539.0,2025-03-10 08:00:00,,,,15.0,,,,,...,,,,,,,,,,
8,132539.0,2025-03-10 09:00:00,,,,,,,,,...,,,,,,,,,,
9,132539.0,2025-03-10 10:00:00,,,,,,,,,...,,,,,,,,,,


In [16]:
df = sets_dict["set_a"]
# Create a dataframe with one row per patient (using the first row for each patient)
static_df = df.groupby("RecordID", as_index=False).first()[["RecordID", "Age", "Weight", "Height", "Gender"]].copy()
static_df


Unnamed: 0,RecordID,Age,Weight,Height,Gender
0,132539.0,54.0,-1.0,-1.0,0.0
1,132540.0,76.0,76.0,175.3,1.0
2,132541.0,44.0,56.7,-1.0,0.0
3,132543.0,68.0,84.6,180.3,1.0
4,132545.0,88.0,-1.0,-1.0,0.0
...,...,...,...,...,...
3995,142665.0,70.0,87.0,-1.0,0.0
3996,142667.0,25.0,166.4,-1.0,1.0
3997,142670.0,44.0,109.0,-1.0,1.0
3998,142671.0,37.0,87.4,-1.0,1.0


In [22]:
# Check for values in Gender column = -1, count them
static_df[static_df["Height"] == -1]

Unnamed: 0,RecordID,Age,Weight,Height,Gender
0,132539.0,54.0,-1.0,-1.0,0.0
2,132541.0,44.0,56.7,-1.0,0.0
4,132545.0,88.0,-1.0,-1.0,0.0
8,132554.0,64.0,60.7,-1.0,0.0
10,132556.0,64.0,65.0,-1.0,0.0
...,...,...,...,...,...
3994,142664.0,51.0,75.0,-1.0,0.0
3995,142665.0,70.0,87.0,-1.0,0.0
3996,142667.0,25.0,166.4,-1.0,1.0
3997,142670.0,44.0,109.0,-1.0,1.0


In [24]:

from sklearn.impute import KNNImputer
def knn_impute_static_features(df, static_features=["Age", "Weight", "Height", "Gender"], n_neighbors=10):
    """
    Impute missing static values (currently indicated by -1) using KNN imputation with n_neighbors.
    
    Parameters:
      df (pd.DataFrame): DataFrame with one row per patient.
      static_features (list): List of static feature column names to impute.
      n_neighbors (int): Number of neighbors to use for KNN imputation.
      
    Returns:
      pd.DataFrame: The DataFrame with missing static feature values imputed.
    """
    # Work on a copy to avoid modifying the original DataFrame.
    df_impute = df.copy()
    
    # Replace missing values (-1) with np.nan in the static columns.
    df_impute[static_features] = df_impute[static_features].replace(-1, np.nan)
    
    # Initialize the KNN imputer.
    imputer = KNNImputer(n_neighbors=n_neighbors)
    
    # Fit and transform the static features.
    imputed_array = imputer.fit_transform(df_impute[static_features])
    
    # Create a new DataFrame with the imputed static features.
    df_imputed_static = pd.DataFrame(imputed_array, columns=static_features, index=df_impute.index)
    
    # Update the original DataFrame with the imputed values.
    df_impute.update(df_imputed_static)
    
    return df_impute

final_df = knn_impute_static_features(static_df)
final_df

Unnamed: 0,RecordID,Age,Weight,Height,Gender
0,132539.0,54.0,91.00,160.79,0.0
1,132540.0,76.0,76.00,175.30,1.0
2,132541.0,44.0,56.70,189.74,0.0
3,132543.0,68.0,84.60,180.30,1.0
4,132545.0,88.0,64.34,158.76,0.0
...,...,...,...,...,...
3995,142665.0,70.0,87.00,174.12,0.0
3996,142667.0,25.0,166.40,176.51,1.0
3997,142670.0,44.0,109.00,175.00,1.0
3998,142671.0,37.0,87.40,172.08,1.0
