In [24]:
import pandas as pd


In [25]:
from pathlib import Path

def find_project_root(marker=".gitignore"):
    """
    walk up from the current working directory until a directory containing the
    specified marker (e.g., .gitignore) is found.
    """
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent.resolve()
    raise FileNotFoundError(f"Project root marker '{marker}' not found starting from {current}")
  

In [26]:
root = find_project_root()
INPUT_DATASET = f"{root}/dataset/V2_preprocessed.parquet_with_nans.parquet"

In [27]:
# check for nans
df = pd.read_parquet(INPUT_DATASET)
df.isna().sum()

HR                      0
O2Sat                   0
Temp                    0
SBP                     0
MAP                     0
                    ...  
Resp_min_6h             0
Resp_mean_6h            0
Resp_median_6h          0
Resp_std_6h         40336
Resp_diff_std_6h    80672
Length: 97, dtype: int64

In [28]:
# check the names of the columns with nans
df.columns[df.isna().any()]

# print nan values per column, filter out the columns with no nans
df.isna().sum().loc[lambda x: x > 0]

HospAdmTime              8
HR_std_6h            40336
HR_diff_std_6h       80672
O2Sat_std_6h         40336
O2Sat_diff_std_6h    80672
SBP_std_6h           40336
SBP_diff_std_6h      80672
MAP_std_6h           40336
MAP_diff_std_6h      80672
Resp_std_6h          40336
Resp_diff_std_6h     80672
dtype: int64

In [29]:
# forward fill the nans
df = df.ffill()
# back fill the nans
df = df.bfill()
# check if the nans are forward filled
df.isna().sum()






HR                  0
O2Sat               0
Temp                0
SBP                 0
MAP                 0
                   ..
Resp_min_6h         0
Resp_mean_6h        0
Resp_median_6h      0
Resp_std_6h         0
Resp_diff_std_6h    0
Length: 97, dtype: int64

In [33]:
df.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,MAP_mean_6h,MAP_median_6h,MAP_std_6h,MAP_diff_std_6h,Resp_max_6h,Resp_min_6h,Resp_mean_6h,Resp_median_6h,Resp_std_6h,Resp_diff_std_6h
0,102.108491,91.419811,36.919203,128.165094,88.199717,67.007325,24.712264,29.6875,0.091837,22.811236,...,88.199717,88.199717,9.100264,16.645094,24.712264,24.712264,24.712264,24.712264,4.039181,6.160501
1,97.0,95.0,36.919203,98.0,75.33,67.007325,19.0,29.6875,0.091837,22.811236,...,81.764858,81.764858,9.100264,16.645094,24.712264,19.0,21.856132,21.856132,4.039181,6.160501
2,89.0,99.0,36.919203,122.0,86.0,67.007325,22.0,29.6875,0.091837,22.811236,...,83.176572,86.0,6.883764,16.645094,24.712264,19.0,21.904088,22.0,2.85734,6.160501
3,90.0,95.0,36.919203,122.0,88.665,67.007325,30.0,29.6875,24.0,22.811236,...,84.548679,87.099858,6.25472,11.968888,30.0,19.0,23.928066,23.356132,4.672138,6.939377
4,103.0,88.5,36.919203,122.0,91.33,67.007325,24.5,29.6875,0.091837,22.811236,...,85.904943,88.199717,6.20793,9.852805,30.0,19.0,24.042453,24.5,4.054267,6.729752


In [32]:
# save the dataset
df.to_parquet(f"{root}/dataset/V2_preprocessed.parquet")