In [24]:
import pandas as pd


In [25]:
from pathlib import Path

def find_project_root(marker=".gitignore"):
    """
    walk up from the current working directory until a directory containing the
    specified marker (e.g., .gitignore) is found.
    """
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent.resolve()
    raise FileNotFoundError(f"Project root marker '{marker}' not found starting from {current}")
  

In [26]:
root = find_project_root()
INPUT_DATASET = f"{root}/dataset/V2_preprocessed.parquet_with_nans.parquet"

In [27]:
# check for nans
df = pd.read_parquet(INPUT_DATASET)
df.isna().sum()

HR                      0
O2Sat                   0
Temp                    0
SBP                     0
MAP                     0
                    ...  
Resp_min_6h             0
Resp_mean_6h            0
Resp_median_6h          0
Resp_std_6h         40336
Resp_diff_std_6h    80672
Length: 97, dtype: int64

In [28]:
# check the names of the columns with nans
df.columns[df.isna().any()]

# print nan values per column, filter out the columns with no nans
df.isna().sum().loc[lambda x: x > 0]

HospAdmTime              8
HR_std_6h            40336
HR_diff_std_6h       80672
O2Sat_std_6h         40336
O2Sat_diff_std_6h    80672
SBP_std_6h           40336
SBP_diff_std_6h      80672
MAP_std_6h           40336
MAP_diff_std_6h      80672
Resp_std_6h          40336
Resp_diff_std_6h     80672
dtype: int64

In [29]:
# forward fill the nans
df = df.ffill()
# back fill the nans
df = df.bfill()
# check if the nans are forward filled
df.isna().sum()





HR                  0
O2Sat               0
Temp                0
SBP                 0
MAP                 0
                   ..
Resp_min_6h         0
Resp_mean_6h        0
Resp_median_6h      0
Resp_std_6h         0
Resp_diff_std_6h    0
Length: 97, dtype: int64