## Exploring pre process dataset

In [1]:
import os
import sys
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
def find_project_root(marker=".gitignore"):
    """
    walk up from the current working directory until a directory containing the
    specified marker (e.g., .gitignore) is found.
    """
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent.resolve()
    raise FileNotFoundError(f"Project root marker '{marker}' not found starting from {current}")
  
root = find_project_root()
if root not in sys.path:
    sys.path.append(root)
    
utils_path = os.path.join(root, "feature_engineering")
if utils_path not in sys.path:
    sys.path.append(utils_path)


## Dataset before pre-processing

In [3]:
imputed_df = pd.read_parquet(f"{root}/dataset/Fully_imputed_dataset.parquet")

In [4]:
column_names = imputed_df.columns.tolist()
print(f"total columns: {len(column_names)}")
print(column_names)

total columns: 44
['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS', 'SepsisLabel', 'patient_id', 'dataset', 'cluster_id']


In [5]:
# print nan values 
imputed_df.isna().sum()

HR                  0
O2Sat               0
Temp                0
SBP                 0
MAP                 0
DBP                 0
Resp                0
EtCO2               0
BaseExcess          0
HCO3                0
FiO2                0
pH                  0
PaCO2               0
SaO2                0
AST                 0
BUN                 0
Alkalinephos        0
Calcium             0
Chloride            0
Creatinine          0
Bilirubin_direct    0
Glucose             0
Lactate             0
Magnesium           0
Phosphate           0
Potassium           0
Bilirubin_total     0
TroponinI           0
Hct                 0
Hgb                 0
PTT                 0
WBC                 0
Fibrinogen          0
Platelets           0
Age                 0
Gender              0
Unit1               0
Unit2               0
HospAdmTime         8
ICULOS              0
SepsisLabel         0
patient_id          0
dataset             0
cluster_id          0
dtype: int64

## Pre-processed dataset

In [8]:

df = pd.read_parquet(f"{root}/dataset/V2_preprocessed.parquet")
df.head(5)

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,HR_missing_count_global,HR_missing_interval_mean_global,O2Sat_missing_count_global,O2Sat_missing_interval_mean_global,SBP_missing_count_global,SBP_missing_interval_mean_global,MAP_missing_count_global,MAP_missing_interval_mean_global,Resp_missing_count_global,Resp_missing_interval_mean_global
0,102.108491,91.419811,36.919203,128.165094,88.199717,67.007325,24.712264,29.6875,0.091837,22.811236,...,5,1.0,10,1.111111,12,1.0,12,1.0,4,1.0
1,97.0,95.0,36.919203,98.0,75.33,67.007325,19.0,29.6875,0.091837,22.811236,...,5,1.0,10,1.111111,12,1.0,12,1.0,4,1.0
2,89.0,99.0,36.919203,122.0,86.0,67.007325,22.0,29.6875,0.091837,22.811236,...,5,1.0,10,1.111111,12,1.0,12,1.0,4,1.0
3,90.0,95.0,36.919203,122.0,88.665,67.007325,30.0,29.6875,24.0,22.811236,...,5,1.0,10,1.111111,12,1.0,12,1.0,4,1.0
4,103.0,88.5,36.919203,122.0,91.33,67.007325,24.5,29.6875,0.091837,22.811236,...,5,1.0,10,1.111111,12,1.0,12,1.0,4,1.0


In [13]:
type(df)

pandas.core.frame.DataFrame

In [9]:
# check if df have the patient_id column
df['patient_id'].nunique()

40336

In [10]:
from pprint import pprint

pre_processed_column_names = df.columns.tolist()
print(f"total columns: {len(pre_processed_column_names)}")

# print extra columns that are in pre_processed_column_names but not in column_names
extra_columns = [col for col in pre_processed_column_names if col not in column_names]
pprint(extra_columns)


total columns: 97
['SOFA_Creatinine',
 'SOFA_Platelets',
 'SOFA_Bilirubin_total',
 'SOFA_SaO2_FiO2',
 'SOFA_score',
 'NEWS_HR_score',
 'NEWS_Resp_score',
 'NEWS_Temp_score',
 'NEWS_SBP_score',
 'NEWS_O2Sat_score',
 'NEWS_FiO2_score',
 'NEWS_score',
 'qSOFA_Resp_score',
 'qSOFA_SBP_score',
 'qSOFA_score',
 'Shock_Index',
 'Bilirubin_Ratio',
 'HR_max_6h',
 'HR_min_6h',
 'HR_mean_6h',
 'HR_median_6h',
 'HR_std_6h',
 'HR_diff_std_6h',
 'O2Sat_max_6h',
 'O2Sat_min_6h',
 'O2Sat_mean_6h',
 'O2Sat_median_6h',
 'O2Sat_std_6h',
 'O2Sat_diff_std_6h',
 'SBP_max_6h',
 'SBP_min_6h',
 'SBP_mean_6h',
 'SBP_median_6h',
 'SBP_std_6h',
 'SBP_diff_std_6h',
 'MAP_max_6h',
 'MAP_min_6h',
 'MAP_mean_6h',
 'MAP_median_6h',
 'MAP_std_6h',
 'MAP_diff_std_6h',
 'Resp_max_6h',
 'Resp_min_6h',
 'Resp_mean_6h',
 'Resp_median_6h',
 'Resp_std_6h',
 'Resp_diff_std_6h',
 'HR_missing_count_global',
 'HR_missing_interval_mean_global',
 'O2Sat_missing_count_global',
 'O2Sat_missing_interval_mean_global',
 'SBP_missing_cou

In [11]:
# Print columns with their NaN counts
nan_counts = df.isna().sum()
# Only show columns that have NaN values
nan_counts = nan_counts[nan_counts > 0]
print(nan_counts)

Series([], dtype: int64)


## Imputed vs Pre-processed

In [14]:
import os
import sys
import pandas as pd

notebook_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
if project_root not in sys.path:
    sys.path.append(project_root)
    
utils_path = os.path.join(project_root, "utils")
if utils_path not in sys.path:
    sys.path.append(utils_path)
import analyse_data

analyse_data.summariseSeperateDatasets(imputed_df, "imputed")
analyse_data.summariseSeperateDatasets(df, "pre-processed")


Hospital system: imputed
  Number of patients: 40336
  Number of septic patients: 2932
  Sepsis prevalence: 7.3%
  Number of rows: 1552210
  Number of entries: 68297232
  Density of entries: 100.0%


Hospital system: pre-processed
  Number of patients: 40336
  Number of septic patients: 2932
  Sepsis prevalence: 7.3%
  Number of rows: 1552210
  Number of entries: 150564370
  Density of entries: 100.0%


