## Exploring pre process dataset

In [1]:
import os
import sys
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [18]:
def find_project_root(marker=".gitignore"):
    """
    walk up from the current working directory until a directory containing the
    specified marker (e.g., .gitignore) is found.
    """
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent.resolve()
    raise FileNotFoundError(f"Project root marker '{marker}' not found starting from {current}")
  
root = find_project_root()
if root not in sys.path:
    sys.path.append(root)
    
utils_path = os.path.join(root, "feature_engineering")
if utils_path not in sys.path:
    sys.path.append(utils_path)


/Users/damianstone/Documents/Code/machine-learning/dl-sepsis-prediction


## Dataset before pre-processing

In [3]:
imputed_df = pd.read_parquet(f"{root}/dataset/Fully_imputed_dataset.parquet")

In [4]:
column_names = imputed_df.columns.tolist()
print(f"total columns: {len(column_names)}")
print(column_names)

total columns: 44
['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS', 'SepsisLabel', 'patient_id', 'dataset', 'cluster_id']


In [5]:
# print nan values 
imputed_df.isna().sum()

HR                  0
O2Sat               0
Temp                0
SBP                 0
MAP                 0
DBP                 0
Resp                0
EtCO2               0
BaseExcess          0
HCO3                0
FiO2                0
pH                  0
PaCO2               0
SaO2                0
AST                 0
BUN                 0
Alkalinephos        0
Calcium             0
Chloride            0
Creatinine          0
Bilirubin_direct    0
Glucose             0
Lactate             0
Magnesium           0
Phosphate           0
Potassium           0
Bilirubin_total     0
TroponinI           0
Hct                 0
Hgb                 0
PTT                 0
WBC                 0
Fibrinogen          0
Platelets           0
Age                 0
Gender              0
Unit1               0
Unit2               0
HospAdmTime         8
ICULOS              0
SepsisLabel         0
patient_id          0
dataset             0
cluster_id          0
dtype: int64

## Pre-processed dataset

In [6]:

df = pd.read_parquet(f"{root}/dataset/preprocessed_data.parquet")
df.head(5)

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Creatinine_SD_6h,Creatinine_MA_12h,Creatinine_SD_12h,Platelets_MA_3h,Platelets_SD_3h,Platelets_Delta,Platelets_MA_6h,Platelets_SD_6h,Platelets_MA_12h,Platelets_SD_12h
0,0.315802,0.892748,0.550488,0.386304,0.24357,0.167883,0.239518,0.21875,0.24312,0.41475,...,0.0,0.035605,0.0,0.221139,0.0,0.5,0.221139,0.0,0.221139,0.0
1,0.296154,0.9375,0.550488,0.278571,0.197607,0.167883,0.181818,0.21875,0.24312,0.41475,...,0.0,0.035605,0.0,0.221139,0.0,0.5,0.221139,0.0,0.221139,0.0
2,0.265385,0.9875,0.550488,0.364286,0.235714,0.167883,0.212121,0.21875,0.24312,0.41475,...,0.0,0.035605,0.0,0.221139,0.0,0.5,0.221139,0.0,0.221139,0.0
3,0.269231,0.9375,0.550488,0.364286,0.245232,0.167883,0.292929,0.21875,0.424242,0.41475,...,0.0,0.035605,0.0,0.221139,0.0,0.5,0.221139,0.0,0.221139,0.0
4,0.319231,0.85625,0.550488,0.364286,0.25475,0.167883,0.237374,0.21875,0.24312,0.41475,...,0.0,0.035605,0.0,0.221139,0.0,0.5,0.221139,0.0,0.221139,0.0


In [9]:
# check if df have the patient_id column
df['patient_id'].nunique()

40336

In [10]:
from pprint import pprint

pre_processed_column_names = df.columns.tolist()
print(f"total columns: {len(pre_processed_column_names)}")

# print extra columns that are in pre_processed_column_names but not in column_names
extra_columns = [col for col in pre_processed_column_names if col not in column_names]
pprint(extra_columns)


total columns: 63
['SOFA',
 'NEWS',
 'qSOFA',
 'MAP_MA_3h',
 'MAP_SD_3h',
 'MAP_Delta',
 'MAP_MA_6h',
 'MAP_SD_6h',
 'MAP_MA_12h',
 'MAP_SD_12h',
 'Creatinine_MA_3h',
 'Creatinine_SD_3h',
 'Creatinine_Delta',
 'Creatinine_MA_6h',
 'Creatinine_SD_6h',
 'Creatinine_MA_12h',
 'Creatinine_SD_12h',
 'Platelets_MA_3h',
 'Platelets_SD_3h',
 'Platelets_Delta',
 'Platelets_MA_6h',
 'Platelets_SD_6h',
 'Platelets_MA_12h',
 'Platelets_SD_12h']


In [11]:
# Print columns with their NaN counts
nan_counts = df.isna().sum()
# Only show columns that have NaN values
nan_counts = nan_counts[nan_counts > 0]
print(nan_counts)

Series([], dtype: int64)


## PCA dataset

In [23]:
pca_df = pd.read_parquet(f"{root}/dataset/pca_preprocessed_data.parquet")
pca_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,SepsisLabel,patient_id
0,0.976547,-3.370716,0.273932,0.623827,2.142885,-1.138575,-0.613078,0.012894,1.106863,1.414957,...,1.101274,-0.163108,-0.146007,0.683788,0.299371,-0.076576,-0.19668,0.040171,0.0,0.0
1,-0.027446,-3.453726,0.502514,0.996124,1.481648,-1.072824,-0.210635,0.74865,0.671149,1.110384,...,1.16517,-0.095383,-0.280268,0.917044,0.243541,0.053703,-0.224501,0.526588,0.0,0.0
2,0.544852,-3.405329,0.265758,0.942998,1.303394,-1.0139,-1.300991,0.285504,0.368328,0.661971,...,1.403815,-0.206788,-0.048163,0.323014,0.272577,-0.051309,-0.312216,0.20478,0.0,0.0
3,2.284222,-3.772123,-2.037691,2.790614,-2.353786,1.795905,3.522639,-7.326447,10.309693,-4.360751,...,7.737744,-3.005374,3.059335,1.904083,-1.080303,1.120541,-6.146302,3.463664,0.0,0.0
4,0.693992,-3.380663,0.326373,0.47085,1.409846,-1.276322,-1.642734,-0.510131,1.261638,1.521896,...,1.114201,-0.167355,-0.153837,0.727323,-0.113446,-0.201083,-0.055265,0.270699,0.0,0.0


## Imputed vs Pre-processed

In [21]:
import analyse_data

analyse_data.summariseSeperateDatasets(imputed_df, "imputed")
analyse_data.summariseSeperateDatasets(df, "pre-processed")


Hospital system: imputed
  Number of patients: 40336
  Number of septic patients: 2932
  Sepsis prevalence: 7.3%
  Number of rows: 1552210
  Number of entries: 68297232
  Density of entries: 100.0%


Hospital system: pre-processed
  Number of patients: 40336
  Number of septic patients: 2932
  Sepsis prevalence: 7.3%
  Number of rows: 1552210
  Number of entries: 97789230
  Density of entries: 100.0%


