In [12]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
from datetime import datetime, timedelta
from scipy import stats
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import warnings
import random
warnings.filterwarnings("ignore")

In [13]:
# Define variables
path = '/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening'
department = 'EG PEDIATRIC ICU' 
campus = 'eg'

In [14]:
# Load encounters file
print('Loading encounters...')
dept_path = '/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip'
dept = pd.read_parquet(dept_path)
dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']] = dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']].apply(pd.to_datetime)
dept = dept[['Pat ID', 'Encounter CSN', 'Name', 'BIRTH_DATE', 'Department', 'Entered_Dept', 'Exited_Dept', 'Hosp_Admission', 'Hosp_Discharge']]
dept.columns = ['patid', 'csn', 'name', 'dob', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']
dept = dept[(dept['department'].str.contains('PEDIATRIC ICU')) & (dept['hosp_adm'] >= '2010-01-01')]
dept['csn'] = dept['csn'].astype(int)
dept.dropna(inplace=True)
dept.drop_duplicates(inplace=True)

dept_first = dept.sort_values(by=['csn', 'hosp_adm', 'entered_dept'])
dept_first = dept_first.groupby('csn', as_index=False).first()

# Load complete cohort
print('Loading complete cohort...')
cohort = pd.read_csv(os.path.join(path, 'complete_cohort.csv'))
cohort = cohort[['patid', 'mrn', 'csn', 'dob']]
cohort['csn'] = cohort['csn'].astype(int)
cohort = cohort.merge(dept_first[['csn', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']], how='inner', on='csn')
cohort.drop_duplicates(inplace=True)

print('Unique CSN total:', len(cohort['csn'].unique().tolist()))

Loading encounters...
Loading complete cohort...
Unique CSN total: 63877


### Filter data

In [15]:
# Load data
print('Loading data...')
variables = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models/raw_features.parquet.gzip')
variables[['dob', 'recorded_time']] = variables[['dob', 'recorded_time']].apply(pd.to_datetime)
variables[['csn', 'variable_id']] = variables[['csn', 'variable_id']].astype(int)
variables.dropna(subset=['value'], inplace=True)
variables = variables[~((variables['variable_name'] == 'BP') & ~(variables['value'].str.contains("/", case=False)))]

# Load meds
meds = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/filtered_meds.parquet.gzip')
meds[['dob', 'mar_time']] = meds[['dob', 'mar_time']].apply(pd.to_datetime)
meds['csn'] = meds['csn'].astype(int)
meds = meds[(meds['csn'].isin(variables['csn'].unique().tolist())) & (meds['dose_unit'] == 'mcg/kg/min') & 
        (meds['med'].str.contains('epinephrine|dopamine', case=False))]
meds = meds[['patid', 'csn', 'dob', 'med_id', 'med', 'mar_time', 'dose']]
meds.columns = ['patid', 'csn', 'dob', 'variable_id', 'variable_name', 'recorded_time', 'value']
meds.loc[(meds['variable_name'].str.contains("epinephrine", case=False)) & ~(meds['variable_name'].str.contains("norepinephrine", case=False)), 'variable_name'] = 'epinephrine'
meds.loc[meds['variable_name'].str.contains("norepinephrine", case=False), 'variable_name'] = 'norepinephrine'
meds.loc[meds['variable_name'].str.contains("dopamine", case=False), 'variable_name'] = 'dopamine'
meds.reset_index(inplace=True, drop=True)
variables = pd.concat([variables, meds])

# Add resp support data
resp = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/resp_data.parquet.gzip')
resp['csn'] = resp['csn'].astype(int)
resp = resp[resp['csn'].isin(variables['csn'].unique().tolist())]
resp[['dob', 'recorded_time']] = resp[['dob', 'recorded_time']].apply(pd.to_datetime)
variables = pd.concat([variables, resp])

# Add hospital admission and department
print('Adding hospital admission and department...')
variables = pd.merge(variables, cohort[['csn', 'hosp_adm', 'department']], on='csn', how='inner')
variables = variables[['patid', 'csn', 'dob', 'hosp_adm', 'department', 'variable_id', 'variable_name', 'recorded_time', 'value']]
variables = variables[variables['department'] == department] 
variables.reset_index(drop=True, inplace=True)

# Gather data within first day of the stay
variables['rel_day'] = np.ceil((variables['recorded_time'] - variables['hosp_adm']) / pd.Timedelta('1 day'))
variables = variables[variables['rel_day'] == 1]
variables.drop('rel_day', axis=1, inplace=True)

# Fix blood pressure
print('Fixing blood pressure...')
sysbp = variables[variables['variable_name'] == 'BP']
sysbp['variable_id'] = 1
sysbp['variable_name'] = 'bp_sys'
sysbp['value'] = sysbp['value'].apply(lambda x: float(x.split('/')[0]))
variables.loc[variables['variable_name'] == 'BP', 'variable_name'] = 'bp_dias'
variables.loc[variables['variable_name'] == 'bp_dias', 'value'] = variables.loc[variables['variable_name'] == 'bp_dias', 'value'].apply(lambda x: float(x.split('/')[1]))
variables = pd.concat([variables, sysbp])
variables.dropna(subset=['value'], inplace=True)
variables.reset_index(drop=True, inplace=True)

print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Loading data...


Adding hospital admission and department...
Fixing blood pressure...
Unique CSN total: 28290


### Clean and preprocess data

In [16]:
# Change some variables names. Similar variables will have the same name
print('Changing variables names...')
variables.loc[variables['variable_name'] == 'Weight', 'variable_name'] = 'weight'
variables.loc[variables['variable_name'] == 'Volume Infused (mL)', 'variable_name'] = 'vol_infused'
variables.loc[variables['variable_name'] == 'Urine (mL)', 'variable_name'] = 'urine'
variables.loc[variables['variable_name'] == 'Code Sheet Weight (kg)', 'variable_name'] = 'weight'
variables.loc[variables['variable_name'] == 'Pulse', 'variable_name'] = 'pulse'
variables.loc[variables['variable_name'] == 'MAP', 'variable_name'] = 'map'
variables.loc[variables['variable_name'] == 'ABP MAP', 'variable_name'] = 'map'
variables.loc[variables['variable_name'] == 'ART MAP', 'variable_name'] = 'map'
variables.loc[variables['variable_name'] == 'Resp', 'variable_name'] = 'resp'
variables.loc[variables['variable_name'] == 'SpO2', 'variable_name'] = 'spo2'
variables.loc[variables['variable_name'] == 'Perfused Pulse (SpO2)', 'variable_name'] = 'spo2'
variables.loc[variables['variable_name'] == 'Temp', 'variable_name'] = 'temp'
variables.loc[variables['variable_name'] == 'FiO2 (%)', 'variable_name'] = 'fio2'
variables.loc[variables['variable_name'] == 'PaO2/FiO2 (Calculated)', 'variable_name'] = 'pao2_fio2'
variables.loc[variables['variable_name'] == 'Pupil Left Reaction', 'variable_name'] = 'pupil_left_reaction'
variables.loc[variables['variable_name'] == 'Pupil Left Size', 'variable_name'] = 'pupil_left_size'
variables.loc[variables['variable_name'] == 'Pupil Right Reaction', 'variable_name'] = 'pupil_right_reaction'
variables.loc[variables['variable_name'] == 'Pupil Right Size', 'variable_name'] = 'pupil_right_size'
variables.loc[variables['variable_name'] == 'Coma Scale Total', 'variable_name'] = 'coma_scale_total'
variables.loc[variables['variable_name'] == 'Oxygen Flow (lpm)', 'variable_name'] = 'o2_flow'
variables.loc[variables['variable_name'] == 'POC pH', 'variable_name'] = 'ph'
variables.loc[variables['variable_name'] == 'POC PO2', 'variable_name'] = 'po2'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PO2', 'variable_name'] = 'po2'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PO2', 'variable_name'] = 'po2'
variables.loc[variables['variable_name'] == 'VENOUS POC PO2', 'variable_name'] = 'po2'
variables.loc[variables['variable_name'] == 'POC PCO2', 'variable_name'] = 'pco2'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PCO2', 'variable_name'] = 'pco2'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PCO2', 'variable_name'] = 'pco2'
variables.loc[variables['variable_name'] == 'VENOUS POC PCO2', 'variable_name'] = 'pco2'
variables.loc[variables['variable_name'] == 'POTASSIUM', 'variable_name'] = 'potassium'
variables.loc[variables['variable_name'] == 'SODIUM', 'variable_name'] = 'sodium'
variables.loc[variables['variable_name'] == 'CHLORIDE', 'variable_name'] = 'chloride'
variables.loc[variables['variable_name'] == 'POC GLUCOSE', 'variable_name'] = 'glucose'
variables.loc[variables['variable_name'] == 'GLUCOSE', 'variable_name'] = 'glucose'
variables.loc[variables['variable_name'] == 'BUN', 'variable_name'] = 'bun'
variables.loc[variables['variable_name'] == 'CREATININE', 'variable_name'] = 'creatinine'
variables.loc[variables['variable_name'] == 'CALCIUM', 'variable_name'] = 'calcium'
variables.loc[variables['variable_name'] == 'POC CALCIUM IONIZED', 'variable_name'] = 'calcium_ionized'
variables.loc[variables['variable_name'] == 'CO2', 'variable_name'] = 'co2'
variables.loc[variables['variable_name'] == 'HEMOGLOBIN', 'variable_name'] = 'hemoglobin'
variables.loc[variables['variable_name'] == 'BILIRUBIN TOTAL', 'variable_name'] = 'bilirubin_total'
variables.loc[variables['variable_name'] == 'ALBUMIN', 'variable_name'] = 'albumin'
variables.loc[variables['variable_name'] == 'WBC', 'variable_name'] = 'wbc'
variables.loc[variables['variable_name'] == 'PLATELETS', 'variable_name'] = 'platelets'
variables.loc[variables['variable_name'] == 'PTT', 'variable_name'] = 'ptt'
variables.loc[variables['variable_name'] == 'PTT.', 'variable_name'] = 'ptt'
variables.loc[variables['variable_name'] == 'ARTERIAL BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'VENOUS BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'CAP BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'ART BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'VENOUS BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'CAP BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'HCO3', 'variable_name'] = 'bicarbonate'
variables.loc[variables['variable_name'] == 'LACTIC ACID', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'POC LACTIC ACID', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'LACTIC ACID WHOLE BLOOD', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'BAND NEUTROPHILS % (MANUAL)', 'variable_name'] = 'band_neutrophils'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PH', 'variable_name'] = 'ph'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PH', 'variable_name'] = 'ph'
variables.loc[variables['variable_name'] == 'VENOUS POC PH', 'variable_name'] = 'ph'
variables.loc[variables['variable_name'] == 'ALT (SGPT)', 'variable_name'] = 'alt'
variables.loc[variables['variable_name'] == 'AST (SGOT)', 'variable_name'] = 'ast'
variables.loc[variables['variable_name'] == 'INT NORM RATIO', 'variable_name'] = 'inr'
variables.loc[variables['variable_name'] == 'PROTIME', 'variable_name'] = 'pt'
variables.drop(['variable_id'], axis=1, inplace=True)

Changing variables names...


In [17]:
# Fix pupillary reaction
print('Fixing pupillary reaction...')
variables.loc[(variables['variable_name'].isin(['pupil_left_reaction', 'pupil_right_reaction'])) & (variables['value'].isin(['Brisk', 'Sluggish', 'Hippus'])), 'value'] = 'Reactive'
variables.loc[(variables['variable_name'].isin(['pupil_left_reaction', 'pupil_right_reaction'])) & (variables['value'].isin(['Non-reactive'])), 'value'] = 'Non-reactive'
variables.loc[(variables['variable_name'].isin(['pupil_left_reaction', 'pupil_right_reaction'])) & (variables['value'].isin(['Unable to assess', 'Pinpoint', 'No eye', 'Pharmacologically dilated', 'Keyhole', 'Ovoid', 'Ovid'])), 'value'] = 'Unable to Assess'

# Fix pupil size
variables.loc[variables['variable_name'].isin(['pupil_left_size', 'pupil_right_size']), 'value'] = variables.loc[variables['variable_name'].isin(['pupil_left_size', 'pupil_right_size']), 'value'].apply(lambda x: x[:-2])

# Numerical variables
print('Dropping invalid observations...')
num_vars = ['weight', 'pulse', 'map', 'bp_sys', 'bp_dias', 'resp', 'spo2', 'temp', 'fio2', 'pao2_fio2', 'pupil_left_size', 
            'pupil_right_size', 'coma_scale_total', 'o2_flow', 'ph', 'po2', 'pco2', 'potassium',
            'sodium', 'chloride', 'glucose', 'bun', 'creatinine', 'calcium', 'calcium_ionized', 'co2', 'hemoglobin',
            'bilirubin_total', 'albumin', 'wbc', 'platelets', 'ptt', 'base_excess', 'bicarbonate', 'lactic_acid',
            'base_deficit', 'vol_infused', 'urine']
num_vars_extra = num_vars.copy()
num_vars_extra.extend(['band_neutrophils', 'alt', 'ast', 'pt', 'inr', 'epinephrine', 'norepinephrine', 'dopamine', 'resp_indicator'])

# Categorical variables
cat_vars = ['pupil_left_reaction', 'pupil_right_reaction']

# Check that all values are numbers for numerical variables
variables = variables[(variables['value'].apply(lambda x: str(x).replace(".", "", 1).isdigit())) | (variables['variable_name'].isin(cat_vars))]
variables.loc[variables['variable_name'].isin(num_vars_extra), 'value'] = variables.loc[variables['variable_name'].isin(num_vars_extra), 'value'].astype(float)
variables.dropna(subset=['value'], inplace=True)
variables.reset_index(drop=True, inplace=True)

print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Fixing pupillary reaction...
Dropping invalid observations...
Unique CSN total: 28290


### Preprocess

In [18]:
# Data wrangling and imputation
print('Data wrangling and generation of flags...')

# Pivot data
variables = pd.pivot_table(variables, values='value', index=['patid', 'csn', 'dob', 'hosp_adm', 'department', 'recorded_time'], columns=['variable_name'], aggfunc=(lambda x: x.iloc[0]), fill_value=np.nan)
variables.reset_index(inplace=True)
variables[['dob', 'hosp_adm', 'recorded_time']] = variables[['dob', 'hosp_adm', 'recorded_time']].apply(pd.to_datetime)

# Create column with relative time
variables.insert(6, 'rel_time', np.nan)
variables['rel_time'] = np.ceil((variables['recorded_time'] - variables['hosp_adm']) / pd.Timedelta('1 hour'))
variables = variables[variables['rel_time'] > 0]
variables.sort_values(by=['csn', 'rel_time'], inplace=True)
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Data wrangling and generation of flags...
Unique CSN total: 28290


In [19]:
# Resample data
agg_dict = {}
for col in variables.columns:
    if col in num_vars_extra:
        agg_dict[col] = pd.NamedAgg(column=col, aggfunc='median')
    else:
        agg_dict[col] = pd.NamedAgg(column=col, aggfunc='last')

variables = variables.groupby(['patid', 'csn', 'dob', 'hosp_adm', 'department', 'rel_time'], as_index=False).agg(**agg_dict)
variables.sort_values(by=['csn', 'rel_time'], inplace=True)
variables.reset_index(drop=True, inplace=True)
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 28290


In [20]:
# Create rows for missing hours
hours_list = []
csn_list = []
variables['rel_time'] = variables['rel_time'].astype(int)

for csn in variables['csn'].unique().tolist():
    df = variables[variables['csn'] == csn]
    hours = [x for x in list(range(df['rel_time'].min(), df['rel_time'].max())) if x not in list(df['rel_time'])]
    csn_list.extend([csn] * len(hours))
    hours_list.extend(hours)
missing = pd.DataFrame(list(zip(csn_list, hours_list)), columns=['csn', 'rel_time'])

cols = list(variables.columns)
cols.remove('csn')
cols.remove('rel_time')

for col in cols:
    missing[col] = np.nan
    
missing = missing[list(variables.columns)]
variables = pd.concat([variables, missing])
variables.sort_values(by=['csn', 'rel_time'], inplace=True)
variables.reset_index(inplace=True, drop=True)
variables[['patid', 'dob', 'hosp_adm', 'department']] = variables.groupby('csn')[['patid', 'dob', 'hosp_adm', 'department']].ffill()
variables['rel_time_med'] = variables['rel_time'] - 0.5
variables.loc[variables['recorded_time'].isna(), 'recorded_time'] = variables['hosp_adm'] + pd.to_timedelta(variables['rel_time_med'], unit='h')
variables.drop(['rel_time_med'], axis=1, inplace=True)
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 28290


In [21]:
# Fix temperature (to C)
variables['temp'] = variables['temp'].apply(lambda x: (x - 32) * (5 / 9))

# Fix weight (to Kg)
variables['weight'] = variables['weight'].apply(lambda x: x / 35.274)

# Normalize o2 flow (lpm/kg)
variables['o2_flow'] = variables['o2_flow'] / variables['weight']

# Fix outliers
for col in num_vars:
    p1 = np.nanpercentile(variables[col], 1.0)
    p99 = np.nanpercentile(variables[col], 99.0)
    variables.loc[variables[col] < p1, col] = np.nan
    variables.loc[variables[col] > p99, col] = np.nan

variables.loc[variables['spo2'] > 100, 'spo2'] = 100
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 28290


In [22]:
# Save dataset file
variables.to_parquet(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_analysis', 'features_preimp_pivot_24_' + campus + '.parquet.gzip'), compression='gzip')