In [5]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
from datetime import datetime, timedelta
from scipy import stats
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import warnings
import random
warnings.filterwarnings("ignore")

In [6]:
# Define variables
path = '/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening'
department = 'SR PEDIATRIC ICU'
campus = 'sr'

In [7]:
# Load encounters file
print('Loading encounters...')
dept_path = '/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip'
dept = pd.read_parquet(dept_path)
dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']] = dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']].apply(pd.to_datetime)
dept = dept[['Pat ID', 'Encounter CSN', 'Name', 'BIRTH_DATE', 'Department', 'Entered_Dept', 'Exited_Dept', 'Hosp_Admission', 'Hosp_Discharge']]
dept.columns = ['patid', 'csn', 'name', 'dob', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']
dept = dept[(dept['department'].str.contains('PEDIATRIC ICU')) & (dept['hosp_adm'] >= '2010-01-01')]
dept['csn'] = dept['csn'].astype(int)
dept.dropna(inplace=True)
dept.drop_duplicates(inplace=True)

dept_first = dept.sort_values(by=['csn', 'hosp_adm', 'entered_dept'])
dept_first = dept_first.groupby('csn', as_index=False).first()

# Load complete cohort
print('Loading complete cohort...')
cohort = pd.read_csv(os.path.join(path, 'complete_cohort.csv'))
cohort = cohort[['patid', 'mrn', 'csn', 'dob']]
cohort['dob'] = cohort['dob'].apply(pd.to_datetime)
cohort['csn'] = cohort['csn'].astype(int)

cohort = cohort.merge(dept_first[['csn', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']], how='inner', on='csn')
cohort.drop_duplicates(inplace=True)

print('Unique CSN total:', len(cohort['csn'].unique().tolist()))

Loading encounters...
Loading complete cohort...
Unique CSN total: 63877


### Filter data

In [8]:
# Load data
print('Loading data...')
variables = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models/raw_features.parquet.gzip')
variables[['dob', 'recorded_time']] = variables[['dob', 'recorded_time']].apply(pd.to_datetime)
variables[['csn', 'variable_id']] = variables[['csn', 'variable_id']].astype(int)
variables.dropna(subset=['value'], inplace=True)
variables = variables[~((variables['variable_name'] == 'BP') & ~(variables['value'].str.contains("/", case=False)))]

# Load meds
meds = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/filtered_meds.parquet.gzip')
meds[['dob', 'mar_time']] = meds[['dob', 'mar_time']].apply(pd.to_datetime)
meds['csn'] = meds['csn'].astype(int)
meds = meds[(meds['csn'].isin(variables['csn'].unique().tolist())) & (meds['dose_unit'] == 'mcg/kg/min') & 
        (meds['med'].str.contains('epinephrine|dopamine', case=False))]
meds = meds[['patid', 'csn', 'dob', 'med_id', 'med', 'mar_time', 'dose']]
meds.columns = ['patid', 'csn', 'dob', 'variable_id', 'variable_name', 'recorded_time', 'value']
meds.loc[(meds['variable_name'].str.contains("epinephrine", case=False)) & ~(meds['variable_name'].str.contains("norepinephrine", case=False)), 'variable_name'] = 'epinephrine'
meds.loc[meds['variable_name'].str.contains("norepinephrine", case=False), 'variable_name'] = 'norepinephrine'
meds.loc[meds['variable_name'].str.contains("dopamine", case=False), 'variable_name'] = 'dopamine'
meds.reset_index(inplace=True, drop=True)
variables = pd.concat([variables, meds])

# Add resp support data
resp = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/resp_data.parquet.gzip')
resp['csn'] = resp['csn'].astype(int)
resp = resp[resp['csn'].isin(variables['csn'].unique().tolist())]
resp[['dob', 'recorded_time']] = resp[['dob', 'recorded_time']].apply(pd.to_datetime)
variables = pd.concat([variables, resp])

# Add hospital admission and department
print('Adding hospital admission and department...')
variables = pd.merge(variables, cohort[['csn', 'hosp_adm', 'department']], on='csn', how='inner')
variables = variables[['patid', 'csn', 'dob', 'hosp_adm', 'department', 'variable_id', 'variable_name', 'recorded_time', 'value']]
variables = variables[variables['department'] == department] 
variables.reset_index(drop=True, inplace=True)

# Gather data within first 7 days of the stay
variables['rel_day'] = np.ceil((variables['recorded_time'] - variables['hosp_adm']) / pd.Timedelta('1 day'))
variables = variables[(variables['rel_day'] > 0) & (variables['rel_day'] < 8)]
variables.drop('rel_day', axis=1, inplace=True)

# Fix blood pressure
print('Fixing blood pressure...')
sysbp = variables[variables['variable_name'] == 'BP']
sysbp['variable_id'] = 1
sysbp['variable_name'] = 'bp_sys'
sysbp['value'] = sysbp['value'].apply(lambda x: float(x.split('/')[0]))
variables.loc[variables['variable_name'] == 'BP', 'variable_name'] = 'bp_dias'
variables.loc[variables['variable_name'] == 'bp_dias', 'value'] = variables.loc[variables['variable_name'] == 'bp_dias', 'value'].apply(lambda x: float(x.split('/')[1]))
variables = pd.concat([variables, sysbp])
variables.dropna(subset=['value'], inplace=True)
variables.reset_index(drop=True, inplace=True)

print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Loading data...
Adding hospital admission and department...
Fixing blood pressure...
Unique CSN total: 35557


### Clean and preprocess data

In [9]:
# Change some variables names. Similar variables will have the same name
print('Changing variables names...')
variables.loc[variables['variable_name'] == 'Weight', 'variable_name'] = 'weight'
variables.loc[variables['variable_name'] == 'Volume Infused (mL)', 'variable_name'] = 'vol_infused'
variables.loc[variables['variable_name'] == 'Urine (mL)', 'variable_name'] = 'urine'
variables.loc[variables['variable_name'] == 'Code Sheet Weight (kg)', 'variable_name'] = 'weight'
variables.loc[variables['variable_name'] == 'Pulse', 'variable_name'] = 'pulse'
variables.loc[variables['variable_name'] == 'MAP', 'variable_name'] = 'map'
variables.loc[variables['variable_name'] == 'ABP MAP', 'variable_name'] = 'map'
variables.loc[variables['variable_name'] == 'ART MAP', 'variable_name'] = 'map'
variables.loc[variables['variable_name'] == 'Resp', 'variable_name'] = 'resp'
variables.loc[variables['variable_name'] == 'SpO2', 'variable_name'] = 'spo2'
variables.loc[variables['variable_name'] == 'Perfused Pulse (SpO2)', 'variable_name'] = 'spo2'
variables.loc[variables['variable_name'] == 'Temp', 'variable_name'] = 'temp'
variables.loc[variables['variable_name'] == 'FiO2 (%)', 'variable_name'] = 'fio2'
variables.loc[variables['variable_name'] == 'PaO2/FiO2 (Calculated)', 'variable_name'] = 'pao2_fio2'
variables.loc[variables['variable_name'] == 'Pupil Left Reaction', 'variable_name'] = 'pupil_left_reaction'
variables.loc[variables['variable_name'] == 'Pupil Left Size', 'variable_name'] = 'pupil_left_size'
variables.loc[variables['variable_name'] == 'Pupil Right Reaction', 'variable_name'] = 'pupil_right_reaction'
variables.loc[variables['variable_name'] == 'Pupil Right Size', 'variable_name'] = 'pupil_right_size'
variables.loc[variables['variable_name'] == 'Coma Scale Total', 'variable_name'] = 'coma_scale_total'
variables.loc[variables['variable_name'] == 'Oxygen Flow (lpm)', 'variable_name'] = 'o2_flow'
variables.loc[variables['variable_name'] == 'POC pH', 'variable_name'] = 'ph'
variables.loc[variables['variable_name'] == 'POC PO2', 'variable_name'] = 'po2'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PO2', 'variable_name'] = 'po2'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PO2', 'variable_name'] = 'po2'
variables.loc[variables['variable_name'] == 'VENOUS POC PO2', 'variable_name'] = 'po2'
variables.loc[variables['variable_name'] == 'POC PCO2', 'variable_name'] = 'pco2'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PCO2', 'variable_name'] = 'pco2'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PCO2', 'variable_name'] = 'pco2'
variables.loc[variables['variable_name'] == 'VENOUS POC PCO2', 'variable_name'] = 'pco2'
variables.loc[variables['variable_name'] == 'POTASSIUM', 'variable_name'] = 'potassium'
variables.loc[variables['variable_name'] == 'SODIUM', 'variable_name'] = 'sodium'
variables.loc[variables['variable_name'] == 'CHLORIDE', 'variable_name'] = 'chloride'
variables.loc[variables['variable_name'] == 'POC GLUCOSE', 'variable_name'] = 'glucose'
variables.loc[variables['variable_name'] == 'GLUCOSE', 'variable_name'] = 'glucose'
variables.loc[variables['variable_name'] == 'BUN', 'variable_name'] = 'bun'
variables.loc[variables['variable_name'] == 'CREATININE', 'variable_name'] = 'creatinine'
variables.loc[variables['variable_name'] == 'CALCIUM', 'variable_name'] = 'calcium'
variables.loc[variables['variable_name'] == 'POC CALCIUM IONIZED', 'variable_name'] = 'calcium_ionized'
variables.loc[variables['variable_name'] == 'CO2', 'variable_name'] = 'co2'
variables.loc[variables['variable_name'] == 'HEMOGLOBIN', 'variable_name'] = 'hemoglobin'
variables.loc[variables['variable_name'] == 'BILIRUBIN TOTAL', 'variable_name'] = 'bilirubin_total'
variables.loc[variables['variable_name'] == 'ALBUMIN', 'variable_name'] = 'albumin'
variables.loc[variables['variable_name'] == 'WBC', 'variable_name'] = 'wbc'
variables.loc[variables['variable_name'] == 'PLATELETS', 'variable_name'] = 'platelets'
variables.loc[variables['variable_name'] == 'PTT', 'variable_name'] = 'ptt'
variables.loc[variables['variable_name'] == 'PTT.', 'variable_name'] = 'ptt'
variables.loc[variables['variable_name'] == 'ARTERIAL BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'VENOUS BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'CAP BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'ART BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'VENOUS BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'CAP BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'HCO3', 'variable_name'] = 'bicarbonate'
variables.loc[variables['variable_name'] == 'LACTIC ACID', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'POC LACTIC ACID', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'LACTIC ACID WHOLE BLOOD', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'BAND NEUTROPHILS % (MANUAL)', 'variable_name'] = 'band_neutrophils'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PH', 'variable_name'] = 'ph'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PH', 'variable_name'] = 'ph'
variables.loc[variables['variable_name'] == 'VENOUS POC PH', 'variable_name'] = 'ph'
variables.loc[variables['variable_name'] == 'ALT (SGPT)', 'variable_name'] = 'alt'
variables.loc[variables['variable_name'] == 'AST (SGOT)', 'variable_name'] = 'ast'
variables.loc[variables['variable_name'] == 'INT NORM RATIO', 'variable_name'] = 'inr'
variables.loc[variables['variable_name'] == 'PROTIME', 'variable_name'] = 'pt'
variables.drop(['variable_id'], axis=1, inplace=True)

Changing variables names...


In [10]:
# Fix pupillary reaction
print('Fixing pupillary reaction...')
variables.loc[(variables['variable_name'].isin(['pupil_left_reaction', 'pupil_right_reaction'])) & (variables['value'].isin(['Brisk', 'Sluggish', 'Hippus'])), 'value'] = 'Reactive'
variables.loc[(variables['variable_name'].isin(['pupil_left_reaction', 'pupil_right_reaction'])) & (variables['value'].isin(['Non-reactive'])), 'value'] = 'Non-reactive'
variables.loc[(variables['variable_name'].isin(['pupil_left_reaction', 'pupil_right_reaction'])) & (variables['value'].isin(['Unable to assess', 'Pinpoint', 'No eye', 'Pharmacologically dilated', 'Keyhole', 'Ovoid', 'Ovid'])), 'value'] = 'Unable to Assess'

# Fix pupil size
variables.loc[variables['variable_name'].isin(['pupil_left_size', 'pupil_right_size']), 'value'] = variables.loc[variables['variable_name'].isin(['pupil_left_size', 'pupil_right_size']), 'value'].apply(lambda x: x[:-2])

# Numerical variables
print('Dropping invalid observations...')
num_vars = ['weight', 'pulse', 'map', 'bp_sys', 'bp_dias', 'resp', 'spo2', 'temp', 'fio2', 'pao2_fio2', 'pupil_left_size', 
            'pupil_right_size', 'coma_scale_total', 'o2_flow', 'ph', 'po2', 'pco2', 'potassium',
            'sodium', 'chloride', 'glucose', 'bun', 'creatinine', 'calcium', 'calcium_ionized', 'co2', 'hemoglobin',
            'bilirubin_total', 'albumin', 'wbc', 'platelets', 'ptt', 'base_excess', 'bicarbonate', 'lactic_acid',
            'base_deficit', 'vol_infused', 'urine']
num_vars_extra = num_vars.copy()
num_vars_extra.extend(['band_neutrophils', 'alt', 'ast', 'pt', 'inr', 'epinephrine', 'norepinephrine', 'dopamine', 'resp_indicator'])

# Categorical variables
cat_vars = ['pupil_left_reaction', 'pupil_right_reaction']

# Check that all values are numbers for numerical variables
variables = variables[(variables['value'].apply(lambda x: str(x).replace(".", "", 1).isdigit())) | (variables['variable_name'].isin(cat_vars))]
variables.loc[variables['variable_name'].isin(num_vars_extra), 'value'] = variables.loc[variables['variable_name'].isin(num_vars_extra), 'value'].astype(float)
variables.dropna(subset=['value'], inplace=True)
variables.reset_index(drop=True, inplace=True)

print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Fixing pupillary reaction...
Dropping invalid observations...
Unique CSN total: 35557


### Preprocess

In [11]:
# Data wrangling and imputation
print('Data wrangling and generation of flags...')

# Pivot data
variables = pd.pivot_table(variables, values='value', index=['patid', 'csn', 'dob', 'hosp_adm', 'department', 'recorded_time'], columns=['variable_name'], aggfunc=(lambda x: x.iloc[0]), fill_value=np.nan)
variables.reset_index(inplace=True)
variables[['dob', 'hosp_adm', 'recorded_time']] = variables[['dob', 'hosp_adm', 'recorded_time']].apply(pd.to_datetime)

# Create column with relative time
variables.insert(6, 'rel_time', np.nan)
variables['rel_time'] = np.ceil((variables['recorded_time'] - variables['hosp_adm']) / pd.Timedelta('1 hour'))
variables = variables[variables['rel_time'] > 0]
variables.sort_values(by=['csn', 'rel_time'], inplace=True)
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Data wrangling and generation of flags...
Unique CSN total: 35557


In [12]:
# Resample data
agg_dict = {}
for col in variables.columns:
    if col in num_vars_extra:
        agg_dict[col] = pd.NamedAgg(column=col, aggfunc='median')
    else:
        agg_dict[col] = pd.NamedAgg(column=col, aggfunc='last')

variables = variables.groupby(['patid', 'csn', 'dob', 'hosp_adm', 'department', 'rel_time'], as_index=False).agg(**agg_dict)
variables.sort_values(by=['csn', 'rel_time'], inplace=True)
variables.reset_index(drop=True, inplace=True)
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


In [13]:
# Create rows for missing hours
hours_list = []
csn_list = []
variables['rel_time'] = variables['rel_time'].astype(int)

for csn in variables['csn'].unique().tolist():
    df = variables[variables['csn'] == csn]
    hours = [x for x in list(range(df['rel_time'].min(), df['rel_time'].max())) if x not in list(df['rel_time'])]
    csn_list.extend([csn] * len(hours))
    hours_list.extend(hours)
missing = pd.DataFrame(list(zip(csn_list, hours_list)), columns=['csn', 'rel_time'])

cols = list(variables.columns)
cols.remove('csn')
cols.remove('rel_time')

for col in cols:
    missing[col] = np.nan
    
missing = missing[list(variables.columns)]
variables = pd.concat([variables, missing])
variables.sort_values(by=['csn', 'rel_time'], inplace=True)
variables.reset_index(inplace=True, drop=True)
variables[['patid', 'dob', 'hosp_adm', 'department']] = variables.groupby('csn')[['patid', 'dob', 'hosp_adm', 'department']].ffill()
variables['rel_time_med'] = variables['rel_time'] - 0.5
variables.loc[variables['recorded_time'].isna(), 'recorded_time'] = variables['hosp_adm'] + pd.to_timedelta(variables['rel_time_med'], unit='h')
variables.drop(['rel_time_med'], axis=1, inplace=True)
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


In [14]:
# Fix temperature (to C)
variables['temp'] = variables['temp'].apply(lambda x: (x - 32) * (5 / 9))

# Fix weight (to Kg)
variables['weight'] = variables['weight'].apply(lambda x: x / 35.274)

# Normalize o2 flow (lpm/kg)
variables['o2_flow'] = variables['o2_flow'] / variables['weight']

# Fix outliers
for col in num_vars:
    p1 = np.nanpercentile(variables[col], 1.0)
    p99 = np.nanpercentile(variables[col], 99.0)
    variables.loc[variables[col] < p1, col] = np.nan
    variables.loc[variables[col] > p99, col] = np.nan

variables.loc[variables['spo2'] > 100, 'spo2'] = 100
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


In [15]:
# Calculate pSOFA components

# Add age
variables['age_days'] = round((variables['hosp_adm'] - variables['dob']) / pd.Timedelta('1 day'), 0)
variables['age_months'] = round(variables['age_days'] / 31, 2)
variables['age_years'] = round(variables['age_days'] / 365.25, 2)

# Calculate spo2_fio2
variables['fio2_imputed'] = variables.groupby(['csn'])['fio2'].ffill(24)
variables['resp_imputed'] = variables.groupby(['csn'])['resp_indicator'].ffill()
variables.loc[variables['spo2'] <= 97, 'spo2_fio2'] = variables['spo2'] / (variables['fio2_imputed'] / 100)

# Calculate pao2_fio2
variables['pao2_fio2'] = variables['po2'] / (variables['fio2_imputed'] / 100)

# Calculate respiratory component
variables['resp_psofa'] = 0
variables['resp_psofa'] = np.where((variables['pao2_fio2'] >= 400) | (variables['spo2_fio2'] >= 292), 0, 
                        np.where(((variables['pao2_fio2'] >= 300) & (variables['pao2_fio2'] < 400)) | ((variables['spo2_fio2'] >= 264) & (variables['spo2_fio2'] < 292)), 1, 
                        np.where(((variables['pao2_fio2'] >= 200) & (variables['pao2_fio2'] < 300)) | ((variables['spo2_fio2'] >= 221) & (variables['spo2_fio2'] < 264)), 2, 
                        np.where((((variables['pao2_fio2'] >= 100) & (variables['pao2_fio2'] < 200)) | ((variables['spo2_fio2'] >= 148) & (variables['spo2_fio2'] < 221))) & (variables['resp_imputed'] == 1), 3, 
                        np.where(((variables['pao2_fio2'] < 100) | (variables['spo2_fio2'] < 148)) & (variables['resp_imputed'] == 1), 4, 0)))))
variables.drop(['fio2_imputed', 'spo2_fio2', 'resp_imputed'], axis=1, inplace=True)

# Calculate coagulation component
variables['coag_psofa'] = 0
variables['coag_psofa'] = np.where(variables['platelets'] >= 150, 0, 
                        np.where((variables['platelets'] >= 100) & (variables['platelets'] < 150), 1, 
                        np.where((variables['platelets'] >= 50) & (variables['platelets'] < 100), 2, 
                        np.where((variables['platelets'] >= 20) & (variables['platelets'] < 50), 3, 
                        np.where(variables['platelets'] < 20, 4, 0)))))

# Calculate hepatic component
variables['hep_psofa'] = 0
variables['hep_psofa'] = np.where(variables['bilirubin_total'] < 1.2, 0, 
                        np.where((variables['bilirubin_total'] >= 1.2) & (variables['bilirubin_total'] < 2), 1, 
                        np.where((variables['bilirubin_total'] >= 2) & (variables['bilirubin_total'] < 6), 2, 
                        np.where((variables['bilirubin_total'] >= 6) & (variables['bilirubin_total'] < 12), 3, 
                        np.where(variables['bilirubin_total'] >= 12, 4, 0)))))

# Calculate cardiovascular component (No dobutamine)
variables['card_psofa'] = 0
variables['card_psofa'] = np.where(((variables['age_months'] < 1) & (variables['map'] >= 46)) | 
                                ((variables['age_months'] < 12) & (variables['map'] >= 55)) |
                                ((variables['age_months'] < 24) & (variables['map'] >= 60)) |
                                ((variables['age_months'] < 60) & (variables['map'] >= 62)) |
                                ((variables['age_months'] < 144) & (variables['map'] >= 65)) |
                                ((variables['age_months'] <= 216) & (variables['map'] >= 67)) |
                                ((variables['age_months'] > 216) & (variables['map'] >= 70)), 0, 
                        np.where(((variables['age_months'] < 1) & (variables['map'] < 46)) | 
                                ((variables['age_months'] < 12) & (variables['map'] < 55)) |
                                ((variables['age_months'] < 24) & (variables['map'] < 60)) |
                                ((variables['age_months'] < 60) & (variables['map'] < 62)) |
                                ((variables['age_months'] < 144) & (variables['map'] < 65)) |
                                ((variables['age_months'] <= 216) & (variables['map'] < 67)) |
                                ((variables['age_months'] > 216) & (variables['map'] < 70)), 1, 
                        np.where(variables['dopamine'] <= 5, 2, 
                        np.where((variables['dopamine'] > 5) | (variables['epinephrine'] <= 0.1) | (variables['norepinephrine'] <= 0.1), 3, 
                        np.where((variables['dopamine'] > 15) | (variables['epinephrine'] > 0.1) | (variables['norepinephrine'] > 0.1), 4, 0)))))

# Calculate neurologic component 
variables['neuro_psofa'] = 0
variables['neuro_psofa'] = np.where(variables['coma_scale_total'] >= 15, 0, 
                        np.where((variables['coma_scale_total'] >= 13) & (variables['coma_scale_total'] < 15), 1, 
                        np.where((variables['coma_scale_total'] >= 10) & (variables['coma_scale_total'] < 13), 2, 
                        np.where((variables['coma_scale_total'] >= 6) & (variables['coma_scale_total'] < 10), 3, 
                        np.where(variables['coma_scale_total'] < 6, 4, 0)))))

# Calculate renal component 
variables['renal_psofa'] = 0
variables['renal_psofa'] = np.where(((variables['age_months'] < 1) & (variables['creatinine'] < 0.8)) |
                                (((variables['age_months'] >= 1) & (variables['age_months'] < 12)) & (variables['creatinine'] < 0.3)) |
                                (((variables['age_months'] >= 12) & (variables['age_months'] < 24)) & (variables['creatinine'] < 0.4)) |
                                (((variables['age_months'] >= 24) & (variables['age_months'] < 60)) & (variables['creatinine'] < 0.6)) |
                                (((variables['age_months'] >= 60) & (variables['age_months'] < 144)) & (variables['creatinine'] < 0.7)) |
                                (((variables['age_months'] >= 144) & (variables['age_months'] <= 216)) & (variables['creatinine'] < 1.0)) |
                                ((variables['age_months'] > 216) & (variables['creatinine'] < 1.2)), 0, 
                        np.where(((variables['age_months'] < 1) & ((variables['creatinine'] < 1.0) & (variables['creatinine'] >= 0.8))) |
                                (((variables['age_months'] >= 1) & (variables['age_months'] < 12)) & ((variables['creatinine'] < 0.5) & (variables['creatinine'] >= 0.3))) |
                                (((variables['age_months'] >= 12) & (variables['age_months'] < 24)) & ((variables['creatinine'] < 0.6) & (variables['creatinine'] >= 0.4))) |
                                (((variables['age_months'] >= 24) & (variables['age_months'] < 60)) & ((variables['creatinine'] < 0.9) & (variables['creatinine'] >= 0.6))) |
                                (((variables['age_months'] >= 60) & (variables['age_months'] < 144)) & ((variables['creatinine'] < 1.1) & (variables['creatinine'] >= 0.7))) |
                                (((variables['age_months'] >= 144) & (variables['age_months'] <= 216)) & ((variables['creatinine'] < 1.7) & (variables['creatinine'] >= 1.0))) |
                                ((variables['age_months'] > 216) & ((variables['creatinine'] < 2.0) & (variables['creatinine'] >= 1.2))), 1, 
                        np.where(((variables['age_months'] < 1) & ((variables['creatinine'] < 1.2) & (variables['creatinine'] >= 1.0))) |
                                (((variables['age_months'] >= 1) & (variables['age_months'] < 12)) & ((variables['creatinine'] < 0.8) & (variables['creatinine'] >= 0.5))) |
                                (((variables['age_months'] >= 12) & (variables['age_months'] < 24)) & ((variables['creatinine'] < 1.1) & (variables['creatinine'] >= 0.6))) |
                                (((variables['age_months'] >= 24) & (variables['age_months'] < 60)) & ((variables['creatinine'] < 1.6) & (variables['creatinine'] >= 0.9))) |
                                (((variables['age_months'] >= 60) & (variables['age_months'] < 144)) & ((variables['creatinine'] < 1.8) & (variables['creatinine'] >= 1.1))) |
                                (((variables['age_months'] >= 144) & (variables['age_months'] <= 216)) & ((variables['creatinine'] < 2.9) & (variables['creatinine'] >= 1.7))) |
                                ((variables['age_months'] > 216) & ((variables['creatinine'] < 3.5) & (variables['creatinine'] >= 2.0))), 2, 
                        np.where(((variables['age_months'] < 1) & ((variables['creatinine'] < 1.6) & (variables['creatinine'] >= 1.2))) |
                                (((variables['age_months'] >= 1) & (variables['age_months'] < 12)) & ((variables['creatinine'] < 1.2) & (variables['creatinine'] >= 0.8))) |
                                (((variables['age_months'] >= 12) & (variables['age_months'] < 24)) & ((variables['creatinine'] < 1.5) & (variables['creatinine'] >= 1.1))) |
                                (((variables['age_months'] >= 24) & (variables['age_months'] < 60)) & ((variables['creatinine'] < 2.3) & (variables['creatinine'] >= 1.6))) |
                                (((variables['age_months'] >= 60) & (variables['age_months'] < 144)) & ((variables['creatinine'] < 2.6) & (variables['creatinine'] >= 1.8))) |
                                (((variables['age_months'] >= 144) & (variables['age_months'] <= 216)) & ((variables['creatinine'] < 4.2) & (variables['creatinine'] >= 2.9))) |
                                ((variables['age_months'] > 216) & ((variables['creatinine'] < 5) & (variables['creatinine'] >= 3.5))), 3, 
                        np.where(((variables['age_months'] < 1) & (variables['creatinine'] >= 1.6)) |
                                (((variables['age_months'] >= 1) & (variables['age_months'] < 12)) & (variables['creatinine'] >= 1.2)) |
                                (((variables['age_months'] >= 12) & (variables['age_months'] < 24)) & (variables['creatinine'] >= 1.5)) |
                                (((variables['age_months'] >= 24) & (variables['age_months'] < 60)) & (variables['creatinine'] >= 2.3)) |
                                (((variables['age_months'] >= 60) & (variables['age_months'] < 144)) & (variables['creatinine'] >= 2.6)) |
                                (((variables['age_months'] >= 144) & (variables['age_months'] <= 216)) & (variables['creatinine'] >= 4.2)) |
                                ((variables['age_months'] > 216) & (variables['creatinine'] >= 5)), 4, 0)))))

# Create psofa components list
psofa_comps = ['resp_psofa', 'coag_psofa', 'hep_psofa', 'card_psofa', 'neuro_psofa', 'renal_psofa']

# Drop unnecessary columns
variables.drop(['epinephrine', 'norepinephrine', 'dopamine', 'resp_indicator'], axis=1, inplace=True)

In [16]:
# Create flags

# Flag heart rate abnormalities
variables['abnormal_heart_rate'] = 0
variables.loc[((variables['age_days'] <= 31) & ((variables['pulse'] < 100) | (variables['pulse'] > 190))) | 
          ((variables['age_days'] > 31) & (variables['age_years'] < 2) & ((variables['pulse'] < 90) | (variables['pulse'] > 180))) | 
          ((variables['age_years'] >= 2) & (variables['age_years'] < 6) & (variables['pulse'] > 160)) | 
          ((variables['age_years'] >= 6) & (variables['age_years'] < 13) & (variables['pulse'] > 140)) | 
          ((variables['age_years'] >= 13) & (variables['age_years'] < 18) & (variables['pulse'] > 130)), 'abnormal_heart_rate'] = 1

# Flag respiratory rate abnormalities
variables['abnormal_resp_rate'] = 0
variables.loc[((variables['age_days'] <= 31) & (variables['resp'] > 68)) | 
          ((variables['age_days'] > 31) & (variables['age_years'] < 2) & (variables['resp'] > 58)) | 
          ((variables['age_years'] >= 2) & (variables['age_years'] < 6) & (variables['resp'] > 44)) | 
          ((variables['age_years'] >= 6) & (variables['age_years'] < 13) & (variables['resp'] > 38)) | 
          ((variables['age_years'] >= 13) & (variables['age_years'] < 18) & (variables['resp'] > 35)), 'abnormal_resp_rate'] = 1

# Flag temperature abnormalities
variables['abnormal_temp'] = 0
variables.loc[((variables['age_days'] <= 31) & ((variables['temp'] < 36) | (variables['temp'] > 38.3))) | 
          ((variables['age_days'] > 31) & ((variables['temp'] < 36) | (variables['temp'] > 37.9))), 'abnormal_temp'] = 1

# Flag WBC abnormalities
variables['abnormal_wbc'] = 0
variables.loc[((variables['age_days'] <= 7) & (variables['wbc'] > 34)) | 
        ((variables['age_days'] > 7) & (variables['age_days'] <= 31) & ((variables['wbc'] < 5) | (variables['wbc'] > 19.5))) | 
        ((variables['age_days'] > 31) & (variables['age_years'] < 2) & ((variables['wbc'] < 5) | (variables['wbc'] > 17.5))) |
        ((variables['age_years'] >= 2) & (variables['age_years'] < 6) & ((variables['wbc'] < 6) | (variables['wbc'] > 15.5))) |
        ((variables['age_years'] >= 6) & (variables['age_years'] < 13) & ((variables['wbc'] < 4.5) | (variables['wbc'] > 13.5))) |
        ((variables['age_years'] >= 13) & (variables['age_years'] < 18) & ((variables['wbc'] < 4.5) | (variables['wbc'] > 11))), 'abnormal_wbc'] = 1

# Flag neutrophil band abnormalities
variables['abnormal_neut_bands'] = 0
variables.loc[variables['band_neutrophils'] > 10, 'abnormal_neut_bands'] = 1

# Flag blood pressure abnormalities
variables['abnormal_bp_sys'] = 0
variables.loc[((variables['age_days'] <= 31) & (variables['bp_sys'] < 60)) | 
            ((variables['age_days'] > 31) & (variables['age_years'] < 1) & (variables['bp_sys'] < 70)) | 
            ((variables['age_years'] >= 1) & (variables['age_years'] <= 10) & (variables['bp_sys'] < (70 + (2 * variables['age_years'])))) | 
            ((variables['age_years'] > 10) & (variables['bp_sys'] < 90)), 'abnormal_bp_sys'] = 1

# Flag base deficit abnormalities
variables['abnormal_base_deficit'] = 0
variables.loc[variables['base_deficit'] > 5, 'abnormal_base_deficit'] = 1

# Flag lactate abnormalities
variables['abnormal_lactate'] = 0
variables.loc[variables['lactic_acid'] > 4, 'abnormal_lactate'] = 1

# Flag two consecutive SpO2 <= 90
variables['cons_spo2_below90'] = 0
variables.loc[variables['spo2'] <= 90, 'cons_spo2_below90'] = 1
variables_temp = variables.loc[variables['csn'].isin(variables.loc[variables['cons_spo2_below90'] == 1, 'csn']), ['csn', 'recorded_time', 'spo2', 'cons_spo2_below90']]
variables_temp.dropna(subset=['spo2'], inplace=True)
variables_temp.loc[variables_temp['cons_spo2_below90'] == 0, 'cons_spo2_below90'] = 2
variables_temp.loc[variables_temp['cons_spo2_below90'] == 1, 'cons_spo2_below90'] = 0
variables_temp['cumsum'] = variables_temp.sort_values(['csn', 'recorded_time']).groupby('csn', as_index=False)['cons_spo2_below90'].cumsum()
variables_temp = variables_temp.sort_values(['csn', 'recorded_time']).groupby(['csn', 'cons_spo2_below90', 'cumsum'], as_index=False).filter(lambda x: len(x) > 1)
variables['cons_spo2_below90'] = 0
variables.loc[variables.index.isin(variables_temp.index.tolist()), 'cons_spo2_below90'] = 1

# Flag FiO2 above 50
variables['fio2_above50'] = 0
variables.loc[variables['fio2'] > 50, 'fio2_above50'] = 1

# Flag low platelets
variables['low_platelets'] = 0
variables.loc[variables['platelets'] < 80, 'low_platelets'] = 1

# Flag abnormal pt
variables['abnormal_pt'] = 0
variables.loc[variables['pt'] > 18.5, 'abnormal_pt'] = 1

# Flag abnormal inr
variables['abnormal_inr'] = 0
variables.loc[variables['inr'] > 2.0, 'abnormal_inr'] = 1

# Flag elevated creatinine
variables['elevated_creat'] = 0
variables.loc[((variables['age_years'] < 1) & (variables['creatinine'] >= 1.2)) | 
           ((variables['age_years'] >= 1) & (variables['creatinine'] >= 3.0)), 'elevated_creat'] = 1

# Flag abnormal alt
variables['abnormal_alt'] = 0
variables.loc[((variables['age_days'] <= 62) & (variables['alt'] > 156)) | 
           ((variables['age_days'] > 62) & (variables['alt'] > 72)), 'abnormal_alt'] = 1

# Flag abnormal ast
variables['abnormal_ast'] = 0
variables.loc[((variables['age_years'] < 1) & (variables['ast'] > 148)) | 
           ((variables['age_years'] >= 1) & (variables['age_years'] < 18) & (variables['ast'] > 92)), 'abnormal_ast'] = 1

# Drop unnecessary variables
variables.drop(['age_days', 'band_neutrophils', 'alt', 'ast', 'pt', 'inr'], axis=1, inplace=True)

# Create flags list
flags_list = ['abnormal_heart_rate', 'abnormal_resp_rate', 'abnormal_temp', 'abnormal_wbc', 'abnormal_neut_bands', 'abnormal_bp_sys',
                'abnormal_base_deficit', 'abnormal_lactate', 'cons_spo2_below90', 'fio2_above50', 'low_platelets', 'abnormal_pt',
                'abnormal_inr', 'elevated_creat', 'abnormal_alt', 'abnormal_ast']
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


In [17]:
# Impute missing data
variables.sort_values(by=['csn', 'rel_time'], inplace=True)
variables.drop(['rel_time'], axis=1, inplace=True)
vital_signs = ['pulse', 'map', 'bp_sys', 'bp_dias', 'resp', 'spo2', 'temp']
other_vars = [x for x in variables.columns if x not in vital_signs and x not in flags_list and x not in psofa_comps]
variables[vital_signs] = variables.groupby(['patid', 'csn', 'dob', 'hosp_adm', 'department'])[vital_signs].ffill(limit=12)
variables[other_vars] = variables.groupby(['patid', 'csn', 'dob', 'hosp_adm', 'department'])[other_vars].ffill(limit=36)
variables.to_csv('pre_imp.csv', index=False)
variables.fillna(value={'pupil_left_reaction': 'Not assessed', 'pupil_right_reaction': 'Not assessed'}, inplace=True)
variables.fillna(variables.median(), inplace=True)
variables.dropna(inplace=True)
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


### Aggregate data

In [18]:
# Aggregate data
print('Aggregating data...')

# Add rel day
variables['rel_day'] = np.ceil((variables['recorded_time'] - variables['hosp_adm']) / pd.Timedelta('1 day'))

max_cols = flags_list.copy()
max_cols.extend(psofa_comps)

agg_dict = {}
for col in variables.columns:
    if col in num_vars:
        agg_dict[col] = ['mean', 'median', 'min', 'max', 'std', skew, kurtosis]
    elif col in max_cols:
        agg_dict[col] = 'max'
    else:
        agg_dict[col] = 'last'
variables = variables.groupby(['patid', 'csn', 'rel_day'], as_index=False).agg(agg_dict)
variables.reset_index(inplace=True, drop=True)

col_names = []
for name in variables.columns:
    if name[0] not in max_cols and name[1] != 'last': 
        col_names.append(name[0] + '_' + name[1])
    else:
        col_names.append(name[0])
variables.columns = col_names

# Calculate psofa
variables['psofa'] = variables['resp_psofa'] + variables['coag_psofa'] + variables['hep_psofa'] + variables['card_psofa'] + variables['neuro_psofa'] + variables['renal_psofa']

# Drop unnecessary columns
variables.drop(['resp_psofa', 'coag_psofa', 'hep_psofa', 'card_psofa', 'neuro_psofa', 'renal_psofa'], axis=1, inplace=True)

print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Aggregating data...
Unique CSN total: 35557


### Add demographics

In [19]:
# Load demographics file
demo = pd.read_parquet("/labs/kamaleswaranlab/ECMO/new_data/TAB1_Patients.parquet.gzip")
demo = demo[['Pat ID', 'Gender', 'Race']]
demo.columns = ['patid', 'gender', 'race']

# Add race and gender
variables = pd.merge(variables, demo, on='patid', how="left")

# Fix gender
variables['gender'] = variables['gender'].fillna('Unknown')

# Fix race
variables['race'] = variables['race'].fillna('Unknown')
variables.loc[(variables['race'].str.contains(";", case=False)) & (variables['race'].str.contains("declined|unknown", case=False)), 'race'] = 'Unknown'
variables.loc[variables['race'].str.contains(";", case=False), 'race'] = 'Other'
variables.loc[variables['race'].isin(['Black/African-Amer']), 'race'] = 'Black or African American'
variables.loc[variables['race'].isin([' White,Non-Hipanic', 'White,Hispanic']), 'race'] = 'White'
variables.loc[variables['race'].isin(['Declined', 'Non-White Hispanic', 'Other', 'Other/Declined', 'Patient Not Present', 'Parent Not Present']), 'race'] = 'Unknown'
variables.loc[variables['race'].isin(['American Ind/Alaskan', 'Multi-Racial', 'American Indian or Alaska Native', 'Native Hawaiian or Other Pacific Islander']), 'race'] = 'Other'
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


### Add medications flags

In [20]:
# Load medications
meds = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models/filtered_meds.parquet.gzip')
meds[['dob', 'mar_time']] = meds[['dob', 'mar_time']].apply(pd.to_datetime)
meds['csn'] = meds['csn'].astype(int)

# Add hosp_adm
vars_temp = variables[['csn', 'hosp_adm']]
vars_temp.drop_duplicates(inplace=True)
meds = meds.merge(vars_temp, how='inner', on='csn')

# Calculate relative day
meds['rel_day'] = np.nan
meds['rel_day'] = np.ceil((meds['mar_time'] - meds['hosp_adm']) / pd.Timedelta('1 day'))

variables['on_asthma_meds'] = 0
variables['on_seizure_meds'] = 0
variables['on_vasopressors'] = 0
variables['on_antiinf_meds'] = 0
variables['on_insulin'] = 0

for rel_day in variables['rel_day'].unique().tolist():
    df = meds[meds['rel_day'] == rel_day]
    variables.loc[(variables['rel_day'] == rel_day) & (variables['csn'].isin(df.loc[df['asthma_meds'] == 1, 'csn'].unique().tolist())), 'on_asthma_meds'] = 1
    variables.loc[(variables['rel_day'] == rel_day) & (variables['csn'].isin(df.loc[df['seizure_meds'] == 1, 'csn'].unique().tolist())), 'on_seizure_meds'] = 1
    variables.loc[(variables['rel_day'] == rel_day) & (variables['csn'].isin(df.loc[df['vasopressors'] == 1, 'csn'].unique().tolist())), 'on_vasopressors'] = 1
    variables.loc[(variables['rel_day'] == rel_day) & (variables['csn'].isin(df.loc[df['antiinfective_meds'] == 1, 'csn'].unique().tolist())), 'on_antiinf_meds'] = 1
    variables.loc[(variables['rel_day'] == rel_day) & (variables['csn'].isin(df.loc[df['insulin'] == 1, 'csn'].unique().tolist())), 'on_insulin'] = 1
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


### Add cultures flags

In [21]:
# Load labs - cultures
cultures = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models/filtered_labs.parquet.gzip')
cultures[['order_time', 'result_time']] = cultures[['order_time', 'result_time']].apply(pd.to_datetime)
cultures['csn'] = cultures['csn'].astype(int)
cultures.dropna(subset='result', inplace=True)
cultures = cultures[cultures['procedure'].str.contains('culture', case=False)]

# Add hosp_adm
cultures = cultures.merge(vars_temp, how='inner', on='csn')

# Calculate relative day
cultures['rel_day'] = np.nan
cultures['rel_day'] = np.ceil((cultures['order_time'] - cultures['hosp_adm']) / pd.Timedelta('1 day'))

# Add flag
variables['had_cultures_ordered'] = 0
for rel_day in variables['rel_day'].unique().tolist():
    df = cultures[cultures['rel_day'] == rel_day]
    variables.loc[(variables['rel_day'] == rel_day) & (variables['csn'].isin(df['csn'].unique().tolist())), 'had_cultures_ordered'] = 1
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


### Add diagnoses flags

In [22]:
# Load admission diagnoses
adm_diag = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models/flagged_adm_diag.parquet.gzip')
adm_diag['csn'] = adm_diag['csn'].astype(int)

# Add flags
variables['sepsis_septicemia_diag'] = 0
variables.loc[variables['csn'].isin(adm_diag.loc[adm_diag['sepsis_septicemia'] == 1, 'csn'].unique().tolist()), 'sepsis_septicemia_diag'] = 1

variables['septic_shock_diag'] = 0
variables.loc[variables['csn'].isin(adm_diag.loc[adm_diag['septic_shock'] == 1, 'csn'].unique().tolist()), 'septic_shock_diag'] = 1

variables['sickle_cell_diag'] = 0
variables.loc[variables['csn'].isin(adm_diag.loc[adm_diag['sickle_cell'] == 1, 'csn'].unique().tolist()), 'sickle_cell_diag'] = 1

variables['dka_diag'] = 0
variables.loc[variables['csn'].isin(adm_diag.loc[adm_diag['dka'] == 1, 'csn'].unique().tolist()), 'dka_diag'] = 1

variables['asthmaticus_diag'] = 0
variables.loc[variables['csn'].isin(adm_diag.loc[adm_diag['asthmaticus'] == 1, 'csn'].unique().tolist()), 'asthmaticus_diag'] = 1
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


### Add previous hospitalizations flags

In [23]:
# Load previous hospitalizations file
prev_hosp = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models/previous_hosp.parquet.gzip')
prev_hosp['csn'] = prev_hosp['csn'].astype(int)

# Add flags
variables['prev_hosp'] = 0
variables.loc[variables['csn'].isin(prev_hosp['csn'].unique().tolist()), 'prev_hosp'] = 1

variables['prev_hosp_prev_year'] = 0
variables.loc[variables['csn'].isin(prev_hosp.loc[prev_hosp['prev_year'] == 1, 'csn'].unique().tolist()), 'prev_hosp_prev_year'] = 1
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 35557


### Add labels

In [24]:
# Add label
inf_phoenix = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/cohort_inf_phoenix.csv')
inf_phoenix['csn'] = inf_phoenix['csn'].astype(int)
variables['label'] = 0
variables.loc[variables['csn'].isin(inf_phoenix['csn'].unique().tolist()), 'label'] = 1
variables.drop(['department', 'hosp_adm'], axis=1, inplace=True)

# Save dataset file
variables.to_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models/dataset_agg_alerts.parquet.gzip', compression='gzip')

In [26]:
print(campus)
print('Unique CSN total:', len(variables['csn'].unique().tolist()))
print('Positive CSNs', len(variables.loc[variables['label'] == 1, 'csn'].unique().tolist()))
print('Controls CSNs', len(variables.loc[variables['label'] == 0, 'csn'].unique().tolist()))

sr
Unique CSN total: 35557
Positive CSNs 2802
Controls CSNs 32755
