In [1]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
from datetime import datetime, timedelta
from scipy import stats
import matplotlib.pyplot as plt
import warnings
from functools import reduce
warnings.filterwarnings("ignore")

### Infection subgroup

In [2]:
# Load labs - cultures
cultures = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/DR15269_LABsAndPFTs.parquet.gzip')
cultures = cultures[['Pat ID', 'C MRN', 'Encounter CSN', 'Order Date', 'Result Date', 'Procedure', 'Component', 'Result']]
cultures.columns = ['patid', 'mrn', 'csn', 'order_time', 'result_time', 'procedure', 'component', 'result']
cultures[['order_time', 'result_time']] = cultures[['order_time', 'result_time']].apply(pd.to_datetime)
cultures.dropna(subset='result', inplace=True)
cultures['csn'] = cultures['csn'].astype(int)
cultures = cultures[cultures['procedure'].str.contains('culture', case=False, na=False)]

# Load antiinfective meds
antiinf = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/antiinf_meds.parquet.gzip')
antiinf['mar_time'] = antiinf['mar_time'].apply(pd.to_datetime)
antiinf['csn'] = antiinf['csn'].astype(int)

# Add hosp admission to antiinf
dept = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip')
dept = dept[['Encounter CSN', 'Hosp_Admission']]
dept.columns = ['csn', 'hosp_adm']
dept['csn'] = dept['csn'].astype(int)
dept['hosp_adm'] = dept['hosp_adm'].apply(pd.to_datetime)
dept.drop_duplicates(inplace=True)
antiinf = antiinf.merge(dept, how='left', on='csn')

# Gather data within first 7 days of the stay
antiinf['rel_day'] = np.ceil((antiinf['mar_time'] - antiinf['hosp_adm']) / pd.Timedelta('1 day'))
antiinf = antiinf[(antiinf['rel_day'] > 0) & (antiinf['rel_day'] < 8)]
antiinf.drop('rel_day', axis=1, inplace=True)

# Add culture order time to antiinf
antiinf = antiinf.merge(cultures[['csn', 'order_time']], how='left', on='csn')

# Compute intervals
antiinf['days_from_hospadm'] = np.ceil((antiinf['mar_time'] - antiinf['hosp_adm']) / pd.Timedelta('1 day'))
antiinf['days_from_culture'] = np.ceil((antiinf['mar_time'] - antiinf['order_time']) / pd.Timedelta('1 day'))

# Find patients that had antibiotics within the first 3 days of hospital admission, at least once every 24h
days_hosp = antiinf[(antiinf['days_from_hospadm'] > 0) & (antiinf['days_from_hospadm'] < 4)]
days_hosp = days_hosp.groupby('csn', as_index=False)['days_from_hospadm'].nunique()
days_hosp = days_hosp[days_hosp['days_from_hospadm'] == 3]

# Find patients that had antibiotics from 24h to 48h after cultures were ordered, at least once every 24h
days_cult = antiinf[(antiinf['days_from_culture'] > -1) & (antiinf['days_from_culture'] < 3)]
days_cult = days_cult.groupby(['csn', 'order_time'], as_index=False)['days_from_culture'].nunique()
days_cult = days_cult[days_cult['days_from_culture'] == 3]

# Find infection time
antiinf = antiinf[(antiinf['csn'].isin(days_hosp['csn'].unique().tolist())) | (antiinf['csn'].isin(days_cult['csn'].unique().tolist()))]
antiinf['inf_time'] = antiinf[['mar_time','order_time']].min(axis=1)
antiinf = antiinf.sort_values(by=['csn', 'inf_time']).groupby('csn', as_index=False).first()

In [3]:
# Load data
data = pd.read_pickle('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/variables.pkl')
data[['dob', 'recorded_time']] = data[['dob', 'recorded_time']].apply(pd.to_datetime)
data['csn'] = data['csn'].astype(int)
data = data[~data['variable_name'].isin(['activity', 'map', 'coma_scale', 'base_excess', 'art_ph', 'cap_ph', 'venous_ph', 'bun_creat', 'bilirubin', 'bun', 'periph_vasc', 'tidal_vol', 'pao2'])]

# Load meds
meds = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/filtered_meds.parquet.gzip')
meds = meds[['patid', 'csn', 'dob', 'med_id', 'med', 'mar_action', 'mar_time']]
meds[['dob', 'mar_time']] = meds[['dob', 'mar_time']].apply(pd.to_datetime)
meds['csn'] = meds['csn'].astype(int)
meds = meds[meds['csn'].isin(data['csn'].unique().tolist())]
meds.reset_index(inplace=True, drop=True)

# Discard NaN
data.dropna(subset='value', inplace=True)

# Remove invalid values
data = data[data['value'].apply(lambda x: str(x).replace(".", "", 1).isdigit())]
data['value'] = data['value'].astype(float)
data.reset_index(inplace=True, drop=True)

# Convert temp from F to C
data.loc[data['variable_name'] == 'temp', 'value'] = data.loc[data['variable_name'] == 'temp', 'value'].apply(lambda x: round((x - 32) * (5 / 9),2))

# Convert weight from oz to lb
data.loc[data['variable_name'] == 'weight', 'value'] = data.loc[data['variable_name'] == 'weight', 'value'].apply(lambda x: round(x/16 ,2))

# Load departments data
dept = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip')
dept = dept[['Encounter CSN', 'MRN', 'Hosp_Admission']]
dept.columns = ['csn', 'mrn', 'hosp_admission']
dept['csn'] = dept['csn'].astype(int)
dept.drop_duplicates(inplace=True)
dept[['hosp_admission']] = dept[['hosp_admission']].apply(pd.to_datetime)

# Add hospital admission and mrn
data = data.merge(dept, how='inner', on='csn')
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data = data[['patid', 'mrn', 'csn', 'dob', 'hosp_admission', 'variable_id', 'variable_name', 'recorded_time', 'value']]

# Compute age in days and years
data['age_days'] = round((data['hosp_admission'] - data['dob']) / pd.Timedelta('1 day'), 0)
data['age_years'] = round(data['age_days'] / 365.25, 2)
data = data[['patid', 'mrn', 'csn', 'dob', 'hosp_admission', 'variable_id', 'variable_name', 'recorded_time', 'value', 'age_days', 'age_years']]

# Gather data within first 7 days of the stay
data['rel_day'] = np.ceil((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 day'))
data = data[(data['rel_day'] > 0) & (data['rel_day'] < 8)]
data.drop('rel_day', axis=1, inplace=True)

# Load demographics file
demo = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB1_Patients.parquet.gzip')
demo = demo[['Pat ID', 'Gender']]
demo.columns = ['patid', 'gender']
demo.drop_duplicates(inplace=True)

# Add gender
data = data.merge(demo, how='inner', on='patid')
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data = data[['patid', 'mrn', 'csn', 'dob', 'gender', 'hosp_admission', 'variable_id', 'variable_name', 'recorded_time', 'value', 'age_days', 'age_years']]

# Pivot dataframe
variables = data['variable_name'].unique().tolist()
data.drop(['variable_id'], axis=1, inplace=True)
data = pd.pivot_table(data, values='value', index=['patid', 'mrn', 'csn', 'dob', 'gender', 'hosp_admission', 'recorded_time', 'age_days', 'age_years'], columns='variable_name', aggfunc='median', fill_value=np.nan)
data.reset_index(inplace=True)

# Remove outliers
variables.remove('cap_refill')
for var in variables:
    p1 = np.nanpercentile(data[var], 1.0)
    p99 = np.nanpercentile(data[var], 99.0)
    data.loc[data[var] < p1, var] = np.nan
    data.loc[data[var] > p99, var] = np.nan

# Create additional temperature column with imputed values
data['temp_imputed'] = data.groupby(['csn'])['temp'].ffill()
data['temp_imputed'] = data['temp_imputed'].fillna(data['temp_imputed'].median())

# Apply temperature correction formulas to HR and RR
data['pulse'] = data['pulse'] - 10 * (data['temp_imputed'] - 37)
data.loc[data['age_years'] < 2, 'resp'] = data['resp'] - 7 * (data['temp_imputed'] - 37)
data.loc[data['age_years'] >= 2, 'resp'] = data['resp'] - 5 * (data['temp_imputed'] - 37)
data.drop(['temp_imputed'], axis=1, inplace=True)

# Calculate FiO2
data['resp_imputed'] = data.groupby(['csn'])['resp'].ffill()
data['resp_imputed'] = data['resp_imputed'].fillna(data['resp_imputed'].median())
data['weight_imputed'] = data.groupby(['csn'])['weight'].ffill()
data['weight_imputed'] = data['weight_imputed'].fillna(data['weight_imputed'].median())
data['o2_flow_imputed'] = data.groupby(['csn'])['o2_flow'].ffill()
data['o2_flow_imputed'] = data['o2_flow_imputed'].fillna(data['o2_flow_imputed'].median())
data['vol_calculated'] = np.nan
data.loc[(data['weight_imputed'] >= 6) & (data['weight_imputed'] < 10) & (data['resp_imputed'] >= 30) & (data['resp_imputed'] <= 50), 'vol_calculated'] = 7.160 - (0.265 * data['resp_imputed']) + (2.820 * data['weight_imputed'])
data.loc[(data['weight_imputed'] >= 10) & (data['weight_imputed'] < 80) & (data['resp_imputed'] >= 8) & (data['resp_imputed'] <= 25), 'vol_calculated'] = 154.137 + (3.470 * data['weight_imputed']) - (6.861 * data['resp_imputed'])
data.loc[(data['weight_imputed'] >= 80) & (data['weight_imputed'] <= 250) & (data['resp_imputed'] >= 8) & (data['resp_imputed'] <= 18)  & (data['gender'] == 'Male'), 'vol_calculated'] = 466.969 + (2.4 * data['weight_imputed']) - (26.342 * data['resp_imputed'])
data.loc[(data['weight_imputed'] >= 80) & (data['weight_imputed'] <= 250) & (data['resp_imputed'] >= 8) & (data['resp_imputed'] <= 18) & (data['gender'] == 'Female'), 'vol_calculated'] = 456.408 + (1.794 * data['weight_imputed']) - (22.716 * data['resp_imputed'])
data['fio2_calculated'] = round(((((data['o2_flow_imputed'] * (data['fio2'] / 100)) + (0.21 * (((data['vol_calculated'] * 0.001) * data['resp_imputed']) - (data['o2_flow_imputed'] * (data['fio2'] / 100))))) / ((data['vol_calculated'] * 0.001) * data['resp_imputed'])) / data['o2_flow_imputed']) * 100, 2)
data.drop(['resp_imputed', 'weight_imputed', 'o2_flow_imputed'], axis=1, inplace=True)
data.loc[data['fio2_calculated'] < 21, 'fio2_calculated'] = 21

# Use correct FiO2
mv_fs = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/mv_indicators_raw.parquet.gzip')
mv_fs = mv_fs[mv_fs['variable_name'] == 'Apparatus Type']
mv_fs['csn'] = mv_fs['csn'].astype(int)
app_type = pd.read_csv('../files/apparatus_type_ann.csv')
mv_fs = mv_fs[mv_fs['csn'].isin(mv_fs.loc[mv_fs['value'].isin(app_type.loc[app_type['nasal'] == 'Nasal', 'value']), 'csn'].unique().tolist())]
mv_fs = mv_fs[~(mv_fs['csn'].isin(mv_fs.loc[mv_fs['value'].isin(app_type.loc[~(app_type['nasal'] == 'Nasal'), 'value']), 'csn'].unique().tolist()))]
data['fio2_corrected'] = np.nan
data.loc[data['csn'].isin(mv_fs['csn'].unique().tolist()), 'fio2_corrected'] = data['fio2_calculated']
data.loc[~(data['csn'].isin(mv_fs['csn'].unique().tolist())), 'fio2_corrected'] = data['fio2']
data.rename(columns={'fio2':'fio2_provided'}, inplace=True)
data.rename(columns={'fio2_corrected':'fio2'}, inplace=True)

# Gather patients with infection
print('Number of CSNs:', len(data['csn'].unique().tolist()))
data = data[data['csn'].isin(antiinf['csn'].unique().tolist())]
print('Number of CSNs with infection:', len(data['csn'].unique().tolist()))

data.sort_values(by=['csn', 'recorded_time'], inplace=True)
data.reset_index(drop=True, inplace=True)

Number of CSNs: 63877
Number of CSNs with infection: 11809


In [5]:
# Flag bradycardia medications administrations
meds['bradycardia'] = 0
brad_meds = ['clonidine', 'precedex', 'dexmedetomidine', 'esmolol', 'labetalol']
meds.loc[meds['med'].str.contains('|'.join(brad_meds), case=False, na=False), 'bradycardia'] = 1

# Flag asthma and seizures medications administrations
meds['asthma_seizure_adm'] = 0
asth_seiz_meds = ['albuterol', 'dexamethasone', 'epinephrine', 'methylprednisolone', 'magnesium', 'terbutaline', 
                  'levalbuterol', 'xopenex', 'lorazepam', 'levetiracetam', 'fosphenytoin', 'phenobarbital']
meds.loc[(meds['med'].str.contains('|'.join(asth_seiz_meds), case=False, na=False)) & 
          ~(meds['med'].str.contains('norepinephrine|phenylephrine', case=False, na=False)) & 
            ~(meds['mar_action'].str.contains('downtime', case=False, na=False)), 'asthma_seizure_adm'] = 1

In [6]:
# Define function to ignore 2 hours after asthma and seizure meds:
def meds_2h_ignore(data, meds, var_list):
    data['idx'] = data.index
    req_cols = ['csn', 'recorded_time', 'idx']
    req_cols.extend(var_list)
    if len(var_list) > 1:
        data_temp = data.loc[(data[var_list[0]] == 1) | (data[var_list[1]] == 1), req_cols]
    else:
        data_temp = data.loc[data[var_list[0]] == 1, req_cols]
    meds['asthma_seizure_ignore'] = 0
    meds.loc[(meds['asthma_seizure_adm'] == 1) & (meds['csn'].isin(data_temp['csn'].unique().tolist())), 'asthma_seizure_ignore'] = 1
    data_temp = data_temp[data_temp['csn'].isin(meds.loc[meds['asthma_seizure_ignore'] == 1, 'csn'].unique().tolist())]
    data_temp = data_temp.merge(meds.loc[meds['asthma_seizure_ignore'] == 1, ['csn', 'mar_time']], on='csn', how='inner')
    data_temp['int'] = (data_temp['recorded_time'] - data_temp['mar_time']) / pd.Timedelta('1 hour')
    data_temp = data_temp[(data_temp['int'] > 0) & (data_temp['int'] < 2)]
    data.loc[data['idx'].isin(data_temp['idx'].unique().tolist()), var_list] = 0
    data.drop(['idx'], axis=1, inplace=True)
    return data

### SIRS

In [7]:
# Flag heart rate abnormalities
data['sirs_hr'] = 0
bradycardia_list = meds.loc[meds['bradycardia'] == 1, 'csn'].unique().tolist()
data.loc[((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 hour') > 1) & 
         (((data['age_days'] <= 31) & (((data['pulse'] < 100) & ~(data['csn'].isin(bradycardia_list))) | (data['pulse'] > 190))) | 
          ((data['age_days'] > 31) & (data['age_years'] < 2) & (((data['pulse'] < 90) & ~(data['csn'].isin(bradycardia_list))) | (data['pulse'] > 180))) | 
          ((data['age_years'] >= 2) & (data['age_years'] < 6) & (data['pulse'] > 160)) | 
          ((data['age_years'] >= 6) & (data['age_years'] < 13) & (data['pulse'] > 140)) | 
          ((data['age_years'] >= 13) & (data['age_years'] < 18) & (data['pulse'] > 130))), 'sirs_hr'] = 1

# Flag respiratory rate abnormalities
data['sirs_resp'] = 0
data.loc[((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 hour') > 1) & 
         (((data['age_days'] <= 31) & (data['resp'] > 68)) | 
          ((data['age_days'] > 31) & (data['age_years'] < 2) & (data['resp'] > 58)) | 
          ((data['age_years'] >= 2) & (data['age_years'] < 6) & (data['resp'] > 44)) | 
          ((data['age_years'] >= 6) & (data['age_years'] < 13) & (data['resp'] > 38)) | 
          ((data['age_years'] >= 13) & (data['age_years'] < 18) & (data['resp'] > 35))), 'sirs_resp'] = 1

# Ignore two hours after asthma and seizure meds administration
data = meds_2h_ignore(data, meds, ['sirs_hr', 'sirs_resp'])

In [8]:
# Load problem list
problem_list = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB3_Problem_List.parquet.gzip')
problem_list.dropna(subset=['Problem'], inplace=True)

# Flag patients with sickle cell disease
problem_list['sickle'] = 0
problem_list.loc[problem_list['Problem'].str.contains('sickle', case=False, na=False), 'sickle'] = 1

# Load hospital diagnoses
hosp_diag = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB4_Hospital_Diagnoses.parquet.gzip')
hosp_diag.dropna(subset=['Diagnosis'], inplace=True)

# Flag patients with sickle cell disease
hosp_diag['sickle'] = 0
hosp_diag.loc[hosp_diag['Diagnosis'].str.contains('sickle', case=False, na=False), 'sickle'] = 1

# Load admitting diagnoses
adm_diag = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB5_Admitting_Diagnoses.parquet.gzip')
adm_diag.dropna(subset=['Diagnosis'], inplace=True)

# Flag patients with sickle cell disease
adm_diag['sickle'] = 0
adm_diag.loc[adm_diag['Diagnosis'].str.contains('sickle', case=False, na=False), 'sickle'] = 1

In [9]:
# Flag temperature abnormalities
data['sirs_temp'] = 0
data.loc[((data['age_days'] <= 31) & ((data['temp'] < 36) | (data['temp'] > 38.3))) | 
          ((data['age_days'] > 31) & ((data['temp'] < 36) | (data['temp'] > 37.9))), 'sirs_temp'] = 1

# Flag WBC abnormalities
data['sirs_wbc'] = 0
data.loc[~((data['patid'].isin(problem_list.loc[problem_list['sickle'] == 1, 'Pat ID'].unique().tolist())) | 
        (data['patid'].isin(hosp_diag.loc[hosp_diag['sickle'] == 1, 'Pat ID'].unique().tolist())) | 
        (data['patid'].isin(adm_diag.loc[adm_diag['sickle'] == 1, 'Pat ID'].unique().tolist()))) & 
        (((data['age_days'] < 7) & (data['wbc'] > 34)) | 
        ((data['age_days'] >= 7) & (data['age_days'] <= 31) & ((data['wbc'] < 5) | (data['wbc'] > 19.5))) | 
        ((data['age_days'] > 31) & (data['age_years'] < 2) & ((data['wbc'] < 5) | (data['wbc'] > 17.5))) |
        ((data['age_years'] >= 2) & (data['age_years'] < 6) & ((data['wbc'] < 6) | (data['wbc'] > 15.5))) |
        ((data['age_years'] >= 6) & (data['age_years'] < 13) & ((data['wbc'] < 4.5) | (data['wbc'] > 13.5))) |
        ((data['age_years'] >= 13) & (data['age_years'] < 18) & ((data['wbc'] < 4.5) | (data['wbc'] > 11)))), 'sirs_wbc'] = 1

# Flag neutrophil band abnormalities
data['sirs_neutrophil'] = 0
data.loc[data['band_neutrophils'] > 10, 'sirs_neutrophil'] = 1

In [10]:
# Find SIRS
df_pr = data.loc[(data['sirs_hr'] == 1) | (data['sirs_resp'] == 1), ['csn', 'recorded_time']]
df_pr.columns = ['csn', 'recorded_time_pr']
df_pr.sort_values(by=['recorded_time_pr'], inplace=True)
df_pr.reset_index(inplace=True, drop=True)

df_others = data.loc[(data['sirs_temp'] == 1) | (data['sirs_wbc'] == 1) | (data['sirs_neutrophil'] == 1), ['csn', 'recorded_time']]
df_others.columns = ['csn', 'recorded_time_others']
df_others.sort_values(by=['recorded_time_others'], inplace=True)
df_others.reset_index(inplace=True, drop=True)

# Find abnormal measures within 24 hours of each other
df_sirs = df_pr.merge(df_others, on='csn', how='inner')
df_sirs['int'] = abs((df_sirs['recorded_time_pr'] - df_sirs['recorded_time_others']) / pd.Timedelta('1 hour'))
df_sirs = df_sirs[df_sirs['int'] <= 24]
df_sirs['sirs_time'] = df_sirs[['recorded_time_pr','recorded_time_others']].min(axis=1)
df_sirs.sort_values(by=['csn', 'sirs_time'], inplace=True)
df_sirs = df_sirs.groupby('csn', as_index=False).first()

# Add sirs time to dataframe
data = data.merge(df_sirs[['csn', 'sirs_time']], how='left', on='csn')
data['sirs'] = 0
data.loc[((data['sirs_hr'] == 1) | (data['sirs_resp'] == 1) | (data['sirs_temp'] == 1) | (data['sirs_wbc'] == 1) | (data['sirs_neutrophil'] == 1)) & 
        (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'sirs'] = 1

### Organ Disfunction

In [12]:
# Find respiratory compromise
data['resp_od'] = 0
data.loc[(data['mv_indicator'] > 0) & 
         (((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 hour')) > 1) & 
         (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'resp_od'] = 1

# Ignore two hours after asthma and seizure meds administration
data = meds_2h_ignore(data, meds, ['resp_od'])

# Find history of ventilator dependence
data_temp = data.loc[data['mv_indicator'] > 0, ['patid', 'csn', 'hosp_admission']]
data_temp.drop_duplicates(inplace=True)
data_temp['count'] = data_temp.sort_values(['patid', 'hosp_admission']).groupby('patid', as_index=False).cumcount()
data.loc[data['csn'].isin(data_temp.loc[data_temp['count'] > 0, 'csn'].unique().tolist()), 'resp_od'] = 0

In [13]:
# Find cardiovascular compromise
data['card_od'] = 0
data.loc[((((data['age_days'] <= 31) & (data['sys_bp'] < 60)) | 
            ((data['age_days'] > 31) & (data['age_years'] < 1) & (data['sys_bp'] < 70)) | 
            ((data['age_years'] >= 1) & (data['age_years'] <= 10) & (data['sys_bp'] < (70 + (2 * data['age_years'])))) | 
            ((data['age_years'] > 10) & (data['sys_bp'] < 90))) | (data['vasoactive'] > 0)) & 
        (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'card_od'] = 1

# Flag patients with IDDM
problem_list['IDDM'] = 0
problem_list.loc[problem_list['Problem'].str.contains('type 1 diabetes', case=False, na=False), 'IDDM'] = 1
hosp_diag['IDDM'] = 0
hosp_diag.loc[hosp_diag['Diagnosis'].str.contains('type 1 diabetes', case=False, na=False), 'IDDM'] = 1
adm_diag['IDDM'] = 0
adm_diag.loc[adm_diag['Diagnosis'].str.contains('type 1 diabetes', case=False, na=False), 'IDDM'] = 1

# Flag patients administered insulin
meds['insulin'] = 0
meds.loc[meds['med'].str.contains("insulin", case=False, na=False), 'insulin'] = 1

# Base deficit requirement
data['base_deficit'] = 0
data.loc[~((data['patid'].isin(problem_list.loc[problem_list['IDDM'] == 1, 'Pat ID'].unique().tolist())) | 
        (data['patid'].isin(hosp_diag.loc[hosp_diag['IDDM'] == 1, 'Pat ID'].unique().tolist())) | 
        (data['patid'].isin(adm_diag.loc[adm_diag['IDDM'] == 1, 'Pat ID'].unique().tolist()))) & 
        ~(data['csn'].isin(meds.loc[meds['insulin'] == 1, 'csn'].unique().tolist())) &
        (data['base_deficit'] > 5) & 
        (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24) & 
        ((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 hour') > 2), 'base_deficit'] = 1
data['lact_cap'] = 0
data.loc[(data['csn'].isin(data.loc[data['base_deficit'] == 1, 'csn'].unique().tolist())) &
          (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24) &
          ((data['lactic_acid'] > 4) | (data['cap_refill'] > 3)), 'lact_cap'] = 1
data_temp = data.groupby('csn', as_index=False)['lact_cap'].sum()
data_temp = data_temp[data_temp['lact_cap'] == 0]
data.loc[data['csn'].isin(data_temp['csn'].unique().tolist()), 'base_deficit'] = 0
data['card_od'] = data[['card_od', 'base_deficit']].max(axis=1)
data.drop(['base_deficit', 'lact_cap'], axis=1, inplace=True)

In [14]:
# Find respiratory not mechanically ventilated compromise

# Consecutive SpO2 requirement
data['respnomv_od'] = 0
data.loc[(data['spo2'] <= 90) & 
        ((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 hour') > 1) &
        (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'respnomv_od'] = 1
data_temp = data.loc[data['csn'].isin(data.loc[data['respnomv_od'] == 1, 'csn']), ['csn', 'recorded_time', 'spo2', 'respnomv_od']]
data_temp.dropna(subset=['spo2'], inplace=True)
data_temp.loc[data_temp['respnomv_od'] == 0, 'respnomv_od'] = 2
data_temp.loc[data_temp['respnomv_od'] == 1, 'respnomv_od'] = 0
data_temp['cumsum'] = data_temp.sort_values(['csn', 'recorded_time']).groupby('csn', as_index=False)['respnomv_od'].cumsum()
data_temp = data_temp.sort_values(['csn', 'recorded_time']).groupby(['csn', 'respnomv_od', 'cumsum'], as_index=False).filter(lambda x: len(x) > 1)
data['respnomv_od'] = 0
data.loc[data.index.isin(data_temp.index.tolist()), 'respnomv_od'] = 1

# FiO2 requirement
data['fio2_req'] = 0
data.loc[(data['fio2'] > 50) &
          ((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 hour') > 1) &
          (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'fio2_req'] = 1
# Ignore two hours after asthma and seizure meds administration
data = meds_2h_ignore(data, meds, ['fio2_req'])
data['respnomv_od'] = data[['respnomv_od', 'fio2_req']].max(axis=1)
data.drop(['fio2_req'], axis=1, inplace=True)

# Ignore patients with a history of sickle cell disease
data.loc[(data['patid'].isin(problem_list.loc[problem_list['sickle'] == 1, 'Pat ID'].unique().tolist())) | 
          (data['patid'].isin(hosp_diag.loc[hosp_diag['sickle'] == 1, 'Pat ID'].unique().tolist())) | 
          (data['patid'].isin(adm_diag.loc[adm_diag['sickle'] == 1, 'Pat ID'].unique().tolist())), 'respnomv_od'] = 0

In [15]:
# Find hematologic compromise

# Platelet count decline requirement
data['hem_od'] = 0
data_temp = data.loc[~(data['platelets'].isna()), ['csn', 'recorded_time', 'platelets', 'sirs_time']]
data_temp['platelets_req'] = 0
for csn in data_temp['csn'].unique().tolist():
    df = data_temp[data_temp['csn'] == csn]
    for i in range(df.shape[0]):
        df_i = df[(df.iloc[i, 1] - df['recorded_time'] > '0 hours') & (df.iloc[i, 1] - df['recorded_time'] < '72 hours')]
        if df_i.shape[0] > 0:
            if (df.iloc[i, 2] < (0.5 * (max(df_i['platelets'])))) and (abs((df.iloc[i, 1] - df.iloc[i, 3]) / pd.Timedelta('1 hour')) <= 24):
                data_temp.loc[data_temp.index == df.index.tolist()[i], 'platelets_req'] = 1
data_temp = data_temp[data_temp['platelets_req'] == 1]
data.loc[data.index.isin(data_temp.index.tolist()), 'hem_od'] = 1
data.loc[(data['hem_od'] == 1) & (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) > 24), 'hem_od'] = 0

# Platelet count requirements
data.loc[(data['platelets'] < 80) &
          (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'hem_od'] = 1

# Exclude ECMO patients
ecmo = pd.read_csv('../files/ECMO_database_2010_2022.csv')
data.loc[data['mrn'].isin(ecmo['Medical Record Number (MRN)'].unique().tolist()), 'hem_od'] = 0

# Other requirements
data.loc[((data['pt'] > 18.5) | 
           (data['inr'] > 2.0)) &
          (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'hem_od'] = 1

In [16]:
# Find renal compromise

# Elevated creatinine requirement
data['renal_od'] = 0
data.loc[(((data['age_years'] < 1) & (data['creatinine'] >= 1.2)) | 
           ((data['age_years'] >= 1) & (data['creatinine'] >= 3.0))) & 
           (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'renal_od'] = 1

# Creatinine increase requirement
labs = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/DR15269_LABsAndPFTs.parquet.gzip')
labs = labs[['Pat ID', 'Encounter CSN', 'Component', 'Result Date', 'Result']]
labs.columns = ['patid', 'csn_baseline', 'variable', 'recorded_time_baseline', 'value']
labs['csn_baseline'] = labs['csn_baseline'].astype(int)
labs = labs[(labs['patid'].isin(data['patid'].unique().tolist())) & (labs['variable'] == 'CREATININE') & ~(labs['value'].isna())]
labs[['recorded_time_baseline']] = labs[['recorded_time_baseline']].apply(pd.to_datetime)
labs = labs[labs['value'].apply(lambda x: str(x).replace(".", "", 1).isdigit())]
labs['value'] = labs['value'].astype(float)
labs.reset_index(inplace=True, drop=True)
labs = labs.sort_values(['csn_baseline', 'recorded_time_baseline']).groupby(['patid', 'csn_baseline'], as_index=False)['recorded_time_baseline', 'value'].last()

data_temp = data.loc[~(data['creatinine'].isna()) & 
                      (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24) &
                      (data['patid'].isin(labs['patid'].unique().tolist()))]
data_temp = data_temp[['patid', 'csn', 'recorded_time', 'creatinine']]
data_temp['idx'] = data_temp.index
data_temp = data_temp.merge(labs, on='patid', how='inner')
data_temp = data_temp[~(data_temp['csn_baseline'] - data_temp['csn'] == 0)]
data_temp['interval'] = (data_temp['recorded_time'] - data_temp['recorded_time_baseline']) / pd.Timedelta('1 hour')
data_temp = data_temp[data_temp['interval'] > 0]
data_temp.sort_values(['csn', 'idx', 'interval'], inplace=True)
data_temp = data_temp.groupby(['idx'], as_index=False).first()
data_temp = data_temp[data_temp['creatinine'] > (2 * data_temp['value'])]
data.loc[data.index.isin(data_temp['idx']), 'renal_od'] = 1

In [17]:
# Find hepatic compromise
data['hep_od'] = 0
data.loc[(((data['age_days'] <= 62) & (data['alt'] > 156)) | 
           ((data['age_days'] > 62) & (data['alt'] > 72)) | 
           ((data['age_years'] < 1) & (data['ast'] > 148)) | 
           ((data['age_years'] >= 1) & (data['age_years'] < 18) & (data['ast'] > 92))) & 
           (abs((data['recorded_time'] - data['sirs_time']) / pd.Timedelta('1 hour')) <= 24), 'hep_od'] = 1

In [18]:
# Find OD
data_temp = data.groupby('csn', as_index=False)['resp_od', 'card_od', 'respnomv_od', 'hem_od', 'renal_od', 'hep_od'].max()
data_temp['od'] = 0
data_temp.loc[((data_temp['resp_od'] == 1) | (data_temp['card_od'] == 1)) | 
         ((data_temp['respnomv_od'] + data_temp['hem_od'] + data_temp['renal_od'] + data_temp['hep_od']) >= 2), 'od'] = 1
data.loc[data['csn'].isin(data_temp.loc[data_temp['od'] == 1, 'csn'].unique().tolist()), 'od'] = 1

# Find OD time
data_temp = data[(data['od'] == 1) & 
                ((data['resp_od'] == 1) | 
                (data['card_od'] == 1) | 
                (data['respnomv_od'] == 1) | 
                (data['hem_od'] == 1) | 
                (data['renal_od'] == 1) | 
                (data['hep_od'] == 1))]
data_temp.rename(columns={'recorded_time':'od_time'}, inplace=True)
data_temp = data_temp.groupby('csn', as_index=False)['od_time'].min()
data = data.merge(data_temp, how='left', on='csn')

# Find sepsis time
data['sepsis_time'] = np.nan
data['sepsis_time'] = pd.to_datetime(data['sepsis_time'])
data.loc[~(data['sirs_time'].isna()) & ~(data['od_time'].isna()), 'sepsis_time'] = data[['sirs_time','od_time']].min(axis=1)

# Filter patients with sepsis time within suspected infection period (72 hours)
data = data.merge(antiinf[['csn', 'inf_time']], on='csn', how='inner')
data['sepsis_day'] = np.ceil((data['sepsis_time'] - data['hosp_admission']) / pd.Timedelta('1 day'))
data['inf_day'] = np.ceil((data['inf_time'] - data['hosp_admission']) / pd.Timedelta('1 day'))
data = data[(data['inf_day'] > 0) & (data['inf_day'] <= 7)]
data.loc[~((data['sepsis_day'] >= np.maximum(data['inf_day'] - 2, 1.0)) & (data['sepsis_day'] <= data['inf_day'] + 1)), 'sepsis_time'] = np.nan
data['sepsis_time'] = pd.to_datetime(data['sepsis_time'])
data.drop(['sepsis_day', 'inf_day'], axis=1, inplace=True)

# Save file
data.to_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/screening_inf_sirs_od.parquet.gzip', compression='gzip')

In [20]:
# Save reduced version
data = data[['patid', 'mrn', 'csn', 'dob', 'sirs_time', 'od_time', 'sepsis_time']]
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print('Number of suspected infection CSNs with sepsis: {}.'.format(len(data.loc[~(data['sepsis_time'].isna()), 'csn'].unique().tolist())))
print('Number of suspected infection CSNs without sepsis: {}.'.format(len(data.loc[(data['sepsis_time'].isna()), 'csn'].unique().tolist())))

Number of suspected infection CSNs with sepsis: 3105.
Number of suspected infection CSNs without sepsis: 8422.


In [22]:
# Save positive cohort
data_pos = data[~(data['sepsis_time'].isna())]
data_pos.to_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/cohort_inf_sirs_od.csv', index=False)