In [1]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import warnings
from functools import reduce
warnings.filterwarnings("ignore")

### Infection subgroup and infection time

In [2]:
# Load labs - cultures
cultures = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/DR15269_LABsAndPFTs.parquet.gzip')
cultures = cultures[['Pat ID', 'C MRN', 'Encounter CSN', 'Order Date', 'Result Date', 'Procedure', 'Component', 'Result']]
cultures.columns = ['patid', 'mrn', 'csn', 'order_time', 'result_time', 'procedure', 'component', 'result']
cultures[['order_time', 'result_time']] = cultures[['order_time', 'result_time']].apply(pd.to_datetime)
cultures.dropna(subset='result', inplace=True)
cultures['csn'] = cultures['csn'].astype(int)
cultures = cultures[cultures['procedure'].str.contains('culture', case=False, na=False)]

# Load antiinfective meds
antiinf = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/antiinf_meds.parquet.gzip')
antiinf['mar_time'] = antiinf['mar_time'].apply(pd.to_datetime)
antiinf['csn'] = antiinf['csn'].astype(int)

# Add hosp admission to antiinf
dept = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip')
dept = dept[['Encounter CSN', 'Hosp_Admission']]
dept.columns = ['csn', 'hosp_adm']
dept['csn'] = dept['csn'].astype(int)
dept['hosp_adm'] = dept['hosp_adm'].apply(pd.to_datetime)
dept.drop_duplicates(inplace=True)
antiinf = antiinf.merge(dept, how='left', on='csn')

# Gather data within first 7 days of the stay
antiinf['rel_day'] = np.ceil((antiinf['mar_time'] - antiinf['hosp_adm']) / pd.Timedelta('1 day'))
antiinf = antiinf[(antiinf['rel_day'] > 0) & (antiinf['rel_day'] < 8)]
antiinf.drop('rel_day', axis=1, inplace=True)

# Add culture order time to antiinf
antiinf = antiinf.merge(cultures[['csn', 'order_time']], how='left', on='csn')

# Compute intervals
antiinf['days_from_hospadm'] = np.ceil((antiinf['mar_time'] - antiinf['hosp_adm']) / pd.Timedelta('1 day'))
antiinf['days_from_culture'] = np.ceil((antiinf['mar_time'] - antiinf['order_time']) / pd.Timedelta('1 day'))

# Find patients that had antibiotics within the first 3 days of hospital admission, at least once every 24h
days_hosp = antiinf[(antiinf['days_from_hospadm'] > 0) & (antiinf['days_from_hospadm'] < 4)]
days_hosp = days_hosp.groupby('csn', as_index=False)['days_from_hospadm'].nunique()
days_hosp = days_hosp[days_hosp['days_from_hospadm'] == 3]

# Find patients that had antibiotics from 24h to 48h after cultures were ordered, at least once every 24h
days_cult = antiinf[(antiinf['days_from_culture'] > -1) & (antiinf['days_from_culture'] < 3)]
days_cult = days_cult.groupby(['csn', 'order_time'], as_index=False)['days_from_culture'].nunique()
days_cult = days_cult[days_cult['days_from_culture'] == 3]

# Find infection time
antiinf = antiinf[(antiinf['csn'].isin(days_hosp['csn'].unique().tolist())) | (antiinf['csn'].isin(days_cult['csn'].unique().tolist()))]
antiinf['inf_time'] = antiinf[['mar_time','order_time']].min(axis=1)
antiinf = antiinf.sort_values(by=['csn', 'inf_time']).groupby('csn', as_index=False).first()

### PSOFA

In [3]:
# Load data
data = pd.read_pickle('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/variables.pkl')
data = data[data['variable_name'].isin(['pao2_fio2', 'pao2', 'fio2', 'spo2', 'resp_indicator', 'platelets', 'bilirubin', 'map', 'coma_scale', 'creatinine', 'resp', 'o2_flow', 'weight'])]
data[['dob', 'recorded_time']] = data[['dob', 'recorded_time']].apply(pd.to_datetime)
data['csn'] = data['csn'].astype(int)
variables = data['variable_name'].unique().tolist()

# Load meds
meds = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/filtered_meds.parquet.gzip')
meds[['dob', 'mar_time']] = meds[['dob', 'mar_time']].apply(pd.to_datetime)
meds['csn'] = meds['csn'].astype(int)
meds = meds[(meds['csn'].isin(data['csn'].unique().tolist())) & (meds['dose_unit'] == 'mcg/kg/min') & 
        (meds['med'].str.contains('epinephrine|dopamine|dobutamine', case=False))]
meds = meds[['patid', 'csn', 'dob', 'med_id', 'med', 'mar_time', 'dose']]
meds.columns = ['patid', 'csn', 'dob', 'variable_id', 'variable_name', 'recorded_time', 'value']
meds.loc[(meds['variable_name'].str.contains("epinephrine", case=False)) & ~(meds['variable_name'].str.contains("norepinephrine", case=False)), 'variable_name'] = 'epinephrine'
meds.loc[meds['variable_name'].str.contains("norepinephrine", case=False), 'variable_name'] = 'norepinephrine'
meds.loc[meds['variable_name'].str.contains("dopamine", case=False), 'variable_name'] = 'dopamine'
meds.loc[meds['variable_name'].str.contains("dobutamine", case=False), 'variable_name'] = 'dobutamine'
meds.reset_index(inplace=True, drop=True)
data = pd.concat([data, meds])

# Discard NaN
data.dropna(subset='value', inplace=True)

# Remove invalid values
data = data[data['value'].apply(lambda x: str(x).replace(".", "", 1).isdigit())]
data['value'] = data['value'].astype(float)
data.reset_index(inplace=True, drop=True)
data = data[~((data['variable_name'] == 'spo2') & (data['value'] > 97))]

# Convert weight from oz to lb
data.loc[data['variable_name'] == 'weight', 'value'] = data.loc[data['variable_name'] == 'weight', 'value'].apply(lambda x: round(x/16 ,2))

# Load departments data
dept = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip')
dept = dept[['Encounter CSN', 'MRN', 'Hosp_Admission']]
dept.drop_duplicates(inplace=True)
dept.columns = ['csn', 'mrn', 'hosp_admission']
dept['csn'] = dept['csn'].astype(int)
dept[['hosp_admission']] = dept[['hosp_admission']].apply(pd.to_datetime)

# Add hospital admission and mrn
data = data.merge(dept, how='inner', on='csn')
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data = data[['patid', 'mrn', 'csn', 'dob', 'hosp_admission', 'variable_id', 'variable_name', 'recorded_time', 'value']]

# Compute age in months
data['age_days'] = round((data['hosp_admission'] - data['dob']) / pd.Timedelta('1 day'), 0)
data['age_months'] = round(data['age_days'] / 31, 2)
data.drop('age_days', axis=1, inplace=True)
data = data[['patid', 'mrn', 'csn', 'dob', 'age_months', 'hosp_admission', 'variable_id', 'variable_name', 'recorded_time', 'value']]

# Gather data within first 7 days of the stay
data['rel_day'] = np.ceil((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 day'))
data = data[(data['rel_day'] > 0) & (data['rel_day'] < 8)]
data.drop('rel_day', axis=1, inplace=True)

# Load demographics file
demo = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/TAB1_Patients.parquet.gzip')
demo = demo[['Pat ID', 'Gender']]
demo.columns = ['patid', 'gender']
demo.drop_duplicates(inplace=True)

# Add gender
data = data.merge(demo, how='inner', on='patid')
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data = data[['patid', 'mrn', 'csn', 'dob', 'gender', 'age_months', 'hosp_admission', 'variable_id', 'variable_name', 'recorded_time', 'value']]

# Add column for day after hospital admission
data['day'] = np.ceil((data['recorded_time'] - data['hosp_admission']) / pd.Timedelta('1 day'))
data = data[data['day'] > 0]
data = data[['patid', 'mrn', 'csn', 'dob', 'gender', 'age_months', 'hosp_admission', 'day', 'variable_id', 'variable_name', 'recorded_time', 'value']]
data = data[data['day'] <= 28.0]

# Pivot dataframe
variables = data['variable_name'].unique().tolist()
data.drop(['variable_id'], axis=1, inplace=True)
data = pd.pivot_table(data, values='value', index=['patid', 'mrn', 'csn', 'dob', 'gender', 'age_months', 'hosp_admission', 'day', 'recorded_time'], columns='variable_name', aggfunc='median', fill_value=np.nan)
data.reset_index(inplace=True)

# Remove outliers
variables.remove('resp_indicator')
for var in variables:
    p1 = np.nanpercentile(data[var], 1.0)
    p99 = np.nanpercentile(data[var], 99.0)
    data.loc[data[var] < p1, var] = np.nan
    data.loc[data[var] > p99, var] = np.nan

# Calculate FiO2
data['resp_imputed'] = data.groupby(['csn'])['resp'].ffill()
data['resp_imputed'] = data.groupby(['csn'])['resp_imputed'].bfill()
data['resp_imputed'] = data['resp_imputed'].fillna(data['resp_imputed'].median())
data['weight_imputed'] = data.groupby(['csn'])['weight'].ffill()
data['weight_imputed'] = data.groupby(['csn'])['weight_imputed'].bfill()
data['weight_imputed'] = data['weight_imputed'].fillna(data['weight_imputed'].median())
data['o2_flow_imputed'] = data.groupby(['csn'])['o2_flow'].ffill()
data['o2_flow_imputed'] = data.groupby(['csn'])['o2_flow_imputed'].bfill()
data['o2_flow_imputed'] = data['o2_flow_imputed'].fillna(data['o2_flow_imputed'].median())
data['vol_calculated'] = np.nan
data.loc[(data['weight_imputed'] >= 6) & (data['weight_imputed'] < 10) & (data['resp_imputed'] >= 30) & (data['resp_imputed'] <= 50), 'vol_calculated'] = 7.160 - (0.265 * data['resp_imputed']) + (2.820 * data['weight_imputed'])
data.loc[(data['weight_imputed'] >= 10) & (data['weight_imputed'] < 80) & (data['resp_imputed'] >= 8) & (data['resp_imputed'] <= 25), 'vol_calculated'] = 154.137 + (3.470 * data['weight_imputed']) - (6.861 * data['resp_imputed'])
data.loc[(data['weight_imputed'] >= 80) & (data['weight_imputed'] <= 250) & (data['resp_imputed'] >= 8) & (data['resp_imputed'] <= 18)  & (data['gender'] == 'Male'), 'vol_calculated'] = 466.969 + (2.4 * data['weight_imputed']) - (26.342 * data['resp_imputed'])
data.loc[(data['weight_imputed'] >= 80) & (data['weight_imputed'] <= 250) & (data['resp_imputed'] >= 8) & (data['resp_imputed'] <= 18) & (data['gender'] == 'Female'), 'vol_calculated'] = 456.408 + (1.794 * data['weight_imputed']) - (22.716 * data['resp_imputed'])
data['fio2_calculated'] = round(((((data['o2_flow_imputed'] * (data['fio2'] / 100)) + (0.21 * (((data['vol_calculated'] * 0.001) * data['resp_imputed']) - (data['o2_flow_imputed'] * (data['fio2'] / 100))))) / ((data['vol_calculated'] * 0.001) * data['resp_imputed'])) / data['o2_flow_imputed']) * 100, 2)
data.drop(['resp_imputed', 'weight_imputed', 'o2_flow_imputed'], axis=1, inplace=True)
data.loc[data['fio2_calculated'] < 21, 'fio2_calculated'] = 21

# Use correct FiO2
mv_fs = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/mv_indicators_raw.parquet.gzip')
mv_fs = mv_fs[mv_fs['variable_name'] == 'Apparatus Type']
app_type = pd.read_csv('../files/apparatus_type_ann.csv')
mv_fs = mv_fs[mv_fs['csn'].isin(mv_fs.loc[mv_fs['value'].isin(app_type.loc[app_type['nasal'] == 'Nasal', 'value']), 'csn'].unique().tolist())]
mv_fs = mv_fs[~(mv_fs['csn'].isin(mv_fs.loc[mv_fs['value'].isin(app_type.loc[~(app_type['nasal'] == 'Nasal'), 'value']), 'csn'].unique().tolist()))]
data['fio2_corrected'] = np.nan
data.loc[data['csn'].isin(mv_fs['csn'].unique().tolist()), 'fio2_corrected'] = data['fio2_calculated']
data.loc[~(data['csn'].isin(mv_fs['csn'].unique().tolist())), 'fio2_corrected'] = data['fio2']
data.rename(columns={'fio2':'fio2_provided'}, inplace=True)
data.rename(columns={'fio2_corrected':'fio2'}, inplace=True)

# Calculate PaO2/FiO2
data['fio2_imputed'] = data.groupby(['csn'])['fio2'].ffill()
data['pao2_fio2_calculated'] = data['pao2'] / (data['fio2_imputed'] / 100)

# Calculate SpO2/FiO2
data['spo2_fio2_calculated'] = data['spo2'] / (data['fio2_imputed'] / 100)
data.drop(['fio2_imputed'], axis=1, inplace=True)

# Forward fill resp support
data['resp_indicator'] = data.groupby(['csn'])['resp_indicator'].ffill()

# Gather patients with infection
print('Number of CSNs:', len(data['csn'].unique().tolist()))
data = data[data['csn'].isin(antiinf['csn'].unique().tolist())]
print('Number of CSNs with infection:', len(data['csn'].unique().tolist()))
data.sort_values(by=['csn', 'recorded_time'], inplace=True)
data.reset_index(drop=True, inplace=True)

Number of CSNs: 63876
Number of CSNs with infection: 11809


### PSOFA

In [5]:
# Calculate component scores

# Calculate respiratory component
data['resp_score'] = 0
data['resp_score'] = np.where((data['pao2_fio2_calculated'] >= 400) | (data['spo2_fio2_calculated'] >= 292), 0, 
                        np.where(((data['pao2_fio2_calculated'] >= 300) & (data['pao2_fio2_calculated'] < 400)) | ((data['spo2_fio2_calculated'] >= 264) & (data['spo2_fio2_calculated'] < 292)), 1, 
                        np.where(((data['pao2_fio2_calculated'] >= 200) & (data['pao2_fio2_calculated'] < 300)) | ((data['spo2_fio2_calculated'] >= 221) & (data['spo2_fio2_calculated'] < 264)), 2, 
                        np.where((((data['pao2_fio2_calculated'] >= 100) & (data['pao2_fio2_calculated'] < 200)) | ((data['spo2_fio2_calculated'] >= 148) & (data['spo2_fio2_calculated'] < 221))) & (data['resp_indicator'] == 1), 3, 
                        np.where(((data['pao2_fio2_calculated'] < 100) | (data['spo2_fio2_calculated'] < 148)) & (data['resp_indicator'] == 1), 4, 0)))))

# Calculate coagulation component
data['coag_score'] = 0
data['coag_score'] = np.where(data['platelets'] >= 150, 0, 
                        np.where((data['platelets'] >= 100) & (data['platelets'] < 150), 1, 
                        np.where((data['platelets'] >= 50) & (data['platelets'] < 100), 2, 
                        np.where((data['platelets'] >= 20) & (data['platelets'] < 50), 3, 
                        np.where(data['platelets'] < 20, 4, 0)))))

# Calculate hepatic component
data['hep_score'] = 0
data['hep_score'] = np.where(data['bilirubin'] < 1.2, 0, 
                        np.where((data['bilirubin'] >= 1.2) & (data['bilirubin'] < 2), 1, 
                        np.where((data['bilirubin'] >= 2) & (data['bilirubin'] < 6), 2, 
                        np.where((data['bilirubin'] >= 6) & (data['bilirubin'] < 12), 3, 
                        np.where(data['bilirubin'] >= 12, 4, 0)))))

# Calculate cardiovascular component (No dobutamine)
data['card_score'] = 0
data['card_score'] = np.where(((data['age_months'] < 1) & (data['map'] >= 46)) | 
                                ((data['age_months'] < 12) & (data['map'] >= 55)) |
                                ((data['age_months'] < 24) & (data['map'] >= 60)) |
                                ((data['age_months'] < 60) & (data['map'] >= 62)) |
                                ((data['age_months'] < 144) & (data['map'] >= 65)) |
                                ((data['age_months'] <= 216) & (data['map'] >= 67)) |
                                ((data['age_months'] > 216) & (data['map'] >= 70)), 0, 
                        np.where(((data['age_months'] < 1) & (data['map'] < 46)) | 
                                ((data['age_months'] < 12) & (data['map'] < 55)) |
                                ((data['age_months'] < 24) & (data['map'] < 60)) |
                                ((data['age_months'] < 60) & (data['map'] < 62)) |
                                ((data['age_months'] < 144) & (data['map'] < 65)) |
                                ((data['age_months'] <= 216) & (data['map'] < 67)) |
                                ((data['age_months'] > 216) & (data['map'] < 70)), 1, 
                        np.where(data['dopamine'] <= 5, 2, 
                        np.where((data['dopamine'] > 5) | (data['epinephrine'] <= 0.1) | (data['norepinephrine'] <= 0.1), 3, 
                        np.where((data['dopamine'] > 15) | (data['epinephrine'] > 0.1) | (data['norepinephrine'] > 0.1), 4, 0)))))

# Calculate neurologic component 
data['neuro_score'] = 0
data['neuro_score'] = np.where(data['coma_scale'] >= 15, 0, 
                        np.where((data['coma_scale'] >= 13) & (data['coma_scale'] < 15), 1, 
                        np.where((data['coma_scale'] >= 10) & (data['coma_scale'] < 13), 2, 
                        np.where((data['coma_scale'] >= 6) & (data['coma_scale'] < 10), 3, 
                        np.where(data['coma_scale'] < 6, 4, 0)))))

# Calculate renal component 
data['renal_score'] = 0
data['renal_score'] = np.where(((data['age_months'] < 1) & (data['creatinine'] < 0.8)) |
                                (((data['age_months'] >= 1) & (data['age_months'] < 12)) & (data['creatinine'] < 0.3)) |
                                (((data['age_months'] >= 12) & (data['age_months'] < 24)) & (data['creatinine'] < 0.4)) |
                                (((data['age_months'] >= 24) & (data['age_months'] < 60)) & (data['creatinine'] < 0.6)) |
                                (((data['age_months'] >= 60) & (data['age_months'] < 144)) & (data['creatinine'] < 0.7)) |
                                (((data['age_months'] >= 144) & (data['age_months'] <= 216)) & (data['creatinine'] < 1.0)) |
                                ((data['age_months'] > 216) & (data['creatinine'] < 1.2)), 0, 
                        np.where(((data['age_months'] < 1) & ((data['creatinine'] < 1.0) & (data['creatinine'] >= 0.8))) |
                                (((data['age_months'] >= 1) & (data['age_months'] < 12)) & ((data['creatinine'] < 0.5) & (data['creatinine'] >= 0.3))) |
                                (((data['age_months'] >= 12) & (data['age_months'] < 24)) & ((data['creatinine'] < 0.6) & (data['creatinine'] >= 0.4))) |
                                (((data['age_months'] >= 24) & (data['age_months'] < 60)) & ((data['creatinine'] < 0.9) & (data['creatinine'] >= 0.6))) |
                                (((data['age_months'] >= 60) & (data['age_months'] < 144)) & ((data['creatinine'] < 1.1) & (data['creatinine'] >= 0.7))) |
                                (((data['age_months'] >= 144) & (data['age_months'] <= 216)) & ((data['creatinine'] < 1.7) & (data['creatinine'] >= 1.0))) |
                                ((data['age_months'] > 216) & ((data['creatinine'] < 2.0) & (data['creatinine'] >= 1.2))), 1, 
                        np.where(((data['age_months'] < 1) & ((data['creatinine'] < 1.2) & (data['creatinine'] >= 1.0))) |
                                (((data['age_months'] >= 1) & (data['age_months'] < 12)) & ((data['creatinine'] < 0.8) & (data['creatinine'] >= 0.5))) |
                                (((data['age_months'] >= 12) & (data['age_months'] < 24)) & ((data['creatinine'] < 1.1) & (data['creatinine'] >= 0.6))) |
                                (((data['age_months'] >= 24) & (data['age_months'] < 60)) & ((data['creatinine'] < 1.6) & (data['creatinine'] >= 0.9))) |
                                (((data['age_months'] >= 60) & (data['age_months'] < 144)) & ((data['creatinine'] < 1.8) & (data['creatinine'] >= 1.1))) |
                                (((data['age_months'] >= 144) & (data['age_months'] <= 216)) & ((data['creatinine'] < 2.9) & (data['creatinine'] >= 1.7))) |
                                ((data['age_months'] > 216) & ((data['creatinine'] < 3.5) & (data['creatinine'] >= 2.0))), 2, 
                        np.where(((data['age_months'] < 1) & ((data['creatinine'] < 1.6) & (data['creatinine'] >= 1.2))) |
                                (((data['age_months'] >= 1) & (data['age_months'] < 12)) & ((data['creatinine'] < 1.2) & (data['creatinine'] >= 0.8))) |
                                (((data['age_months'] >= 12) & (data['age_months'] < 24)) & ((data['creatinine'] < 1.5) & (data['creatinine'] >= 1.1))) |
                                (((data['age_months'] >= 24) & (data['age_months'] < 60)) & ((data['creatinine'] < 2.3) & (data['creatinine'] >= 1.6))) |
                                (((data['age_months'] >= 60) & (data['age_months'] < 144)) & ((data['creatinine'] < 2.6) & (data['creatinine'] >= 1.8))) |
                                (((data['age_months'] >= 144) & (data['age_months'] <= 216)) & ((data['creatinine'] < 4.2) & (data['creatinine'] >= 2.9))) |
                                ((data['age_months'] > 216) & ((data['creatinine'] < 5) & (data['creatinine'] >= 3.5))), 3, 
                        np.where(((data['age_months'] < 1) & (data['creatinine'] >= 1.6)) |
                                (((data['age_months'] >= 1) & (data['age_months'] < 12)) & (data['creatinine'] >= 1.2)) |
                                (((data['age_months'] >= 12) & (data['age_months'] < 24)) & (data['creatinine'] >= 1.5)) |
                                (((data['age_months'] >= 24) & (data['age_months'] < 60)) & (data['creatinine'] >= 2.3)) |
                                (((data['age_months'] >= 60) & (data['age_months'] < 144)) & (data['creatinine'] >= 2.6)) |
                                (((data['age_months'] >= 144) & (data['age_months'] <= 216)) & (data['creatinine'] >= 4.2)) |
                                ((data['age_months'] > 216) & (data['creatinine'] >= 5)), 4, 0)))))

In [7]:
# Calculate PSOFA per day
data = data[['patid', 'mrn', 'csn', 'dob', 'age_months', 'hosp_admission', 'day', 'resp_score', 'coag_score', 'hep_score', 'card_score', 'neuro_score', 'renal_score']]
data = data.groupby(['patid', 'mrn', 'csn', 'dob', 'hosp_admission', 'day'], as_index=False)['resp_score', 'coag_score', 'hep_score', 'card_score', 'neuro_score', 'renal_score'].max()
data['psofa'] = data['resp_score'] + data['coag_score'] + data['hep_score'] + data['card_score'] + data['neuro_score'] + data['renal_score']

# Add infection time
data = data.merge(antiinf[['csn', 'inf_time']], how='inner', on='csn')
data['inf_day'] = np.ceil((data['inf_time'] - data['hosp_admission']) / pd.Timedelta('1 day'))
data.loc[(data['inf_day'] <= 0) | (data['inf_day'] > 30), 'inf_day'] = np.nan

# Find patients with sepsis
data_temp = data[(data['day'] >= np.maximum(data['inf_day'] - 2, 1.0)) & (data['day'] <= data['inf_day'] + 1)]
data_temp = data_temp[data_temp['psofa'] >= 2]
data_temp = data_temp.sort_values(by=['csn', 'day'], ascending=True).groupby('csn', as_index=False)['hosp_admission', 'day', 'psofa'].first()
data_temp['sepsis_time'] = data_temp['hosp_admission'] + pd.to_timedelta(data_temp['day'], unit='d')
data['sepsis'] = 0
data.loc[data['csn'].isin(data_temp['csn'].unique().tolist()), 'sepsis'] = 1

# Add sepsis time
data = data.merge(data_temp[['csn', 'sepsis_time']], how='left', on='csn')

# Save file
data.to_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/screening_inf_psofa.csv', compression='gzip')

In [9]:
# Save reduced version
data = data[['patid', 'mrn', 'csn', 'dob', 'inf_time', 'inf_day', 'sepsis', 'sepsis_time']]
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print('Number of suspected infection CSNs with sepsis: {}.'.format(len(data.loc[data['sepsis'] == 1, 'csn'].unique().tolist())))
print('Number of suspected infection CSNs without sepsis: {}.'.format(len(data.loc[data['sepsis'] == 0, 'csn'].unique().tolist())))

Number of suspected infection CSNs with sepsis: 9125.
Number of suspected infection CSNs without sepsis: 2684.


In [11]:
# Save positive cohort
data_pos = data[~(data['sepsis_time'].isna())]
data_pos.to_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/cohort_inf_psofa.csv', index=False)