In [12]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
import tableone
from datetime import datetime, timedelta
from scipy import stats
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [13]:
screening_method = 'inf_phoenix'

In [14]:
path = '/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening'
comp_cohort = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/complete_cohort.csv')
inf_phoenix_eg = pd.read_parquet(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models', 'dataset_agg_eg_inf_phoenix.parquet.gzip'))
inf_phoenix_sr = pd.read_parquet(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models', 'dataset_agg_sr_inf_phoenix.parquet.gzip'))
inf_phoenix = pd.concat([inf_phoenix_eg, inf_phoenix_sr])
comp_cohort = comp_cohort[comp_cohort['csn'].isin(inf_phoenix['csn'].unique().tolist())]
data = comp_cohort[['patid', 'csn', 'dob']]
data['dob'] = data['dob'].apply(pd.to_datetime)

In [15]:
# Load data
cohort_phoenix = pd.read_csv(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening', 'cohort_inf_phoenix.csv'))
cohort_psofa = pd.read_csv(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening', 'cohort_inf_psofa.csv'))

# Add sepsis label
data['inf_phoenix'] = 0
data['inf_psofa'] = 0
data.loc[data['csn'].isin(cohort_phoenix['csn'].unique().tolist()), 'inf_phoenix'] = 1
data.loc[data['csn'].isin(cohort_psofa['csn'].unique().tolist()), 'inf_psofa'] = 1

# Load demographics file
demo = pd.read_parquet("/labs/kamaleswaranlab/ECMO/new_data/TAB1_Patients.parquet.gzip")
demo = demo[['Pat ID', 'Gender', 'Race']]
demo.columns = ['patid', 'Gender', 'Race']

# Add race and gender
data = pd.merge(data, demo, on='patid', how="left")

# Fix gender
data['Gender'] = data['Gender'].fillna('Unknown')

# Fix race
data['Race'] = data['Race'].fillna('Unknown')
data.loc[(data['Race'].str.contains(";", case=False)) & (data['Race'].str.contains("declined|unknown", case=False)), 'Race'] = 'Unknown'
data.loc[data['Race'].str.contains(";", case=False), 'Race'] = 'Other'
data.loc[data['Race'].isin(['Black/African-Amer']), 'Race'] = 'Black or African American'
data.loc[data['Race'].isin([' White,Non-Hipanic', 'White,Hispanic']), 'Race'] = 'White'
data.loc[data['Race'].isin(['Declined', 'Non-White Hispanic', 'Other', 'Other/Declined', 'Patient Not Present', 'Parent Not Present']), 'Race'] = 'Unknown'
data.loc[data['Race'].isin(['American Ind/Alaskan', 'Multi-Racial', 'American Indian or Alaska Native', 'Native Hawaiian or Other Pacific Islander']), 'Race'] = 'Other'

In [17]:
# Load encounters file
print('Loading encounters...')
dept_path = '/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip'
dept = pd.read_parquet(dept_path)
dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']] = dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']].apply(pd.to_datetime)
dept = dept[['Pat ID', 'Encounter CSN', 'Name', 'BIRTH_DATE', 'Department', 'Entered_Dept', 'Exited_Dept', 'Hosp_Admission', 'Hosp_Discharge']]
dept.columns = ['patid', 'csn', 'name', 'dob', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']
dept = dept[(dept['department'].str.contains('PEDIATRIC ICU')) & (dept['hosp_adm'] >= '2010-01-01')]
dept.dropna(inplace=True)
dept.drop_duplicates(inplace=True)

dept_first = dept.sort_values(by=['csn', 'hosp_adm', 'entered_dept'])
dept_first = dept_first.groupby('csn', as_index=False).first()

# Add hospital and PICU admission and discharge
data = data.merge(dept_first[['csn', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']], how='inner', on='csn')

Loading encounters...


In [18]:
# Add age
data['Age in years'] = round(round((data['hosp_adm'] - data['dob']) / pd.Timedelta('1 day'), 0) / 365.25, 2)

# Compute Age group
data['Age Group'] = np.where(data['Age in years'] <= 0.083, '≤ 28 days', np.where(data['Age in years'] <= 3.0, '29 days - 2 years', 
                    np.where(data['Age in years'] <= 6.0, '3 - 5 years', '6 - 17 years')))

In [19]:
# Compute hospital LOS
data['Hospital Length of Stay in days'] = (data['hosp_disch'] - data['hosp_adm']) / pd.Timedelta('1 day')

# Compute PICU LOS
data['PICU Length of Stay in days'] = (data['exited_dept'] - data['entered_dept']) / pd.Timedelta('1 day')
data.drop(['hosp_disch', 'hosp_adm', 'exited_dept'], axis=1, inplace=True)

# Add mortality
depts = pd.read_parquet(dept_path)
depts[['Entered_Dept']] = depts[['Entered_Dept']].apply(pd.to_datetime)
depts['Hospital mortality'] = np.where(depts['Hospital_Discharge_Disposition'] == 'Expired', 'Yes', np.where(depts['Hospital_Discharge_Disposition'] == 'Expired Place Unknown', 'Yes', 'No'))
depts = depts[['Encounter CSN', 'Hospital mortality', 'Department', 'Entered_Dept']]
depts.columns = ['csn', 'Hospital Mortality', 'department', 'entered_dept']

# Merge dataframes
data = data.merge(depts, how='inner', on=['csn', 'department', 'entered_dept'])
data.rename(columns={'department':'PICU Campus'}, inplace=True)

# Add ethnicity
pats = pd.read_parquet("/labs/kamaleswaranlab/ECMO/new_data/TAB1_Patients.parquet.gzip")
pats = pats[['Pat ID', 'Ethnicity']]
pats.columns = ['patid', 'Ethnicity']

# Merge dataframes
data = data.merge(pats, how='inner', on='patid')
data.drop(['entered_dept'], axis=1, inplace=True)

In [20]:
# Add mortality scores
scores = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_analysis/scores_24.csv')
scores = scores[['csn', 'phoenix', 'psofa', 'pelod2', 'prism3']]
scores.columns = ['csn', 'Phoenix', 'pSOFA', 'PELOD II', 'PRISM III']
data = data.merge(scores, how='left', on='csn')

In [21]:
# Organize data
data.loc[(data['PICU Campus'] == 'SR PEDIATRIC ICU') & (data['inf_phoenix'] == 0) & (data['inf_psofa'] == 0), 'Cohort'] = 'Validation No Sepsis'
data.loc[(data['PICU Campus'] == 'EG PEDIATRIC ICU') & (data['inf_phoenix'] == 0) & (data['inf_psofa'] == 0), 'Cohort'] = 'Derivation No Sepsis'
data.loc[(data['PICU Campus'] == 'SR PEDIATRIC ICU') & (data['inf_phoenix'] == 1) & (data['inf_psofa'] == 0), 'Cohort'] = 'Validation inf_phoenix Only'
data.loc[(data['PICU Campus'] == 'EG PEDIATRIC ICU') & (data['inf_phoenix'] == 1) & (data['inf_psofa'] == 0), 'Cohort'] = 'Derivation inf_phoenix Only'
data.loc[(data['PICU Campus'] == 'SR PEDIATRIC ICU') & (data['inf_phoenix'] == 0) & (data['inf_psofa'] == 1), 'Cohort'] = 'Validation inf_psofa Only'
data.loc[(data['PICU Campus'] == 'EG PEDIATRIC ICU') & (data['inf_phoenix'] == 0) & (data['inf_psofa'] == 1), 'Cohort'] = 'Derivation inf_psofa Only'
data.loc[(data['PICU Campus'] == 'SR PEDIATRIC ICU') & (data['inf_phoenix'] == 1) & (data['inf_psofa'] == 1), 'Cohort'] = 'Validation inf_phoenix and inf_psofa'
data.loc[(data['PICU Campus'] == 'EG PEDIATRIC ICU') & (data['inf_phoenix'] == 1) & (data['inf_psofa'] == 1), 'Cohort'] = 'Derivation inf_phoenix and inf_psofa'

data.drop(['inf_phoenix', 'inf_psofa', 'PICU Campus', 'patid', 'csn', 'dob'], axis=1, inplace=True)

# Fix Ethnicity
data.loc[data['Ethnicity'].isin(['Declined', 'Patient Not Present', 'Parent Not Present']), 'Ethnicity'] = 'Unknown'

In [23]:
# Create tableOne
columns = list(data.columns)
columns.remove('Cohort')

categorical = ['Gender', 'Race', 'Age Group', 'Hospital Mortality', 'Ethnicity']
    
nonnormal = [x for x in columns if x not in categorical]

groupby = ['Cohort']

data.reset_index(inplace=True, drop=True)

myTable = tableone.TableOne(data, columns=columns, categorical=categorical, groupby=groupby, nonnormal=nonnormal, pval=False, missing=False, htest_name=False)
myTable

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Cohort,Grouped by Cohort,Grouped by Cohort,Grouped by Cohort,Grouped by Cohort,Grouped by Cohort,Grouped by Cohort,Grouped by Cohort,Grouped by Cohort
Unnamed: 0_level_1,Unnamed: 1_level_1,Overall,Derivation No Sepsis,Derivation inf_phoenix Only,Derivation inf_phoenix and inf_psofa,Derivation inf_psofa Only,Validation No Sepsis,Validation inf_phoenix Only,Validation inf_phoenix and inf_psofa,Validation inf_psofa Only
n,,63875,23750,704,1883,1981,29572,724,2078,3183
"Gender, n (%)",Female,28774 (45.0),10776 (45.4),310 (44.0),851 (45.2),861 (43.5),13227 (44.7),323 (44.6),948 (45.6),1478 (46.4)
"Gender, n (%)",Male,35098 (54.9),12972 (54.6),394 (56.0),1032 (54.8),1120 (56.5),16344 (55.3),401 (55.4),1130 (54.4),1705 (53.6)
"Gender, n (%)",Unknown,3 (0.0),2 (0.0),,,,1 (0.0),,,
"Race, n (%)",Asian,1914 (3.0),568 (2.4),20 (2.8),52 (2.8),50 (2.5),960 (3.2),38 (5.2),82 (3.9),144 (4.5)
"Race, n (%)",Black or African American,25828 (40.4),12525 (52.7),338 (48.0),950 (50.5),1025 (51.7),9200 (31.1),243 (33.6),674 (32.4),873 (27.4)
"Race, n (%)",Other,279 (0.4),107 (0.5),2 (0.3),8 (0.4),2 (0.1),139 (0.5),1 (0.1),7 (0.3),13 (0.4)
"Race, n (%)",Unknown,9056 (14.2),2340 (9.9),81 (11.5),180 (9.6),174 (8.8),5183 (17.5),114 (15.7),367 (17.7),617 (19.4)
"Race, n (%)",White,26798 (42.0),8210 (34.6),263 (37.4),693 (36.8),730 (36.9),14090 (47.6),328 (45.3),948 (45.6),1536 (48.3)
"Age in years, median [Q1,Q3]",,"4.4 [1.1,11.6]","4.9 [1.2,12.1]","6.3 [1.5,12.5]","4.5 [0.9,11.8]","3.7 [1.0,10.3]","4.0 [1.1,11.4]","6.0 [1.8,11.9]","5.8 [1.3,11.7]","3.1 [0.8,9.4]"


In [24]:
myTable.to_csv(os.path.join('/home/dchanci/research/pediatric_sepsis/prediction_ml/models/results_updated', 'tableone_comparison.csv'))