In [1]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
import tableone
from datetime import datetime, timedelta
from scipy import stats
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [14]:
screening_method = 'inf_phoenix'

In [15]:
path = '/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening'
comp_cohort = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/complete_cohort.csv')
inf_phoenix_eg = pd.read_parquet(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models', 'dataset_agg_eg_inf_phoenix.parquet.gzip'))
inf_phoenix_sr = pd.read_parquet(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models', 'dataset_agg_sr_inf_phoenix.parquet.gzip'))
inf_phoenix = pd.concat([inf_phoenix_eg, inf_phoenix_sr])
comp_cohort = comp_cohort[comp_cohort['csn'].isin(inf_phoenix['csn'].unique().tolist())]
data = comp_cohort[['patid', 'csn', 'dob']]
data['dob'] = data['dob'].apply(pd.to_datetime)

In [16]:
# Load data
cohort_pos = pd.read_csv(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening', 'cohort_' + screening_method + '.csv'))

# Add sepsis label
data['Sepsis'] = 0
data.loc[data['csn'].isin(cohort_pos['csn'].unique().tolist()), 'Sepsis'] = 1

# Load demographics file
demo = pd.read_parquet("/labs/collab/ECMO/new_data/TAB1_Patients.parquet.gzip")
demo = demo[['Pat ID', 'Gender', 'Race']]
demo.columns = ['patid', 'Gender', 'Race']

# Add race and gender
data = pd.merge(data, demo, on='patid', how="left")

# Fix gender
data['Gender'] = data['Gender'].fillna('Unknown')

# Fix race
data['Race'] = data['Race'].fillna('Unknown')
data.loc[(data['Race'].str.contains(";", case=False)) & (data['Race'].str.contains("declined|unknown", case=False)), 'Race'] = 'Unknown'
data.loc[data['Race'].str.contains(";", case=False), 'Race'] = 'Other'
data.loc[data['Race'].isin(['Black/African-Amer']), 'Race'] = 'Black or African American'
data.loc[data['Race'].isin([' White,Non-Hipanic', 'White,Hispanic']), 'Race'] = 'White'
data.loc[data['Race'].isin(['Declined', 'Non-White Hispanic', 'Other', 'Other/Declined', 'Patient Not Present', 'Parent Not Present']), 'Race'] = 'Unknown'
data.loc[data['Race'].isin(['American Ind/Alaskan', 'Multi-Racial', 'American Indian or Alaska Native', 'Native Hawaiian or Other Pacific Islander']), 'Race'] = 'Other'

In [18]:
# Load encounters file
print('Loading encounters...')
dept_path = '/labs/collab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip'
dept = pd.read_parquet(dept_path)
dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']] = dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']].apply(pd.to_datetime)
dept = dept[['Pat ID', 'Encounter CSN', 'Name', 'BIRTH_DATE', 'Department', 'Entered_Dept', 'Exited_Dept', 'Hosp_Admission', 'Hosp_Discharge']]
dept.columns = ['patid', 'csn', 'name', 'dob', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']
dept = dept[(dept['department'].str.contains('PEDIATRIC ICU')) & (dept['hosp_adm'] >= '2010-01-01')]
dept.dropna(inplace=True)
dept.drop_duplicates(inplace=True)

dept_first = dept.sort_values(by=['csn', 'hosp_adm', 'entered_dept'])
dept_first = dept_first.groupby('csn', as_index=False).first()

# Add hospital and PICU admission and discharge
data = data.merge(dept_first[['csn', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']], how='inner', on='csn')

Loading encounters...


In [19]:
# Add age
data['Age in years'] = round(round((data['hosp_adm'] - data['dob']) / pd.Timedelta('1 day'), 0) / 365.25, 2)

# Compute Age group
data['Age Group'] = np.where(data['Age in years'] <= 0.083, '≤ 28 days', np.where(data['Age in years'] <= 3.0, '29 days - 2 years', 
                    np.where(data['Age in years'] <= 6.0, '3 - 5 years', '6 - 17 years')))

In [20]:
# Compute hospital LOS
data['Hospital Length of Stay in days'] = (data['hosp_disch'] - data['hosp_adm']) / pd.Timedelta('1 day')

# Compute PICU LOS
data['PICU Length of Stay in days'] = (data['exited_dept'] - data['entered_dept']) / pd.Timedelta('1 day')
data['PICU Length of Stay in hours'] = (data['exited_dept'] - data['entered_dept']) / pd.Timedelta('1 hour')
data.drop(['hosp_disch', 'hosp_adm', 'exited_dept'], axis=1, inplace=True)

# Add mortality
depts = pd.read_parquet(dept_path)
depts[['Entered_Dept']] = depts[['Entered_Dept']].apply(pd.to_datetime)
depts['Hospital mortality'] = np.where(depts['Hospital_Discharge_Disposition'] == 'Expired', 'Yes', np.where(depts['Hospital_Discharge_Disposition'] == 'Expired Place Unknown', 'Yes', 'No'))
depts = depts[['Encounter CSN', 'Hospital mortality', 'Department', 'Entered_Dept']]
depts.columns = ['csn', 'Hospital Mortality', 'department', 'entered_dept']

# Merge dataframes
data = data.merge(depts, how='inner', on=['csn', 'department', 'entered_dept'])
data.rename(columns={'department':'PICU Campus'}, inplace=True)

# Add composite outcome
data['Hospital Mortality or PICU Stay >= 72 hours'] = 'No'
data.loc[(data['PICU Length of Stay in hours'] >= 72) | (data['Hospital Mortality'] ==  'Yes'), 'Hospital Mortality or PICU Stay >= 72 hours'] = 'Yes'
data.drop('PICU Length of Stay in hours', axis=1, inplace=True)

# Add ethnicity
pats = pd.read_parquet("/labs/collab/ECMO/new_data/TAB1_Patients.parquet.gzip")
pats = pats[['Pat ID', 'Ethnicity']]
pats.columns = ['patid', 'Ethnicity']

# Merge dataframes
data = data.merge(pats, how='inner', on='patid')
data.drop(['entered_dept'], axis=1, inplace=True)

In [21]:
# Add mortality scores
scores = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_analysis/scores_24.csv')
scores = scores[['csn', 'phoenix', 'psofa', 'pelod2', 'prism3']]
scores.columns = ['csn', 'Phoenix', 'pSOFA', 'PELOD II', 'PRISM III']
data = data.merge(scores, how='left', on='csn')

In [22]:
# Organize data
data.loc[(data['PICU Campus'] == 'SR PEDIATRIC ICU') & ((data['Sepsis'] == 0)), 'Campus'] = 'Validation No Sepsis'
data.loc[(data['PICU Campus'] == 'EG PEDIATRIC ICU') & ((data['Sepsis'] == 0)), 'Campus'] = 'Derivation No Sepsis'
data.loc[(data['PICU Campus'] == 'SR PEDIATRIC ICU') & ((data['Sepsis'] == 1)), 'Campus'] = 'Validation Sepsis'
data.loc[(data['PICU Campus'] == 'EG PEDIATRIC ICU') & ((data['Sepsis'] == 1)), 'Campus'] = 'Derivation Sepsis'

data.drop(['Sepsis', 'PICU Campus', 'patid', 'csn', 'dob'], axis=1, inplace=True)

# Fix Ethnicity
data.loc[data['Ethnicity'].isin(['Declined', 'Patient Not Present', 'Parent Not Present']), 'Ethnicity'] = 'Unknown'

In [None]:
# Create tableOne
columns = list(data.columns)
columns.remove('Campus')

categorical = ['Gender', 'Race', 'Age Group', 'Hospital Mortality', 'Ethnicity', 'Hospital Mortality or PICU Stay >= 72 hours']
    
nonnormal = [x for x in columns if x not in categorical]

groupby = ['Campus']

data.reset_index(inplace=True, drop=True)

myTable = tableone.TableOne(data, columns=columns, categorical=categorical, groupby=groupby, nonnormal=nonnormal, pval=False, missing=False, htest_name=False)
myTable

In [12]:
myTable.to_csv(os.path.join('/home/dchanci/projects/ped_sepsis_prediction_ml/models/results_updated', screening_method, 'tableone.csv'))