In [1]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
import tableone
from datetime import datetime, timedelta
from scipy import stats
import matplotlib.pyplot as plt
import warnings

In [2]:
path = '/home/dchanci/research/pediatric_sepsis/prediction_ml/models/results_updated'
name = 'inf_phoenix_inf_psofa'

In [3]:
cohort1 = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/cohort_inf_phoenix.csv')
cohort2 = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/cohort_inf_psofa.csv')

In [4]:
col_names = ['albumin', 'base_excess', 'base_deficit', 'pco2', 'po2', 'bicarbonate', 'bilirubin_total', 'bp_dias', 'bp_sys', 'bun',
              'calcium', 'calcium_ionized', 'chloride', 'co2', 'creatinine', 'fio2', 'glucose', 'hemoglobin', 'lactic_acid', 
              'map', 'pao2_fio2', 'ph', 'platelets', 'potassium', 'ptt', 'pulse', 'pupil_left_size', 'resp', 'sodium', 'spo2', 'temp', 'wbc']

col_names_fixed = ['Albumin (g/dL)', 'Base Excess (mEq/L)', 'Base Deficit (mEq/L)', 'Arterial PaCO2 (mm Hg)', 'Arterial PaO2 (mm Hg)', 'Bicarbonate (mEq/L)', 'Bilirubin (mg/dL)', 
                   'Diastolic Blood Pressure (mm Hg)', 'Systolic Blood Pressure (mm Hg)', 'BUN (mg/dL)', 'Calcium (mg/dL)', 'Ionized Calcium (mg/dL)', 
                   'Chloride (mEq/L)', 'CO2 (mEq/L)', 'Creatinine (mg/dL)', 'FiO2 (%)', 'Glucose (mg/dL)', 'Hemoglobin (g/dL)', 'Lactic Acid (mEq/L)', 
                    'Mean Arterial Pressure (mm Hg)', 'PaO2/FiO2 Ratio (mmHg)', 'pH', 'Platelets (x10\u2079/L)', 'Potassium (mEq/L)', 'PTT (seconds)', 
                    'Heart Rate (beats per minute)', 'Pupil Left Size (mm)', 'Respiratory Rate (breaths per minute)', 'Sodium (mEq/L)', 'SpO2 (%)', 
                    'Temperature (°C)', 'WBC (x10\u2079/L)']

In [5]:
# Load data
dist_eg = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_analysis/features_preimp_pivot_24_eg.parquet.gzip')
dist_eg = dist_eg[(dist_eg['csn'].isin(cohort1['csn'].unique().tolist())) & (dist_eg['csn'].isin(cohort2['csn'].unique().tolist()))]
dist_eg = dist_eg[col_names]
dist_eg.columns = col_names_fixed

In [6]:
# Compute statistics
mean = [round(x,2) for x in list(np.nanmean(dist_eg, axis=0))]
mean.insert(0, "Mean")
median = [round(x,2) for x in list(np.nanmedian(dist_eg, axis=0))]
median.insert(0, "Median")
q1 = [round(x,2) for x in list(np.nanquantile(dist_eg, 0.25, axis=0))]
q1.insert(0, "Q1")
q3 = [round(x,2) for x in list(np.nanquantile(dist_eg, 0.75, axis=0))]
q3.insert(0, "Q3")
min = [round(x,2) for x in list(np.nanmin(dist_eg, axis=0))]
min.insert(0, "Min")
max = [round(x,2) for x in list(np.nanmax(dist_eg, axis=0))]
max.insert(0, "Max")
std = [round(x,2) for x in list(np.nanstd(dist_eg, axis=0))]
std.insert(0, "Std")
dist_eg.reset_index(inplace=True)
dist_eg.loc[len(dist_eg.index)] = mean
dist_eg.loc[len(dist_eg.index)] = median
dist_eg.loc[len(dist_eg.index)] = q1
dist_eg.loc[len(dist_eg.index)] = q3
dist_eg.loc[len(dist_eg.index)] = min
dist_eg.loc[len(dist_eg.index)] = max
dist_eg.loc[len(dist_eg.index)] = std
dist_eg = dist_eg.iloc[-7:]
dist_eg=dist_eg.rename(columns = {'index':'Feature'})
dist_eg = dist_eg.set_index('Feature')
dist_eg = dist_eg.T
dist_eg.reset_index(inplace=True)
dist_eg.rename(columns={'index': 'Feature'}, inplace=True)
dist_eg

Feature,Feature.1,Mean,Median,Q1,Q3,Min,Max,Std
0,Albumin (g/dL),2.69,2.7,2.3,3.1,1.3,4.9,0.62
1,Base Excess (mEq/L),2.9,2.0,0.0,4.0,0.0,20.0,3.47
2,Base Deficit (mEq/L),5.69,5.0,3.0,7.0,1.0,26.0,4.04
3,Arterial PaCO2 (mm Hg),44.89,42.95,36.4,51.5,17.0,85.8,12.07
4,Arterial PaO2 (mm Hg),77.61,58.0,41.0,90.0,21.0,383.0,57.05
5,Bicarbonate (mEq/L),22.41,22.3,19.6,25.1,4.8,39.2,4.89
6,Bilirubin (mg/dL),1.29,0.5,0.3,1.0,0.1,16.5,2.3
7,Diastolic Blood Pressure (mm Hg),57.39,56.0,47.0,66.0,31.0,100.0,14.22
8,Systolic Blood Pressure (mm Hg),100.28,99.0,88.0,112.0,64.0,150.0,17.35
9,BUN (mg/dL),16.99,12.0,8.0,19.0,2.0,97.0,15.45


In [7]:
# Load data
dist_sr = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_analysis/features_preimp_pivot_24_sr.parquet.gzip')
dist_sr = dist_sr[(dist_sr['csn'].isin(cohort1['csn'].unique().tolist())) & (dist_sr['csn'].isin(cohort2['csn'].unique().tolist()))]
dist_sr = dist_sr[col_names]
dist_sr.columns = col_names_fixed

In [8]:
# Compute statistics
mean = [round(x,2) for x in list(np.nanmean(dist_sr, axis=0))]
mean.insert(0, "Mean")
median = [round(x,2) for x in list(np.nanmedian(dist_sr, axis=0))]
median.insert(0, "Median")
q1 = [round(x,2) for x in list(np.nanquantile(dist_sr, 0.25, axis=0))]
q1.insert(0, "Q1")
q3 = [round(x,2) for x in list(np.nanquantile(dist_sr, 0.75, axis=0))]
q3.insert(0, "Q3")
min = [round(x,2) for x in list(np.nanmin(dist_sr, axis=0))]
min.insert(0, "Min")
max = [round(x,2) for x in list(np.nanmax(dist_sr, axis=0))]
max.insert(0, "Max")
std = [round(x,2) for x in list(np.nanstd(dist_sr, axis=0))]
std.insert(0, "Std")
dist_sr.reset_index(inplace=True)
dist_sr.loc[len(dist_sr.index)] = mean
dist_sr.loc[len(dist_sr.index)] = median
dist_sr.loc[len(dist_sr.index)] = q1
dist_sr.loc[len(dist_sr.index)] = q3
dist_sr.loc[len(dist_sr.index)] = min
dist_sr.loc[len(dist_sr.index)] = max
dist_sr.loc[len(dist_sr.index)] = std
dist_sr = dist_sr.iloc[-7:]
dist_sr=dist_sr.rename(columns = {'index':'Feature'})
dist_sr = dist_sr.set_index('Feature')
dist_sr = dist_sr.T
dist_sr.reset_index(inplace=True)
dist_sr.rename(columns={'index': 'Feature'}, inplace=True)
dist_sr

Feature,Feature.1,Mean,Median,Q1,Q3,Min,Max,Std
0,Albumin (g/dL),2.76,2.7,2.3,3.2,1.6,4.7,0.6
1,Base Excess (mEq/L),2.71,2.0,0.0,4.0,0.0,16.0,3.08
2,Base Deficit (mEq/L),5.6,5.0,3.0,7.0,1.0,25.5,4.06
3,Arterial PaCO2 (mm Hg),43.93,41.8,35.5,50.6,17.4,85.9,12.21
4,Arterial PaO2 (mm Hg),72.64,62.0,46.0,84.0,22.0,313.0,41.42
5,Bicarbonate (mEq/L),22.37,22.38,19.6,25.2,4.9,37.4,4.81
6,Bilirubin (mg/dL),0.85,0.4,0.2,0.7,0.1,14.7,1.61
7,Diastolic Blood Pressure (mm Hg),55.92,55.0,45.0,66.0,29.0,97.0,14.4
8,Systolic Blood Pressure (mm Hg),99.79,99.0,89.0,110.0,66.0,144.0,15.58
9,BUN (mg/dL),13.94,11.0,7.0,16.0,2.0,66.0,10.98


In [9]:
# Organize tables 
for i in range(dist_eg.shape[0]):
    dist_eg.loc[dist_eg.index == i, 'Mean (SD)'] = str(dist_eg.loc[dist_eg.index == i, 'Mean'].values[0]) + ' (' + str(dist_eg.loc[dist_eg.index == i, 'Std'].values[0]) + ')'
    dist_eg.loc[dist_eg.index == i, 'Median (Q1, Q3)'] = str(dist_eg.loc[dist_eg.index == i, 'Median'].values[0]) + ' (' + str(dist_eg.loc[dist_eg.index == i, 'Q1'].values[0]) + ', ' + str(dist_eg.loc[dist_eg.index == i, 'Q3'].values[0]) + ')'
    dist_eg.loc[dist_eg.index == i, 'Min, Max'] = str(dist_eg.loc[dist_eg.index == i, 'Min'].values[0]) + ', ' + str(dist_eg.loc[dist_eg.index == i, 'Max'].values[0])
dist_eg.drop(['Mean', 'Median', 'Q1', 'Q3', 'Min', 'Max', 'Std'], axis=1, inplace=True)


for i in range(dist_sr.shape[0]):
    dist_sr.loc[dist_sr.index == i, 'Mean (SD)'] = str(dist_sr.loc[dist_sr.index == i, 'Mean'].values[0]) + ' (' + str(dist_sr.loc[dist_sr.index == i, 'Std'].values[0]) + ')'
    dist_sr.loc[dist_sr.index == i, 'Median (Q1, Q3)'] = str(dist_sr.loc[dist_sr.index == i, 'Median'].values[0]) + ' (' + str(dist_sr.loc[dist_sr.index == i, 'Q1'].values[0]) + ', ' + str(dist_sr.loc[dist_sr.index == i, 'Q3'].values[0]) + ')'
    dist_sr.loc[dist_sr.index == i, 'Min, Max'] = str(dist_sr.loc[dist_sr.index == i, 'Min'].values[0]) + ', ' + str(dist_sr.loc[dist_sr.index == i, 'Max'].values[0])
dist_sr.drop(['Mean', 'Median', 'Q1', 'Q3', 'Min', 'Max', 'Std'], axis=1, inplace=True)
dist_sr

dist_eg = dist_eg.merge(dist_sr, on='Feature')
dist_eg.columns = [['Feature', 'Derivation', 'Derivation', 'Derivation', 'Validation', 'Validation', 'Validation'], [' ', 'Mean (SD)', 'Median (Q1, Q3)', 'Min, Max', 'Mean (SD)', 'Median (Q1, Q3)', 'Min, Max']]
dist_eg.to_csv(os.path.join(path, name + '_24_h_dist.csv'), index=False)
dist_eg

Unnamed: 0_level_0,Feature,Derivation,Derivation,Derivation,Validation,Validation,Validation
Unnamed: 0_level_1,Unnamed: 1_level_1,Mean (SD),"Median (Q1, Q3)","Min, Max",Mean (SD),"Median (Q1, Q3)","Min, Max"
0,Albumin (g/dL),2.69 (0.62),"2.7 (2.3, 3.1)","1.3, 4.9",2.76 (0.6),"2.7 (2.3, 3.2)","1.6, 4.7"
1,Base Excess (mEq/L),2.9 (3.47),"2.0 (0.0, 4.0)","0.0, 20.0",2.71 (3.08),"2.0 (0.0, 4.0)","0.0, 16.0"
2,Base Deficit (mEq/L),5.69 (4.04),"5.0 (3.0, 7.0)","1.0, 26.0",5.6 (4.06),"5.0 (3.0, 7.0)","1.0, 25.5"
3,Arterial PaCO2 (mm Hg),44.89 (12.07),"42.95 (36.4, 51.5)","17.0, 85.8",43.93 (12.21),"41.8 (35.5, 50.6)","17.4, 85.9"
4,Arterial PaO2 (mm Hg),77.61 (57.05),"58.0 (41.0, 90.0)","21.0, 383.0",72.64 (41.42),"62.0 (46.0, 84.0)","22.0, 313.0"
5,Bicarbonate (mEq/L),22.41 (4.89),"22.3 (19.6, 25.1)","4.8, 39.2",22.37 (4.81),"22.38 (19.6, 25.2)","4.9, 37.4"
6,Bilirubin (mg/dL),1.29 (2.3),"0.5 (0.3, 1.0)","0.1, 16.5",0.85 (1.61),"0.4 (0.2, 0.7)","0.1, 14.7"
7,Diastolic Blood Pressure (mm Hg),57.39 (14.22),"56.0 (47.0, 66.0)","31.0, 100.0",55.92 (14.4),"55.0 (45.0, 66.0)","29.0, 97.0"
8,Systolic Blood Pressure (mm Hg),100.28 (17.35),"99.0 (88.0, 112.0)","64.0, 150.0",99.79 (15.58),"99.0 (89.0, 110.0)","66.0, 144.0"
9,BUN (mg/dL),16.99 (15.45),"12.0 (8.0, 19.0)","2.0, 97.0",13.94 (10.98),"11.0 (7.0, 16.0)","2.0, 66.0"
