In [1]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
import tableone
from datetime import datetime, timedelta
from scipy import stats
import matplotlib.pyplot as plt
import warnings

In [2]:
path = '/home/dchanci/projects/ped_sepsis_prediction_ml/models/results_updated'
screening_method = 'inf_phoenix'

In [3]:
col_names = ['albumin', 'base_excess', 'base_deficit', 'pco2', 'po2', 'bicarbonate', 'bilirubin_total', 'bp_dias', 'bp_sys', 'bun',
              'calcium', 'calcium_ionized', 'chloride', 'co2', 'creatinine', 'fio2', 'glucose', 'hemoglobin', 'lactic_acid', 
              'map', 'pao2_fio2', 'ph', 'platelets', 'potassium', 'ptt', 'pulse', 'pupil_left_size', 'resp', 'sodium', 'spo2', 'temp', 'wbc']

col_names_fixed = ['Albumin (g/dL)', 'Base Excess (mEq/L)', 'Base Deficit (mEq/L)', 'Arterial PaCO2 (mm Hg)', 'Arterial PaO2 (mm Hg)', 'Bicarbonate (mEq/L)', 'Bilirubin (mg/dL)', 
                   'Diastolic Blood Pressure (mm Hg)', 'Systolic Blood Pressure (mm Hg)', 'BUN (mg/dL)', 'Calcium (mg/dL)', 'Ionized Calcium (mg/dL)', 
                   'Chloride (mEq/L)', 'CO2 (mEq/L)', 'Creatinine (mg/dL)', 'FiO2 (%)', 'Glucose (mg/dL)', 'Hemoglobin (g/dL)', 'Lactic Acid (mEq/L)', 
                    'Mean Arterial Pressure (mm Hg)', 'PaO2/FiO2 Ratio (mmHg)', 'pH', 'Platelets (x10\u2079/L)', 'Potassium (mEq/L)', 'PTT (seconds)', 
                    'Heart Rate (beats per minute)', 'Pupil Left Size (mm)', 'Respiratory Rate (breaths per minute)', 'Sodium (mEq/L)', 'SpO2 (%)', 
                    'Temperature (°C)', 'WBC (x10\u2079/L)']

In [4]:
# Load data
data = pd.read_parquet(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models', 'features_preimp_pivot_eg_' + screening_method + '.parquet.gzip'))
data = data[col_names]
data.columns = col_names_fixed

In [None]:
# Compute statistics
mean = [round(x,2) for x in list(np.nanmean(data, axis=0))]
mean.insert(0, "Mean")
median = [round(x,2) for x in list(np.nanmedian(data, axis=0))]
median.insert(0, "Median")
q1 = [round(x,2) for x in list(np.nanquantile(data, 0.25, axis=0))]
q1.insert(0, "Q1")
q3 = [round(x,2) for x in list(np.nanquantile(data, 0.75, axis=0))]
q3.insert(0, "Q3")
min = [round(x,2) for x in list(np.nanmin(data, axis=0))]
min.insert(0, "Min")
max = [round(x,2) for x in list(np.nanmax(data, axis=0))]
max.insert(0, "Max")
std = [round(x,2) for x in list(np.nanstd(data, axis=0))]
std.insert(0, "Std")
data.reset_index(inplace=True)
data.loc[len(data.index)] = mean
data.loc[len(data.index)] = median
data.loc[len(data.index)] = q1
data.loc[len(data.index)] = q3
data.loc[len(data.index)] = min
data.loc[len(data.index)] = max
data.loc[len(data.index)] = std
data = data.iloc[-7:]
data=data.rename(columns = {'index':'Feature'})
data = data.set_index('Feature')
data = data.T
data.to_csv(os.path.join(path, screening_method, 'features_dist_eg.csv'))
data

In [6]:
# Load data
data = pd.read_parquet(os.path.join('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models', 'features_preimp_pivot_sr_' + screening_method + '.parquet.gzip'))
data = data[col_names]
data.columns = col_names_fixed

In [None]:
# Compute statistics
mean = [round(x,2) for x in list(np.nanmean(data, axis=0))]
mean.insert(0, "Mean")
median = [round(x,2) for x in list(np.nanmedian(data, axis=0))]
median.insert(0, "Median")
q1 = [round(x,2) for x in list(np.nanquantile(data, 0.25, axis=0))]
q1.insert(0, "Q1")
q3 = [round(x,2) for x in list(np.nanquantile(data, 0.75, axis=0))]
q3.insert(0, "Q3")
min = [round(x,2) for x in list(np.nanmin(data, axis=0))]
min.insert(0, "Min")
max = [round(x,2) for x in list(np.nanmax(data, axis=0))]
max.insert(0, "Max")
std = [round(x,2) for x in list(np.nanstd(data, axis=0))]
std.insert(0, "Std")
data.reset_index(inplace=True)
data.loc[len(data.index)] = mean
data.loc[len(data.index)] = median
data.loc[len(data.index)] = q1
data.loc[len(data.index)] = q3
data.loc[len(data.index)] = min
data.loc[len(data.index)] = max
data.loc[len(data.index)] = std
data = data.iloc[-7:]
data=data.rename(columns = {'index':'Feature'})
data = data.set_index('Feature')
data = data.T
data.to_csv(os.path.join(path, screening_method, 'features_dist_sr.csv'))
data

In [None]:
# Organize tables 
dist_eg = pd.read_csv(os.path.join(path, screening_method, 'features_dist_eg.csv'))
dist_eg.rename(columns={'Unnamed: 0': 'Feature'}, inplace=True)
for i in range(dist_eg.shape[0]):
    dist_eg.loc[dist_eg.index == i, 'Mean (SD)'] = str(dist_eg.loc[dist_eg.index == i, 'Mean'].values[0]) + ' (' + str(dist_eg.loc[dist_eg.index == i, 'Std'].values[0]) + ')'
    dist_eg.loc[dist_eg.index == i, 'Median (Q1, Q3)'] = str(dist_eg.loc[dist_eg.index == i, 'Median'].values[0]) + ' (' + str(dist_eg.loc[dist_eg.index == i, 'Q1'].values[0]) + ', ' + str(dist_eg.loc[dist_eg.index == i, 'Q3'].values[0]) + ')'
    dist_eg.loc[dist_eg.index == i, 'Min, Max'] = str(dist_eg.loc[dist_eg.index == i, 'Min'].values[0]) + ', ' + str(dist_eg.loc[dist_eg.index == i, 'Max'].values[0])
dist_eg.drop(['Mean', 'Median', 'Q1', 'Q3', 'Min', 'Max', 'Std'], axis=1, inplace=True)


dist_sr = pd.read_csv(os.path.join(path, screening_method, 'features_dist_sr.csv'))
dist_sr.rename(columns={'Unnamed: 0': 'Feature'}, inplace=True)
for i in range(dist_sr.shape[0]):
    dist_sr.loc[dist_sr.index == i, 'Mean (SD)'] = str(dist_sr.loc[dist_sr.index == i, 'Mean'].values[0]) + ' (' + str(dist_sr.loc[dist_sr.index == i, 'Std'].values[0]) + ')'
    dist_sr.loc[dist_sr.index == i, 'Median (Q1, Q3)'] = str(dist_sr.loc[dist_sr.index == i, 'Median'].values[0]) + ' (' + str(dist_sr.loc[dist_sr.index == i, 'Q1'].values[0]) + ', ' + str(dist_sr.loc[dist_sr.index == i, 'Q3'].values[0]) + ')'
    dist_sr.loc[dist_sr.index == i, 'Min, Max'] = str(dist_sr.loc[dist_sr.index == i, 'Min'].values[0]) + ', ' + str(dist_sr.loc[dist_sr.index == i, 'Max'].values[0])
dist_sr.drop(['Mean', 'Median', 'Q1', 'Q3', 'Min', 'Max', 'Std'], axis=1, inplace=True)
dist_sr

dist_eg = dist_eg.merge(dist_sr, on='Feature')
dist_eg.columns = [['Feature', 'Derivation', 'Derivation', 'Derivation', 'Validation', 'Validation', 'Validation'], [' ', 'Mean (SD)', 'Median (Q1, Q3)', 'Min, Max', 'Mean (SD)', 'Median (Q1, Q3)', 'Min, Max']]
dist_eg.to_csv(os.path.join(path, screening_method, 'features_dist.csv'), index=False)
dist_eg