In [1]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
from datetime import datetime, timedelta
from scipy import stats
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import warnings
import random
warnings.filterwarnings("ignore")

In [2]:
def get_psofa (variables):

    df = variables.copy()

    # Calculate PaO2/FiO2
    df['fio2_imputed'] = df.groupby(['csn'])['fio2'].ffill()
    df['pao2_fio2_calculated'] = df['pao2'] / (df['fio2_imputed'] / 100)
    
    # Calculate SpO2/FiO2
    df['spo2_fio2_calculated'] = df['spo2'] / (df['fio2_imputed'] / 100)
    df.drop(['fio2_imputed'], axis=1, inplace=True)

    # Calculate respiratory component
    df['resp_score'] = 0
    df['resp_score'] = np.where((df['pao2_fio2_calculated'] >= 400) | (df['spo2_fio2_calculated'] >= 292), 0, 
                            np.where(((df['pao2_fio2_calculated'] >= 300) & (df['pao2_fio2_calculated'] < 400)) | ((df['spo2_fio2_calculated'] >= 264) & (df['spo2_fio2_calculated'] < 292)), 1, 
                            np.where(((df['pao2_fio2_calculated'] >= 200) & (df['pao2_fio2_calculated'] < 300)) | ((df['spo2_fio2_calculated'] >= 221) & (df['spo2_fio2_calculated'] < 264)), 2, 
                            np.where((((df['pao2_fio2_calculated'] >= 100) & (df['pao2_fio2_calculated'] < 200)) | ((df['spo2_fio2_calculated'] >= 148) & (df['spo2_fio2_calculated'] < 221))) & (df['resp_indicator'] == 1), 3, 
                            np.where(((df['pao2_fio2_calculated'] < 100) | (df['spo2_fio2_calculated'] < 148)) & (df['resp_indicator'] == 1), 4, 0)))))

    # Calculate coagulation component
    df['coag_score'] = 0
    df['coag_score'] = np.where(df['platelets'] >= 150, 0, 
                            np.where((df['platelets'] >= 100) & (df['platelets'] < 150), 1, 
                            np.where((df['platelets'] >= 50) & (df['platelets'] < 100), 2, 
                            np.where((df['platelets'] >= 20) & (df['platelets'] < 50), 3, 
                            np.where(df['platelets'] < 20, 4, 0)))))

    # Calculate hepatic component
    df['hep_score'] = 0
    df['hep_score'] = np.where(df['bilirubin'] < 1.2, 0, 
                            np.where((df['bilirubin'] >= 1.2) & (df['bilirubin'] < 2), 1, 
                            np.where((df['bilirubin'] >= 2) & (df['bilirubin'] < 6), 2, 
                            np.where((df['bilirubin'] >= 6) & (df['bilirubin'] < 12), 3, 
                            np.where(df['bilirubin'] >= 12, 4, 0)))))

    # Calculate cardiovascular component (No dobutamine)
    df['card_score'] = 0
    df['card_score'] = np.where(((df['age_months'] < 1) & (df['map'] >= 46)) | 
                                    ((df['age_months'] < 12) & (df['map'] >= 55)) |
                                    ((df['age_months'] < 24) & (df['map'] >= 60)) |
                                    ((df['age_months'] < 60) & (df['map'] >= 62)) |
                                    ((df['age_months'] < 144) & (df['map'] >= 65)) |
                                    ((df['age_months'] <= 216) & (df['map'] >= 67)) |
                                    ((df['age_months'] > 216) & (df['map'] >= 70)), 0, 
                            np.where(((df['age_months'] < 1) & (df['map'] < 46)) | 
                                    ((df['age_months'] < 12) & (df['map'] < 55)) |
                                    ((df['age_months'] < 24) & (df['map'] < 60)) |
                                    ((df['age_months'] < 60) & (df['map'] < 62)) |
                                    ((df['age_months'] < 144) & (df['map'] < 65)) |
                                    ((df['age_months'] <= 216) & (df['map'] < 67)) |
                                    ((df['age_months'] > 216) & (df['map'] < 70)), 1, 
                            np.where(df['dopamine'] <= 5, 2, 
                            np.where((df['dopamine'] > 5) | (df['epinephrine'] <= 0.1) | (df['norepinephrine'] <= 0.1), 3, 
                            np.where((df['dopamine'] > 15) | (df['epinephrine'] > 0.1) | (df['norepinephrine'] > 0.1), 4, 0)))))

    # Calculate neurologic component 
    df['neuro_score'] = 0
    df['neuro_score'] = np.where(df['coma_scale'] >= 15, 0, 
                            np.where((df['coma_scale'] >= 13) & (df['coma_scale'] < 15), 1, 
                            np.where((df['coma_scale'] >= 10) & (df['coma_scale'] < 13), 2, 
                            np.where((df['coma_scale'] >= 6) & (df['coma_scale'] < 10), 3, 
                            np.where(df['coma_scale'] < 6, 4, 0)))))

    # Calculate renal component 
    df['renal_score'] = 0
    df['renal_score'] = np.where(((df['age_months'] < 1) & (df['creatinine'] < 0.8)) |
                                    (((df['age_months'] >= 1) & (df['age_months'] < 12)) & (df['creatinine'] < 0.3)) |
                                    (((df['age_months'] >= 12) & (df['age_months'] < 24)) & (df['creatinine'] < 0.4)) |
                                    (((df['age_months'] >= 24) & (df['age_months'] < 60)) & (df['creatinine'] < 0.6)) |
                                    (((df['age_months'] >= 60) & (df['age_months'] < 144)) & (df['creatinine'] < 0.7)) |
                                    (((df['age_months'] >= 144) & (df['age_months'] <= 216)) & (df['creatinine'] < 1.0)) |
                                    ((df['age_months'] > 216) & (df['creatinine'] < 1.2)), 0, 
                            np.where(((df['age_months'] < 1) & ((df['creatinine'] < 1.0) & (df['creatinine'] >= 0.8))) |
                                    (((df['age_months'] >= 1) & (df['age_months'] < 12)) & ((df['creatinine'] < 0.5) & (df['creatinine'] >= 0.3))) |
                                    (((df['age_months'] >= 12) & (df['age_months'] < 24)) & ((df['creatinine'] < 0.6) & (df['creatinine'] >= 0.4))) |
                                    (((df['age_months'] >= 24) & (df['age_months'] < 60)) & ((df['creatinine'] < 0.9) & (df['creatinine'] >= 0.6))) |
                                    (((df['age_months'] >= 60) & (df['age_months'] < 144)) & ((df['creatinine'] < 1.1) & (df['creatinine'] >= 0.7))) |
                                    (((df['age_months'] >= 144) & (df['age_months'] <= 216)) & ((df['creatinine'] < 1.7) & (df['creatinine'] >= 1.0))) |
                                    ((df['age_months'] > 216) & ((df['creatinine'] < 2.0) & (df['creatinine'] >= 1.2))), 1, 
                            np.where(((df['age_months'] < 1) & ((df['creatinine'] < 1.2) & (df['creatinine'] >= 1.0))) |
                                    (((df['age_months'] >= 1) & (df['age_months'] < 12)) & ((df['creatinine'] < 0.8) & (df['creatinine'] >= 0.5))) |
                                    (((df['age_months'] >= 12) & (df['age_months'] < 24)) & ((df['creatinine'] < 1.1) & (df['creatinine'] >= 0.6))) |
                                    (((df['age_months'] >= 24) & (df['age_months'] < 60)) & ((df['creatinine'] < 1.6) & (df['creatinine'] >= 0.9))) |
                                    (((df['age_months'] >= 60) & (df['age_months'] < 144)) & ((df['creatinine'] < 1.8) & (df['creatinine'] >= 1.1))) |
                                    (((df['age_months'] >= 144) & (df['age_months'] <= 216)) & ((df['creatinine'] < 2.9) & (df['creatinine'] >= 1.7))) |
                                    ((df['age_months'] > 216) & ((df['creatinine'] < 3.5) & (df['creatinine'] >= 2.0))), 2, 
                            np.where(((df['age_months'] < 1) & ((df['creatinine'] < 1.6) & (df['creatinine'] >= 1.2))) |
                                    (((df['age_months'] >= 1) & (df['age_months'] < 12)) & ((df['creatinine'] < 1.2) & (df['creatinine'] >= 0.8))) |
                                    (((df['age_months'] >= 12) & (df['age_months'] < 24)) & ((df['creatinine'] < 1.5) & (df['creatinine'] >= 1.1))) |
                                    (((df['age_months'] >= 24) & (df['age_months'] < 60)) & ((df['creatinine'] < 2.3) & (df['creatinine'] >= 1.6))) |
                                    (((df['age_months'] >= 60) & (df['age_months'] < 144)) & ((df['creatinine'] < 2.6) & (df['creatinine'] >= 1.8))) |
                                    (((df['age_months'] >= 144) & (df['age_months'] <= 216)) & ((df['creatinine'] < 4.2) & (df['creatinine'] >= 2.9))) |
                                    ((df['age_months'] > 216) & ((df['creatinine'] < 5) & (df['creatinine'] >= 3.5))), 3, 
                            np.where(((df['age_months'] < 1) & (df['creatinine'] >= 1.6)) |
                                    (((df['age_months'] >= 1) & (df['age_months'] < 12)) & (df['creatinine'] >= 1.2)) |
                                    (((df['age_months'] >= 12) & (df['age_months'] < 24)) & (df['creatinine'] >= 1.5)) |
                                    (((df['age_months'] >= 24) & (df['age_months'] < 60)) & (df['creatinine'] >= 2.3)) |
                                    (((df['age_months'] >= 60) & (df['age_months'] < 144)) & (df['creatinine'] >= 2.6)) |
                                    (((df['age_months'] >= 144) & (df['age_months'] <= 216)) & (df['creatinine'] >= 4.2)) |
                                    ((df['age_months'] > 216) & (df['creatinine'] >= 5)), 4, 0)))))
    
    comps = ['csn', 'resp_score', 'coag_score', 'hep_score', 'card_score', 'neuro_score', 'renal_score']

    df = df[comps]
    df = df.groupby('csn', as_index=False).max()
    df['psofa'] = df['resp_score'] + df['coag_score'] + df['hep_score'] + df['card_score'] + df['neuro_score'] + df['renal_score']
    df = df[['csn', 'psofa']]

    return df

In [3]:
def get_pelod2 (variables):

    df = variables.copy()

    # Add creatinine
    df['creatinine_micro'] = df['creatinine'].apply(lambda x: x * 88.42)
    df = df.reset_index(drop=True)

    df['neuro_score'] = 0
    df['neuro_score'] = np.where((df['coma_scale'] >= 11) | ((df['pupil_right_reaction'] == 1) & (df['pupil_left_reaction'] == 1)) >= 11, 0, 
                                        np.where((df['coma_scale'] >= 5) & (df['coma_scale'] < 11), 1, 
                                        np.where((df['coma_scale'] >= 3) & (df['coma_scale'] < 5), 4,
                                        np.where((df['pupil_right_reaction'] == 0) & (df['pupil_left_reaction'] == 0), 5, 0))))

    df['cardio_score'] = 0
    df['cardio_score'] = np.where((df['lactic_acid'] < 5) | 
                                    (((df['age_months'] < 1) & (df['map'] >= 46)) |
                                    ((df['age_months'] >= 1) & (df['age_months'] < 12) & (df['map'] >= 55)) |
                                    ((df['age_months'] >= 12) & (df['age_months'] < 24) & (df['map'] >= 60)) |
                                    ((df['age_months'] >= 24) & (df['age_months'] < 60) & (df['map'] >= 62)) |
                                    ((df['age_months'] >= 60) & (df['age_months'] < 144) & (df['map'] >= 65)) |
                                    ((df['age_months'] >= 144) & (df['map'] >= 67))), 0, 
                        np.where((df['lactic_acid'] >= 5) & (df['lactic_acid'] < 11), 1, 
                        np.where(((df['age_months'] < 1) & (df['map'] >= 31) & (df['map'] < 46)) |
                                    ((df['age_months'] >= 1) & (df['age_months'] < 12) & (df['map'] >= 39) & (df['map'] < 55)) |
                                    ((df['age_months'] >= 12) & (df['age_months'] < 24) & (df['map'] >= 44) & (df['map'] < 60)) |
                                    ((df['age_months'] >= 24) & (df['age_months'] < 60) & (df['map'] >= 46) & (df['map'] < 62)) |
                                    ((df['age_months'] >= 60) & (df['age_months'] < 144) & (df['map'] >= 49)) & (df['map'] < 65) |
                                    ((df['age_months'] >= 144) & (df['map'] >= 52) & (df['map'] < 67)), 2, 
                        np.where(((df['age_months'] < 1) & (df['map'] > 16) & (df['map'] < 31)) |
                                    ((df['age_months'] >= 1) & (df['age_months'] < 12) & (df['map'] > 24) & (df['map'] < 39)) |
                                    ((df['age_months'] >= 12) & (df['age_months'] < 24) & (df['map'] > 30) & (df['map'] < 44)) |
                                    ((df['age_months'] >= 24) & (df['age_months'] < 60) & (df['map'] > 31) & (df['map'] < 46)) |
                                    ((df['age_months'] >= 60) & (df['age_months'] < 144) & (df['map'] > 35) & (df['map'] < 49)) |
                                    ((df['age_months'] >= 144) & (df['map'] > 37) & (df['map'] < 52)), 3, 
                        np.where(df['lactic_acid'] >= 11, 4, 
                        np.where(((df['age_months'] < 1) & (df['map'] <= 16)) |
                                    ((df['age_months'] >= 1) & (df['age_months'] < 12) & (df['map'] <= 24)) |
                                    ((df['age_months'] >= 12) & (df['age_months'] < 24) & (df['map'] <= 30)) |
                                    ((df['age_months'] >= 24) & (df['age_months'] < 60) & (df['map'] <= 31)) |
                                    ((df['age_months'] >= 60) & (df['age_months'] < 144) & (df['map'] <= 35)) |
                                    ((df['age_months'] >= 144) & (df['map'] <= 37)), 6, 0))))))


    df['renal_score'] = 0
    df['renal_score'] = np.where(((df['age_months'] < 1) & (df['creatinine_micro'] <= 69)) |
                                        ((df['age_months'] >= 1) & (df['age_months'] < 12) & (df['creatinine_micro'] <= 22)) |
                                        ((df['age_months'] >= 12) & (df['age_months'] < 24) & (df['creatinine_micro'] <= 34)) |
                                        ((df['age_months'] >= 24) & (df['age_months'] < 60) & (df['creatinine_micro'] <= 50)) |
                                        ((df['age_months'] >= 60) & (df['age_months'] < 144) & (df['creatinine_micro'] <= 58)) |
                                        ((df['age_months'] >= 144) & (df['creatinine_micro'] <= 92)), 0, 
                                        np.where(((df['age_months'] < 1) & (df['creatinine_micro'] >= 70)) |
                                        ((df['age_months'] >= 1) & (df['age_months'] < 12) & (df['creatinine_micro'] >= 23)) |
                                        ((df['age_months'] >= 12) & (df['age_months'] < 24) & (df['creatinine_micro'] >= 35)) |
                                        ((df['age_months'] >= 24) & (df['age_months'] < 60) & (df['creatinine_micro'] >= 51)) |
                                        ((df['age_months'] >= 60) & (df['age_months'] < 144) & (df['creatinine_micro'] >= 59)) |
                                        ((df['age_months'] >= 144) & (df['creatinine_micro'] >= 93)), 2, 0))

    df['resp_score'] = 0
    df['resp_score'] = np.where((df['pao2'] >= 61) | (df['paco2'] >= 58), 0, 
                                    np.where((df['paco2'] > 59) & (df['paco2'] < 95), 1, 
                                    np.where(df['pao2'] <= 60, 2, 
                                    np.where((df['paco2'] >= 95) | (df['mv_indicator'] > 0), 3, 0))))

    df['hem_score'] = 0
    df['hem_score'] = np.where((df['wbc'] > 2) | (df['platelets'] >= 142), 0, 
                                    np.where((df['platelets'] > 76) & (df['platelets'] < 142), 1, 
                                    np.where((df['wbc'] <= 2) | (df['platelets'] <= 76), 2, 0)))


    comps = ['csn', 'neuro_score', 'cardio_score', 'renal_score', 'resp_score', 'hem_score']

    df = df[comps]
    df = df.groupby('csn', as_index=False).max()
    df['pelod2'] = df['neuro_score'] + df['cardio_score'] + df['renal_score'] + df['resp_score'] + df['hem_score'] 
    df = df[['csn', 'pelod2']]

    return df

In [4]:
def get_prism3 (variables):

    df = variables.copy()

    df['sbp_prism'] = 0
    df['sbp_prism'] = np.where(((df['age_months'] > 1) & (df['age_years'] < 2) & 
                                    (((df['bp_sys'] >= 130) & (df['bp_sys'] <= 160)) | 
                                    ((df['bp_sys'] >= 55) & (df['bp_sys'] <= 65)))) | 
                                    ((df['age_years'] >= 2) & (df['age_years'] < 12) & 
                                    (((df['bp_sys'] >= 50) & (df['bp_sys'] <= 200)) | 
                                    ((df['bp_sys'] >= 65) & (df['bp_sys'] <= 75)))), 2, 
                                    np.where((((df['age_months'] > 1) & (df['age_years'] < 2) & 
                                    (((df['bp_sys'] >= 40) & (df['bp_sys'] <= 54)) | 
                                    (df['bp_sys'] >= 160)))) | 
                                    (((df['age_years'] >= 2) & (df['age_years'] < 12) & 
                                    (((df['bp_sys'] >= 50) & (df['bp_sys'] <= 64)) | 
                                    (df['bp_sys'] > 200)))), 6, 
                                    np.where(((df['age_months'] > 1) & (df['age_years'] < 2) & (df['bp_sys'] < 40)) | 
                                    ((df['age_years'] >= 2) & (df['age_years'] < 12) & (df['bp_sys'] < 50)), 7, 0)))

    df['dbp_prism'] = 0
    df['dbp_prism'] = np.where(df['bp_dias'] > 110, 6, 0)

    df['hr_prism'] = 0
    df['hr_prism'] = np.where(((df['age_months'] > 1) & (df['age_years'] < 2) & 
                                    ((df['pulse'] > 160) | (df['pulse'] < 90))) | 
                                    ((df['age_years'] >= 2) & (df['age_years'] < 12) & 
                                    (df['pulse'] > 150) | (df['pulse'] < 80)), 4, 0)

    df['resp_prism'] = 0
    df['resp_prism'] = np.where(((df['age_months'] > 1) & (df['age_years'] < 2) & 
                                        (df['resp'] >= 61) & (df['resp'] <= 90)) | 
                                        ((df['age_years'] >= 2) & (df['age_years'] < 12) &
                                        (df['resp'] >= 51) & (df['resp'] <= 70)), 1, 
                                        np.where(((df['age_months'] > 1) & (df['age_years'] < 2) &
                                        ((df['resp'] > 90) | (df['resp'] < 1))) | 
                                        ((df['age_years'] >= 2) & (df['age_years'] < 12) &
                                        ((df['resp'] > 70) | (df['resp'] < 1))), 5, 0))

    df['pao2_prism'] = 0
    df['pao2_prism'] = np.where((df['pao2_fio2'] >= 200) & (df['pao2_fio2'] <= 300), 2, 
                                    np.where(df['pao2_fio2'] < 200, 3, 0))

    df['paco2_prism'] = 0
    df['paco2_prism'] = np.where((df['paco2'] >= 51) & (df['paco2'] <= 65), 1, 
                                    np.where(df['paco2'] > 65, 5, 0))

    df['coma_prism'] = 0
    df['coma_prism'] = np.where(df['coma_scale'] < 8, 6, 0)

    df['pupil_prism'] = 0
    df['pupil_prism'] = np.where((df['pupil_right_reaction'] == 0) | (df['pupil_left_reaction'] == 0), 10, 0)

    df['bilir_prism'] = 0
    df['bilir_prism'] = np.where((df['bilirubin'] > 3.5) & (df['age_months'] > 1), 6, 0)

    df['pot_prism'] = 0
    df['pot_prism'] = np.where(((df['potassium'] >= 3.0) & (df['potassium'] <= 3.5)) | 
                                    ((df['potassium'] >= 6.5) & (df['potassium'] <= 7.5)), 1, 
                                    np.where((df['potassium'] < 3.0) | (df['potassium'] > 7.5), 5, 0))

    df['calcium_prism'] = 0
    df['calcium_prism'] = np.where(((df['calcium'] >= 7.0) & (df['calcium'] <= 8.0)) | 
                                    ((df['calcium'] >= 12.0) & (df['calcium'] <= 15.0)), 2, 
                                    np.where((df['calcium'] < 7.0) | (df['calcium'] > 15.0), 6, 0))

    df['gluc_prism'] = 0
    df['gluc_prism'] = np.where(((df['glucose'] >= 40) & (df['glucose'] <= 60)) | 
                                    ((df['glucose'] >= 250) & (df['glucose'] <= 400)), 4, 
                                    np.where((df['glucose'] < 40) | (df['glucose'] > 400), 8, 0))

    df['bicarb_prism'] = 0
    df['bicarb_prism'] = np.where((df['bicarbonate'] < 16) | (df['bicarbonate'] > 32), 3, 0)

    comps = ['csn', 'sbp_prism', 'dbp_prism', 'hr_prism', 'resp_prism', 'pao2_prism', 'paco2_prism', 'coma_prism', 'pupil_prism', 'bilir_prism', 'pot_prism', 'calcium_prism', 'gluc_prism', 'bicarb_prism']

    df = df[comps]
    df = df.groupby('csn', as_index=False).max()
    df['prism3'] = df['sbp_prism'] + df['dbp_prism'] + df['hr_prism'] + df['resp_prism'] + df['pao2_prism'] + df['paco2_prism'] + \
                        df['coma_prism'] + df['pupil_prism'] + df['bilir_prism'] + df['pot_prism'] + df['calcium_prism'] + df['gluc_prism'] + \
                        df['bicarb_prism']
    df = df[['csn', 'prism3']]

    return df

In [5]:
def get_phoenix (variables):

    df = variables.copy()

    # Calculate PaO2/FiO2
    df['fio2_imputed'] = df.groupby(['csn'])['fio2'].ffill()
    df['pao2_fio2_calculated'] = df['pao2'] / (df['fio2_imputed'] / 100)

    # Calculate SpO2/FiO2
    df['spo2_fio2_calculated'] = df['spo2'] / (df['fio2_imputed'] / 100)
    df.drop(['fio2_imputed'], axis=1, inplace=True)

    # Create column with relative time
    df['rel_time'] = np.ceil((df['recorded_time'] - df['hosp_admission']) / pd.Timedelta('30 minutes'))
    df = df[df['rel_time'] > 0]
    df.sort_values(by=['csn', 'rel_time'], inplace=True)
    print('Unique CSN total:', len(df['csn'].unique().tolist()))

    # Resample df
    agg_dict = {}
    for col in df.columns:
        if col in ['pao2_fio2_calculated', 'spo2_fio2_calculated', 'map', 'platelets' 'fibrinogen', 'coma_scale', 'pupil_right_reaction', 'pupil_left_reaction']:
            agg_dict[col] = pd.NamedAgg(column=col, aggfunc='min')
        elif col in ['lactic_acid', 'inr', 'ddimer']:
            agg_dict[col] = pd.NamedAgg(column=col, aggfunc='max')
        else:
            agg_dict[col] = pd.NamedAgg(column=col, aggfunc='last')

    df = df.groupby(['patid', 'csn', 'dob', 'age_months', 'age_years', 'hosp_admission', 'rel_time'], as_index=False).agg(**agg_dict)
    df.sort_values(by=['csn', 'rel_time'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    print('Unique CSN total:', len(df['csn'].unique().tolist()))

    # Create rows for missing rows
    hours_list = []
    csn_list = []
    df['rel_time'] = df['rel_time'].astype(int)

    for csn in df['csn'].unique().tolist():
        data = df[df['csn'] == csn]
        hours = [x for x in list(range(data['rel_time'].min(), data['rel_time'].max())) if x not in list(data['rel_time'])]
        csn_list.extend([csn] * len(hours))
        hours_list.extend(hours)
    missing = pd.DataFrame(list(zip(csn_list, hours_list)), columns=['csn', 'rel_time'])

    cols = list(df.columns)
    cols.remove('csn')
    cols.remove('rel_time')

    for col in cols:
        missing[col] = np.nan
        
    missing = missing[list(df.columns)]
    df = pd.concat([df, missing])
    df.sort_values(by=['csn', 'rel_time'], inplace=True)
    df.reset_index(inplace=True, drop=True)
    df[['patid', 'csn', 'dob', 'age_months', 'age_years', 'hosp_admission']] = df.groupby('csn')[['patid', 'csn', 'dob', 'age_months', 'age_years', 'hosp_admission']].ffill()
    df['rel_time_med'] = (df['rel_time'] - 0.5) / 2
    df.loc[df['recorded_time'].isna(), 'recorded_time'] = df['hosp_admission'] + pd.to_timedelta(df['rel_time_med'], unit='h')
    df.drop(['rel_time_med'], axis=1, inplace=True)

    # Forward fill 
    ff_48 = ['platelets', 'inr', 'fibrinogen', 'ddimer']
    ff_24 = ['coma_scale', 'epinephrine', 'norepinephrine', 'dopamine']
    ff_12 = ['pao2_fio2_calculated', 'spo2_fio2_calculated', 'resp_indicator', 'mv_indicator', 'map', 'lactic_acid', 'pupil_left_reaction', 'pupil_right_reaction']
    df[ff_48] = df.groupby(['patid', 'csn', 'dob', 'age_months', 'age_years', 'hosp_admission'])[ff_48].ffill(limit=48)
    df[ff_24] = df.groupby(['patid', 'csn', 'dob', 'age_months', 'age_years', 'hosp_admission'])[ff_24].ffill(limit=24)
    df[ff_12] = df.groupby(['patid', 'csn', 'dob', 'age_months', 'age_years', 'hosp_admission'])[ff_12].ffill(limit=12)

    # Calculate component scores

    # Calculate respiratory component
    df['resp_score'] = 0
    df['resp_score'] = np.where((df['pao2_fio2_calculated'] >= 400) | (df['spo2_fio2_calculated'] >= 292), 0, 
                            np.where(((df['pao2_fio2_calculated'] < 400) | (df['spo2_fio2_calculated'] < 292)) & (df['resp_indicator'] == 1), 1, 
                            np.where((((df['pao2_fio2_calculated'] >= 100) & (df['pao2_fio2_calculated'] < 200)) | 
                                    ((df['spo2_fio2_calculated'] >= 148) & (df['spo2_fio2_calculated'] < 220))) & (df['mv_indicator'] == 1), 2, 
                            np.where(((df['pao2_fio2_calculated'] < 100) | (df['spo2_fio2_calculated'] < 148)) & (df['mv_indicator'] == 1), 3, 0))))


    # Calculate cardiovascular component

    df['card_score_1_lactate'] = 0
    df.loc[(df['lactic_acid'] >= 5) & (df['lactic_acid'] < 11), 'card_score_1_lactate'] = 1

    df['card_score_1_map'] = 0
    df.loc[((df['age_months'] < 1) & ((df['map'] >= 17) & (df['map'] <= 30))) | 
            (((df['age_months'] >= 1) & (df['age_years'] < 1)) & ((df['map'] >= 25) & (df['map'] <= 38))) |
            (((df['age_years'] >= 1) & (df['age_years'] < 2)) & ((df['map'] >= 31) & (df['map'] <= 43))) |
            (((df['age_years'] >= 2) & (df['age_years'] < 5)) & ((df['map'] >= 32) & (df['map'] <= 44))) |
            (((df['age_years'] >= 5) & (df['age_years'] < 12)) & ((df['map'] >= 36) & (df['map'] <= 48))) |
            (((df['age_years'] >= 12) & (df['age_years'] < 17)) & ((df['map'] >= 38) & (df['map'] <= 51))), 'card_score_1_map'] = 1

    df['card_score_epinephrine'] = 0
    df.loc[~(df['epinephrine'].isna()), 'card_score_epinephrine'] = 1
    df['card_score_norepinephrine'] = 0
    df.loc[~(df['norepinephrine'].isna()), 'card_score_norepinephrine'] = 1
    df['card_score_dopamine'] = 0
    df.loc[~(df['dopamine'].isna()), 'card_score_dopamine'] = 1
    # df['card_score_dobutamine'] = 0
    # df.loc[~(df['dobutamine'].isna()), 'card_score_dobutamine'] = 1
    # df['card_score_milrinone'] = 0
    # df.loc[~(df['milrinone'].isna()), 'card_score_milrinone'] = 1

    df['card_score_2_lactate'] = 0
    df.loc[df['lactic_acid'] >= 11, 'card_score_2_lactate'] = 2

    df['card_score_2_map'] = 0
    df.loc[((df['age_months'] < 1) & (df['map'] <= 30)) | 
            (((df['age_months'] >= 1) & (df['age_years'] < 1)) & (df['map'] < 17)) |
            (((df['age_years'] >= 1) & (df['age_years'] < 2)) & (df['map'] < 25)) |
            (((df['age_years'] >= 2) & (df['age_years'] < 5)) & (df['map'] < 31)) |
            (((df['age_years'] >= 5) & (df['age_years'] < 12)) & (df['map'] < 32)) |
            (((df['age_years'] >= 12) & (df['age_years'] < 17)) & (df['map'] < 38)), 'card_score_2_map'] = 2


    # Calculate coagulation component
    df['coag_score_platelets'] = 0
    df.loc[df['platelets'] < 100, 'coag_score_platelets'] = 1

    df['coag_score_inr'] = 0
    df.loc[df['inr'] > 1.3, 'coag_score_inr'] = 1

    df['coag_score_ddimer'] = 0
    df.loc[df['ddimer'] > 1000, 'coag_score_ddimer'] = 1

    df['coag_score_fibrinogen'] = 0
    df.loc[df['fibrinogen'] < 100, 'coag_score_fibrinogen'] = 1


    # Calculate neurologic component 
    df['neuro_score'] = 0
    df['neuro_score'] = np.where((df['coma_scale'] > 10) & (df['pupil_left_reaction'] == 1) & (df['pupil_right_reaction'] == 1), 0, 
                            np.where(df['coma_scale'] <= 10, 1, 
                            np.where((df['pupil_left_reaction'] == 0) & (df['pupil_right_reaction'] == 0), 2, 0)))
    

    df = df[['csn', 'resp_score', 'card_score_1_lactate', 'card_score_1_map', 'card_score_epinephrine', 'card_score_norepinephrine', 
             'card_score_dopamine', 'card_score_2_lactate', 'card_score_2_map', 'coag_score_platelets', 
             'coag_score_inr', 'coag_score_ddimer', 'coag_score_fibrinogen', 'neuro_score']]

    # Compute cardiovascular score per rel_time
    df['vasoactive_meds'] = df['card_score_epinephrine'] + df['card_score_norepinephrine'] + df['card_score_dopamine']
    df['vasoactive_meds'] = np.where(df['vasoactive_meds'] >= 2, 2, np.where(df['vasoactive_meds'] == 1, 1, 0))
    df['card_score'] = df['vasoactive_meds'] + df['card_score_1_lactate'] + df['card_score_1_map'] + df['card_score_2_lactate'] + df['card_score_2_map']

    # Compute coagulation final score
    df['coag_score'] = df['coag_score_platelets'] + df['coag_score_inr'] + df['coag_score_ddimer'] + df['coag_score_fibrinogen']
    df.drop(['coag_score_platelets', 'coag_score_inr', 'coag_score_ddimer', 'coag_score_fibrinogen'], axis=1, inplace=True)
    df.loc[df['coag_score'] > 2, 'coag_score'] = 2

    # Compute Phoenix score per rel_time
    df['phoenix'] = df['resp_score'] + df['card_score'] + df['coag_score'] + df['neuro_score']

    comps = ['csn', 'phoenix']
    df = df[comps]
    df = df.groupby('csn', as_index=False).max()

    return df

    

In [6]:
# Load encounters file
print('Loading encounters...')
dept_path = '/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip'
dept = pd.read_parquet(dept_path)
dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']] = dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']].apply(pd.to_datetime)
dept = dept[['Pat ID', 'Encounter CSN', 'Name', 'BIRTH_DATE', 'Department', 'Entered_Dept', 'Exited_Dept', 'Hosp_Admission', 'Hosp_Discharge']]
dept.columns = ['patid', 'csn', 'name', 'dob', 'department', 'entered_dept', 'exited_dept', 'hosp_admission', 'hosp_disch']
dept['csn'] = dept['csn'].astype(int)
dept = dept[(dept['department'].str.contains('PEDIATRIC ICU')) & (dept['hosp_admission'] >= '2010-01-01')]
dept = dept[['csn', 'hosp_admission']]
dept.dropna(inplace=True)
dept.drop_duplicates(inplace=True)

# Load complete cohort
print('Loading complete cohort...')
cohort = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/complete_cohort.csv')
cohort = cohort[['patid', 'mrn', 'csn', 'dob']]
cohort = cohort.merge(dept, how='inner', on='csn')

Loading encounters...
Loading complete cohort...


In [7]:
# Load data
print('Loading data...')
variables = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/raw_variables.parquet.gzip')
variables[['dob', 'recorded_time']] = variables[['dob', 'recorded_time']].apply(pd.to_datetime)
variables[['csn', 'variable_id']] = variables[['csn', 'variable_id']].astype(int)
variables.dropna(subset=['value'], inplace=True)
variables = variables[~((variables['variable_name'] == 'BP') & ~(variables['value'].str.contains("/", case=False)))]

# Load meds
meds = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/filtered_meds.parquet.gzip')
meds[['dob', 'mar_time']] = meds[['dob', 'mar_time']].apply(pd.to_datetime)
meds['csn'] = meds['csn'].astype(int)
meds = meds[(meds['csn'].isin(variables['csn'].unique().tolist())) & (meds['dose_unit'] == 'mcg/kg/min') & 
        (meds['med'].str.contains('epinephrine|dopamine', case=False))]
meds = meds[['patid', 'csn', 'dob', 'med_id', 'med', 'mar_time', 'dose']]
meds.columns = ['patid', 'csn', 'dob', 'variable_id', 'variable_name', 'recorded_time', 'value']
meds.loc[(meds['variable_name'].str.contains("epinephrine", case=False)) & ~(meds['variable_name'].str.contains("norepinephrine", case=False)), 'variable_name'] = 'epinephrine'
meds.loc[meds['variable_name'].str.contains("norepinephrine", case=False), 'variable_name'] = 'norepinephrine'
meds.loc[meds['variable_name'].str.contains("dopamine", case=False), 'variable_name'] = 'dopamine'
meds.loc[meds['variable_name'].str.contains("dobutamine", case=False), 'variable_name'] = 'dobutamine'
meds.loc[meds['variable_name'].str.contains("milrinone", case=False), 'variable_name'] = 'milrinone'
meds.loc[meds['variable_name'].str.contains("vasopressin", case=False), 'variable_name'] = 'vasopressin'
meds.reset_index(inplace=True, drop=True)
variables = pd.concat([variables, meds])

# Fix blood pressure
sysbp = variables[variables['variable_name'] == 'BP']
sysbp['variable_id'] = 1
sysbp['variable_name'] = 'bp_sys'
sysbp['value'] = sysbp['value'].apply(lambda x: float(x.split('/')[0]))
variables.loc[variables['variable_name'] == 'BP', 'variable_name'] = 'bp_dias'
variables.loc[variables['variable_name'] == 'bp_dias', 'value'] = variables.loc[variables['variable_name'] == 'bp_dias', 'value'].apply(lambda x: float(x.split('/')[1]))
variables = pd.concat([variables, sysbp])
variables.dropna(subset=['value'], inplace=True)
variables.reset_index(drop=True, inplace=True)

# Fix pupillary reaction
variables.loc[(variables['variable_name'].isin(['Pupil Right Reaction', 'Pupil Left Reaction'])) & (variables['value'].isin(['Brisk', 'Sluggish', 'Hippus'])), 'value'] = 'Reactive'
variables.loc[(variables['variable_name'].isin(['Pupil Right Reaction', 'Pupil Left Reaction'])) & (variables['value'].isin(['Non-reactive'])), 'value'] = 'Non-reactive'
variables.loc[(variables['variable_name'].isin(['Pupil Right Reaction', 'Pupil Left Reaction'])) & (variables['value'].isin(['Unable to assess', 'Pinpoint', 'No eye', 'Pharmacologically dilated', 'Keyhole', 'Ovoid', 'Ovid'])), 'value'] = 'Unable to Assess'

# Add mechanical ventilation data
mv = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/mv_data.parquet.gzip')
mv['csn'] = mv['csn'].astype(int)
mv = mv[mv['csn'].isin(variables['csn'].unique().tolist())]
mv[['dob', 'recorded_time']] = mv[['dob', 'recorded_time']].apply(pd.to_datetime)
variables = pd.concat([variables, mv])

# Add resp support data
resp = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/resp_data.parquet.gzip')
resp['csn'] = resp['csn'].astype(int)
resp = resp[resp['csn'].isin(variables['csn'].unique().tolist())]
resp[['dob', 'recorded_time']] = resp[['dob', 'recorded_time']].apply(pd.to_datetime)
variables = pd.concat([variables, resp])

# Add hospital admission
print('Adding hospital admission and department...')
variables = pd.merge(variables, cohort[['csn', 'hosp_admission']], on='csn', how='inner')
variables = variables[['patid', 'csn', 'dob', 'hosp_admission', 'variable_id', 'variable_name', 'recorded_time', 'value']]

Loading data...
Adding hospital admission and department...


In [8]:
# Fix names
variables.loc[variables['variable_name'] == 'POTASSIUM', 'variable_name'] = 'potassium'
variables.loc[variables['variable_name'] == 'CALCIUM', 'variable_name'] = 'calcium'
variables.loc[variables['variable_name'] == 'GLUCOSE', 'variable_name'] = 'glucose'
variables.loc[variables['variable_name'] == 'HCO3', 'variable_name'] = 'bicarbonate'
variables.loc[variables['variable_name'] == 'Weight', 'variable_name'] = 'weight'
variables.loc[variables['variable_name'] == 'Code Sheet Weight (kg)', 'variable_name'] = 'weight'
variables.loc[variables['variable_name'] == 'Pulse', 'variable_name'] = 'pulse'
variables.loc[variables['variable_name'] == 'Resp', 'variable_name'] = 'resp'
variables.loc[variables['variable_name'] == 'Temp', 'variable_name'] = 'temp'
variables.loc[variables['variable_name'] == 'SpO2', 'variable_name'] = 'spo2'
variables.loc[variables['variable_name'] == 'MAP', 'variable_name'] = 'map'
variables.loc[variables['variable_name'] == 'ABP MAP', 'variable_name'] = 'map'
variables.loc[variables['variable_name'] == 'FiO2 (%)', 'variable_name'] = 'fio2'
variables.loc[variables['variable_name'] == 'Coma Scale Total', 'variable_name'] = 'coma_scale'
variables.loc[variables['variable_name'] == 'WBC', 'variable_name'] = 'wbc'
variables.loc[variables['variable_name'] == 'BAND NEUTROPHILS % (MANUAL)', 'variable_name'] = 'band_neutrophils'
variables.loc[variables['variable_name'] == 'ARTERIAL BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'VENOUS BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'CAP BASE EXCESS', 'variable_name'] = 'base_excess'
variables.loc[variables['variable_name'] == 'ART BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'VENOUS BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'CAP BASE DEFICIT', 'variable_name'] = 'base_deficit'
variables.loc[variables['variable_name'] == 'LACTIC ACID', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'POC LACTIC ACID', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'LACTIC ACID WHOLE BLOOD', 'variable_name'] = 'lactic_acid'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PH', 'variable_name'] = 'art_ph'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PH', 'variable_name'] = 'cap_ph'
variables.loc[variables['variable_name'] == 'VENOUS POC PH', 'variable_name'] = 'venous_ph'
variables.loc[variables['variable_name'] == 'BUN/CREATININE RATIO QUEST', 'variable_name'] = 'bun_creat'
variables.loc[variables['variable_name'] == 'BUN/CREATININE RATIO OSF', 'variable_name'] = 'bun_creat'
variables.loc[variables['variable_name'] == 'BUN/CREATININE RATIO LABCORP', 'variable_name'] = 'bun_creat'
variables.loc[variables['variable_name'] == 'BILIRUBIN TOTAL', 'variable_name'] = 'bilirubin'
variables.loc[variables['variable_name'] == 'ALT (SGPT)', 'variable_name'] = 'alt'
variables.loc[variables['variable_name'] == 'AST (SGOT)', 'variable_name'] = 'ast'
variables.loc[variables['variable_name'] == 'PLATELETS', 'variable_name'] = 'platelets'
variables.loc[variables['variable_name'] == 'INT NORM RATIO', 'variable_name'] = 'inr'
variables.loc[variables['variable_name'] == 'PROTIME', 'variable_name'] = 'pt'
variables.loc[variables['variable_name'] == 'BUN', 'variable_name'] = 'bun'
variables.loc[variables['variable_name'] == 'CREATININE', 'variable_name'] = 'creatinine'
variables.loc[variables['variable_name'] == 'POC PCO2', 'variable_name'] = 'paco2'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PCO2', 'variable_name'] = 'paco2'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PCO2', 'variable_name'] = 'paco2'
variables.loc[variables['variable_name'] == 'VENOUS POC PCO2', 'variable_name'] = 'paco2'
variables.loc[variables['variable_name'] == 'POC PO2', 'variable_name'] = 'pao2'
variables.loc[variables['variable_name'] == 'ARTERIAL POC PO2', 'variable_name'] = 'pao2'
variables.loc[variables['variable_name'] == 'CAPILLARY POC PO2', 'variable_name'] = 'pao2'
variables.loc[variables['variable_name'] == 'VENOUS POC PO2', 'variable_name'] = 'pao2'
variables.loc[variables['variable_name'] == 'PaO2/FiO2 (Calculated)', 'variable_name'] = 'pao2_fio2'
variables.loc[variables['variable_name'] == 'RLE Capillary Refill (sec)', 'variable_name'] = 'cap_refill'
variables.loc[variables['variable_name'] == 'LLE Capillary Refill (sec)', 'variable_name'] = 'cap_refill'
variables.loc[variables['variable_name'] == 'RUE Capillary Refill (sec)', 'variable_name'] = 'cap_refill'
variables.loc[variables['variable_name'] == 'LUE Capillary Refill (sec)', 'variable_name'] = 'cap_refill'
variables.loc[variables['variable_name'] == 'PERIPHERAL VASCULAR WDL', 'variable_name'] = 'periph_vasc'
variables.loc[variables['variable_name'] == 'Activity', 'variable_name'] = 'activity'
variables.loc[variables['variable_name'] == 'Tidal Volume Set', 'variable_name'] = 'tidal_vol'
variables.loc[variables['variable_name'] == 'Oxygen Flow (lpm)', 'variable_name'] = 'o2_flow'
variables.loc[variables['variable_name'] == 'DDIMER UNITS', 'variable_name'] = 'ddimer'
variables.loc[variables['variable_name'] == 'FIBRINOGEN', 'variable_name'] = 'fibrinogen'
variables.loc[variables['variable_name'] == 'Pupil Right Reaction', 'variable_name'] = 'pupil_right_reaction'
variables.loc[variables['variable_name'] == 'Pupil Left Reaction', 'variable_name'] = 'pupil_left_reaction'

In [9]:
# Gather first 24 hours of data
variables['int'] = np.ceil((variables['recorded_time'] - variables['hosp_admission']) / pd.Timedelta('1 hour'))
variables = variables[(variables['int'] > 0) & (variables['int'] <= 24)]
variables.drop('int', axis=1, inplace=True)
variables.reset_index(drop=True, inplace=True)

# Fix pupillary reaction
variables.loc[(variables['variable_name'].isin(['pupil_right_reaction', 'pupil_left_reaction'])) & (variables['value'] == 'Non-reactive'), 'value'] = 0
variables.loc[(variables['variable_name'].isin(['pupil_right_reaction', 'pupil_left_reaction'])) & (variables['value'] == 'Reactive'), 'value'] = 1
variables.loc[(variables['variable_name'].isin(['pupil_right_reaction', 'pupil_left_reaction'])) & (variables['value'] == 'Unable to Assess'), 'value'] = 2

# Fix pupil size
variables.loc[variables['variable_name'].isin(['pupil_left_size', 'pupil_right_size']), 'value'] = variables.loc[variables['variable_name'].isin(['pupil_left_size', 'pupil_right_size']), 'value'].apply(lambda x: x[:-2])

# Discard NaN
variables.dropna(subset='value', inplace=True)

# Remove invalid values
variables = variables[variables['value'].apply(lambda x: str(x).replace(".", "", 1).isdigit())]
variables['value'] = variables['value'].astype(float)
variables.reset_index(inplace=True, drop=True)
variables = variables[~((variables['variable_name'] == 'spo2') & (variables['value'] > 97))]

# Convert weight from oz to lb
variables.loc[variables['variable_name'] == 'weight', 'value'] = variables.loc[variables['variable_name'] == 'weight', 'value'].apply(lambda x: round(x/16 ,2))

# Data wrangling and imputation
print('Data wrangling and generation of flags...')

# Pivot data
cols = variables['variable_name'].unique().tolist()
variables = pd.pivot_table(variables, values='value', index=['patid', 'csn', 'dob', 'hosp_admission', 'recorded_time'], columns=['variable_name'], aggfunc=(lambda x: x.iloc[0]), fill_value=np.nan)
variables.reset_index(inplace=True)
variables[['dob', 'hosp_admission', 'recorded_time']] = variables[['dob', 'hosp_admission', 'recorded_time']].apply(pd.to_datetime)

# Add age
variables['age_days'] = round((variables['hosp_admission'] - variables['dob']) / pd.Timedelta('1 day'), 0)
variables['age_months'] = round(variables['age_days'] / 31, 2)
variables['age_years'] = round(variables['age_days'] / 365.25, 2)

Data wrangling and generation of flags...


In [10]:
# Compute scores

phoenix = get_phoenix(variables)
cohort = cohort.merge(phoenix, on='csn', how='left')

psofa = get_psofa(variables)
cohort = cohort.merge(psofa, on='csn', how='left')

pelod2 = get_pelod2(variables)
cohort = cohort.merge(pelod2, on='csn', how='left')

prism3 = get_prism3(variables)
cohort = cohort.merge(prism3, on='csn', how='left')

Unique CSN total: 63823
Unique CSN total: 63823


In [11]:
# Save file
cohort.to_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_analysis/scores_24.csv', index=False)