In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys, os, pickle, utils
from tqdm import tqdm
from datetime import timedelta
#from utils import baseline_SCr

if os.getcwd()[-4:] == "code":
    os.chdir('../')

icu  = './data/mimic-iv-2.2-parquet/icu/'
hosp = './data/mimic-iv-2.2-parquet/hosp/'
ed   = './data/mimic-iv-2.2-parquet/ed/'

pd.set_option('mode.chained_assignment',  None)

In [2]:
hosp_list = ['labevents', 'd_labitems', 'patients', 'admissions', 
             'diagnoses_icd', 'microbiologyevents', 'prescriptions', 'omr']
for i in hosp_list:
    globals()['{}'.format(i)] = pd.read_parquet(hosp+i+'.parquet')

In [3]:
icu_list = ['chartevents', 'd_items', 'icustays', 'inputevents', 'outputevents', 'procedureevents']
for i in icu_list:
    globals()['{}'.format(i)] = pd.read_parquet(icu+i+'.parquet')

In [4]:
vitalsign = pd.read_parquet(ed+'vitalsign.parquet')

In [5]:
labvalues = pd.read_csv('./data/origin/labvalues/labvalues.csv')
vitals = pd.read_csv('./data/origin/vitals/vitals.csv')
comorbidities = pd.read_csv('./data/origin/demographic/comorbidities.csv')

# Demographic

## Gender

In [6]:
if not os.path.isfile('./data/origin/demographic/patients_gender.parquet'):
    patients_gender = utils.cal_gender(patients)
    patients_gender.to_parquet('./data/origin/demographic/patients_gender.parquet')
else : patients_gender = pd.read_parquet('./data/origin/demographic/patients_gender.parquet')

## Age

In [7]:
if not os.path.isfile('./data/origin/demographic/icustays_age.parquet'):
    icustays_age = utils.cal_age(icustays,patients)
    icustays_age.to_parquet('./data/origin/demographic/icustays_age.parquet')
else : icustays_age = pd.read_parquet('./data/origin/demographic/icustays_age.parquet')

## Race

In [8]:
if not os.path.isfile('./data/origin/demographic/admissions_race.parquet'):
    admissions_race = utils.cal_race(admissions)
    admissions_race.to_parquet('./data/origin/demographic/admissions_race.parquet')
else : admissions_race = pd.read_parquet('./data/origin/demographic/admissions_race.parquet')

## Height

In [None]:
#admission_height = cal_height(icustays,chartevents,omr)

## Weight

In [9]:
if not os.path.isfile('./data/origin/demographic/admission_weight.parquet'):
    admission_weight = utils.cal_weight(icustays,chartevents,inputevents,omr)
    admission_weight.to_parquet('./data/origin/demographic/admission_weight.parquet')
else : admission_weight = pd.read_parquet('./data/origin/demographic/admission_weight.parquet')

## Comorbidites

In [10]:
if not os.path.isfile('./data/origin/demographic/comorbidities.parquet'):
    comorbidities = utils.cal_comorbidities(icustays, comorbidities,diagnoses_icd)
    comorbidities.to_parquet('./data/origin/demographic/comorbidities.parquet')
else : comorbidities = pd.read_parquet('./data/origin/demographic/comorbidities.parquet')

## Baseline SCr

In [15]:
baseline_SCr = utils.cal_baseline_SCr(labevents_SCr,icustays,patients_gender,icustays_age,admissions_race)

  0%|          | 0/50920 [00:00<?, ?it/s]

100%|██████████| 50920/50920 [05:31<00:00, 153.62it/s]


# Lab values

In [11]:
for i,idx in enumerate(tqdm(labvalues.abbreviation)):
    if not os.path.isfile('./data/origin/labvalues/chartevents_%s.parquet'%idx):
        tmp = labvalues.iloc[i]
        globals()['chartevents_{}'.format(idx)] = utils.extract_labvalues(chartevents,labevents,tmp,is_in_icu=True)
        globals()['chartevents_{}'.format(idx)].to_parquet('./data/origin/labvalues/chartevents_%s.parquet'%idx)
    else : globals()['chartevents_{}'.format(idx)] = pd.read_parquet('./data/origin/labvalues/chartevents_%s.parquet'%idx)

100%|██████████| 30/30 [00:00<00:00, 44.31it/s]


In [12]:
for i,idx in enumerate(tqdm(labvalues.abbreviation)):
    if not os.path.isfile('./data/origin/labvalues/labevents_%s.parquet'%idx):
        tmp = labvalues.iloc[i]
        globals()['labevents_{}'.format(idx)] = utils.extract_labvalues(chartevents,labevents,tmp,is_in_icu=False)
        globals()['labevents_{}'.format(idx)].to_parquet('./data/origin/labvalues/labevents_%s.parquet'%idx)
    else : globals()['labevents_{}'.format(idx)] = pd.read_parquet('./data/origin/labvalues/labevents_%s.parquet'%idx)

100%|██████████| 30/30 [00:02<00:00, 12.57it/s]


In [13]:
for i,idx in enumerate(tqdm(labvalues.abbreviation)):
    if not os.path.isfile('./data/resample/labvalues/ii/resample_%s.parquet'%idx):
        globals()['resample_{}'.format(idx)] = utils.resample_labvalues(globals()['chartevents_{}'.format(idx)],globals()['labevents_{}'.format(idx)],icustays,idx)
        globals()['resample_{}'.format(idx)].to_parquet('./data/resample/labvalues/ii/resample_%s.parquet'%idx)
    else : globals()['resample_{}'.format(idx)] = pd.read_parquet('./data/resample/labvalues/ii/resample_%s.parquet'%idx)

100%|██████████| 30/30 [00:08<00:00,  3.35it/s]


# Vitals

In [16]:
for i,idx in enumerate(tqdm(vitals.abbreviation)):
    if not os.path.isfile('./data/origin/vitals/chartevents_%s.parquet'%idx):
        tmp = vitals.iloc[i]
        globals()['chartevents_{}'.format(idx)] = utils.extract_labvalues(chartevents,labevents,tmp,is_in_icu=True)
        globals()['chartevents_{}'.format(idx)].to_parquet('./data/origin/vitals/chartevents_%s.parquet'%idx)
    else : globals()['chartevents_{}'.format(idx)] = pd.read_parquet('./data/origin/vitals/chartevents_%s.parquet'%idx)

# Temperature
chartevents_tempF['valuenum'] = (chartevents_tempF['valuenum']-32)*5/9
chartevents_tempF.rename(columns={'valuenum2':'valuenum'},inplace=True)
chartevents_tempF['valuenum'] = chartevents_tempF['valuenum'].round(1)

chartevents_temp = pd.concat([chartevents_tempC,chartevents_tempF])
chartevents_temp.sort_values(by=['subject_id','hadm_id','stay_id','charttime'],inplace=True)
chartevents_temp.to_parquet('./data/origin/vitals/chartevents_temp.parquet')

os.remove('./data/origin/vitals/chartevents_tempC.parquet')
os.remove('./data/origin/vitals/chartevents_tempF.parquet')

100%|██████████| 11/11 [00:05<00:00,  1.85it/s]


In [None]:
'''for idx in tqdm([x[12:-8] for x in os.listdir('./data/origin/vitals') if x[-8:] == '.parquet']):
    print('Processing %s'%idx)
    globals()['resample_{}'.format(idx)] = resample_vitals(globals()['chartevents_{}'.format(idx)],icustays,idx)
    globals()['resample_{}'.format(idx)].to_parquet('./data/resample/vitals/resample_%s.parquet'%idx)'''

In [17]:
for idx in tqdm([x[12:-8] for x in os.listdir('./data/origin/vitals') if x[-8:] == '.parquet']):
    print('Processing %s'%idx)
    if not os.path.isfile('./data/resample/vitals/resample_%s.parquet'%idx):
        globals()['resample_{}'.format(idx)] = utils.resample_vitals(globals()['chartevents_{}'.format(idx)],icustays,idx)
        globals()['resample_{}'.format(idx)].to_parquet('./data/resample/vitals/resample_%s.parquet'%idx)
    else : globals()['resample_{}'.format(idx)] = pd.read_parquet('./data/resample/vitals/resample_%s.parquet'%idx)

  0%|          | 0/10 [00:00<?, ?it/s]

Processing ABPd


 10%|█         | 1/10 [00:00<00:02,  4.08it/s]

Processing ABPs


 20%|██        | 2/10 [00:00<00:01,  4.17it/s]

Processing CVP


 30%|███       | 3/10 [00:00<00:01,  4.14it/s]

Processing FiO2


 40%|████      | 4/10 [00:00<00:01,  4.21it/s]

Processing HR


 50%|█████     | 5/10 [00:01<00:01,  4.16it/s]

Processing NBPd


 60%|██████    | 6/10 [00:01<00:00,  4.17it/s]

Processing NBPs


 70%|███████   | 7/10 [00:01<00:00,  4.11it/s]

Processing RR


 80%|████████  | 8/10 [00:01<00:00,  3.99it/s]

Processing SaO2


 90%|█████████ | 9/10 [00:02<00:00,  3.98it/s]

Processing temp


100%|██████████| 10/10 [00:02<00:00,  4.09it/s]


In [18]:
resample_SBP = pd.merge(resample_ABPs,resample_NBPs, on=['subject_id','hadm_id','stay_id', 'charttime'], how='left')
resample_SBP['SBP'] = np.where((resample_SBP['presense_ABPs']==0), resample_SBP['NBPs'], resample_SBP['ABPs'])
resample_SBP['presense_SBP'] = 0
resample_SBP.loc[(resample_SBP['presense_ABPs']==1) | (resample_SBP['presense_NBPs']==1), 'presense_SBP'] = 1

resample_SBP = resample_SBP[['subject_id','hadm_id','stay_id','charttime','SBP','presense_SBP']]

In [19]:
resample_DBP = pd.merge(resample_ABPd,resample_NBPd, on=['subject_id','hadm_id','stay_id', 'charttime'], how='left')
resample_DBP['DBP'] = np.where((resample_DBP['presense_ABPd']==0), resample_DBP['NBPd'], resample_DBP['ABPd'])
resample_DBP['presense_DBP'] = 0
resample_DBP.loc[(resample_DBP['presense_ABPd']==1) | (resample_DBP['presense_NBPd']==1), 'presense_DBP'] = 1
resample_DBP = resample_DBP[['subject_id','hadm_id','stay_id','charttime','DBP','presense_DBP']]

In [20]:
resample_MAP = pd.merge(resample_SBP,resample_DBP, on=['subject_id','hadm_id','stay_id', 'charttime'], how='left')
resample_MAP['MAP'] = 1/3*resample_MAP['SBP'] + 2/3*resample_MAP['DBP']
resample_MAP.loc[(resample_MAP['presense_SBP']==1)&(resample_MAP['presense_DBP']==1),'presense_MAP']=1
resample_MAP.loc[resample_MAP['presense_MAP'].isna(),'MAP']=0
resample_MAP.loc[resample_MAP['presense_MAP'].isna(),'presense_MAP']=0
resample_MAP = resample_MAP[['subject_id','hadm_id','stay_id','charttime','MAP','presense_MAP']]

In [21]:
resample_SBP.to_parquet('./data/resample/vitals/resample_SBP.parquet')
resample_DBP.to_parquet('./data/resample/vitals/resample_DBP.parquet')
resample_MAP.to_parquet('./data/resample/vitals/resample_MAP.parquet')

# Urine Output

In [22]:
if not os.path.isfile('./data/resample/measures/resample_uo.parquet'):
    outputevents_uo = utils.cal_uo(outputevents)
    resample_uo = utils.resample_urine(outputevents_uo, icustays, 'uo')
    resample_uo.to_parquet('./data/resample/measures/resample_uo.parquet')
else : resample_uo = pd.read_parquet('./data/resample/measures/resample_uo.parquet')

# Fluid/Vasopressor intake

## Fluid

In [23]:
if not os.path.isfile('./data/resample/inputs/resample_fluid.parquet'):
    fluid = [220949, 220950, 220952, 225158, 225159, 225161, 225828, 225797, 225799, 225823, 225825, 225827, 225830, 226089, 225941, 225943, 225944, 226361, 226363, 226364, 226375, 226377, 226452, 226453, 227533, 228140, 228141, 228142, 228341, 220955, 220967, 220968, 220953]
    inputevents_fluid = inputevents[inputevents['itemid'].isin(fluid)]

    inputevents_fluid.loc[inputevents_fluid['rateuom']=='mL/min', 'rate'] = inputevents_fluid['rate']/60
    inputevents_fluid.loc[inputevents_fluid['rateuom']=='mL/kg/hour', 'rate'] = inputevents_fluid['rate'] * inputevents_fluid['patientweight']

    resample_fluids = utils.resample_inputrates(icustays,inputevents_fluid,'fluid')
    resample_fluids.to_parquet('./data/resample/inputs/resample_fluid.parquet')
else : resample_fluids = pd.read_parquet('./data/resample/inputs/resample_fluid.parquet')

## Vasopressor

In [24]:
inputevents_angiotensin_ii = inputevents[inputevents['itemid'].isin([229709,229764])]

In [25]:
inputevents_epinephrine = inputevents[inputevents['itemid'].isin([221289])]
inputevents_dopamine = inputevents[inputevents['itemid'].isin([221662])]
inputevents_dobutamine = inputevents[inputevents['itemid'].isin([221653])]
inputevents_norephinephrine = inputevents[inputevents['itemid'].isin([221906])]
inputevents_phenylephrine = inputevents[inputevents['itemid'].isin([221749, 229630, 229632])]
inputevents_vasopressin = inputevents[inputevents['itemid'].isin([222315])]

In [26]:
# Norephinephrine
inputevents_norephinephrine.loc[inputevents_norephinephrine['rateuom']=='mg/kg/min','rate']= inputevents_norephinephrine['rate']*1e3
inputevents_norephinephrine = inputevents_norephinephrine[['subject_id','hadm_id','stay_id','starttime','endtime','rate']]

# Epinephrine
inputevents_epinephrine = inputevents_epinephrine[['subject_id','hadm_id','stay_id','starttime','endtime','rate']]

# Phenylephrine
inputevents_phenylephrine.loc[inputevents_phenylephrine['rateuom']=='mcg/min','rate'] = inputevents_phenylephrine['rate']/inputevents_phenylephrine['patientweight']
inputevents_phenylephrine = inputevents_phenylephrine[['subject_id','hadm_id','stay_id','starttime','endtime','rate']]

# Vasopressin
inputevents_vasopressin.loc[inputevents_vasopressin['rateuom']=='units/hour', 'rate'] = inputevents_vasopressin['rate']/60
inputevents_vasopressin = inputevents_vasopressin[['subject_id','hadm_id','stay_id','starttime','endtime','rate']]

# Dopamine
inputevents_dopamine = inputevents_dopamine[['subject_id','hadm_id','stay_id','starttime','endtime','rate']]

# Dobupamine
inputevents_dobutamine = inputevents_dobutamine[['subject_id','hadm_id','stay_id','starttime','endtime','rate']]

# angiotensin_ii
inputevents_angiotensin_ii.loc[inputevents_angiotensin_ii['rateuom']=='mcg/kg/min', 'rate'] = inputevents_angiotensin_ii['rate']*1e3
inputevents_angiotensin_ii = inputevents_angiotensin_ii[['subject_id','hadm_id','stay_id','starttime','endtime','rate']]

In [27]:
Vaso = ['epinephrine','dopamine','dobutamine','norephinephrine','phenylephrine','vasopressin','angiotensin_ii']
for i,idx in enumerate(tqdm(Vaso)):
    if not os.path.isfile('./data/resample/inputs/resample_%s.parquet'%idx):
        globals()['resample_{}'.format(idx)] = utils.resample_inputrates(icustays, globals()['inputevents_{}'.format(idx)],idx)
        globals()['resample_{}'.format(idx)].to_parquet('./data/resample/inputs/resample_%s.parquet'%idx)
    else : globals()['resample_{}'.format(idx)] = pd.read_parquet('./data/resample/inputs/resample_%s.parquet'%idx)

100%|██████████| 7/7 [00:01<00:00,  4.07it/s]


In [28]:
for i,idx in enumerate(tqdm(Vaso)):
    if i == 0 : resample_vasopressor = globals()['resample_{}'.format(idx)].copy()
    else : resample_vasopressor = pd.merge(resample_vasopressor,globals()['resample_{}'.format(idx)].copy(),on=['subject_id','hadm_id','stay_id','charttime'],how='left')

100%|██████████| 7/7 [00:19<00:00,  2.73s/it]


In [29]:
resample_vasopressor['vaso_equ'] = resample_vasopressor['norephinephrine'] + resample_vasopressor['epinephrine'] + 1/100*resample_vasopressor['dopamine'] + 0.06*resample_vasopressor['phenylephrine'] + 2.5*resample_vasopressor['vasopressin'] + 0.0025*resample_vasopressor['angiotensin_ii']
resample_vasopressor = resample_vasopressor[['subject_id','hadm_id','stay_id','charttime','vaso_equ']]

# Ventilator

In [30]:
if not os.path.isfile('./data/resample/procedures/resample_ventilation.parquet'):
    procedureevents_ventilation = procedureevents[procedureevents['itemid'].isin([225792,225794])]
    resample_ventilation = utils.resample_inputrates(icustays,procedureevents_ventilation,'ventilation')
    resample_ventilation.to_parquet('./data/resample_ventilation.parquet')
else : resample_ventilation = pd.read_parquet('./data/resample/procedures/resample_ventilation.parquet')

# RRT

In [31]:
if not os.path.isfile('./data/resample/procedures/resample_rrt.parquet'):
    procedureevents_rrt = procedureevents[procedureevents['itemid'].isin([
    225441, #Hemodialysis
    225802, #Dialysis - CRRT
    225803, #Dialysis - CVVHD
    225805, #Peritoneal Dialysis
    225809, #Dialysis - CVVHDF
    225955, #Dialysis - SCUF
    ])]
    resample_rrt = utils.resample_inputrates(icustays,procedureevents_rrt,'RRT')
    resample_rrt.to_parquet('./data/resample/procedures/resample_rrt.parquet')
else : resample_rrt = pd.read_parquet('./data/resample/procedures/resample_rrt.parquet')

# SOFA

In [32]:
# GCS
chartevents_GCS_eye = chartevents[chartevents['itemid'].isin([220739])]
chartevents_GCS_verval = chartevents[chartevents['itemid'].isin([223900])]
chartevents_GCS_motor = chartevents[chartevents['itemid'].isin([223901])]

GCS = ['GCS_eye','GCS_verval','GCS_motor']
for i,idx in enumerate(tqdm(GCS)):
    if not os.path.isfile('./data/resample/measures/resample_%s.parquet'%idx):
        print("[%i/%i] Resampling %s..."%(i+1,len(vitals.abbreviation.unique()),idx))
        globals()['resample_{}'.format(idx)] = resample_vitals(globals()['chartevents_{}'.format(idx)],icustays,idx)
        globals()['resample_{}'.format(idx)].to_parquet('./data/resample/resample_%s.parquet'%idx)
    else : globals()['resample_{}'.format(idx)] = pd.read_parquet('./data/resample/measures/resample_%s.parquet'%idx)

100%|██████████| 3/3 [00:01<00:00,  2.22it/s]


In [33]:
resample_GCS_eye = utils.resample_fill(resample_GCS_eye)
resample_GCS_verval = utils.resample_fill(resample_GCS_verval)
resample_GCS_motor = utils.resample_fill(resample_GCS_motor)

resample_GCS_eye.rename(columns={'valuenum':'GCS_eye'},inplace=True)
resample_GCS_verval.rename(columns={'valuenum':'GCS_verval'},inplace=True)
resample_GCS_motor.rename(columns={'valuenum':'GCS_motor'},inplace=True)

resample_GCS_eye.drop('itemid',axis=1,inplace=True)
resample_GCS_verval.drop('itemid',axis=1,inplace=True)
resample_GCS_motor.drop('itemid',axis=1,inplace=True)

resample_GCS = pd.merge(resample_GCS_eye,resample_GCS_verval,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_GCS = pd.merge(resample_GCS,resample_GCS_motor,on=['subject_id','hadm_id','stay_id','charttime'],how='left')

resample_GCS = resample_GCS[['subject_id','hadm_id','stay_id','charttime','GCS_eye','GCS_verval','GCS_motor']]
resample_GCS['GCS'] = resample_GCS['GCS_eye'] + resample_GCS['GCS_verval'] + resample_GCS['GCS_motor']

In [34]:
def resample_PF(resample_PaO2,resample_FiO2):
    resample_PaO2.rename(columns={'valuenum':'PaO2'},inplace=True)
    resample_FiO2.rename(columns={'valuenum':'FiO2'},inplace=True)

    df = pd.merge(resample_PaO2,resample_FiO2,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
    df['PF'] = df['PaO2']/(df['FiO2']/100)
    df = df[['subject_id','hadm_id','stay_id','charttime','PF']]
    return df

resample_PF = resample_PF(resample_PaO2,resample_FiO2)

In [35]:
resample_uo_24hrs = resample_uo.copy()
rolling_avg=resample_uo_24hrs.groupby('stay_id').rolling(window='24H', on='charttime',min_periods=24)['uo'].sum().reset_index(drop=True)
resample_uo_24hrs['uo_day'] = rolling_avg

In [36]:
resample_SOFA = resample_GCS.copy()
for i in ['MAP','dopamine','dobutamine','epinephrine','norephinephrine','PF','Platelet','T_Bil','SCr','ventilation','uo_24hrs']:
    resample_SOFA = pd.merge(resample_SOFA,globals()['resample_{}'.format(i)],on=['subject_id','hadm_id','stay_id','charttime'],how='left')

In [37]:
def cal_SOFA(df):
    # Central nervous system
    #df.loc[df['GCS'].isna(),'SOFA_CNS'] = 0
    df.loc[df['GCS']==15,'SOFA_CNS'] = 0
    df.loc[(df['GCS']>=13)&(df['GCS']<=14),'SOFA_CNS'] = 1
    df.loc[(df['GCS']>=10)&(df['GCS']<=12),'SOFA_CNS'] = 2
    df.loc[(df['GCS']>=6)&(df['GCS']<=9),'SOFA_CNS'] = 3
    df.loc[(df['GCS']<6),'SOFA_CNS'] = 4

    # Cardiovascular system
    #df.loc[df['MAP'].isna(),'SOFA_CVS'] = 0
    df.loc[df['MAP']>=70,'SOFA_CVS'] = 0
    df.loc[df['MAP']<70,'SOFA_CVS'] = 1
    df.loc[((df['dopamine']<=5)&(df['dopamine']>0))|(df['dobutamine']>0),'SOFA_CVS'] = 2
    df.loc[(df['dopamine']>5)|((df['epinephrine']<=0.1)&(df['epinephrine']>0))|((df['norephinephrine']<=0.1)&(df['norephinephrine']>0)),'SOFA_CVS'] = 3
    df.loc[(df['dopamine']>15)|(df['epinephrine']>0.1)|(df['norephinephrine']>0.1),'SOFA_CVS'] = 4

    # Respiratory system
    #df.loc[df['PF'].isna(), 'SOFA_RS'] = 0
    df.loc[df['PF']>=400, 'SOFA_RS'] = 0
    df.loc[df['PF']<400, 'SOFA_RS'] = 1
    df.loc[df['PF']<300, 'SOFA_RS'] = 2
    df.loc[(df['PF']<200)&(df['ventilation']>0), 'SOFA_RS'] = 3
    df.loc[(df['PF']<100)&(df['ventilation']>0), 'SOFA_RS'] = 4

    # Coagulation
    #df.loc[df['Platelet'].isna(), 'SOFA_C'] = 0
    df.loc[df['Platelet']>=150, 'SOFA_C'] = 0
    df.loc[df['Platelet']<150, 'SOFA_C'] = 1
    df.loc[df['Platelet']<100, 'SOFA_C'] = 2
    df.loc[df['Platelet']<50, 'SOFA_C'] = 3
    df.loc[df['Platelet']<20, 'SOFA_C'] = 4

    # Liver
    #df.loc[df['T_Bil'].isna(), 'SOFA_L'] = 0
    df.loc[df['T_Bil']<1.2, 'SOFA_L'] = 0
    df.loc[(df['T_Bil']>=1.2)&(df['T_Bil']<2.0), 'SOFA_L'] = 1
    df.loc[(df['T_Bil']>=2.0)&(df['T_Bil']<6.0), 'SOFA_L'] = 2
    df.loc[(df['T_Bil']>=6.0)&(df['T_Bil']<12.0), 'SOFA_L'] = 3
    df.loc[(df['T_Bil']>=12.0), 'SOFA_L'] = 4

    # Renal function
    #df.loc[df['SCr'].isna(), 'SOFA_R'] = 0
    df.loc[df['SCr']<1.2, 'SOFA_R'] = 0
    df.loc[(df['SCr']>=1.2)&(df['SCr']<2.0), 'SOFA_R'] = 1
    df.loc[(df['SCr']>=2.0)&(df['SCr']<3.5), 'SOFA_R'] = 2
    df.loc[((df['SCr']>=3.5)&(df['SCr']<5.0))|(df['uo_day']<500), 'SOFA_R'] = 3
    df.loc[(df['SCr']>=5.0)|(df['uo_day']<200), 'SOFA_R'] = 4

    df['SOFA'] = df['SOFA_CNS'] + df['SOFA_CVS'] + df['SOFA_RS'] + df['SOFA_C'] + df['SOFA_L'] + df['SOFA_R']
    df = df[['subject_id','hadm_id','stay_id','charttime','SOFA']]
    return df

resample_SOFA = cal_SOFA(resample_SOFA)

In [38]:
resample_SOFA

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,SOFA
0,10000032,29079034,39553978,2180-07-23 14:00:00,
1,10000032,29079034,39553978,2180-07-23 15:00:00,
2,10000032,29079034,39553978,2180-07-23 16:00:00,
3,10000032,29079034,39553978,2180-07-23 17:00:00,
4,10000032,29079034,39553978,2180-07-23 18:00:00,
...,...,...,...,...,...
6099429,19999987,23865745,36195440,2145-11-04 16:59:00,6.0
6099430,19999987,23865745,36195440,2145-11-04 17:59:00,6.0
6099431,19999987,23865745,36195440,2145-11-04 18:59:00,6.0
6099432,19999987,23865745,36195440,2145-11-04 19:59:00,6.0


# AKI annotation

## AKI_UO

In [49]:
resample_uo

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,uo
0,10000032,29079034,39553978,2180-07-23 14:00:00,0.0
1,10000032,29079034,39553978,2180-07-23 15:00:00,175.0
2,10000032,29079034,39553978,2180-07-23 16:00:00,0.0
3,10000032,29079034,39553978,2180-07-23 17:00:00,0.0
4,10000032,29079034,39553978,2180-07-23 18:00:00,0.0
...,...,...,...,...,...
42,19999987,23865745,36195440,2145-11-04 16:59:00,0.0
43,19999987,23865745,36195440,2145-11-04 17:59:00,0.0
44,19999987,23865745,36195440,2145-11-04 18:59:00,0.0
45,19999987,23865745,36195440,2145-11-04 19:59:00,325.0


In [54]:
def AKI_UO_annotation(resample_uo,admission_weight):
    for i in tqdm(range(6,49,1)):
        rolling_avg=resample_uo.groupby('stay_id').rolling(window=str(i)+'H', on='charttime',min_periods=i)['uo'].mean().reset_index()
        rolling_avg.rename(columns={'uo':'roll_%iH' % i},inplace=True)
        resample_uo = pd.merge(resample_uo,rolling_avg,on=['stay_id','charttime'],how='left')
        df = pd.merge(resample_uo,admission_weight,on=['subject_id','stay_id'],how='left')
    df['6-12H_min'] = df[df.columns[6:13]].min(axis=1)/df['valuenum']
    df['12H-_min'] = df[df.columns[12:49]].min(axis=1)/df['valuenum']
    df['24H-_min'] = df[df.columns[24:49]].min(axis=1)/df['valuenum']
    df['AKI_UO']=0
    df.loc[df['6-12H_min']<0.5, 'AKI_UO'] = 1
    df.loc[df['12H-_min']<0.5, 'AKI_UO'] = 2
    df.loc[df['12H-_min']==0, 'AKI_UO'] = 3
    df.loc[df['24H-_min']<0.3, 'AKI_UO'] = 3
    df['presense_AKI_UO'] = 1
    df.loc[df['roll_6H'].isna(),'presense_AKI_UO']=0
    df = df[['subject_id','hadm_id','stay_id','charttime','AKI_UO','presense_AKI_UO']]
    return df

In [55]:
resample_AKI_UO = AKI_UO_annotation(resample_uo,admission_weight)
resample_AKI_UO.to_parquet('./data/resample/measures/resample_AKI_UO.parquet')
resample_AKI_UO.AKI_UO.value_counts()

  0%|          | 0/43 [00:00<?, ?it/s]

100%|██████████| 43/43 [07:11<00:00, 10.03s/it]


AKI_UO
0    3695199
2     972590
3     967805
1     463840
Name: count, dtype: int64

In [90]:
resample_AKI_UO

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,AKI_UO,presense_AKI_UO
0,10000032,29079034,39553978,2180-07-23 14:00:00,0,0
1,10000032,29079034,39553978,2180-07-23 15:00:00,0,0
2,10000032,29079034,39553978,2180-07-23 16:00:00,0,0
3,10000032,29079034,39553978,2180-07-23 17:00:00,0,0
4,10000032,29079034,39553978,2180-07-23 18:00:00,0,0
...,...,...,...,...,...,...
6099429,19999987,23865745,36195440,2145-11-04 16:59:00,2,1
6099430,19999987,23865745,36195440,2145-11-04 17:59:00,2,1
6099431,19999987,23865745,36195440,2145-11-04 18:59:00,2,1
6099432,19999987,23865745,36195440,2145-11-04 19:59:00,1,1


In [None]:
if not os.path.isfile('./data/resample/measures/resample_AKI_UO.parquet'):
    resample_AKI_UO = utils.AKI_UO_annotation(resample_uo,admission_weight)
    resample_AKI_UO.to_parquet('./data/resample/measures/resample_AKI_UO.parquet')
else: resample_AKI_UO = pd.read_parquet('./data/resample/measures/resample_AKI_UO.parquet')
resample_AKI_UO.AKI_UO.value_counts()

## AKI_SCr

In [56]:
def AKI_SCr_annotation(resample_SCr,baseline_SCr,resample_rrt):
    df = resample_SCr.copy()
    df = pd.merge(df,baseline_SCr,on=['subject_id','hadm_id','stay_id'],how='left')
    df = pd.merge(df,resample_rrt,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
    df.loc[df['presense_SCr']==0, 'SCr'] = np.nan
    SCr_48hrs_min = df.groupby('stay_id').rolling(window='48H', on='charttime')['SCr'].min().reset_index()
    SCr_48hrs_min.rename(columns={'SCr':'SCr_48hrs_min'},inplace=True)
    df = pd.merge(df,SCr_48hrs_min,on=['stay_id','charttime'],how='left')
    
    df['AKI_SCr'] = 0
    df.loc[((df['SCr']>=1.5*df['baseline_SCr'])&(df['SCr']<2.0*df['baseline_SCr']))|(df['SCr']>=0.3+df['SCr_48hrs_min']), 'AKI_SCr'] = 1
    df.loc[(df['SCr']>=2.0*df['baseline_SCr'])&(df['SCr']<3.0*df['baseline_SCr']), 'AKI_SCr'] = 2
    df.loc[(df['SCr']>=3.0*df['baseline_SCr'])|((df['SCr']>=0.3+df['SCr_48hrs_min'])&(df['SCr']>=4.0))|(df['RRT']>0), 'AKI_SCr'] = 3
    df['presense_AKI_SCr'] = df['presense_SCr']
    df = df[['subject_id','hadm_id','stay_id','charttime','AKI_SCr','presense_AKI_SCr']]    
    return df

In [92]:
resample_AKI_SCr.rename(columns={'presense_SCr':'presense_AKI_SCr'},inplace=True)

In [57]:
resample_AKI_SCr = AKI_SCr_annotation(resample_SCr,baseline_SCr,resample_rrt)
resample_AKI_SCr.AKI_SCr.value_counts()
resample_AKI_SCr.to_parquet('./data/resample/measures/resample_AKI_UO.parquet')

In [None]:
if not os.path.isfile('./data/resample/measures/resample_AKI_SCr.parquet'):
    resample_AKI_SCr = utils.AKI_SCr_annotation(resample_SCr,baseline_SCr,resample_rrt)
    resample_AKI_SCr.to_parquet('./data/resample/measures/resample_AKI_UO.parquet')
else: resample_AKI_SCr = pd.read_parquet('./data/resample/measures/resample_AKI_UO.parquet')
resample_AKI_SCr.AKI_SCr.value_counts()

# Merge

In [93]:
# merge vital
vitallist = ['HR','SBP','DBP','MAP','temp','RR','CVP','SaO2','FiO2']
for i,idx in enumerate(tqdm(vitallist)):
    print("Merge %s..."%idx)
    if i == 0 : resample_merge = globals()['resample_{}'.format(idx)].copy()
    else : resample_merge = pd.merge(resample_merge,globals()['resample_{}'.format(idx)],on=['subject_id','hadm_id','stay_id','charttime'],how='left')

#merge labvalues
for i,idx in enumerate(tqdm(labvalues.abbreviation)):
    print("Merge %s..."%idx)
    resample_merge = pd.merge(resample_merge,globals()['resample_{}'.format(idx)],on=['subject_id','hadm_id','stay_id','charttime'],how='left')


  0%|          | 0/9 [00:00<?, ?it/s]

Merge HR...


 11%|█         | 1/9 [00:00<00:02,  3.90it/s]

Merge SBP...


 22%|██▏       | 2/9 [00:03<00:14,  2.01s/it]

Merge DBP...


 33%|███▎      | 3/9 [00:06<00:14,  2.45s/it]

Merge MAP...


 44%|████▍     | 4/9 [00:09<00:13,  2.69s/it]

Merge temp...


 56%|█████▌    | 5/9 [00:12<00:11,  2.84s/it]

Merge RR...


 67%|██████▋   | 6/9 [00:15<00:08,  2.95s/it]

Merge CVP...


 78%|███████▊  | 7/9 [00:18<00:05,  2.95s/it]

Merge SaO2...


 89%|████████▉ | 8/9 [00:21<00:02,  2.97s/it]

Merge FiO2...


100%|██████████| 9/9 [00:24<00:00,  2.77s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

Merge Alb...


  3%|▎         | 1/30 [00:03<01:49,  3.78s/it]

Merge Alk_Phos...


  7%|▋         | 2/30 [00:07<01:48,  3.89s/it]

Merge AG...


 10%|█         | 3/30 [00:11<01:45,  3.90s/it]

Merge BUN...


 13%|█▎        | 4/30 [00:15<01:42,  3.96s/it]

Merge Ca...


 17%|█▋        | 5/30 [00:19<01:39,  3.99s/it]

Merge CK...


 20%|██        | 6/30 [00:23<01:36,  4.01s/it]

Merge D_Bil...


 23%|██▎       | 7/30 [00:28<01:33,  4.08s/it]

Merge Glu...


 27%|██▋       | 8/30 [00:32<01:30,  4.10s/it]

Merge HCT...


 30%|███       | 9/30 [00:36<01:26,  4.10s/it]

Merge INR...


 33%|███▎      | 10/30 [00:40<01:22,  4.14s/it]

Merge PH...


 37%|███▋      | 11/30 [00:44<01:18,  4.14s/it]

Merge PHOS...


 40%|████      | 12/30 [00:48<01:14,  4.13s/it]

Merge Platelet...


 43%|████▎     | 13/30 [00:52<01:10,  4.15s/it]

Merge Cl...


 47%|████▋     | 14/30 [00:57<01:06,  4.14s/it]

Merge SCr...


 50%|█████     | 15/30 [01:01<01:02,  4.16s/it]

Merge Na...


 53%|█████▎    | 16/30 [01:05<00:58,  4.18s/it]

Merge Potassium...


 57%|█████▋    | 17/30 [01:09<00:54,  4.19s/it]

Merge T_Bil...


 60%|██████    | 18/30 [01:14<00:50,  4.25s/it]

Merge WBC...


 63%|██████▎   | 19/30 [01:18<00:46,  4.24s/it]

Merge Gl...


 67%|██████▋   | 20/30 [01:22<00:42,  4.26s/it]

Merge Mg...


 70%|███████   | 21/30 [01:26<00:38,  4.27s/it]

Merge Ca_ion...


 73%|███████▎  | 22/30 [01:30<00:33,  4.20s/it]

Merge HCO3...


 77%|███████▋  | 23/30 [01:34<00:29,  4.15s/it]

Merge AST...


 80%|████████  | 24/30 [01:39<00:24,  4.13s/it]

Merge ALT...


 83%|████████▎ | 25/30 [01:43<00:20,  4.16s/it]

Merge PTT...


 87%|████████▋ | 26/30 [01:47<00:16,  4.16s/it]

Merge baseexcess...


 90%|█████████ | 27/30 [01:51<00:12,  4.20s/it]

Merge lactate...


 93%|█████████▎| 28/30 [01:55<00:08,  4.20s/it]

Merge PaO2...


 97%|█████████▋| 29/30 [02:00<00:04,  4.22s/it]

Merge PaCO2...


100%|██████████| 30/30 [02:04<00:00,  4.15s/it]


In [94]:
# merge demographic
resample_merge = pd.merge(resample_merge,patients_gender,on=['subject_id'],how='left')
resample_merge = pd.merge(resample_merge,admissions_race,on=['subject_id'],how='left')
resample_merge = pd.merge(resample_merge,admission_weight,on=['subject_id','stay_id'],how='left')
resample_merge = pd.merge(resample_merge,icustays_age,on=['subject_id','hadm_id','stay_id'],how='left')
resample_merge = pd.merge(resample_merge,comorbidities,on=['subject_id','hadm_id','stay_id'],how='left')
resample_merge = pd.merge(resample_merge,baseline_SCr,on=['subject_id','hadm_id','stay_id'],how='left')


In [95]:
# merge measures
resample_merge = pd.merge(resample_merge,resample_uo,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_merge = pd.merge(resample_merge,resample_SOFA,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_merge = pd.merge(resample_merge,resample_AKI_UO,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_merge = pd.merge(resample_merge,resample_AKI_SCr,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_merge['AKI'] = resample_merge[['AKI_UO','AKI_SCr']].max(axis=1)
resample_merge['presense_AKI'] = 0
resample_merge.loc[(resample_merge['presense_AKI_UO']==1)|(resample_merge['presense_AKI_SCr']==1),'presense_AKI'] = 1

In [96]:
# merge procedures & inputs
resample_merge = pd.merge(resample_merge,resample_ventilation,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_merge = pd.merge(resample_merge,resample_fluids,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_merge = pd.merge(resample_merge,resample_vasopressor,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_antibiotics.rename(columns={'starttime':'charttime'},inplace=True)
resample_merge = pd.merge(resample_merge,resample_antibiotics,on=['stay_id','charttime'],how='left')

In [97]:
resample_merge[['Cephalosporins', 'Vancomycin', 'Betalactam_comb', 'Metronidazole',
       'Carbapenems', 'Penicillins', 'Fluoroquinolones', 'Others',
       'category_count']] = resample_merge[['Cephalosporins', 'Vancomycin', 'Betalactam_comb', 'Metronidazole',
       'Carbapenems', 'Penicillins', 'Fluoroquinolones', 'Others',
       'category_count']].fillna(0)

In [98]:
# Derived variables
resample_merge['SCr/baseline_SCr'] = resample_merge['SCr'] / resample_merge['baseline_SCr']
resample_merge['delta_SCr'] = resample_merge['SCr'] - resample_merge['baseline_SCr']
resample_merge['BUN/SCr'] = resample_merge['BUN'] / resample_merge['SCr']

resample_merge.rename(columns={'valuenum':'weight'},inplace=True)

In [99]:
len(resample_merge.columns)

126

In [89]:
resample_merge.to_parquet('./data/resample/resample_merge.parquet')

In [None]:
if not os.path.isfile('./data/resample/resample_merge.parquet'):
    resample_merge.to_parquet('./data/resample/resample_merge.parquet')
else : resample_merge = pd.read_parquet('./data/resample/resample_merge.parquet')

# Dead

In [63]:
for i in [x for x in admissions.columns if 'time' in x]:
    admissions[i] = pd.to_datetime(admissions[i])

In [64]:
patients['dod'] = pd.to_datetime(patients['dod'])

In [65]:
admissions_disch_dead = admissions[admissions['discharge_location']=='DIED'][['subject_id','hadm_id','dischtime']]
admissions_deathtime_dead = admissions[~admissions['deathtime'].isna()][['subject_id','hadm_id','deathtime']]
patients_dead = patients[~patients['dod'].isna()][['subject_id','dod']]

In [66]:
def cal_dead(admissions, patients, icustays):
    for i in [x for x in admissions.columns if 'time' in x]:
        admissions[i] = pd.to_datetime(admissions[i])
    patients['dod'] = pd.to_datetime(patients['dod'])

    admissions_disch_dead = admissions[admissions['discharge_location']=='DIED'][['subject_id','hadm_id','dischtime']]
    admissions_deathtime_dead = admissions[~admissions['deathtime'].isna()][['subject_id','hadm_id','deathtime']]
    patients_dead = patients[~patients['dod'].isna()][['subject_id','dod']]

    tmp = pd.merge(patients,admissions_disch_dead,on=['subject_id'],how='left')
    tmp = pd.merge(tmp,admissions_deathtime_dead,on='subject_id',how='left')

    tmp = tmp[['subject_id','dod','dischtime','deathtime']]
    tmp = pd.merge(tmp,icustays[['subject_id','stay_id','outtime']],on='subject_id',how='left')
    tmp = tmp[~tmp['stay_id'].isna()][['subject_id','stay_id','dod','dischtime','deathtime','outtime']]

    tmp['deadtime'] = np.where(
    pd.notnull(tmp['dischtime']) & pd.notnull(tmp['deathtime']),
    np.minimum(tmp['dischtime'], tmp['deathtime']),
    np.where(pd.notnull(tmp['dischtime']), tmp['dischtime'],
             np.where(pd.notnull(tmp['deathtime']), tmp['deathtime'], tmp['dod']))
    )

    # dead 플래그 계산
    tmp['dead'] = np.where(tmp['deadtime'] <= tmp['outtime'], 1, 0)
    return tmp[['subject_id','stay_id','deadtime','dead']]

In [67]:
dead = cal_dead(admissions, patients, icustays)

In [68]:
dead

Unnamed: 0,subject_id,stay_id,deadtime,dead
0,10000032,39553978.0,2180-09-09 00:00:00,0
33,10000980,39765666.0,2193-08-26 00:00:00,0
40,10001217,37067082.0,NaT,0
41,10001217,34592300.0,NaT,0
54,10001725,31205490.0,NaT,0
...,...,...,...,...
321976,19999442,32336619.0,NaT,0
321980,19999625,31070865.0,NaT,0
321987,19999828,36075953.0,NaT,0
321989,19999840,38978960.0,2164-09-17 13:42:00,1


In [69]:
def resample_dead(icustays, inputevents):
    resampled_data = []
    name = 'dead'
    # inputevents에 있는 모든 stay_id를 미리 확인
    inputevents_stay_ids = set(inputevents[~inputevents['deadtime'].isna()].subject_id.unique())

    for stay_id in tqdm(icustays.stay_id.unique()):
        intime = icustays[icustays['stay_id']==stay_id].intime.values[0]
        outtime = icustays[icustays['stay_id']==stay_id].outtime.values[0]
        subject_id = icustays[icustays['stay_id']==stay_id].subject_id.values[0]
        hadm_id = icustays[icustays['stay_id']==stay_id].hadm_id.values[0]

        # 1시간 단위로 시간대 생성
        time_range = pd.date_range(start=intime, end=outtime, freq='H')

        # inputevents에 stay_id가 있는지 확인
        if subject_id in inputevents_stay_ids:
            # 해당 stay_id의 inputevents 데이터 필터링
            stay_inputevents = inputevents[inputevents['subject_id'] == subject_id]

            for timestamp in time_range:
                end_time = timestamp + pd.Timedelta(hours=1)

                # 해당 시간대에 해당하는 inputevents의 rate 합계 계산
                rates = stay_inputevents[(stay_inputevents['deadtime'] <= timestamp)]['dead']
                total_rate = rates.sum() if not rates.empty else 0

                resampled_data.append({'subject_id':subject_id,'hadm_id':hadm_id,'stay_id': stay_id, 'charttime': timestamp, name: total_rate})
        else:
            # inputevents에 stay_id가 없는 경우, 모든 rate를 0으로 설정
            for timestamp in time_range:
                resampled_data.append({'subject_id':subject_id,'hadm_id':hadm_id,'stay_id': stay_id, 'charttime': timestamp, name: 0})
    resampled_data = pd.DataFrame(resampled_data)
    return pd.DataFrame(resampled_data)

In [70]:
resample_dead = resample_dead(icustays,dead)

100%|██████████| 73181/73181 [13:09<00:00, 92.65it/s] 


In [100]:
resample_merge = pd.merge(resample_merge,resample_dead,on=['subject_id','hadm_id','stay_id','charttime'],how='left')
resample_merge['AKI_stage3'] = 0
resample_merge.loc[resample_merge['AKI'] == 3, 'AKI_stage3'] = 1

In [101]:
len(resample_merge.columns)

128

In [None]:
resample_merge.to_parquet('./data/resample/resample_label.parquet')

In [None]:
resample_merge.columns

# Antibiotics

In [73]:
d_antibiotics = d_items[(d_items['linksto']=='inputevents')&(d_items['category']=='Antibiotics')]
d_antibiotics = d_antibiotics[['itemid','label']]
d_antibiotics.reset_index(inplace=True)

# non-Antibiotics 제거
d_antibiotics = d_antibiotics[~d_antibiotics['itemid'].isin([
    225898,225877,225895,225868,225869,225885,225905,225838,
    225848,225844,225896,225837,225873,228003,225871,225903])]
d_antibiotics.dropna(subset='itemid',inplace=True)
d_antibiotics.reset_index(drop=True,inplace=True)

In [74]:
inputevents_antibiotics = inputevents[inputevents['itemid'].isin(d_antibiotics.itemid.unique())]
inputevents_antibiotics['count'] = 1

inputevents_antibiotics = inputevents_antibiotics[['stay_id','starttime','itemid','count']]
inputevents_antibiotics['starttime'] = pd.to_datetime(inputevents_antibiotics['starttime'])

inputevents_antibiotics.sort_values(by=['stay_id','starttime'],ascending=True,inplace=True)
inputevents_antibiotics.reset_index(drop=True,inplace=True)

d_antibiotics = pd.merge(d_antibiotics,inputevents_antibiotics.groupby('itemid')['count'].sum(), on='itemid', how='left')
d_antibiotics.dropna(subset='count',inplace=True)
d_antibiotics.sort_values(by='label',inplace=True)

In [75]:
len(d_antibiotics)

33

In [76]:
def resample_multitherapy(multi,icustays_intime,icustays_outtime,resample_time):
    multi_resample = []
    for i in tqdm(multi.stay_id.unique()):
        tmp = multi[multi['stay_id']==i]
        tmp_intime = icustays_intime[icustays_intime['stay_id']==i][['stay_id','starttime']]
        tmp_outtime = icustays_outtime[icustays_outtime['stay_id']==i][['stay_id','starttime']]
        tmp = pd.concat([tmp, tmp_intime, tmp_outtime])
        tmp.fillna(0, inplace=True)
        tmp = tmp[(tmp['starttime'].values >= tmp_intime.starttime.values)&(tmp['starttime'].values <= tmp_outtime.starttime.values)]
        tmp.index = pd.DatetimeIndex(tmp['starttime'])
        tmp = tmp[tmp.columns[2:]].resample(resample_time,origin='start').sum()
        tmp.reset_index(drop=False,inplace=True)
        tmp['stay_id'] = i
        multi_resample.append(tmp)
    multi_resample = pd.concat(multi_resample)
    return multi_resample

In [77]:
all = [225798,225840,225842,225843,225845,225850,225851,225853,
       225855,225859,225860,225862,225863,225865,225866,225875,
       225876,225879,225881,225883,225884,225886,225888,225890,
       225892,225893,225899,225902,227691,229061,229064,229587]

Cephalosporins = [225851,225850,225855,225853,229587,227691]
Vancomycin = [225798]
Betalactam_comb = [225893,225843]
Metronidazole = [225884]
Carbapenems = [225883,225876,229061]
Penicillins = [225888,225892,225842,225890]
Fluoroquinolones = [225859,225879,225886]
Others = [225875,225902,225840,225847,225860,225845,
          225866,225881,225863,225862,225899,225865,229064]

category = ['Cephalosporins','Vancomycin','Betalactam_comb','Metronidazole',
              'Carbapenems','Penicillins','Fluoroquinolones','Others']

In [78]:
icustays_intime = icustays[['subject_id','hadm_id','stay_id','intime']]
icustays_intime['intime'] = pd.to_datetime(icustays_intime['intime'])
icustays_intime = icustays_intime.rename(columns={'intime':'starttime'})

icustays_outtime = icustays[['subject_id','hadm_id','stay_id','outtime']]
icustays_outtime['outtime'] = pd.to_datetime(icustays_outtime['outtime'])
icustays_outtime = icustays_outtime.rename(columns={'outtime':'starttime'})

In [79]:
def multitherapy(inputevents_antibiotics, time=5):
    AB_multitherapy = pd.concat([inputevents_antibiotics,inputevents_antibiotics.shift(1)],axis=1)
    AB_multitherapy.columns = ['stay_id_x','starttime_x','itemid_x','count_x','stay_id_y','starttime_y','itemid_y','count_y']
    AB_multitherapy['timediff'] = AB_multitherapy['starttime_x'] - AB_multitherapy['starttime_y']
    AB_multitherapy['timediff'] = AB_multitherapy.timediff.dt.total_seconds()/60

    AB_multitherapy['multi'] = 0
    AB_multitherapy.loc[(AB_multitherapy['timediff']<time)&(AB_multitherapy['stay_id_x']==AB_multitherapy['stay_id_y']), 'multi'] = 1
    AB_multitherapy.loc[AB_multitherapy['multi']==1, 'starttime_x'] = np.nan

    AB_multitherapy = AB_multitherapy.ffill()
    AB_multitherapy = AB_multitherapy[['stay_id_x','starttime_x','itemid_x','count_x']]

    AB_multitherapy.drop_duplicates(inplace=True)
    AB_multitherapy_pivot = AB_multitherapy.pivot(index=['stay_id_x','starttime_x'],columns='itemid_x',values='count_x')
    AB_multitherapy_pivot.reset_index(drop=False,inplace=True)
    AB_multitherapy_pivot.fillna(0,inplace=True)
    AB_multitherapy_pivot = AB_multitherapy_pivot.rename(columns={'stay_id_x':'stay_id', 'starttime_x':'starttime'})
    AB_multitherapy_pivot.columns.name = None

    AB_multitherapy_pivot['count'] = AB_multitherapy_pivot[AB_multitherapy_pivot.columns[2:]].sum(axis=1)
    return AB_multitherapy_pivot

antibiotics_multi = multitherapy(inputevents_antibiotics)

In [80]:
resample_antibiotics = resample_multitherapy(antibiotics_multi,icustays_intime,icustays_outtime,resample_time='H')

100%|██████████| 44426/44426 [02:58<00:00, 249.40it/s]


In [81]:
for i in category:
    resample_antibiotics[i] = resample_antibiotics[globals()['{}'.format(i)]].sum(axis=1)
    resample_antibiotics.loc[resample_antibiotics[i] > 0, i] = 1
    resample_antibiotics.reset_index(inplace=True,drop=True)

In [82]:
resample_antibiotics['category_count'] = resample_antibiotics[category].sum(axis=1)

In [83]:
resample_antibiotics = resample_antibiotics[['stay_id','starttime','Cephalosporins','Vancomycin','Betalactam_comb',
                                     'Metronidazole','Carbapenems','Penicillins','Fluoroquinolones','Others','category_count']]