# Tabularization Procedure
In this notebook, we will tabularize our data for it to be able to appropriate for applying ML techniques including DL.

In [1]:
import pandas as pd
from warnings import simplefilter
from tqdm import tqdm
import datetime
import numpy as np
simplefilter(action="ignore", category=pd.errors.DtypeWarning)
pd.set_option('display.max_columns', None)

import multiprocessing as mp
from functools import partial
from sepsis_preprocessing import tabularize
import pickle

## Tabularization

### pilot tabularization

In [5]:
with open('sepsis_condata_new_0.pkl','rb') as f:
    concat_data = pickle.load(f)

step_length = datetime.timedelta(hours=1)

demo_list = ['height', 'weight']
vital_list = ['heart_rate', 'resp_rate', 'temperature', 'sbp', 'dbp', 'map', 'cvp', 'paco2', 'pao2', 'fio2']# cvp added
lab_list = ['gcs', 'bilirubin', 'platelets', 'creatinine', 'lactate', 'bun', 'arterial_ph', 'wbc', 'hemoglobin', 'hematocrit','potassium', 'sodium']
vaso_list = ['epinephrine', 'dopamine', 'dobutamine', 'norepinephrine', 'phenylephrine', 'vasopressin']
fluid_list = ['fluid']
urine_list = ['urine_output']
vent_list = ['ventilator']
SOFA_list = ['CNS_SOFA', 'CARDIO_SOFA', 'RESP_SOFA', 'COAG_SOFA', 'LIVER_SOFA', 'RENAL_SOFA']
label_list = ['SEPSIS', 'SHOCK']

item_name_dict = {
    'ct': demo_list+vital_list+lab_list+urine_list,
    'stet': fluid_list+vaso_list+vent_list,
    'demo': demo_list,
    'vital': vital_list,
    'lab': lab_list,
    'urine': urine_list, 
    'vaso': vaso_list,
    'fluid': fluid_list,
    'vent': vent_list
}

tmp_dict = {concat_data['hospadm_id']:concat_data}
tabularize(total_dict=tmp_dict, step_length=step_length, item_dict=item_name_dict, hadmid=concat_data['hospadm_id']).head()

Unnamed: 0,hadm_id,seq_num,seq_ST,seq_ET,age,gender,race,height,height_value,height_median,height_max,height_min,height_presence,weight,weight_value,weight_median,weight_max,weight_min,weight_presence,heart_rate,heart_rate_value,heart_rate_median,heart_rate_max,heart_rate_min,heart_rate_presence,resp_rate,resp_rate_value,resp_rate_median,resp_rate_max,resp_rate_min,resp_rate_presence,temperature,temperature_value,temperature_median,temperature_max,temperature_min,temperature_presence,sbp,sbp_value,sbp_median,sbp_max,sbp_min,sbp_presence,dbp,dbp_value,dbp_median,dbp_max,dbp_min,dbp_presence,map,map_value,map_median,map_max,map_min,map_presence,cvp,cvp_value,cvp_median,cvp_max,cvp_min,cvp_presence,paco2,paco2_value,paco2_median,paco2_max,paco2_min,paco2_presence,pao2,pao2_value,pao2_median,pao2_max,pao2_min,pao2_presence,fio2,fio2_value,fio2_median,fio2_max,fio2_min,fio2_presence,gcs,gcs_value,gcs_median,gcs_max,gcs_min,gcs_presence,bilirubin,bilirubin_value,bilirubin_median,bilirubin_max,bilirubin_min,bilirubin_presence,platelets,platelets_value,platelets_median,platelets_max,platelets_min,platelets_presence,creatinine,creatinine_value,creatinine_median,creatinine_max,creatinine_min,creatinine_presence,lactate,lactate_value,lactate_median,lactate_max,lactate_min,lactate_presence,bun,bun_value,bun_median,bun_max,bun_min,bun_presence,arterial_ph,arterial_ph_value,arterial_ph_median,arterial_ph_max,arterial_ph_min,arterial_ph_presence,wbc,wbc_value,wbc_median,wbc_max,wbc_min,wbc_presence,hemoglobin,hemoglobin_value,hemoglobin_median,hemoglobin_max,hemoglobin_min,hemoglobin_presence,hematocrit,hematocrit_value,hematocrit_median,hematocrit_max,hematocrit_min,hematocrit_presence,potassium,potassium_value,potassium_median,potassium_max,potassium_min,potassium_presence,sodium,sodium_value,sodium_median,sodium_max,sodium_min,sodium_presence,epinephrine_value,epinephrine_rate,dopamine_value,dopamine_rate,dobutamine_value,dobutamine_rate,norepinephrine_value,norepinephrine_rate,phenylephrine_value,phenylephrine_rate,vasopressin_value,vasopressin_rate,fluid_value,urine_output,urine_output_value,ventilator_value,bmi,CNS_SOFA,CARDIO_SOFA,vaso_presence,RESP_SOFA,COAG_SOFA,LIVER_SOFA,RENAL_SOFA,SOFA,SEPSIS,SHOCK
0,26184834,0,2131-01-07 23:59:59,2131-01-08 00:59:59,77.024658,F,BLACK/AFRICAN AMERICAN,[157.48],157.48,157.48,157.48,157.48,1,[68.68934240362812],68.689342,68.689342,68.689342,68.689342,1,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,1,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.697371,4,1,0,0,0,0,0,5,1,0
1,26184834,1,2131-01-08 00:59:59,2131-01-08 01:59:59,77.024772,F,BLACK/AFRICAN AMERICAN,"[157.48, 157.0, 157.0, 157.0, 157.0, 157.0, 15...",157.474386,157.473684,157.48,157.473684,0,"[65.00803775382872, 65.00803775382872, 65.0, 6...",68.640822,68.640798,68.640904,68.640798,0,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,0,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.67978,4,1,0,0,0,0,0,5,1,0
2,26184834,2,2131-01-08 01:59:59,2131-01-08 02:59:59,77.024886,F,BLACK/AFRICAN AMERICAN,"[157.48, 157.0, 157.0, 157.0, 157.0, 157.0, 15...",157.468772,157.467368,157.48,157.467368,0,"[65.00803775382872, 65.00803775382872, 65.0, 6...",68.592301,68.592254,68.592466,68.592254,0,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,0,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.662186,4,1,0,0,0,0,0,5,1,0
3,26184834,3,2131-01-08 02:59:59,2131-01-08 03:59:59,77.025,F,BLACK/AFRICAN AMERICAN,"[157.48, 157.0, 157.0, 157.0, 157.0, 157.0, 15...",157.463158,157.461053,157.48,157.461053,0,"[65.00803775382872, 65.00803775382872, 65.0, 6...",68.543781,68.54371,68.544028,68.54371,0,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,0,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.644589,4,1,0,0,0,0,0,5,1,0
4,26184834,4,2131-01-08 03:59:59,2131-01-08 04:59:59,77.025114,F,BLACK/AFRICAN AMERICAN,"[157.48, 157.0, 157.0, 157.0, 157.0, 157.0, 15...",157.457544,157.454737,157.48,157.454737,0,"[65.00803775382872, 65.00803775382872, 65.0, 6...",68.49526,68.495166,68.49559,68.495166,0,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,0,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.62699,4,1,0,0,0,0,0,5,1,0


### full tabularization with or without multi-processing

In [6]:
icustays = pd.read_csv('processed_data/sepsis/icustays_sepshock.csv')
icustays = icustays.loc[icustays.cohort_stays == 1].reset_index(drop=True)
hadmids = icustays.hadm_id.unique()

FileNotFoundError: [Errno 2] No such file or directory: 'processed_data/sepsis/icustays_sepshock.csv'

In [None]:
with open('sepsis_condata_new.pkl','rb') as f:
    concat_data = pickle.load(f)
    
'''
item_list = list(concat_data[hadmids[0]].keys())
item_list = item_list[1:len(item_list)]
'''

In [None]:
tmp = []
s_idx, e_idx = [0, 4999]
tmp_id_list = hadmids[s_idx:e_idx]
print(f'{len(hadmids)}, {len(tmp_id_list)}, {set(tmp_id_list).intersection(set(hadmids[4999:]))}')

In [None]:
# without multiprocessing
tmp = []
s_idx, e_idx = [0, 4999]
tmp_id_list = hadmids[s_idx:e_idx]
for adm in tqdm(tmp_id_list):
    tmp.append(tabularize(total_dict=concat_data, step_length=step_length, item_dict=item_name_dict, hadmid=adm))

In [None]:
tabularized_data = {}
for i in tqdm(range(len(tmp))):
    tabularized_data[tmp[i].hadm_id[0]] = tmp[i]

with open(str('sepsis_tabdata_raw_%d_%d.pkl' %(s_idx+1, e_idx)),'wb') as f:
    pickle.dump(tabularized_data, f)

In [None]:
tabularized_data = {}
s_idx, e_idx = [0, 4999]
with open(str('sepsis_tabdata_raw_%d_%d.pkl' %(s_idx+1, e_idx)), 'rb') as f:
    tabularized_data.update(pickle.load(f))
s_idx, e_idx = [4999, len(hadmids)]
with open(str('sepsis_tabdata_raw_%d_%d.pkl' %(s_idx+1, e_idx)), 'rb') as f:
    tabularized_data.update(pickle.load(f))
with open('sepsis_tabdata_raw.pkl','wb') as f:
    pickle.dump(tabularized_data, f)

In [None]:
# with multiprocessing
with mp.Pool(10) as pool:
    pooled_result = pool.imap_unordered(partial(tabularize, total_dict=concat_data, step_length=step_length, item_dict=item_name_dict), tqdm(id_list[0:200]))
pool.close()
pool.join()
pd.concat(objs=[i for i in pooled_result], axis=1).to_csv('sepsis_tabdata_raw.csv', index=False)

## Tabularization version 2 - with cvp

### pilot tabularization

In [2]:
with open('sepsis_condata_new_0.pkl','rb') as f:
    concat_data = pickle.load(f)

step_length = datetime.timedelta(hours=1)

demo_list = ['height', 'weight']
vital_list = ['heart_rate', 'resp_rate', 'temperature', 'sbp', 'dbp', 'map', 'cvp', 'paco2', 'pao2', 'fio2']# cvp added
lab_list = ['gcs', 'bilirubin', 'platelets', 'creatinine', 'lactate', 'bun', 'arterial_ph', 'wbc', 'hemoglobin', 'hematocrit','potassium', 'sodium']
vaso_list = ['epinephrine', 'dopamine', 'dobutamine', 'norepinephrine', 'phenylephrine', 'vasopressin']
fluid_list = ['fluid']
urine_list = ['urine_output']
vent_list = ['ventilator']
SOFA_list = ['CNS_SOFA', 'CARDIO_SOFA', 'RESP_SOFA', 'COAG_SOFA', 'LIVER_SOFA', 'RENAL_SOFA']
label_list = ['SEPSIS', 'SHOCK']

item_name_dict = {
    'ct': demo_list+vital_list+lab_list+urine_list,
    'stet': fluid_list+vaso_list+vent_list,
    'demo': demo_list,
    'vital': vital_list,
    'lab': lab_list,
    'urine': urine_list, 
    'vaso': vaso_list,
    'fluid': fluid_list,
    'vent': vent_list
}

tmp_dict = {concat_data['hospadm_id']:concat_data}
tabularize(total_dict=tmp_dict, step_length=step_length, item_dict=item_name_dict, hadmid=concat_data['hospadm_id']).head()

Unnamed: 0,hadm_id,seq_num,seq_ST,seq_ET,age,gender,race,height,height_value,height_median,height_max,height_min,height_presence,weight,weight_value,weight_median,weight_max,weight_min,weight_presence,heart_rate,heart_rate_value,heart_rate_median,heart_rate_max,heart_rate_min,heart_rate_presence,resp_rate,resp_rate_value,resp_rate_median,resp_rate_max,resp_rate_min,resp_rate_presence,temperature,temperature_value,temperature_median,temperature_max,temperature_min,temperature_presence,sbp,sbp_value,sbp_median,sbp_max,sbp_min,sbp_presence,dbp,dbp_value,dbp_median,dbp_max,dbp_min,dbp_presence,map,map_value,map_median,map_max,map_min,map_presence,cvp,cvp_value,cvp_median,cvp_max,cvp_min,cvp_presence,paco2,paco2_value,paco2_median,paco2_max,paco2_min,paco2_presence,pao2,pao2_value,pao2_median,pao2_max,pao2_min,pao2_presence,fio2,fio2_value,fio2_median,fio2_max,fio2_min,fio2_presence,gcs,gcs_value,gcs_median,gcs_max,gcs_min,gcs_presence,bilirubin,bilirubin_value,bilirubin_median,bilirubin_max,bilirubin_min,bilirubin_presence,platelets,platelets_value,platelets_median,platelets_max,platelets_min,platelets_presence,creatinine,creatinine_value,creatinine_median,creatinine_max,creatinine_min,creatinine_presence,lactate,lactate_value,lactate_median,lactate_max,lactate_min,lactate_presence,bun,bun_value,bun_median,bun_max,bun_min,bun_presence,arterial_ph,arterial_ph_value,arterial_ph_median,arterial_ph_max,arterial_ph_min,arterial_ph_presence,wbc,wbc_value,wbc_median,wbc_max,wbc_min,wbc_presence,hemoglobin,hemoglobin_value,hemoglobin_median,hemoglobin_max,hemoglobin_min,hemoglobin_presence,hematocrit,hematocrit_value,hematocrit_median,hematocrit_max,hematocrit_min,hematocrit_presence,potassium,potassium_value,potassium_median,potassium_max,potassium_min,potassium_presence,sodium,sodium_value,sodium_median,sodium_max,sodium_min,sodium_presence,epinephrine_value,epinephrine_rate,dopamine_value,dopamine_rate,dobutamine_value,dobutamine_rate,norepinephrine_value,norepinephrine_rate,phenylephrine_value,phenylephrine_rate,vasopressin_value,vasopressin_rate,fluid_value,urine_output,urine_output_value,ventilator_value,bmi,CNS_SOFA,CARDIO_SOFA,vaso_presence,RESP_SOFA,COAG_SOFA,LIVER_SOFA,RENAL_SOFA,SOFA,SEPSIS,SHOCK
0,26184834,0,2131-01-07 23:59:59,2131-01-08 00:59:59,77.024658,F,BLACK/AFRICAN AMERICAN,[157.48],157.48,157.48,157.48,157.48,1,[68.68934240362812],68.689342,68.689342,68.689342,68.689342,1,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,1,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.697371,4,1,0,0,0,0,0,5,1,0
1,26184834,1,2131-01-08 00:59:59,2131-01-08 01:59:59,77.024772,F,BLACK/AFRICAN AMERICAN,"[157.48, 157.0, 157.0, 157.0, 157.0, 157.0, 15...",157.474386,157.473684,157.48,157.473684,0,"[65.00803775382872, 65.00803775382872, 65.0, 6...",68.640822,68.640798,68.640904,68.640798,0,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,0,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.67978,4,1,0,0,0,0,0,5,1,0
2,26184834,2,2131-01-08 01:59:59,2131-01-08 02:59:59,77.024886,F,BLACK/AFRICAN AMERICAN,"[157.48, 157.0, 157.0, 157.0, 157.0, 157.0, 15...",157.468772,157.467368,157.48,157.467368,0,"[65.00803775382872, 65.00803775382872, 65.0, 6...",68.592301,68.592254,68.592466,68.592254,0,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,0,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.662186,4,1,0,0,0,0,0,5,1,0
3,26184834,3,2131-01-08 02:59:59,2131-01-08 03:59:59,77.025,F,BLACK/AFRICAN AMERICAN,"[157.48, 157.0, 157.0, 157.0, 157.0, 157.0, 15...",157.463158,157.461053,157.48,157.461053,0,"[65.00803775382872, 65.00803775382872, 65.0, 6...",68.543781,68.54371,68.544028,68.54371,0,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,0,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.644589,4,1,0,0,0,0,0,5,1,0
4,26184834,4,2131-01-08 03:59:59,2131-01-08 04:59:59,77.025114,F,BLACK/AFRICAN AMERICAN,"[157.48, 157.0, 157.0, 157.0, 157.0, 157.0, 15...",157.457544,157.454737,157.48,157.454737,0,"[65.00803775382872, 65.00803775382872, 65.0, 6...",68.49526,68.495166,68.49559,68.495166,0,[38.0],38.0,38.0,38.0,38.0,0,"[20.0, 16.0]",18.0,18.0,20.0,16.0,0,"[36.1, 35.4]",35.75,35.75,36.1,35.4,0,"[180.0, 180.0]",180.0,180.0,180.0,180.0,0,"[12.0, 12.0]",12.0,12.0,12.0,12.0,0,"[68.0, 68.0]",68.0,68.0,68.0,68.0,0,0.0,0.0,0.0,0.0,0.0,0,[60.0],60.0,60.0,60.0,60.0,0,[65.0],65.0,65.0,65.0,65.0,0,[100.0],100.0,100.0,100.0,100.0,0,[3.0],3.0,3.0,3.0,3.0,0,[0.2],0.2,0.2,0.2,0.2,0,[195.0],195.0,195.0,195.0,195.0,0,[1.1],1.1,1.1,1.1,1.1,0,[4.0],4.0,4.0,4.0,4.0,0,[16.0],16.0,16.0,16.0,16.0,0,[7.42],7.42,7.42,7.42,7.42,0,[6.6],6.6,6.6,6.6,6.6,0,[11.4],11.4,11.4,11.4,11.4,0,[35.6],35.6,35.6,35.6,35.6,0,[3.6],3.6,3.6,3.6,3.6,0,[135.0],135.0,135.0,135.0,135.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,27.62699,4,1,0,0,0,0,0,5,1,0


### full tabularization without multi-processing

In [3]:
icustays = pd.read_csv('icustays_sepshock.csv')# at 2nd computer 
icustays = icustays.loc[icustays.cohort_stays == 1].reset_index(drop=True)
hadmids = icustays.hadm_id.unique()

In [4]:
with open('sepsis_condata_new.pkl','rb') as f:
    concat_data = pickle.load(f)
    
'''
item_list = list(concat_data[hadmids[0]].keys())
item_list = item_list[1:len(item_list)]
'''

'\nitem_list = list(concat_data[hadmids[0]].keys())\nitem_list = item_list[1:len(item_list)]\n'

In [5]:
# without multiprocessing
tmp = []
tmp_id_list = hadmids
for adm in tqdm(tmp_id_list):
    tmp.append(tabularize(total_dict=concat_data, step_length=step_length, item_dict=item_name_dict, hadmid=adm))# 41hr 43min

100%|██████████| 9998/9998 [41:43:11<00:00, 15.02s/it]     


In [6]:
tabularized_data = {}
for i in tqdm(range(len(tmp))):
    tabularized_data[tmp[i].hadm_id[0]] = tmp[i]

with open('sepsis_tabdata_raw_v2.pkl','wb') as f:# this is version 2
    pickle.dump(tabularized_data, f)

100%|██████████| 9998/9998 [00:00<00:00, 26676.16it/s]


## After tabularization

In [None]:
with open('sepsis_tabdata_raw.pkl','rb') as f:
    tab_data = pickle.load(f)

In [None]:
icustays = pd.read_csv('processed_data/sepsis/icustays_sepshock.csv')
icustays = icustays.loc[icustays.cohort_stays == 1].reset_index(drop=True)
stayids = icustays.stay_id.unique()
icustays.intime = pd.to_datetime(icustays.intime)
icustays.outtime = pd.to_datetime(icustays.outtime)

cohort 길이 관련 이슈 보고
1. 그냥 it, ot 기준: 너무 짧을 수도?
2. icu module 모두 다쓰기: tabularize 다시 해야함. 부분적으로. / 그리고 해당 시기는 엄연히 icu가 아니라 hosp 모듈 시간으로 봐야하기 때문에 그렇다면 hosp 모듈의 정보도 활용해야 하는데 그렇지는 못하다는 모순이 존재 / 대신 더 길테니까 좀 더 여러모로 분석이 용이

=> 그냥 진행하되 다른 컴퓨터로 추가 전처리 해보는 걸로?

In [None]:
sepshock = pd.DataFrame(columns=['stay_id', 'sepsis', 'shock', '0tosep', '0tosho', 'septhensho', 'shothensep', 'td_sepsho'])
sepshock['stay_id'] = stayids
#----------------------------------------------------------------------------------------------------------------------------------------------------

for i in tqdm(range(len(stayids))):
    tmp_stay = stayids[i]
    tmp_cond = icustays.stay_id == tmp_stay
    tmp_adm = icustays.loc[tmp_cond].hadm_id.tolist()[0]
    it, ot = icustays.loc[tmp_cond, ('intime', 'outtime')].reset_index(drop=True).loc[0]

    tmp_adm_data = tab_data[tmp_adm]	
    tmp_cond = (tmp_adm_data.seq_ET >= it) & (tmp_adm_data.seq_ST <= ot)
    tmp_stay_df = tmp_adm_data.loc[tmp_cond].reset_index(drop=True)
    tmp_0t = tmp_stay_df.seq_ST[0]

    # 1. sepsis 0tosep 
    tmp_cond = sepshock.stay_id == tmp_stay
    sepshock.loc[tmp_cond, 'sepsis'] = int((sum(tmp_stay_df['SEPSIS']) > 0))
    
    if sepshock.loc[tmp_cond, 'sepsis'].tolist()[0] == 1:
        tmp_idx = tmp_stay_df['SEPSIS'].tolist().index(1)
        tmp_st = pd.to_datetime(tmp_stay_df.loc[tmp_idx, 'seq_ST'])
        sepshock.loc[tmp_cond, '0tosep'] = (tmp_st-tmp_0t)/datetime.timedelta(hours=1)


    # 2. shock 0tosho
    tmp_cond = sepshock.stay_id == tmp_stay
    sepshock.loc[tmp_cond, 'shock'] = int((sum(tmp_stay_df['SHOCK']) > 0)) # sepsis
    
    if sepshock.loc[tmp_cond, 'shock'].tolist()[0] == 1:
        tmp_idx = tmp_stay_df['SHOCK'].tolist().index(1)
        tmp_st =pd.to_datetime(tmp_stay_df.loc[tmp_idx, 'seq_ST'])
        sepshock.loc[tmp_cond, '0tosho'] = (tmp_st-tmp_0t)/datetime.timedelta(hours=1)


    # 3. septhensho shothensep td_sepsho
    if ((sepshock.loc[tmp_cond, 'sepsis'][i] == 1) & (sepshock.loc[tmp_cond, 'shock'][i] == 1)):
        sepshock.loc[tmp_cond, 'septhensho'] = int((sepshock.loc[tmp_cond, '0tosho'] > sepshock.loc[tmp_cond, '0tosep']))
        sepshock.loc[tmp_cond, 'shothensep'] = int((sepshock.loc[tmp_cond, '0tosho'] <= sepshock.loc[tmp_cond, '0tosep']))
        sepshock.loc[tmp_cond, 'td_sepsho'] = abs(sepshock.loc[tmp_cond, '0tosho'] - sepshock.loc[tmp_cond, '0tosep'])

In [None]:
print(f'sepsis num, ratio: {sum(sepshock.sepsis)}, {sum(sepshock.sepsis)/sepshock.shape[0]}')
print(f'shock num, ratio: {sum(sepshock.shock)}, {sum(sepshock.shock)/sepshock.shape[0]}')
print(f'septhensho num, ratio(/total): {np.nansum(sepshock.septhensho)}, {np.nansum(sepshock.septhensho)/sum((sepshock.sepsis==1)&(sepshock.shock==1))}')
print(f'shothensep num, ratio(/total): {np.nansum(sepshock.shothensep)}, {np.nansum(sepshock.shothensep)/sum((sepshock.sepsis==1)&(sepshock.shock==1))}')
print(f'septhensho ratio(/total): {np.nansum(sepshock.septhensho)/sepshock.shape[0]}')
print(f'shothensep ratio(/total): {np.nansum(sepshock.shothensep)/sepshock.shape[0]}')
print(f"\n==========0 to sep========== \n{sepshock['0tosep'].astype('float').describe()}\n")
print(f"\n==========0 to shock========== \n{sepshock['0tosho'].astype('float').describe()}\n")
print(f"\n==========td sepsho========== \n{sepshock['td_sepsho'].astype('float').describe()}\n")

In [None]:
sepsis_stays = sepshock.stay_id[sepshock.sepsis == 1].tolist()
shock_stays = sepshock.stay_id[sepshock.shock == 1].tolist()
septhensho_stays = sepshock.stay_id[sepshock.septhensho == 1].tolist()
shothensep_stays = sepshock.stay_id[sepshock.shothensep == 1].tolist()
ver_name = 'tab_1hr_v1'
icustays.loc[icustays.stay_id.isin(sepsis_stays), str('sepsis_%s' %ver_name)] = 1
icustays.loc[icustays.stay_id.isin(shock_stays), str('shock_%s' %ver_name)] = 1
icustays.loc[icustays.stay_id.isin(septhensho_stays), str('septhensho_%s' %ver_name)] = 1
icustays.loc[icustays.stay_id.isin(shothensep_stays), str('shothensep_%s' %ver_name)] = 1
icustays.loc[((icustays[str('sepsis_%s' %ver_name)] == 1) & (icustays[str('shothensep_%s' %ver_name)] != 1)), 'cohort_stays_'+ver_name] = 1

In [None]:
icustays.to_csv(str('processed_data/sepsis/icustays_%s.csv' %ver_name), index=False) # v1 means itot ver

In [None]:
sum(((icustays[str('sepsis_%s' %ver_name)] == 1) & (icustays[str('shothensep_%s' %ver_name)] != 1)))

In [None]:
len(icustays.loc[icustays['cohort_stays_'+ver_name] == 1, 'stay_id'].unique())

# EON