In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys, os, pickle

from tqdm.notebook import tqdm
from datetime import timedelta
#from utils import baseline_SCr

if os.getcwd()[-4:] == "code":
    os.chdir('../')

icu = './data/mimic-iv-2.2-parquet/icu/'
hosp = './data/mimic-iv-2.2-parquet/hosp/'

In [2]:
labevents     = pd.read_parquet(hosp+'labevents.parquet')
d_labitems    = pd.read_parquet(hosp+'d_labitems.parquet')

patients      = pd.read_parquet(hosp+'patients.parquet')
admissions    = pd.read_parquet(hosp+'admissions.parquet')

diagnoses_icd = pd.read_parquet(hosp+'diagnoses_icd.parquet')
microbiology  = pd.read_parquet(hosp+'microbiologyevents.parquet')
prescriptions = pd.read_parquet(hosp+'prescriptions.parquet')

In [3]:
chartevents     = pd.read_parquet(icu+'chartevents.parquet')
d_items         = pd.read_parquet(icu+'d_items.parquet')
inputevents     = pd.read_parquet(icu+'inputevents.parquet')
outputevents    = pd.read_parquet(icu+'outputevents.parquet')
procedureevents = pd.read_parquet(icu+'procedureevents.parquet')
icustays        = pd.read_parquet(icu+'icustays.parquet')

In [38]:
outputevents    = pd.read_parquet(icu+'outputevents.parquet')

In [4]:
# to_datetime
chartevents['charttime'] = pd.to_datetime(chartevents['charttime'])

In [17]:
labvalues = pd.read_csv('./data/labvalues/labvalues.csv')
vitals = pd.read_csv('./data/vitals/vitals.csv')

# Demographic

### Gender

In [114]:
patients_gender = patients[['subject_id','gender']]
patients_gender.loc[patients_gender.gender == 'F' , 'gender'] = 1
patients_gender.loc[patients_gender.gender == 'M' , 'gender'] = 0

In [115]:
patients_gender

Unnamed: 0,subject_id,gender
0,10000032,1
1,10000048,1
2,10000068,1
3,10000084,0
4,10000102,1
...,...,...
299707,19999828,1
299708,19999829,1
299709,19999840,0
299710,19999914,1


### Age

In [116]:
icustays['intime'] = pd.to_datetime(icustays['intime'])
icustays['outtime'] = pd.to_datetime(icustays['outtime'])

In [117]:
icustays_intime = icustays[['subject_id','hadm_id','stay_id','intime']]
patients_age = patients[['subject_id','anchor_age','anchor_year']]

icustays_age = pd.merge(icustays_intime, patients_age, on = 'subject_id', how = 'left')

In [118]:
icustays_age['anchor_age_delta'] = pd.to_timedelta(icustays_age['anchor_age']*365.25, unit='D')
icustays_age['anchor_year'] = pd.to_datetime(icustays_age['anchor_year'],format="%Y")

icustays_age['delta'] = icustays_age['intime'] - icustays_age['anchor_year']
icustays_age['age'] = ((icustays_age['anchor_age_delta'] + icustays_age['delta'])/365.25).dt.days

icustays_age = icustays_age[['subject_id','hadm_id','stay_id','age']]

In [119]:
icustays_age

Unnamed: 0,subject_id,hadm_id,stay_id,age
0,10000032,29079034,39553978,52
1,10000980,26913865,39765666,76
2,10001217,24597018,37067082,55
3,10001217,27703517,34592300,55
4,10001725,25563031,31205490,46
...,...,...,...,...
73176,19999442,26785317,32336619,43
73177,19999625,25304202,31070865,82
73178,19999828,25744818,36075953,48
73179,19999840,21033226,38978960,58


### Race

In [57]:
admissions_race = admissions[['subject_id','race']].copy()

In [58]:
admissions_race['race'].replace(['ASIAN - ASIAN INDIAN', 'ASIAN - CHINESE','ASIAN - KOREAN', 'ASIAN - SOUTH EAST ASIAN'],'ASIAN',inplace=True)
admissions_race['race'].replace(['BLACK/AFRICAN AMERICAN','BLACK/AFRICAN','BLACK/CAPE VERDEAN','BLACK/CARIBBEAN ISLAND'],'BLACK',inplace=True)
admissions_race['race'].replace(['HISPANIC/LATINO - CENTRAL AMERICAN','HISPANIC/LATINO - COLUMBIAN','HISPANIC/LATINO - CUBAN','HISPANIC/LATINO - DOMINICAN','HISPANIC/LATINO - GUATEMALAN','HISPANIC/LATINO - HONDURAN','HISPANIC/LATINO - MEXICAN',
                             'HISPANIC/LATINO - PUERTO RICAN','HISPANIC/LATINO - SALVADORAN','PORTUGUESE','SOUTH AMERICAN'],'HISPANIC OR LATINO',inplace=True)
admissions_race['race'].replace(['NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER'],'PACIFIC ISLANDER',inplace=True)
admissions_race['race'].replace(['UNABLE TO OBTAIN','PATIENT DECLINED TO ANSWER'],'UNKNOWN',inplace=True)
admissions_race['race'].replace(['WHITE - BRAZILIAN','WHITE - EASTERN EUROPEAN','WHITE - OTHER EUROPEAN','WHITE - RUSSIAN'],'WHITE',inplace=True)

admissions_race = admissions_race.drop_duplicates()

In [61]:
multiple = admissions_race.subject_id.value_counts().loc[lambda x : x > 1].to_frame()
multiple.reset_index(inplace = True)
multiple = multiple.subject_id.unique()

hosp_race_multiple = admissions_race[admissions_race['subject_id'].isin(multiple)]

sol = []
multi = []
for i in hosp_race_multiple.subject_id.unique() :
    tmp = hosp_race_multiple[hosp_race_multiple['subject_id'] == i]
    if (tmp['race'] == 'UNKNOWN').any() :
        tmp = tmp[tmp['race'] != 'UNKNOWN']
    if len(tmp) <2 :
        sol.append(tmp)
    else : 
        multi.append(tmp)
sol = pd.concat(sol)
multi = pd.concat(multi)

multi['race'] = 'MULTIPLE RACE/ETHNICITY'
multi.drop_duplicates(inplace=True)

admissions_race = admissions_race[~admissions_race['subject_id'].isin(sol.subject_id.unique())]
admissions_race = admissions_race[~admissions_race['subject_id'].isin(multi.subject_id.unique())]

admissions_race = pd.concat([admissions_race,sol,multi])

In [62]:
admissions_race.race.unique()

array(['WHITE', 'OTHER', 'BLACK', 'UNKNOWN', 'HISPANIC OR LATINO',
       'ASIAN', 'PACIFIC ISLANDER', 'AMERICAN INDIAN/ALASKA NATIVE',
       'MULTIPLE RACE/ETHNICITY'], dtype=object)

## Height

In [125]:
# Height
chartevents[chartevents['itemid'].isin([226707])]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
157,10000032,29079034,39553978,88981.0,2180-07-23 12:36:00,2180-07-23 14:45:00,226707,60,60.0,Inch,0.0
3718,10001725,25563031,31205490,22280.0,2110-04-11 15:52:00,2110-04-11 16:01:00,226707,62,62.0,Inch,0.0
4644,10001884,26184834,37510196,3512.0,2131-01-11 04:20:00,2131-01-13 14:43:00,226707,62,62.0,Inch,0.0
20745,10002013,23581541,39060235,96730.0,2160-05-18 10:00:00,2160-05-18 12:05:00,226707,62,62.0,Inch,0.0
40604,10002428,20321825,34807493,62692.0,2156-04-30 20:36:00,2156-05-02 16:37:00,226707,59,59.0,Inch,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313613045,19999287,20175828,35165301,34445.0,2197-08-03 20:58:00,2197-08-06 15:12:00,226707,65,65.0,Inch,0.0
313616169,19999287,22997012,37692584,11933.0,2197-07-26 03:31:00,2197-07-27 14:42:00,226707,64,64.0,Inch,0.0
313621571,19999297,21439025,37364566,30382.0,2162-08-16 05:48:00,2162-08-16 11:48:00,226707,65,65.0,Inch,0.0
313626294,19999442,26785317,32336619,29154.0,2148-11-19 14:23:00,2148-11-20 16:34:00,226707,76,76.0,Inch,0.0


In [126]:
# Height(cm)
chartevents[chartevents['itemid'].isin([226730])]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
158,10000032,29079034,39553978,88981.0,2180-07-23 12:36:00,2180-07-23 14:45:00,226730,152,152.0,cm,0.0
3719,10001725,25563031,31205490,22280.0,2110-04-11 15:52:00,2110-04-11 16:01:00,226730,157,157.0,cm,0.0
4645,10001884,26184834,37510196,3512.0,2131-01-11 04:20:00,2131-01-13 14:43:00,226730,157,157.0,cm,0.0
20746,10002013,23581541,39060235,96730.0,2160-05-18 10:00:00,2160-05-18 12:05:00,226730,157,157.0,cm,0.0
40605,10002428,20321825,34807493,62692.0,2156-04-30 20:36:00,2156-05-02 16:37:00,226730,150,150.0,cm,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313613046,19999287,20175828,35165301,34445.0,2197-08-03 20:58:00,2197-08-06 15:12:00,226730,165,165.0,cm,0.0
313616170,19999287,22997012,37692584,11933.0,2197-07-26 03:31:00,2197-07-27 14:42:00,226730,163,163.0,cm,0.0
313621572,19999297,21439025,37364566,30382.0,2162-08-16 05:48:00,2162-08-16 11:48:00,226730,165,165.0,cm,0.0
313626295,19999442,26785317,32336619,29154.0,2148-11-19 14:23:00,2148-11-20 16:34:00,226730,193,193.0,cm,0.0


## Weight

In [124]:
#Admission Weight(Kg)
chartevents[chartevents['itemid'].isin([226512])]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
156,10000032,29079034,39553978,88981.0,2180-07-23 12:36:00,2180-07-23 14:45:00,226512,39.4,39.4,kg,0.0
488,10000980,26913865,39765666,26402.0,2189-06-27 07:40:00,2189-06-27 09:02:00,226512,76.2,76.2,kg,0.0
1663,10001217,24597018,37067082,84976.0,2157-11-20 19:17:00,2157-11-20 19:32:00,226512,71.2,71.2,kg,0.0
2881,10001217,27703517,34592300,51676.0,2157-12-19 15:42:00,2157-12-19 17:26:00,226512,74.8,74.8,kg,0.0
3717,10001725,25563031,31205490,22280.0,2110-04-11 15:52:00,2110-04-11 16:01:00,226512,72.2,72.2,kg,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313626293,19999442,26785317,32336619,29154.0,2148-11-19 14:23:00,2148-11-20 16:34:00,226512,107.5,107.5,kg,0.0
313633288,19999625,25304202,31070865,84925.0,2139-10-10 18:06:00,2139-10-10 19:58:00,226512,50.5,50.5,kg,0.0
313633978,19999828,25744818,36075953,44654.0,2149-01-08 16:45:00,2149-01-08 18:15:00,226512,67.9,67.9,kg,0.0
313641208,19999840,21033226,38978960,82641.0,2164-09-12 09:26:00,2164-09-12 09:47:00,226512,77.5,77.5,kg,0.0


In [123]:
#Admission Weight(lbs.)
chartevents[chartevents['itemid'].isin([226531])]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
242,10000032,29079034,39553978,88981.0,2180-07-23 14:22:00,2180-07-23 14:23:00,226531,86.7,86.7,,0.0
264,10000032,29079034,39553978,88981.0,2180-07-23 14:44:00,2180-07-23 14:45:00,226531,86.7,86.7,,0.0
482,10000980,26913865,39765666,11933.0,2189-06-27 18:02:00,2189-06-27 18:02:00,226531,167.6,167.6,,0.0
485,10000980,26913865,39765666,11933.0,2189-06-27 20:13:00,2189-06-27 20:13:00,226531,167.6,167.6,,0.0
525,10000980,26913865,39765666,36518.0,2189-06-27 08:27:00,2189-06-27 08:27:00,226531,0,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313632793,19999442,26785317,32336619,96670.0,2148-11-24 07:59:00,2148-11-24 07:59:00,226531,236.5,236.5,,0.0
313633287,19999625,25304202,31070865,44373.0,2139-10-11 10:02:00,2139-10-11 10:02:00,226531,111.1,111.1,,0.0
313643257,19999987,23865745,36195440,49819.0,2145-11-03 09:01:00,2145-11-03 09:01:00,226531,132,132.0,,0.0
313643342,19999987,23865745,36195440,49819.0,2145-11-03 11:59:00,2145-11-03 11:59:00,226531,132,132.0,,0.0


# Lab values

In [72]:
for i,idx in tqdm(enumerate(labvalues.abbreviation)):
    print("[%i/%i] Processing %s..."%(i+1,len(labvalues),labvalues.iloc[i].labvalue))
    globals()['chartevents_{}'.format(idx)] = chartevents[chartevents['itemid'].isin([labvalues.iloc[i].itemid_icu])][['subject_id','hadm_id','stay_id','charttime','itemid','valuenum']]

    if labvalues.iloc[i].lb_cond == 'ge' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] >= labvalues.iloc[i].lb]
    elif labvalues.iloc[i].lb_cond == 'gt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] > labvalues.iloc[i].lb]

    if labvalues.iloc[i].ub_cond == 'le' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] <= labvalues.iloc[i].ub]
    elif labvalues.iloc[i].ub_cond == 'lt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] < labvalues.iloc[i].ub]

    globals()['chartevents_{}'.format(idx)].sort_values(by=['subject_id','charttime'],ascending=True,inplace=True)
    globals()['chartevents_{}'.format(idx)].reset_index(inplace=True, drop=True)

    if not os.path.isfile('./data/labvalues/chartevents_%s.parquet'%idx):
        globals()['chartevents_{}'.format(idx)].to_parquet('./data/labvalues/chartevents_%s.parquet'%idx)

0it [00:00, ?it/s]

[1/28] Processing Albumin...
[2/28] Processing Alkaline Phosphate...
[3/28] Processing Anion Gap...
[4/28] Processing Blood urea nitrogen...
[5/28] Processing Calcium non-ionized...
[6/28] Processing Creatine Kinase...
[7/28] Processing Direct Bilirubin...
[8/28] Processing Glucose...
[9/28] Processing Hematocrit...
[10/28] Processing International Normalized Ratio...
[11/28] Processing PH...
[12/28] Processing Phosphorous...
[13/28] Processing Platelet Count...
[14/28] Processing Serum chloride...
[15/28] Processing Serum creatinine...
[16/28] Processing Serum Sodium...
[17/28] Processing Serum Potassium...
[18/28] Processing Total Bilirubin...
[19/28] Processing White blood cell count...
[20/28] Processing Serum glucose...
[21/28] Processing Magnesium...
[22/28] Processing Ionized calcium...
[23/28] Processing Serum HCO3...
[24/28] Processing AST...
[25/28] Processing ALT...
[26/28] Processing PTT...
[27/28] Processing Arterial Base Excess...
[28/28] Processing Lactic Acid...


In [6]:
for i,idx in tqdm(enumerate(labvalues.abbreviation)):
    try :
        print("[%i/%i] Processing %s..."%(i+1,len(labvalues),labvalues.iloc[i].labvalue))
        itemid = labvalues.iloc[i].itemid_hosp
        if isinstance(itemid, str):
            itemid = itemid.replace(" ","")
            itemid = itemid.split(",")
        if isinstance(itemid, list):
            itemid = [int(x) for x in itemid]
        else : itemid = [int(itemid)]
        globals()['labevents_{}'.format(idx)] = labevents[labevents['itemid'].isin(itemid)][['subject_id','hadm_id','charttime','itemid','valuenum']]

        if labvalues.iloc[i].lb_cond == 'ge' : 
            globals()['labevents_{}'.format(idx)] = globals()['labevents_{}'.format(idx)][globals()['labevents_{}'.format(idx)]['valuenum'] >= labvalues.iloc[i].lb]
        elif labvalues.iloc[i].lb_cond == 'gt' : 
            globals()['labevents_{}'.format(idx)] = globals()['labevents_{}'.format(idx)][globals()['labevents_{}'.format(idx)]['valuenum'] > labvalues.iloc[i].lb]

        if labvalues.iloc[i].ub_cond == 'le' : 
            globals()['labevents_{}'.format(idx)] = globals()['labevents_{}'.format(idx)][globals()['labevents_{}'.format(idx)]['valuenum'] <= labvalues.iloc[i].ub]
        elif labvalues.iloc[i].ub_cond == 'lt' : 
            globals()['labevents_{}'.format(idx)] = globals()['labevents_{}'.format(idx)][globals()['labevents_{}'.format(idx)]['valuenum'] < labvalues.iloc[i].ub]

        globals()['labevents_{}'.format(idx)].sort_values(by=['subject_id','charttime'],ascending=True,inplace=True)
        globals()['labevents_{}'.format(idx)].reset_index(inplace=True, drop=True)

        if not os.path.isfile('./data/labvalues/labevents_%s.parquet'%idx):
            globals()['labevents_{}'.format(idx)].to_parquet('./data/labvalues/labevents_%s.parquet'%idx)
    except: ValueError

0it [00:00, ?it/s]

[1/28] Processing Albumin...
[2/28] Processing Alkaline Phosphate...
[3/28] Processing Anion Gap...
[4/28] Processing Blood urea nitrogen...
[5/28] Processing Calcium non-ionized...
[6/28] Processing Creatine Kinase...
[7/28] Processing Direct Bilirubin...
[8/28] Processing Glucose...
[9/28] Processing Hematocrit...
[10/28] Processing International Normalized Ratio...
[11/28] Processing PH...
[12/28] Processing Phosphorous...
[13/28] Processing Platelet Count...
[14/28] Processing Serum chloride...
[15/28] Processing Serum creatinine...
[16/28] Processing Serum Sodium...
[17/28] Processing Serum Potassium...
[18/28] Processing Total Bilirubin...
[19/28] Processing White blood cell count...
[20/28] Processing Serum glucose...
[21/28] Processing Magnesium...
[22/28] Processing Ionized calcium...
[23/28] Processing Serum HCO3...
[24/28] Processing AST...
[25/28] Processing ALT...
[26/28] Processing PTT...
[27/28] Processing Arterial Base Excess...
[28/28] Processing Lactic Acid...


# Vitals

In [18]:
for i,idx in tqdm(enumerate(vitals.abbreviation)):
    print("[%i/%i] Processing %s..."%(i+1,len(vitals),vitals.iloc[i].labvalue))
    itemid = vitals.iloc[i].itemid_icu
    if isinstance(itemid, str):
        itemid = itemid.replace(" ","")
        itemid = itemid.split(",")
    if isinstance(itemid, list):
        itemid = [int(x) for x in itemid]
    else : itemid = [int(itemid)]
    globals()['chartevents_{}'.format(idx)] = chartevents[chartevents['itemid'].isin(itemid)][['subject_id','hadm_id','stay_id','charttime','itemid','valuenum']]

    if vitals.iloc[i].lb_cond == 'ge' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] >= vitals.iloc[i].lb]
    elif vitals.iloc[i].lb_cond == 'gt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] > vitals.iloc[i].lb]

    if vitals.iloc[i].ub_cond == 'le' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] <= vitals.iloc[i].ub]
    elif vitals.iloc[i].ub_cond == 'lt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] < vitals.iloc[i].ub]

    globals()['chartevents_{}'.format(idx)].sort_values(by=['subject_id','charttime'],ascending=True,inplace=True)
    globals()['chartevents_{}'.format(idx)].reset_index(inplace=True, drop=True)

    if not os.path.isfile('./data/vitals/chartevents_%s.parquet'%idx):
        globals()['chartevents_{}'.format(idx)].to_parquet('./data/vitals/chartevents_%s.parquet'%idx)

0it [00:00, ?it/s]

[1/13] Processing Arterial O2 pressure...
[2/13] Processing Arterial CO2 Pressure...
[3/13] Processing Inspired O2 Fraction...
[4/13] Processing Arterial O2 Saturation...
[5/13] Processing Respiratory Rate...
[6/13] Processing Arterial Blood Pressure systolic...
[7/13] Processing Arterial Blood Pressure diastolic...
[8/13] Processing Non Invasive Blood Pressure systolic...
[9/13] Processing Non Invasive Blood Pressure diastolic...
[10/13] Processing Temperature Celsius...
[11/13] Processing Temperature Fahrenheit...
[12/13] Processing Heart Rate...
[13/13] Processing Central Venous Pressure...


## Temp

In [21]:
chartevents_tempF['valuenum2'] = (chartevents_tempF['valuenum']-32)*5/9
chartevents_tempF.drop('valuenum',axis=1,inplace=True)
chartevents_tempF.rename(columns={'valuenum2':'valuenum'},inplace=True)
chartevents_tempF['valuenum'] = chartevents_tempF['valuenum'].round(1)

chartevents_tempC = pd.concat([chartevents_tempC,chartevents_tempF])
chartevents_tempC.sort_values(by=['subject_id','hadm_id','stay_id','charttime'],inplace=True)
chartevents_tempC.to_parquet('./data/vitals/chartevents_tempC.parquet')

# Urine Output

In [58]:
outputevents_uo = outputevents[outputevents['itemid'].isin([
    226557, 226558, #Ureteral Stent, 요관스텐트
    226559, #Foley, 도뇨관(소변줄)
    226560, 226561, 226563, 226564, 226565, 226566, 226567, 226584, 226627, 226631, 226632
])]

guirrigant_input = outputevents[outputevents['itemid'].isin([227488])]
guirrigant_output = outputevents[outputevents['itemid'].isin([227489])]

guirrigant = pd.merge(guirrigant_input,guirrigant_output[['subject_id','hadm_id','stay_id','charttime','value']],on=['subject_id','hadm_id','stay_id','charttime'])
guirrigant['value'] =  guirrigant['value_y'] - guirrigant['value_x']
guirrigant = guirrigant[guirrigant['value'] >= 0]
guirrigant = guirrigant[['subject_id','hadm_id','stay_id','charttime','itemid','value']]

outputevents_uo = outputevents_uo[['subject_id','hadm_id','stay_id','charttime','itemid','value']]
outputevents_uo = pd.concat([outputevents_uo,guirrigant])

outputevents_uo.sort_values(['subject_id','hadm_id','stay_id','charttime'],inplace=True)
outputevents_uo.reset_index(inplace=True,drop=True)

outputevents_uo.to_parquet('./data/others/outputevents_uo.parquet')

# Baseline SCr

In [70]:
def baseline_sCr(chartevents_SCr,icustays,labevents) :
    icu_sCr_subject_id = chartevents_SCr['subject_id'].unique()
    icu_sCr_stay_id = chartevents_SCr['stay_id'].unique()

    icu_icustays_sCr = icustays[icustays['stay_id'].isin(icu_sCr_stay_id)]
    icu_icustays_sCr['intime'] = pd.to_datetime(icu_icustays_sCr['intime'])

    hosp_labevents_sCr = labevents[labevents['itemid'].isin([
    50912, # Creatinine, Blood, Chemistry
    52024, # Creatinine, Whole Blood, Blood, Chemistry
    52546  # Creatinine, Blood, Chemistry
    ])]

    hosp_labevents_sCr = hosp_labevents_sCr[['subject_id','hadm_id','charttime','valuenum']]
    hosp_labevents_sCr['charttime'] = pd.to_datetime(hosp_labevents_sCr['charttime'])

    baseline_sCr_list = []

    for i in tqdm(icu_sCr_subject_id) : 
        tmp_hosp_labevents_sCr = hosp_labevents_sCr[hosp_labevents_sCr['subject_id'] == i]
        for j in icu_icustays_sCr[icu_icustays_sCr['subject_id'] == i].stay_id : 
            tmp_icu_icustays_intime = icu_icustays_sCr[icu_icustays_sCr['stay_id'] == j].iloc[0]
            tmp_icu_icustays_intime_7days = tmp_icu_icustays_intime['intime'] - timedelta(days=7)
            tmp_icu_icustays_intime_1yr = tmp_icu_icustays_intime['intime'] - timedelta(days=365)

            baseline_sCr = 0
            
            tmp_hosp_labevents_sCr_list = tmp_hosp_labevents_sCr[tmp_hosp_labevents_sCr['charttime'] < tmp_icu_icustays_intime['intime']]
            tmp_hosp_labevents_sCr_list = tmp_hosp_labevents_sCr_list[tmp_hosp_labevents_sCr_list['charttime'] > tmp_icu_icustays_intime_7days]

            if not tmp_hosp_labevents_sCr_list.empty :
                baseline_sCr = tmp_hosp_labevents_sCr_list['valuenum'].min()

            else : 
                tmp_hosp_labevents_sCr_list = tmp_hosp_labevents_sCr_list[tmp_hosp_labevents_sCr_list['charttime'] < tmp_icu_icustays_intime_7days]
                tmp_hosp_labevents_sCr_list = tmp_hosp_labevents_sCr_list[tmp_hosp_labevents_sCr_list['charttime'] > tmp_icu_icustays_intime_1yr]
                baseline_sCr = tmp_hosp_labevents_sCr_list['valuenum'].median()

            baseline_sCr_list.append([i, j, baseline_sCr])

    df_baseline_sCr_list = pd.DataFrame(baseline_sCr_list, columns=['subject_id','stay_id','valuenum'])
    return df_baseline_sCr_list

In [73]:
baseline_sCr = baseline_sCr(chartevents_SCr,icustays,labevents)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  icu_icustays_sCr['intime'] = pd.to_datetime(icu_icustays_sCr['intime'])


  0%|          | 0/49635 [00:00<?, ?it/s]

# Glasgow Coma Scale

In [74]:
chartevents_GCS_eye = chartevents[chartevents['itemid'].isin([220739])]
chartevents_GCS_verval = chartevents[chartevents['itemid'].isin([223900])]
chartevents_GCS_motor = chartevents[chartevents['itemid'].isin([223901])]

In [128]:
chartevents_GCS_eye.to_parquet('./data/others/chartevents_GCS_eye.parquet')
chartevents_GCS_verval.to_parquet('./data/others/chartevents_GCS_verval.parquet')
chartevents_GCS_motor.to_parquet('./data/others/chartevents_GCS_motor.parquet')

# Fluid/Vasopressor intake

## Fluid

In [15]:
fluid = [220949, 220950, 220952, 225158, 225159, 225161, 225828, 225797, 225799, 225823, 225825, 225827, 225830, 226089, 225941, 225943, 225944, 226361, 226363, 226364, 226375, 226377, 226452, 226453, 227533, 228140, 228141, 228142, 228341, 220955, 220967, 220968, 220953]

In [10]:
inputevents_fluid = inputevents[inputevents['itemid'].isin(fluid)]

In [83]:
icustays[icustays['stay_id'].isin(tmp.stay_id.unique())][['stay_id','intime','outtime']]

Unnamed: 0,stay_id,intime,outtime
0,39553978,2180-07-23 14:00:00,2180-07-23 23:50:47
1,39765666,2189-06-27 08:42:00,2189-06-27 20:38:27
2,37067082,2157-11-20 19:18:02,2157-11-21 22:08:00


In [81]:
tmp = inputevents_fluid[['stay_id','starttime','endtime','amount']].head(10)

In [84]:
tmp = pd.merge(tmp,icustays[icustays['stay_id'].isin(tmp.stay_id.unique())][['stay_id','intime','outtime']],on='stay_id',how='left')

In [86]:
tmp.to_csv('./data/fluid.csv')

In [87]:
data = tmp.copy()

In [88]:
from pandas import to_datetime

# Convert the time columns to datetime objects for easier manipulation
data['starttime'] = to_datetime(data['starttime'])
data['endtime'] = to_datetime(data['endtime'])
data['intime'] = to_datetime(data['intime'])
data['outtime'] = to_datetime(data['outtime'])

# Resampling and calculating medication amount for each stay_id
resampled_data = []

# Process each stay_id separately
for stay_id in data['stay_id'].unique():
    # Filter data for the current stay_id
    stay_data = data[data['stay_id'] == stay_id]

    # Creating a time range from intime to outtime, at one-hour intervals
    time_range = pd.date_range(start=stay_data['intime'].min(), end=stay_data['outtime'].max(), freq='H')

    # Initialize a DataFrame to store resampled data for this stay_id
    stay_resampled = pd.DataFrame({'time': time_range})
    stay_resampled['amount'] = 0  # Initialize amount column with 0

    # Loop through each medication administration record
    for index, row in stay_data.iterrows():
        # Find the time range when the medication was administered
        mask = (stay_resampled['time'] >= row['starttime']) & (stay_resampled['time'] <= row['endtime'])
        # Proportionally distribute the amount of medication over the time steps
        time_steps = stay_resampled[mask].shape[0]
        if time_steps > 0:
            stay_resampled.loc[mask, 'amount'] += row['amount'] / time_steps

    # Add stay_id column to the resampled data
    stay_resampled['stay_id'] = stay_id
    resampled_data.append(stay_resampled)

# Combine all the resampled data into a single DataFrame
final_resampled_data = pd.concat(resampled_data)

# Display the first few rows of the final resampled data
final_resampled_data.head()


Unnamed: 0,time,amount,stay_id
0,2180-07-23 14:00:00,0.0,39553978
1,2180-07-23 15:00:00,0.0,39553978
2,2180-07-23 16:00:00,0.0,39553978
3,2180-07-23 17:00:00,200.0,39553978
4,2180-07-23 18:00:00,0.0,39553978


In [94]:
final_resampled_data[final_resampled_data['stay_id']==39553978]

Unnamed: 0,time,amount,stay_id
0,2180-07-23 14:00:00,0.0,39553978
1,2180-07-23 15:00:00,0.0,39553978
2,2180-07-23 16:00:00,0.0,39553978
3,2180-07-23 17:00:00,200.0,39553978
4,2180-07-23 18:00:00,0.0,39553978
5,2180-07-23 19:00:00,0.0,39553978
6,2180-07-23 20:00:00,0.0,39553978
7,2180-07-23 21:00:00,0.0,39553978
8,2180-07-23 22:00:00,0.0,39553978
9,2180-07-23 23:00:00,0.0,39553978


In [93]:
data[data['stay_id']==39553978]

Unnamed: 0,stay_id,starttime,endtime,amount,intime,outtime
0,39553978,2180-07-23 21:10:00,2180-07-23 21:11:00,100.0,2180-07-23 14:00:00,2180-07-23 23:50:47
1,39553978,2180-07-23 17:00:00,2180-07-23 17:01:00,200.0,2180-07-23 14:00:00,2180-07-23 23:50:47
2,39553978,2180-07-23 18:56:00,2180-07-23 18:57:00,100.0,2180-07-23 14:00:00,2180-07-23 23:50:47


In [95]:
# Correcting the calculation of the medication amount distribution

def calculate_medication_distribution(row, start_time, end_time, amount):
    """Calculate the amount of medication distributed across the time range."""
    # Time range overlap with medication administration time
    overlap_start = max(row['time'], start_time)
    overlap_end = min(row['time'] + pd.Timedelta(hours=1), end_time)
    # Proportion of the hour during which medication was administered
    overlap = (overlap_end - overlap_start) / pd.Timedelta(hours=1)
    return overlap * amount

# Re-calculate the amount distribution
for stay_resampled in resampled_data:
    for index, med_row in data[data['stay_id'] == stay_resampled['stay_id'].iloc[0]].iterrows():
        # Apply the calculation function to each row
        stay_resampled['amount'] += stay_resampled.apply(calculate_medication_distribution, 
                                                         args=(med_row['starttime'], med_row['endtime'], med_row['amount']), 
                                                         axis=1)

# Combine all the resampled data into a single DataFrame again
final_resampled_data_corrected = pd.concat(resampled_data)

# Display the first few rows of the corrected final resampled data
final_resampled_data_corrected.head()


Unnamed: 0,time,amount,stay_id
0,2180-07-23 14:00:00,-1410.0,39553978
1,2180-07-23 15:00:00,-1010.0,39553978
2,2180-07-23 16:00:00,-610.0,39553978
3,2180-07-23 17:00:00,-206.666667,39553978
4,2180-07-23 18:00:00,-411.666667,39553978


In [96]:
# Resetting the amount calculation
for stay_resampled in resampled_data:
    stay_resampled['amount'] = 0

# Correct approach to calculate medication amount
for stay_resampled in resampled_data:
    stay_id = stay_resampled['stay_id'].iloc[0]
    for index, med_row in data[data['stay_id'] == stay_id].iterrows():
        # Calculate the overlap between each time step and the medication administration period
        for index, time_row in stay_resampled.iterrows():
            # Calculate overlap
            overlap_start = max(time_row['time'], med_row['starttime'])
            overlap_end = min(time_row['time'] + pd.Timedelta(hours=1), med_row['endtime'])
            # Calculate overlap duration in hours
            overlap_hours = (overlap_end - overlap_start) / pd.Timedelta(hours=1)
            # Distribute medication amount based on the proportion of the hour during which medication was administered
            if overlap_hours > 0:
                stay_resampled.at[index, 'amount'] += (overlap_hours / ((med_row['endtime'] - med_row['starttime']) / pd.Timedelta(hours=1))) * med_row['amount']

# Combine all the resampled data into a single DataFrame again
final_resampled_data_corrected = pd.concat(resampled_data)

# Display the first few rows of the corrected final resampled data
final_resampled_data_corrected.head()


Unnamed: 0,time,amount,stay_id
0,2180-07-23 14:00:00,0.0,39553978
1,2180-07-23 15:00:00,0.0,39553978
2,2180-07-23 16:00:00,0.0,39553978
3,2180-07-23 17:00:00,200.0,39553978
4,2180-07-23 18:00:00,100.0,39553978


In [97]:
final_resampled_data_corrected

Unnamed: 0,time,amount,stay_id
0,2180-07-23 14:00:00,0.0,39553978
1,2180-07-23 15:00:00,0.0,39553978
2,2180-07-23 16:00:00,0.0,39553978
3,2180-07-23 17:00:00,200.0,39553978
4,2180-07-23 18:00:00,100.0,39553978
5,2180-07-23 19:00:00,0.0,39553978
6,2180-07-23 20:00:00,0.0,39553978
7,2180-07-23 21:00:00,100.0,39553978
8,2180-07-23 22:00:00,0.0,39553978
9,2180-07-23 23:00:00,0.0,39553978


## Vasopressor

In [100]:
inputevents_epinephrine = inputevents[inputevents['itemid'].isin([221289])]
inputevents_dopamine = inputevents[inputevents['itemid'].isin([221662])]
inputevents_dobutamine = inputevents[inputevents['itemid'].isin([221653])]
inputevents_norephinephrine = inputevents[inputevents['itemid'].isin([221906])]
inputevents_phenylephrine = inputevents[inputevents['itemid'].isin([221749, 229630, 229632])]
inputevents_vasopressin = inputevents[inputevents['itemid'].isin([222315])]

In [101]:
inputevents_norephinephrine.rateuom.unique()

array(['mcg/kg/min', 'mg/kg/min'], dtype=object)

In [102]:
inputevents_epinephrine.rateuom.unique()

array(['mcg/kg/min'], dtype=object)

In [103]:
inputevents_phenylephrine.rateuom.unique()

array(['mcg/kg/min', 'mcg/min'], dtype=object)

In [105]:
inputevents_vasopressin.rateuom.unique()

array(['units/hour', 'units/min'], dtype=object)

In [106]:
inputevents_dopamine

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,...,ordercomponenttypedescription,ordercategorydescription,patientweight,totalamount,totalamountuom,isopenbag,continueinnextdept,statusdescription,originalamount,originalrate
198,10001884,26184834,37510196,47220,2131-01-11 07:40:00,2131-01-11 08:17:00,2131-01-11 08:30:00,221662,19.259530,mg,...,Main order parameter,Continuous Med,65.0,250.0,ml,0,0,ChangeDose/Rate,334.178864,8.0
210,10001884,26184834,37510196,47220,2131-01-11 08:17:00,2131-01-11 09:14:00,2131-01-11 08:30:00,221662,22.243373,mg,...,Main order parameter,Continuous Med,65.0,250.0,ml,0,0,ChangeDose/Rate,314.919342,6.0
212,10001884,26184834,37510196,47220,2131-01-11 09:14:00,2131-01-11 10:01:00,2131-01-11 09:35:00,221662,6.113676,mg,...,Main order parameter,Continuous Med,65.0,250.0,ml,0,0,Paused,292.675964,2.0
249,10001884,26184834,37510196,64480,2131-01-11 04:50:00,2131-01-11 05:04:00,2131-01-11 04:50:00,221662,9.105691,mg,...,Main order parameter,Continuous Med,65.0,250.0,ml,0,0,ChangeDose/Rate,400.000000,10.0
254,10001884,26184834,37510196,64480,2131-01-11 05:04:00,2131-01-11 06:54:00,2131-01-11 05:04:00,221662,35.772359,mg,...,Main order parameter,Continuous Med,65.0,250.0,ml,0,0,ChangeDose/Rate,390.894318,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8974078,19997293,20208898,31629173,72391,2123-10-15 01:45:00,2123-10-15 01:55:00,2123-10-15 02:16:00,221662,3.364172,mg,...,Main order parameter,Continuous Med,134.5,250.0,ml,0,0,ChangeDose/Rate,400.000000,2.5
8974083,19997293,20208898,31629173,72391,2123-10-15 01:55:00,2123-10-15 02:00:00,2123-10-15 02:17:00,221662,2.355320,mg,...,Main order parameter,Continuous Med,134.5,250.0,ml,0,0,Paused,396.635834,3.5
8976232,19997473,27787494,32134105,24834,2173-09-18 23:00:00,2173-09-18 23:10:00,2173-09-18 23:09:00,221662,9.478673,mg,...,Main order parameter,Continuous Med,63.1,250.0,ml,0,0,ChangeDose/Rate,400.000000,15.0
8976234,19997473,27787494,32134105,24834,2173-09-18 23:10:00,2173-09-18 23:15:00,2173-09-18 23:10:00,221662,3.159558,mg,...,Main order parameter,Continuous Med,63.1,250.0,ml,0,0,ChangeDose/Rate,390.521332,10.0


In [107]:
inputevents_dobutamine

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,...,ordercomponenttypedescription,ordercategorydescription,patientweight,totalamount,totalamountuom,isopenbag,continueinnextdept,statusdescription,originalamount,originalrate
24575,10022620,27180902,31953583,16723,2174-01-04 20:50:00,2174-01-05 14:08:00,2174-01-05 14:09:00,221653,145.498015,mg,...,Main order parameter,Continuous Med,56.0,250.0,ml,0,0,FinishedRunning,145.498001,2.500659
24603,10022620,27180902,31953583,25610,2174-01-04 14:38:00,2174-01-04 20:50:00,2174-01-04 14:50:00,221653,104.201685,mg,...,Main order parameter,Continuous Med,56.0,250.0,ml,0,0,ChangeDose/Rate,250.000000,5.000000
24613,10022620,27180902,31953583,25610,2174-01-05 14:08:00,2174-01-05 14:50:00,2174-01-05 14:09:00,221653,5.885651,mg,...,Main order parameter,Continuous Med,56.0,250.0,ml,0,0,Stopped,250.000000,2.500000
24694,10022620,27180902,31953583,55302,2174-01-03 23:51:00,2174-01-04 14:38:00,2174-01-04 14:50:00,221653,248.500002,mg,...,Main order parameter,Continuous Med,56.0,250.0,ml,0,0,FinishedRunning,248.500000,5.003593
24734,10022620,27180902,31953583,58657,2174-01-03 12:27:00,2174-01-03 14:00:00,2174-01-03 12:27:00,221653,13.025211,mg,...,Main order parameter,Continuous Med,56.0,250.0,ml,0,0,ChangeDose/Rate,250.000000,2.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8947554,19968075,28592225,31756531,24834,2153-04-22 23:44:00,2153-04-23 07:31:00,2153-04-23 07:29:00,221653,191.709037,mg,...,Main order parameter,Continuous Med,102.6,250.0,ml,0,0,FinishedRunning,191.709000,4.000431
8947592,19968075,28592225,31756531,24834,2153-04-23 07:31:00,2153-04-23 13:25:00,2153-04-23 07:29:00,221653,145.305547,mg,...,Main order parameter,Continuous Med,102.6,250.0,ml,0,0,ChangeDose/Rate,250.000000,4.000000
8969293,19990821,27777812,38906628,94919,2143-03-07 02:11:00,2143-03-07 02:34:00,2143-03-07 02:11:00,221653,7.133995,mg,...,Main order parameter,Continuous Med,62.0,250.0,ml,0,0,ChangeDose/Rate,250.000000,5.000000
8969297,19990821,27777812,38906628,94919,2143-03-07 02:34:00,2143-03-07 02:35:00,2143-03-07 02:34:00,221653,0.496658,mg,...,Main order parameter,Continuous Med,62.0,250.0,ml,0,0,ChangeDose/Rate,242.866013,8.000001


In [109]:
procedureevents[procedureevents['itemid'].isin([225802])]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,value,valueuom,...,orderid,linkorderid,ordercategoryname,ordercategorydescription,patientweight,isopenbag,continueinnextdept,statusdescription,originalamount,originalrate
220,10004235,24181354,34100191,96601.0,2196-02-26 01:00:00,2196-02-27 16:27:00,2196-02-27 16:42:00,225802,2367.000000,min,...,3699739,3699739,Dialysis,ContinuousProcess,127.0,1,0,FinishedRunning,2367.0,1
621,10007818,22987108,32359580,29855.0,2146-06-26 01:00:00,2146-06-26 14:21:00,2146-06-26 01:00:00,225802,801.000000,min,...,5520577,5520577,Dialysis,ContinuousProcess,86.2,0,0,Stopped,2208.0,1
638,10007818,22987108,32359580,65154.0,2146-06-22 21:10:00,2146-06-24 09:58:00,2146-06-24 10:06:00,225802,2208.000000,min,...,8439366,8439366,Dialysis,ContinuousProcess,86.2,1,0,FinishedRunning,2208.0,1
641,10007818,22987108,32359580,65154.0,2146-06-26 16:30:00,2146-06-28 08:00:00,2146-06-28 06:39:00,225802,1.645833,day,...,5677703,5677703,Dialysis,ContinuousProcess,86.2,0,0,Stopped,4.0,1
653,10007818,22987108,32359580,82842.0,2146-06-24 12:23:00,2146-06-26 01:11:00,2146-06-24 12:23:00,225802,2208.000000,min,...,3377209,3377209,Dialysis,ContinuousProcess,86.2,0,0,FinishedRunning,2208.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694893,19986880,28386154,32959861,53913.0,2185-08-06 11:20:00,2185-08-08 02:28:00,2185-08-06 11:41:00,225802,2348.000000,min,...,2552802,2552802,Dialysis,ContinuousProcess,57.0,0,0,FinishedRunning,2348.0,1
694896,19986880,28386154,32959861,76255.0,2185-08-08 11:40:00,2185-08-10 02:48:00,2185-08-08 11:40:00,225802,2348.000000,min,...,8042255,8042255,Dialysis,ContinuousProcess,57.0,0,0,FinishedRunning,2348.0,1
694899,19986880,28386154,32959861,86518.0,2185-08-04 16:10:00,2185-08-04 17:00:00,2185-08-04 18:26:00,225802,50.000000,min,...,4163989,4163989,Dialysis,ContinuousProcess,57.0,1,0,FinishedRunning,50.0,1
694900,19986880,28386154,32959861,86518.0,2185-08-04 17:30:00,2185-08-06 08:38:00,2185-08-06 09:52:00,225802,2348.000000,min,...,8782702,8782702,Dialysis,ContinuousProcess,57.0,1,0,FinishedRunning,2348.0,1


In [110]:
resample_AG = pd.read_parquet('./data/resample/resample_AG.parquet')
resample_Alb = pd.read_parquet('./data/resample/resample_Alb.parquet')
