In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys, os, pickle

from tqdm.notebook import tqdm
from datetime import timedelta
#from utils import baseline_SCr

if os.getcwd()[-4:] == "code":
    os.chdir('../')

icu = './data/mimic-iv-2.2-parquet/icu/'
hosp = './data/mimic-iv-2.2-parquet/hosp/'

In [2]:
labevents     = pd.read_parquet(hosp+'labevents.parquet')
d_labitems    = pd.read_parquet(hosp+'d_labitems.parquet')

patients      = pd.read_parquet(hosp+'patients.parquet')
admissions    = pd.read_parquet(hosp+'admissions.parquet')

diagnoses_icd = pd.read_parquet(hosp+'diagnoses_icd.parquet')
microbiology  = pd.read_parquet(hosp+'microbiologyevents.parquet')
prescriptions = pd.read_parquet(hosp+'prescriptions.parquet')

In [3]:
chartevents     = pd.read_parquet(icu+'chartevents.parquet')
d_items         = pd.read_parquet(icu+'d_items.parquet')
inputevents     = pd.read_parquet(icu+'inputevents.parquet')
procedureevents = pd.read_parquet(icu+'procedureevents.parquet')
icustays        = pd.read_parquet(icu+'icustays.parquet')

In [4]:
# to_datetime
chartevents['charttime'] = pd.to_datetime(chartevents['charttime'])

# Demographic

### Gender

In [106]:
patients_gender = patients[['subject_id','gender']]
patients_gender.loc[patients_gender.gender == 'F' , 'gender'] = 1
patients_gender.loc[patients_gender.gender == 'M' , 'gender'] = 0

In [107]:
patients_gender

Unnamed: 0,subject_id,gender
0,10000032,1
1,10000048,1
2,10000068,1
3,10000084,0
4,10000102,1
...,...,...
299707,19999828,1
299708,19999829,1
299709,19999840,0
299710,19999914,1


### Age

In [108]:
icustays['intime'] = pd.to_datetime(icustays['intime'])
icustays['outtime'] = pd.to_datetime(icustays['outtime'])

In [109]:
icustays_intime = icustays[['subject_id','hadm_id','stay_id','intime']]
patients_age = patients[['subject_id','anchor_age','anchor_year']]

icustays_age = pd.merge(icustays_intime, patients_age, on = 'subject_id', how = 'left')

In [110]:
icustays_age['anchor_age_delta'] = pd.to_timedelta(icustays_age['anchor_age']*365.25, unit='D')
icustays_age['anchor_year'] = pd.to_datetime(icustays_age['anchor_year'],format="%Y")

icustays_age['delta'] = icustays_age['intime'] - icustays_age['anchor_year']
icustays_age['age'] = ((icustays_age['anchor_age_delta'] + icustays_age['delta'])/365.25).dt.days

icustays_age = icustays_age[['subject_id','hadm_id','stay_id','age']]

In [111]:
icustays_age

Unnamed: 0,subject_id,hadm_id,stay_id,age
0,10000032,29079034,39553978,52
1,10000980,26913865,39765666,76
2,10001217,24597018,37067082,55
3,10001217,27703517,34592300,55
4,10001725,25563031,31205490,46
...,...,...,...,...
73176,19999442,26785317,32336619,43
73177,19999625,25304202,31070865,82
73178,19999828,25744818,36075953,48
73179,19999840,21033226,38978960,58


### Race

In [57]:
admissions_race = admissions[['subject_id','race']].copy()

In [58]:
admissions_race['race'].replace(['ASIAN - ASIAN INDIAN', 'ASIAN - CHINESE','ASIAN - KOREAN', 'ASIAN - SOUTH EAST ASIAN'],'ASIAN',inplace=True)
admissions_race['race'].replace(['BLACK/AFRICAN AMERICAN','BLACK/AFRICAN','BLACK/CAPE VERDEAN','BLACK/CARIBBEAN ISLAND'],'BLACK',inplace=True)
admissions_race['race'].replace(['HISPANIC/LATINO - CENTRAL AMERICAN','HISPANIC/LATINO - COLUMBIAN','HISPANIC/LATINO - CUBAN','HISPANIC/LATINO - DOMINICAN','HISPANIC/LATINO - GUATEMALAN','HISPANIC/LATINO - HONDURAN','HISPANIC/LATINO - MEXICAN',
                             'HISPANIC/LATINO - PUERTO RICAN','HISPANIC/LATINO - SALVADORAN','PORTUGUESE','SOUTH AMERICAN'],'HISPANIC OR LATINO',inplace=True)
admissions_race['race'].replace(['NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER'],'PACIFIC ISLANDER',inplace=True)
admissions_race['race'].replace(['UNABLE TO OBTAIN','PATIENT DECLINED TO ANSWER'],'UNKNOWN',inplace=True)
admissions_race['race'].replace(['WHITE - BRAZILIAN','WHITE - EASTERN EUROPEAN','WHITE - OTHER EUROPEAN','WHITE - RUSSIAN'],'WHITE',inplace=True)

admissions_race = admissions_race.drop_duplicates()

In [61]:
multiple = admissions_race.subject_id.value_counts().loc[lambda x : x > 1].to_frame()
multiple.reset_index(inplace = True)
multiple = multiple.subject_id.unique()

hosp_race_multiple = admissions_race[admissions_race['subject_id'].isin(multiple)]

sol = []
multi = []
for i in hosp_race_multiple.subject_id.unique() :
    tmp = hosp_race_multiple[hosp_race_multiple['subject_id'] == i]
    if (tmp['race'] == 'UNKNOWN').any() :
        tmp = tmp[tmp['race'] != 'UNKNOWN']
    if len(tmp) <2 :
        sol.append(tmp)
    else : 
        multi.append(tmp)
sol = pd.concat(sol)
multi = pd.concat(multi)

multi['race'] = 'MULTIPLE RACE/ETHNICITY'
multi.drop_duplicates(inplace=True)

admissions_race = admissions_race[~admissions_race['subject_id'].isin(sol.subject_id.unique())]
admissions_race = admissions_race[~admissions_race['subject_id'].isin(multi.subject_id.unique())]

admissions_race = pd.concat([admissions_race,sol,multi])

In [62]:
admissions_race.race.unique()

array(['WHITE', 'OTHER', 'BLACK', 'UNKNOWN', 'HISPANIC OR LATINO',
       'ASIAN', 'PACIFIC ISLANDER', 'AMERICAN INDIAN/ALASKA NATIVE',
       'MULTIPLE RACE/ETHNICITY'], dtype=object)

# Lab values

In [5]:
labvalues = pd.read_csv('./data/labvalues/labvalues.csv')
labvalues

Unnamed: 0,labvalue,abbreviation,itemid,lb,ub,lb_cond,ub_cond
0,Albumin,Alb,227456,0.6,6.0,ge,le
1,Alkaline Phosphate,Alk_Phos,225612,20.0,3625.0,ge,le
2,Anion Gap,AG,227073,5.0,50.0,ge,le
3,Blood urea nitrogen,BUN,225624,0.0,300.0,gt,le
4,Calcium non-ionized,Ca,225625,0.2,20.0,ge,le
5,Creatine Kinase,CK,225634,0.0,30000.0,ge,le
6,Direct Bilirubin,D_Bil,225651,0.0,100.0,gt,le
7,Glucose,Glu,220621,20.0,2000.0,ge,le
8,Hematocrit,HCT,220545,0.0,100.0,gt,lt
9,International Normalized Ratio,INR,227467,0.3,10.0,ge,le


In [8]:
for i,idx in tqdm(enumerate(labvalues.abbreviation)):
    print("[%i/%i] Processing %s..."%(i+1,len(labvalues),labvalues.iloc[i].labvalue))
    globals()['chartevents_{}'.format(idx)] = chartevents[chartevents['itemid'].isin([labvalues.iloc[i].itemid])][['subject_id','hadm_id','stay_id','charttime','itemid','valuenum']]

    if labvalues.iloc[i].lb_cond == 'ge' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] >= labvalues.iloc[i].lb]
    elif labvalues.iloc[i].lb_cond == 'gt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] > labvalues.iloc[i].lb]

    if labvalues.iloc[i].ub_cond == 'le' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] <= labvalues.iloc[i].ub]
    elif labvalues.iloc[i].ub_cond == 'lt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] < labvalues.iloc[i].ub]

    globals()['chartevents_{}'.format(idx)].sort_values(by=['subject_id','charttime'],ascending=True,inplace=True)
    globals()['chartevents_{}'.format(idx)].reset_index(inplace=True, drop=True)

    if not os.path.isfile('./data/labvalues/chartevents_%s.parquet'%idx):
        globals()['chartevents_{}'.format(idx)].to_parquet('./data/labvalues/chartevents_%s.parquet'%idx)

0it [00:00, ?it/s]

[1/43] Processing Albumin...
[2/43] Processing Alkaline Phosphate...
[3/43] Processing Anion Gap...
[4/43] Processing Blood urea nitrogen...
[5/43] Processing Calcium non-ionized...
[6/43] Processing Creatine Kinase...
[7/43] Processing Direct Bilirubin...
[8/43] Processing Glucose...
[9/43] Processing Hematocrit...
[10/43] Processing International Normalized Ratio...
[11/43] Processing PH...
[12/43] Processing Phosphorous...
[13/43] Processing Platelet Count...
[14/43] Processing Serum chloride...
[15/43] Processing Serum creatinine...
[16/43] Processing Serum Sodium...
[17/43] Processing Serum Potassium...
[18/43] Processing Total Bilirubin...
[19/43] Processing White blood cell count...
[20/43] Processing Serum glucose...
[21/43] Processing Magnesium...
[22/43] Processing Ionized calcium...
[23/43] Processing Serum HCO3...
[24/43] Processing AST...
[25/43] Processing ALT...
[26/43] Processing PTT...
[27/43] Processing Arterial O2 pressure...
[28/43] Processing Arterial CO2 Pressure.

# Vital sign

In [95]:
vitals = [220045,220179,220050,220180,220051,223762,223761,220210,220277,220227]
len(vitals)

10

In [96]:
d_items[d_items['itemid'].isin(vitals)]

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
2,220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
6,220050,Arterial Blood Pressure systolic,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,90.0,140.0
7,220051,Arterial Blood Pressure diastolic,ABPd,chartevents,Routine Vital Signs,mmHg,Numeric,60.0,90.0
24,220179,Non Invasive Blood Pressure systolic,NBPs,chartevents,Routine Vital Signs,mmHg,Numeric,,
25,220180,Non Invasive Blood Pressure diastolic,NBPd,chartevents,Routine Vital Signs,mmHg,Numeric,,
28,220210,Respiratory Rate,RR,chartevents,Respiratory,insp/min,Numeric,,
31,220227,Arterial O2 Saturation,SaO2,chartevents,Labs,%,Numeric,,
36,220277,O2 saturation pulseoxymetry,SpO2,chartevents,Respiratory,%,Numeric,,
337,223761,Temperature Fahrenheit,Temperature F,chartevents,Routine Vital Signs,°F,Numeric,,
338,223762,Temperature Celsius,Temperature C,chartevents,Routine Vital Signs,°C,Numeric,,


In [101]:
chartevents.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'caregiver_id', 'charttime',
      dtype='object')

In [100]:
chartevents.columns.str.contains('time')

array([False, False, False, False,  True,  True, False, False, False,
       False, False])

In [113]:
chartevents[chartevents['itemid'].isin([220228])]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
2295,10001217,24597018,37067082,,2157-11-21 03:16:00,2157-11-21 04:30:00,220228,11.2,11.2,g/dl,1.0
3541,10001217,27703517,34592300,,2157-12-20 01:45:00,2157-12-20 02:05:00,220228,12.3,12.3,g/dl,0.0
4603,10001725,25563031,31205490,,2110-04-11 18:02:00,2110-04-11 18:26:00,220228,13.9,13.9,g/dl,0.0
4626,10001725,25563031,31205490,,2110-04-12 02:59:00,2110-04-12 03:25:00,220228,12.6,12.6,g/dl,0.0
18757,10001884,26184834,37510196,,2131-01-11 06:31:00,2131-01-11 07:04:00,220228,11.4,11.4,g/dl,0.0
...,...,...,...,...,...,...,...,...,...,...,...
313642206,19999840,21033226,38978960,,2164-09-16 04:57:00,2164-09-16 05:10:00,220228,12.4,12.4,g/dl,1.0
313642227,19999840,21033226,38978960,,2164-09-17 02:49:00,2164-09-17 03:22:00,220228,12.3,12.3,g/dl,1.0
313642275,19999840,21033226,38978960,,2164-09-17 13:13:00,2164-09-17 13:31:00,220228,11.4,11.4,g/dl,1.0
313644995,19999987,23865745,36195440,,2145-11-03 01:35:00,2145-11-03 01:44:00,220228,13.2,13.2,g/dl,0.0
