In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys, os, pickle

from tqdm.notebook import tqdm
from datetime import timedelta
#from utils import baseline_SCr

if os.getcwd()[-4:] == "code":
    os.chdir('../')

icu = './data/mimic-iv-2.2-parquet/icu/'
hosp = './data/mimic-iv-2.2-parquet/hosp/'

In [2]:
labevents     = pd.read_parquet(hosp+'labevents.parquet')
d_labitems    = pd.read_parquet(hosp+'d_labitems.parquet')

patients      = pd.read_parquet(hosp+'patients.parquet')
admissions    = pd.read_parquet(hosp+'admissions.parquet')

diagnoses_icd = pd.read_parquet(hosp+'diagnoses_icd.parquet')
microbiology  = pd.read_parquet(hosp+'microbiologyevents.parquet')
prescriptions = pd.read_parquet(hosp+'prescriptions.parquet')

In [3]:
chartevents     = pd.read_parquet(icu+'chartevents.parquet')
d_items         = pd.read_parquet(icu+'d_items.parquet')
inputevents     = pd.read_parquet(icu+'inputevents.parquet')
procedureevents = pd.read_parquet(icu+'procedureevents.parquet')
icustays        = pd.read_parquet(icu+'icustays.parquet')

In [4]:
# to_datetime
chartevents['charttime'] = pd.to_datetime(chartevents['charttime'])

In [5]:
labvalues = pd.read_csv('./data/labvalues/labvalues.csv')
vitals = pd.read_csv('./data/vitals/vitals.csv')

# Demographic

### Gender

In [106]:
patients_gender = patients[['subject_id','gender']]
patients_gender.loc[patients_gender.gender == 'F' , 'gender'] = 1
patients_gender.loc[patients_gender.gender == 'M' , 'gender'] = 0

In [107]:
patients_gender

Unnamed: 0,subject_id,gender
0,10000032,1
1,10000048,1
2,10000068,1
3,10000084,0
4,10000102,1
...,...,...
299707,19999828,1
299708,19999829,1
299709,19999840,0
299710,19999914,1


### Age

In [108]:
icustays['intime'] = pd.to_datetime(icustays['intime'])
icustays['outtime'] = pd.to_datetime(icustays['outtime'])

In [109]:
icustays_intime = icustays[['subject_id','hadm_id','stay_id','intime']]
patients_age = patients[['subject_id','anchor_age','anchor_year']]

icustays_age = pd.merge(icustays_intime, patients_age, on = 'subject_id', how = 'left')

In [110]:
icustays_age['anchor_age_delta'] = pd.to_timedelta(icustays_age['anchor_age']*365.25, unit='D')
icustays_age['anchor_year'] = pd.to_datetime(icustays_age['anchor_year'],format="%Y")

icustays_age['delta'] = icustays_age['intime'] - icustays_age['anchor_year']
icustays_age['age'] = ((icustays_age['anchor_age_delta'] + icustays_age['delta'])/365.25).dt.days

icustays_age = icustays_age[['subject_id','hadm_id','stay_id','age']]

In [111]:
icustays_age

Unnamed: 0,subject_id,hadm_id,stay_id,age
0,10000032,29079034,39553978,52
1,10000980,26913865,39765666,76
2,10001217,24597018,37067082,55
3,10001217,27703517,34592300,55
4,10001725,25563031,31205490,46
...,...,...,...,...
73176,19999442,26785317,32336619,43
73177,19999625,25304202,31070865,82
73178,19999828,25744818,36075953,48
73179,19999840,21033226,38978960,58


### Race

In [57]:
admissions_race = admissions[['subject_id','race']].copy()

In [58]:
admissions_race['race'].replace(['ASIAN - ASIAN INDIAN', 'ASIAN - CHINESE','ASIAN - KOREAN', 'ASIAN - SOUTH EAST ASIAN'],'ASIAN',inplace=True)
admissions_race['race'].replace(['BLACK/AFRICAN AMERICAN','BLACK/AFRICAN','BLACK/CAPE VERDEAN','BLACK/CARIBBEAN ISLAND'],'BLACK',inplace=True)
admissions_race['race'].replace(['HISPANIC/LATINO - CENTRAL AMERICAN','HISPANIC/LATINO - COLUMBIAN','HISPANIC/LATINO - CUBAN','HISPANIC/LATINO - DOMINICAN','HISPANIC/LATINO - GUATEMALAN','HISPANIC/LATINO - HONDURAN','HISPANIC/LATINO - MEXICAN',
                             'HISPANIC/LATINO - PUERTO RICAN','HISPANIC/LATINO - SALVADORAN','PORTUGUESE','SOUTH AMERICAN'],'HISPANIC OR LATINO',inplace=True)
admissions_race['race'].replace(['NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER'],'PACIFIC ISLANDER',inplace=True)
admissions_race['race'].replace(['UNABLE TO OBTAIN','PATIENT DECLINED TO ANSWER'],'UNKNOWN',inplace=True)
admissions_race['race'].replace(['WHITE - BRAZILIAN','WHITE - EASTERN EUROPEAN','WHITE - OTHER EUROPEAN','WHITE - RUSSIAN'],'WHITE',inplace=True)

admissions_race = admissions_race.drop_duplicates()

In [61]:
multiple = admissions_race.subject_id.value_counts().loc[lambda x : x > 1].to_frame()
multiple.reset_index(inplace = True)
multiple = multiple.subject_id.unique()

hosp_race_multiple = admissions_race[admissions_race['subject_id'].isin(multiple)]

sol = []
multi = []
for i in hosp_race_multiple.subject_id.unique() :
    tmp = hosp_race_multiple[hosp_race_multiple['subject_id'] == i]
    if (tmp['race'] == 'UNKNOWN').any() :
        tmp = tmp[tmp['race'] != 'UNKNOWN']
    if len(tmp) <2 :
        sol.append(tmp)
    else : 
        multi.append(tmp)
sol = pd.concat(sol)
multi = pd.concat(multi)

multi['race'] = 'MULTIPLE RACE/ETHNICITY'
multi.drop_duplicates(inplace=True)

admissions_race = admissions_race[~admissions_race['subject_id'].isin(sol.subject_id.unique())]
admissions_race = admissions_race[~admissions_race['subject_id'].isin(multi.subject_id.unique())]

admissions_race = pd.concat([admissions_race,sol,multi])

In [62]:
admissions_race.race.unique()

array(['WHITE', 'OTHER', 'BLACK', 'UNKNOWN', 'HISPANIC OR LATINO',
       'ASIAN', 'PACIFIC ISLANDER', 'AMERICAN INDIAN/ALASKA NATIVE',
       'MULTIPLE RACE/ETHNICITY'], dtype=object)

# Lab values

In [8]:
for i,idx in tqdm(enumerate(labvalues.abbreviation)):
    print("[%i/%i] Processing %s..."%(i+1,len(labvalues),labvalues.iloc[i].labvalue))
    globals()['chartevents_{}'.format(idx)] = chartevents[chartevents['itemid'].isin([labvalues.iloc[i].itemid_icu])][['subject_id','hadm_id','stay_id','charttime','itemid','valuenum']]

    if labvalues.iloc[i].lb_cond == 'ge' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] >= labvalues.iloc[i].lb]
    elif labvalues.iloc[i].lb_cond == 'gt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] > labvalues.iloc[i].lb]

    if labvalues.iloc[i].ub_cond == 'le' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] <= labvalues.iloc[i].ub]
    elif labvalues.iloc[i].ub_cond == 'lt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] < labvalues.iloc[i].ub]

    globals()['chartevents_{}'.format(idx)].sort_values(by=['subject_id','charttime'],ascending=True,inplace=True)
    globals()['chartevents_{}'.format(idx)].reset_index(inplace=True, drop=True)

    if not os.path.isfile('./data/labvalues/chartevents_%s.parquet'%idx):
        globals()['chartevents_{}'.format(idx)].to_parquet('./data/labvalues/chartevents_%s.parquet'%idx)

0it [00:00, ?it/s]

[1/28] Processing Albumin...
[2/28] Processing Alkaline Phosphate...
[3/28] Processing Anion Gap...
[4/28] Processing Blood urea nitrogen...
[5/28] Processing Calcium non-ionized...
[6/28] Processing Creatine Kinase...
[7/28] Processing Direct Bilirubin...
[8/28] Processing Glucose...
[9/28] Processing Hematocrit...
[10/28] Processing International Normalized Ratio...
[11/28] Processing PH...
[12/28] Processing Phosphorous...
[13/28] Processing Platelet Count...
[14/28] Processing Serum chloride...
[15/28] Processing Serum creatinine...
[16/28] Processing Serum Sodium...
[17/28] Processing Serum Potassium...
[18/28] Processing Total Bilirubin...
[19/28] Processing White blood cell count...
[20/28] Processing Serum glucose...
[21/28] Processing Magnesium...
[22/28] Processing Ionized calcium...
[23/28] Processing Serum HCO3...
[24/28] Processing AST...
[25/28] Processing ALT...
[26/28] Processing PTT...
[27/28] Processing Arterial Base Excess...
[28/28] Processing Lactic Acid...


In [6]:
for i,idx in tqdm(enumerate(labvalues.abbreviation)):
    try :
        print("[%i/%i] Processing %s..."%(i+1,len(labvalues),labvalues.iloc[i].labvalue))
        itemid = labvalues.iloc[i].itemid_hosp
        if isinstance(itemid, str):
            itemid = itemid.replace(" ","")
            itemid = itemid.split(",")
        if isinstance(itemid, list):
            itemid = [int(x) for x in itemid]
        else : itemid = [int(itemid)]
        globals()['labevents_{}'.format(idx)] = labevents[labevents['itemid'].isin(itemid)][['subject_id','hadm_id','charttime','itemid','valuenum']]

        if labvalues.iloc[i].lb_cond == 'ge' : 
            globals()['labevents_{}'.format(idx)] = globals()['labevents_{}'.format(idx)][globals()['labevents_{}'.format(idx)]['valuenum'] >= labvalues.iloc[i].lb]
        elif labvalues.iloc[i].lb_cond == 'gt' : 
            globals()['labevents_{}'.format(idx)] = globals()['labevents_{}'.format(idx)][globals()['labevents_{}'.format(idx)]['valuenum'] > labvalues.iloc[i].lb]

        if labvalues.iloc[i].ub_cond == 'le' : 
            globals()['labevents_{}'.format(idx)] = globals()['labevents_{}'.format(idx)][globals()['labevents_{}'.format(idx)]['valuenum'] <= labvalues.iloc[i].ub]
        elif labvalues.iloc[i].ub_cond == 'lt' : 
            globals()['labevents_{}'.format(idx)] = globals()['labevents_{}'.format(idx)][globals()['labevents_{}'.format(idx)]['valuenum'] < labvalues.iloc[i].ub]

        globals()['labevents_{}'.format(idx)].sort_values(by=['subject_id','charttime'],ascending=True,inplace=True)
        globals()['labevents_{}'.format(idx)].reset_index(inplace=True, drop=True)

        if not os.path.isfile('./data/labvalues/labevents_%s.parquet'%idx):
            globals()['labevents_{}'.format(idx)].to_parquet('./data/labvalues/labevents_%s.parquet'%idx)
    except: ValueError

0it [00:00, ?it/s]

[1/28] Processing Albumin...
[2/28] Processing Alkaline Phosphate...
[3/28] Processing Anion Gap...
[4/28] Processing Blood urea nitrogen...
[5/28] Processing Calcium non-ionized...
[6/28] Processing Creatine Kinase...
[7/28] Processing Direct Bilirubin...
[8/28] Processing Glucose...
[9/28] Processing Hematocrit...
[10/28] Processing International Normalized Ratio...
[11/28] Processing PH...
[12/28] Processing Phosphorous...
[13/28] Processing Platelet Count...
[14/28] Processing Serum chloride...
[15/28] Processing Serum creatinine...
[16/28] Processing Serum Sodium...
[17/28] Processing Serum Potassium...
[18/28] Processing Total Bilirubin...
[19/28] Processing White blood cell count...
[20/28] Processing Serum glucose...
[21/28] Processing Magnesium...
[22/28] Processing Ionized calcium...
[23/28] Processing Serum HCO3...
[24/28] Processing AST...
[25/28] Processing ALT...
[26/28] Processing PTT...
[27/28] Processing Arterial Base Excess...
[28/28] Processing Lactic Acid...


# Vitals

In [7]:
for i,idx in tqdm(enumerate(vitals.abbreviation)):
    print("[%i/%i] Processing %s..."%(i+1,len(vitals),vitals.iloc[i].labvalue))
    globals()['chartevents_{}'.format(idx)] = chartevents[chartevents['itemid'].isin([vitals.iloc[i].itemid_icu])][['subject_id','hadm_id','stay_id','charttime','itemid','valuenum']]

    if vitals.iloc[i].lb_cond == 'ge' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] >= vitals.iloc[i].lb]
    elif vitals.iloc[i].lb_cond == 'gt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] > vitals.iloc[i].lb]

    if vitals.iloc[i].ub_cond == 'le' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] <= vitals.iloc[i].ub]
    elif vitals.iloc[i].ub_cond == 'lt' : 
        globals()['chartevents_{}'.format(idx)] = globals()['chartevents_{}'.format(idx)][globals()['chartevents_{}'.format(idx)]['valuenum'] < vitals.iloc[i].ub]

    globals()['chartevents_{}'.format(idx)].sort_values(by=['subject_id','charttime'],ascending=True,inplace=True)
    globals()['chartevents_{}'.format(idx)].reset_index(inplace=True, drop=True)

    if not os.path.isfile('./data/vitals/chartevents_%s.parquet'%idx):
        globals()['chartevents_{}'.format(idx)].to_parquet('./data/vitals/chartevents_%s.parquet'%idx)

0it [00:00, ?it/s]

[1/15] Processing Arterial O2 pressure...
[2/15] Processing Arterial CO2 Pressure...
[3/15] Processing Inspired O2 Fraction...
[4/15] Processing Arterial O2 Saturation...
[5/15] Processing Respiratory Rate...
[6/15] Processing Arterial Blood Pressure systolic...
[7/15] Processing Arterial Blood Pressure diastolic...
[8/15] Processing Non Invasive Blood Pressure systolic...
[9/15] Processing Non Invasive Blood Pressure diastolic...
[10/15] Processing Temperature Celsius...
[11/15] Processing Temperature Fahrenheit...
[12/15] Processing Heart Rate...
[13/15] Processing ART BP Systolic...
[14/15] Processing ART BP Diastolic...
[15/15] Processing Central Venous Pressure...


In [8]:
fluid = [220949, 220950, 220952, 225158, 225159, 225161, 225828, 225797, 225799, 225823, 225825, 225827, 225830, 226089, 225941, 225943, 225944, 226361, 226363, 226364, 226375, 226377, 226452, 226453, 227533, 228140, 228141, 228142, 228341, 220955, 220967, 220968, 220953]

In [10]:
inputevents_fluid = inputevents[inputevents['itemid'].isin(fluid)]

In [12]:
inputevents_fluid['itemid'].value_counts()

itemid
225158    1258660
220949    1034251
225943     561934
226452     428008
225799     304801
226453     224140
225828     108282
226089      94194
225944      71293
225797      70662
225823      27616
226364      20332
226361      15355
220952      13411
225159      10227
225161       8504
225825       5844
227533       5373
220950       4630
225827       3125
226363       2458
226375       2252
225830       1594
228140        576
225941        416
228341        312
226377        132
228142         51
228141         14
Name: count, dtype: int64