# Exports raw data from mimic-iv database

Only the following care unit patients are exported:
- Coronary Care unit (CCU)
- Cardiac Vascular Intensive Care unit (CVICU)

In [1]:
from projects.utils import *
from projects.common import *
from typing import Tuple
from tqdm import tqdm
# from tqdm.notebook import tqdm
from multiprocessing import Pool, RLock
from configobj import ConfigObj
import numpy as np
import getpass
import json
import math
import os
import psycopg2
import pandas as pd
import time

import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
def connect_db():
    db_dir = os.path.abspath('') + "/../../../db"
    return connect_to_database(db_dir)


connect_db()

# def merge_lab_df(df1: pd.DataFrame, df2: pd.DataFrame):
#     target_cols = ['subject_id', 'hadm_id', 'charttime', 'specimen_id']
#     df1 = df1.sort_values(target_cols)
#     df2 = df2.sort_values(target_cols)
#     df2 = df2.loc[:, ~df2.columns.isin(target_cols)]
#     return pd.concat([df1, df2], axis=1)


def get_db_table_as_df_ext(_schema_type: str, _table: str,
                           _chunk: int = 10000):
    """Wrapper for generating dataframe from the db. """
    (query_schema_core,
     query_schema_hosp,
     query_schema_icu,
     query_schema_derived,
     conn) = connect_db()

    if _schema_type == 'core':
        _query_schema = query_schema_core
    if _schema_type == 'hosp':
        _query_schema = query_schema_hosp
    if _schema_type == 'icu':
        _query_schema = query_schema_icu
    if _schema_type == 'derived':
        _query_schema = query_schema_derived
    else:
        _query_schema = None

    df_iter, num_entries = get_database_table_as_dataframe(
        conn, _query_schema, _table, _chunk_size=_chunk*MP_NUM_PROCESSES)
    num_entries = math.ceil(num_entries / (_chunk*MP_NUM_PROCESSES))
    return df_iter, num_entries


def get_info_save_path(data_dir: str, stay_id: int):
    return os.path.join(data_dir, 'info_'+str(stay_id)+'.dsv')


def get_data_save_path(data_dir: str, stay_id: int):
    return os.path.join(data_dir, 'data_'+str(stay_id)+'.dsv')


def save_data_dsv_ext(save_path: str, data: dict) -> None:
    save_dsv(save_path, pd.DataFrame(data))


def create_dummy_files_func(export_dir, _custom_icustays_list, pid):
    for icustay_id in tqdm(_custom_icustays_list):
        # if icustay_id != 39060235:
        #     continue
        save_path = get_data_save_path(export_dir, icustay_id)
        assert not os.path.exists(save_path)
        save_data_dsv_ext(save_path, pd.DataFrame(DataTable().data))


def create_dummy_files(export_dir: str, _custom_icustays_list: list):
    """ Create empty dummy .dsv files."""
    parallel_processing(create_dummy_files_func, MP_NUM_PROCESSES,
                        export_dir, _custom_icustays_list)
    print("Created dummy .dsv files.")


def split_df(df: pd.DataFrame, num_processes: int = 8):
    interval = math.ceil(len(df)/num_processes)
    dfs = [df.iloc[interval*i:interval*(i+1)]
           for i in range((num_processes-1))]
    dfs += [df.iloc[interval*(num_processes-1):]]
    return dfs


def parallel_processing_ext(_func,
                            _df_iter,
                            _num_entries: int,
                            _custom_icustays_list: list):
    """Wrapper for parallel processing. Sorts the dataframe based on
    `sort_list` before running the `func`.

    TODO: df should be splitted up based on `stay_id`, i.e. where all
    the `stay_id` are assigned to the same process. If not may cause reading
    error because this id determines which dsv file to read from. The current
    hack is to create a large enough df chunk so that this error situation
    will not occur.
    """
    sort_list = ['subject_id', 'hadm_id', 'stay_id',
                 'charttime', 'starttime', 'endtime', ]
    # mem_flag, df_mem = False, pd.DataFrame()
    for df in tqdm(_df_iter, total=_num_entries):

        if 'stay_id' in df.columns.tolist():
            df = df[df.stay_id.isin(_custom_icustays_list)]
            # df_mem = pd.concat([df_mem, df]) if mem_flag else df
            # uc = df_mem.nunique(axis=1)
            # if uc < MP_NUM_PROCESSES:
            #     mem_flag = True
            #     continue
            # else:
            #     df = df_mem
            #     mem_flag, df_mem = False, pd.DataFrame()

        df = df.sort_values(
            by=[i for i in sort_list if i in df.columns.tolist()])
        dfs = split_df(df, MP_NUM_PROCESSES)
        parallel_processing(_func, MP_NUM_PROCESSES, dfs)


Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<


In [10]:
def export_func_factory(append_func, export_dir: str):

    global export_func

    def export_func(dfs, pid):

        stay_id_mem = -1
        subject_id_mem = -1
        hadm_id_mem = -1

        stay_ids_dt = {}  # dict of DataTable
        stay_ids_path = {}  # dict of paths

        save_path = None
        save_flag = -1

        df = dfs[0]
        it = InfoTable()
        dt = DataTable()
        for df_i in df.iterrows():
            df_row = df_i[1]

            if 'stay_id' in df.columns.tolist():
                save_flag = 0
                stay_id = df_row['stay_id']

                if stay_id_mem == -1:
                    # Initial loading of data
                    stay_id_mem = df_row['stay_id']
                    save_path = get_data_save_path(export_dir, stay_id)
                    dt.data = load_data_dsv(save_path)

                elif df_row['stay_id'] != stay_id_mem:
                    # Only load data when the id changes.
                    # Saves the previous data first before loading.
                    save_path = get_data_save_path(export_dir, stay_id_mem)
                    save_data_dsv_ext(save_path, dt.data)
                    stay_id_mem = df_row['stay_id']
                    save_path = get_data_save_path(export_dir, stay_id)
                    dt.data = load_data_dsv(save_path)

                dt = append_func(dt, df_row, df.columns.tolist())

            else:
                # elif 'hadm_id' not in df.columns.tolist():
                save_flag = 1
                sub_id = str(df_row['subject_id'])

                if sub_id not in custom_icustays_dict:
                    continue

                if subject_id_mem == -1:
                    # Initial loading of data
                    subject_id_mem = sub_id
                    for _, stay_ids in custom_icustays_dict[sub_id].items():
                        for stay_id in stay_ids:
                            save_path = get_data_save_path(export_dir, stay_id)
                            _dt = DataTable()
                            _dt.data = load_data_dsv(save_path)
                            stay_ids_dt[stay_id] = _dt
                            stay_ids_path[stay_id] = save_path

                elif subject_id_mem != sub_id:
                    # Only load data when the id changes.
                    # Saves the previous data first before loading.
                    subject_id_mem = sub_id
                    for stay_id, stay_id_dt in stay_ids_dt.items():
                        save_data_dsv_ext(
                            stay_ids_path[stay_id], stay_id_dt.data)
                    stay_ids_dt, stay_ids_path = {}, {}
                    for _, stay_ids in custom_icustays_dict[sub_id].items():
                        for stay_id in stay_ids:
                            save_path = get_data_save_path(export_dir, stay_id)
                            _dt = DataTable()
                            _dt.data = load_data_dsv(save_path)
                            stay_ids_dt[stay_id] = _dt
                            stay_ids_path[stay_id] = save_path

                for _, stay_ids in custom_icustays_dict[sub_id].items():
                    for stay_id in stay_ids:
                        info_save_path = get_info_save_path(
                            STRUCTURED_EXPORT_DIR, stay_id)
                        it.data = load_info_dsv(info_save_path)
                        icu_intime = it.data['value'][13]
                        icu_outtime = it.data['value'][14]
                        if pd.Timestamp(str(icu_intime)) <= df_row['charttime'] <= pd.Timestamp(str(icu_outtime)):
                            stay_ids_dt[stay_id] = append_func(
                                stay_ids_dt[stay_id], df_row, df.columns.tolist())

            # else:
            #     save_flag = 2
            #     sub_id = str(df_row['subject_id'])
            #     hadm_id = str(df_row['hadm_id'])

            #     if sub_id not in custom_icustays_dict:
            #         continue
            #     if hadm_id not in custom_icustays_dict[sub_id]:
            #         continue

            #     stay_ids = custom_icustays_dict[sub_id][hadm_id]

            #     if subject_id_mem == -1 and hadm_id_mem == -1:
            #         # Initial loading of data
            #         subject_id_mem = sub_id
            #         hadm_id_mem = hadm_id
            #         for stay_id in stay_ids:
            #             save_path = get_data_save_path(export_dir, stay_id)
            #             _dt = DataTable()
            #             _dt.data = load_data_dsv(save_path)
            #             stay_ids_dt.append(_dt)
            #             stay_ids_path.append(save_path)

            #     elif subject_id_mem != sub_id or hadm_id_mem != hadm_id:
            #         # Only load data when the id changes.
            #         # Saves the previous data first before loading.
            #         subject_id_mem = sub_id
            #         hadm_id_mem = hadm_id
            #         for stay_id_dt, stay_id_path in zip(stay_ids_dt,
            #                                             stay_ids_path):
            #             save_data_dsv_ext(stay_id_path, stay_id_dt.data)
            #         stay_ids_dt, stay_ids_path = [], []
            #         for stay_id in stay_ids:
            #             save_path = get_data_save_path(export_dir, stay_id)
            #             _dt = DataTable()
            #             _dt.data = load_data_dsv(save_path)
            #             stay_ids_dt.append(_dt)
            #             stay_ids_path.append(save_path)

            #     for idx, stay_id_dt in enumerate(stay_ids_dt):
            #         info_save_path = get_info_save_path(
            #             STRUCTURED_EXPORT_DIR, stay_id)
            #         it.data = load_info_dsv(info_save_path)
            #         icu_intime = it.data['value'][13]
            #         icu_outtime = it.data['value'][14]
            #         if icu_intime <= df_row['charttime'] <= icu_outtime:
            #             stay_ids_dt[idx] = append_func(
            #                 stay_id_dt, df_row, df.columns.tolist())

        if save_flag == 0:
            # Saves the final data.
            save_data_dsv_ext(save_path, dt.data)

        elif save_flag == 1 or save_flag == 2:
            # Saves the final data.
            for stay_id, stay_id_dt in stay_ids_dt.items():
                save_data_dsv_ext(stay_ids_path[stay_id], stay_id_dt.data)
            stay_ids_dt, stay_ids_path = {}, {}

    return export_func


In [4]:
# # Save labevents_info table from db.
# (_, _, _, query_schema_derived, conn) = connect_db()
# df = get_database_table_as_dataframe(
#     conn, query_schema_derived, 'labevents_info')
# df = df.sort_values('itemid')
# df.to_csv("../../../"+LAB_INFO_PATH, na_rep='', sep='\t', index=False)


# Prepare the required mappings.
The custom dict and list are created from `05a_export_raw_info.ipynb` .

In [5]:
# labitems = pd.read_csv("../../../"+LAB_ITEM_PATH, sep='\t', header=0)
# labitems.fillna('None')

d_derived = pd.read_csv("../../../"+DERIVED_ITEM_PATH, sep='\t', header=0)
d_derived = d_derived.fillna('None')
print('d_derived', d_derived.columns.to_list())

d_items = pd.read_csv("../../../"+CHART_ITEM_PATH, sep='\t', header=0)
d_items = d_items.fillna('None')
print('d_items', d_items.columns.to_list())

d_labinfos = pd.read_csv("../../../"+LAB_INFO_PATH, sep='\t', header=0)
d_labinfos = d_labinfos.fillna('None')
print('d_labinfos', d_labinfos.columns.to_list())

d_labitems = pd.read_csv("../../../"+LAB_ITEM_PATH, sep='\t', header=0)
d_labitems = d_labitems.fillna('None')
print('d_labitems', d_labitems.columns.to_list())

with open("../../../" + TMP_CUSTOM_LIST, 'r') as f:
    custom_icustays_list = json.load(f)

with open("../../../" + TMP_CUSTOM_DICT, 'r') as f:
    custom_icustays_dict = json.load(f)


def create_mappings(_id_mapping: dict):

    id_mapping = {}
    unit_mapping = {}
    low_mapping = {}
    high_mapping = {}
    cat_mapping = {}

    for k, v in _id_mapping.items():

        id_mapping[v] = k

        if k//100000 == 1:
            unit_mapping[v] = d_derived[d_derived['uid']
                                        == k]['units'].values[0]
            low_mapping[v] = None  # TODO ADD THIS IN THE DERIVED TABLE?
            high_mapping[v] = None  # TODO ADD THIS IN THE DERIVED TABLE?
            cat_mapping[v] = d_derived[d_derived['uid']
                                       == k]['category'].values[0]

        elif k//200000 == 1:
            unit_mapping[v] = d_items[d_items['uid']
                                      == k]['unitname'].values[0]
            low_mapping[v] = d_items[d_items['uid']
                                     == k]['lownormalvalue'].values[0]
            high_mapping[v] = d_items[d_items['uid']
                                      == k]['highnormalvalue'].values[0]
            cat_mapping[v] = d_items[d_items['uid'] == k]['category'].values[0]

        elif k//500000 == 1:
            cat_mapping[v] = d_labitems[d_labitems['uid']
                                        == k]['category'].values[0]
            unit_mapping[v] = None  # From db table
            low_mapping[v] = None  # From db table
            high_mapping[v] = None  # From db table

        else:
            unit_mapping[v] = None
            low_mapping[v] = None
            high_mapping[v] = None
            cat_mapping[v] = None

    return id_mapping, unit_mapping, low_mapping, high_mapping, cat_mapping


d_derived ['uid', 'label', 'units', 'category', 'notes']
d_items ['uid', 'itemid', 'label', 'abbreviation', 'linksto', 'category', 'unitname', 'param_type', 'lownormalvalue', 'highnormalvalue']
d_labinfos ['itemid', 'valueuom', 'valueuom_count', 'ref_range_lower', 'ref_range_lower_count', 'ref_range_upper', 'ref_range_upper_count']
d_labitems ['uid', 'itemid', 'label', 'fluid', 'category', 'loinc_code']


# Export data

Currently the unit is taken from the original tables. A better solution is to include them in the concepts.


In [5]:
count = 0

## Height

In [6]:
def append_func(dt, df_row, cols):
    dt.append(
        uid=100001,
        value=df_row['height'],
        unit='cm',
        category='General',
        starttime=df_row['charttime'],
    )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# [subject_id, stay_id, charttime, height]
df_iter, num_entries = get_db_table_as_df_ext('derived', 'height')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added height entries.")


100%|██████████| 2694/2694 [00:04<00:00, 663.22it/s]
100%|██████████| 2694/2694 [00:04<00:00, 663.30it/s]
100%|██████████| 2688/2688 [00:04<00:00, 665.86it/s]
100%|██████████| 2694/2694 [00:04<00:00, 660.80it/s]
100%|██████████| 2694/2694 [00:04<00:00, 663.02it/s]
100%|██████████| 2694/2694 [00:04<00:00, 659.59it/s]
100%|██████████| 2694/2694 [00:04<00:00, 657.87it/s]
100%|██████████| 2694/2694 [00:04<00:00, 658.87it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting height data


1it [00:00,  9.04it/s]


Number of entries for height : 35170
Column names : ['subject_id', 'stay_id', 'charttime', 'height']



100%|██████████| 1/1 [00:14<00:00, 14.44s/it]

Added height entries.





## Weight

In [7]:
def append_func(dt, df_row, col):
    dt.append(
        uid=100002,
        value=df_row['weight'],
        unit='kg',
        category='General, ' + df_row['weight_type'],
        starttime=df_row['starttime'],
        endtime=df_row['endtime'],
    )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['stay_id', 'starttime', 'endtime', 'weight', 'weight_type']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'weight_durations')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added weight entries.")


100%|██████████| 2694/2694 [00:03<00:00, 769.12it/s]
100%|██████████| 2688/2688 [00:03<00:00, 767.91it/s]
100%|██████████| 2694/2694 [00:03<00:00, 762.93it/s]
100%|██████████| 2694/2694 [00:03<00:00, 763.55it/s]
100%|██████████| 2694/2694 [00:03<00:00, 758.29it/s]
100%|██████████| 2694/2694 [00:03<00:00, 762.92it/s]
100%|██████████| 2694/2694 [00:03<00:00, 755.93it/s]
100%|██████████| 2694/2694 [00:03<00:00, 753.58it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting weight_durations data


4it [00:01,  3.93it/s]


Number of entries for weight_durations : 287155
Column names : ['stay_id', 'starttime', 'endtime', 'weight', 'weight_type']



100%|██████████| 4/4 [00:27<00:00,  6.89s/it]

Added weight entries.





## Chemistry

In [12]:
id_mapping = {
    550862: 'albumin',
    550930: 'globulin',
    550976: 'total_protein',
    550868: 'aniongap',
    550882: 'bicarbonate',
    551006: 'bun',
    550893: 'calcium',
    550902: 'chloride',
    550912: 'creatinine',
    550931: 'glucose',
    550983: 'sodium',
    550971: 'potassium',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=df_row[col+'_unit'],
                lower_range=df_row[col+'_lower'],
                upper_range=df_row[col+'_upper'],
                category=cat_mapping[col],
                specimen_id=df_row['specimen_id'],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'albumin', 'globulin', 'total_protein', 'aniongap', 'bicarbonate', 'bun', 'calcium', 'chloride', 'creatinine', 'glucose', 'sodium', 'potassium']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'chemistry')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added chemistry (lab) entries.")


100%|██████████| 2694/2694 [00:03<00:00, 879.77it/s]
100%|██████████| 2688/2688 [00:03<00:00, 879.77it/s]
100%|██████████| 2694/2694 [00:03<00:00, 871.83it/s]
100%|██████████| 2694/2694 [00:03<00:00, 868.25it/s]
100%|██████████| 2694/2694 [00:03<00:00, 870.16it/s]
100%|██████████| 2694/2694 [00:03<00:00, 865.58it/s]
100%|██████████| 2694/2694 [00:03<00:00, 859.84it/s]
100%|██████████| 2694/2694 [00:03<00:00, 857.58it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting chemistry data


50it [00:52,  1.05s/it]


Number of entries for chemistry : 3956323
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'albumin', 'globulin', 'total_protein', 'aniongap', 'bicarbonate', 'bun', 'calcium', 'chloride', 'creatinine', 'glucose', 'sodium', 'potassium', 'albumin_unit', 'globulin_unit', 'total_protein_unit', 'aniongap_unit', 'bicarbonate_unit', 'bun_unit', 'calcium_unit', 'chloride_unit', 'creatinine_unit', 'glucose_unit', 'sodium_unit', 'potassium_unit', 'albumin_lower', 'globulin_lower', 'total_protein_lower', 'aniongap_lower', 'bicarbonate_lower', 'bun_lower', 'calcium_lower', 'chloride_lower', 'creatinine_lower', 'glucose_lower', 'sodium_lower', 'potassium_lower', 'albumin_upper', 'globulin_upper', 'total_protein_upper', 'aniongap_upper', 'bicarbonate_upper', 'bun_upper', 'calcium_upper', 'chloride_upper', 'creatinine_upper', 'glucose_upper', 'sodium_upper', 'potassium_upper']



100%|██████████| 50/50 [14:09<00:00, 16.98s/it]

Added chemistry (lab) entries.





## Blood Gas

In [14]:
id_mapping = {
    552028: 'specimen',
    550801: 'aado2',
    550802: 'baseexcess',
    550803: 'bicarbonate',
    550804: 'totalco2',
    550805: 'carboxyhemoglobin',
    550806: 'chloride',
    550808: 'calcium',
    550809: 'glucose',
    550810: 'hematocrit',
    550811: 'hemoglobin',
    550813: 'lactate',
    550814: 'methemoglobin',
    550816: 'fio2',
    550817: 'so2',
    550818: 'pco2',
    550820: 'ph',
    550821: 'po2',
    550822: 'potassium',
    550824: 'sodium',
    550825: 'temperature',
    223835: 'fio2_chartevents',
    100038: 'pao2fio2ratio',  # nounit
    100039: 'aado2_calc',
    100040: 'specimen_pred',  # nounit
    100041: 'specimen_prob',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:

            if id_mapping[col]//100000 == 1:
                unit = unit_mapping[col]
                lower_range = low_mapping[col]
                upper_range = high_mapping[col]

                if id_mapping[col] == 100039:
                    unit = df_row['aado2_unit'],
                    lower_range = df_row['aado2_lower'],
                    upper_range = df_row['aado2_upper'],

            elif id_mapping[col]//100000 == 2:
                unit = unit_mapping[col]
                lower_range = low_mapping[col]
                upper_range = high_mapping[col]

            else:
                unit = df_row[col+'_unit'],
                lower_range = df_row[col+'_lower'],
                upper_range = df_row[col+'_upper'],

            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=unit,
                lower_range=lower_range,
                upper_range=upper_range,
                category=cat_mapping[col],
                starttime=df_row['charttime'],
            )
    return dt


# count += 1
# create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'charttime', 'specimen', 'specimen_pred', 'specimen_prob', 'so2', 'po2', 'pco2', 'fio2_chartevents', 'fio2', 'aado2', 'aado2_calc', 'pao2fio2ratio', 'ph', 'baseexcess', 'bicarbonate', 'totalco2', 'hematocrit', 'hemoglobin', 'carboxyhemoglobin', 'methemoglobin', 'chloride', 'calcium', 'temperature', 'potassium', 'sodium', 'lactate', 'glucose']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'bg')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added bg (lab) entries.")


Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting bg data


8it [00:52,  6.57s/it]


Number of entries for bg : 561212
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen', 'specimen_unit', 'specimen_lower', 'specimen_upper', 'specimen_pred', 'specimen_prob', 'so2', 'so2_unit', 'so2_lower', 'so2_upper', 'po2', 'po2_unit', 'po2_lower', 'po2_upper', 'pco2', 'pco2_unit', 'pco2_lower', 'pco2_upper', 'fio2_chartevents', 'fio2', 'fio2_unit', 'fio2_lower', 'fio2_upper', 'aado2', 'aado2_unit', 'aado2_lower', 'aado2_upper', 'aado2_calc', 'pao2fio2ratio', 'ph', 'ph_unit', 'ph_lower', 'ph_upper', 'baseexcess', 'baseexcess_unit', 'baseexcess_lower', 'baseexcess_upper', 'bicarbonate', 'bicarbonate_unit', 'bicarbonate_lower', 'bicarbonate_upper', 'totalco2', 'totalco2_unit', 'totalco2_lower', 'totalco2_upper', 'hematocrit', 'hematocrit_unit', 'hematocrit_lower', 'hematocrit_upper', 'hemoglobin', 'hemoglobin_unit', 'hemoglobin_lower', 'hemoglobin_upper', 'carboxyhemoglobin', 'carboxyhemoglobin_unit', 'carboxyhemoglobin_lower', 'carboxyhemoglobin_upper', 'methemoglobin', '

100%|██████████| 8/8 [04:45<00:00, 35.69s/it]

Added bg (lab) entries.





## Blood Differential

In [15]:
# impute absolute count if percentage & WBC is available
id_mapping = {
    551146: 'basophils',
    552069: 'basophils_abs',
    551200: 'eosinophils',
    551254: 'monocytes',
    551256: 'neutrophils',
    552075: 'neutrophils_abs',
    551143: 'atypical_lymphocytes',
    551144: 'bands',
    552135: 'immature_granulocytes',
    551251: 'metamyelocytes',
    551257: 'nrbc',

    100003: 'wbc',  # TODO: May need to split due to category.
    100004: 'lymphocytes',
    100005: 'eosinophils_abs',
    100006: 'lymphocytes_abs',
    100007: 'monocytes_abs',

    # 51300: 'wbc',
    # 51301: 'wbc',
    # 51755: 'wbc',
    # [51244, 51245]: lymphocytes
    # [52073, 51199]: eosinophils_abs
    # [51133, 52769]: lymphocytes_abs
    # [52074, 51253]: monocytes_abs
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            if id_mapping[col]//100000 == 1:
                unit = unit_mapping[col]
                lower_range = low_mapping[col]
                upper_range = high_mapping[col]

                if id_mapping[col] == 100003:
                    lower_range = df_row['wbc_lower'],
                    upper_range = df_row['wbc_upper'],
                elif id_mapping[col] == 100004:
                    lower_range = df_row['lymphocytes_lower'],
                    upper_range = df_row['lymphocytes_upper'],
                elif id_mapping[col] == 100005:
                    lower_range = df_row['eosinophils_abs_lower'],
                    upper_range = df_row['eosinophils_abs_upper'],
                elif id_mapping[col] == 100006:
                    lower_range = df_row['lymphocytes_abs_lower'],
                    upper_range = df_row['lymphocytes_abs_upper'],
                elif id_mapping[col] == 100007:
                    lower_range = df_row['monocytes_abs_lower'],
                    upper_range = df_row['monocytes_abs_upper'],

            else:
                unit = df_row[col+'_unit'],
                lower_range = df_row[col+'_lower'],
                upper_range = df_row[col+'_upper'],

            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=unit,
                lower_range=lower_range,
                upper_range=upper_range,
                category=cat_mapping[col],
                specimen_id=df_row['specimen_id'],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'wbc', 'basophils_abs', 'eosinophils_abs', 'lymphocytes_abs', 'monocytes_abs', 'neutrophils_abs', 'basophils', 'eosinophils', 'lymphocytes', 'monocytes', 'neutrophils', 'atypical_lymphocytes', 'bands', 'immature_granulocytes', 'metamyelocytes', 'nrbc']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'blood_differential')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added blood_differential (lab) entries.")


100%|██████████| 2694/2694 [00:03<00:00, 836.01it/s]
100%|██████████| 2694/2694 [00:03<00:00, 833.60it/s]
100%|██████████| 2694/2694 [00:03<00:00, 831.48it/s]
100%|██████████| 2694/2694 [00:03<00:00, 833.92it/s]
100%|██████████| 2694/2694 [00:03<00:00, 832.28it/s]
100%|██████████| 2694/2694 [00:03<00:00, 834.78it/s]
100%|██████████| 2688/2688 [00:03<00:00, 833.34it/s]
100%|██████████| 2694/2694 [00:03<00:00, 828.49it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting blood_differential data


42it [01:08,  1.63s/it]


Number of entries for blood_differential : 3283493
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'wbc', 'wbc_unit', 'wbc_lower', 'wbc_upper', 'basophils_abs', 'basophils_abs_unit', 'basophils_abs_lower', 'basophils_abs_upper', 'eosinophils_abs', 'eosinophils_abs_unit', 'eosinophils_abs_lower', 'eosinophils_abs_upper', 'lymphocytes_abs', 'lymphocytes_abs_unit', 'lymphocytes_abs_lower', 'lymphocytes_abs_upper', 'monocytes_abs', 'monocytes_abs_unit', 'monocytes_abs_lower', 'monocytes_abs_upper', 'neutrophils_abs', 'neutrophils_abs_unit', 'neutrophils_abs_lower', 'neutrophils_abs_upper', 'basophils', 'basophils_unit', 'basophils_lower', 'basophils_upper', 'eosinophils', 'eosinophils_unit', 'eosinophils_lower', 'eosinophils_upper', 'lymphocytes', 'lymphocytes_unit', 'lymphocytes_lower', 'lymphocytes_upper', 'monocytes', 'monocytes_unit', 'monocytes_lower', 'monocytes_upper', 'neutrophils', 'neutrophils_unit', 'neutrophils_lower', 'neutrophils_upper', 'atypical_lymphoc

100%|██████████| 42/42 [11:54<00:00, 17.01s/it]

Added blood_differential (lab) entries.





## Cardiac Marker

In [17]:
id_mapping = {
    551002: 'troponin_i',
    551003: 'troponin_t',
    550911: 'ck_mb',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=df_row[col+'_unit'],
                lower_range=df_row[col+'_lower'],
                upper_range=df_row[col+'_upper'],
                category=cat_mapping[col],
                specimen_id=df_row['specimen_id'],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'troponin_i', 'troponin_t', 'ck_mb']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'cardiac_marker')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added cardiac_marker (lab) entries.")

# hadm_list, sub_list = [], []
# for chunk in tqdm(df_iter):
#     num_entries += len(chunk)
#     hadm_list += chunk['hadm_id'].tolist()
#     sub_list += chunk['subject_id'].tolist()

100%|██████████| 2694/2694 [00:02<00:00, 911.99it/s]
100%|██████████| 2694/2694 [00:03<00:00, 897.87it/s]
100%|██████████| 2688/2688 [00:03<00:00, 891.70it/s]
100%|██████████| 2694/2694 [00:03<00:00, 889.30it/s]
100%|██████████| 2694/2694 [00:03<00:00, 880.63it/s]
100%|██████████| 2694/2694 [00:03<00:00, 875.25it/s]
100%|██████████| 2694/2694 [00:03<00:00, 884.70it/s]
100%|██████████| 2694/2694 [00:03<00:00, 863.45it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting cardiac_marker data


6it [00:02,  2.81it/s]


Number of entries for cardiac_marker : 430049
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'troponin_i', 'troponin_t', 'ck_mb', 'troponin_i_unit', 'troponin_t_unit', 'ck_mb_unit', 'troponin_i_lower', 'troponin_t_lower', 'ck_mb_lower', 'troponin_i_upper', 'troponin_t_upper', 'ck_mb_upper']



100%|██████████| 6/6 [02:03<00:00, 20.63s/it]

Added cardiac_marker (lab) entries.





## Coagulation

In [18]:
id_mapping = {
    551196: 'd_dimer',
    551214: 'fibrinogen',
    551297: 'thrombin',
    551237: 'inr',
    551274: 'pt',
    551275: 'ptt',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=df_row[col+'_unit'],
                lower_range=df_row[col+'_lower'],
                upper_range=df_row[col+'_upper'],
                category=cat_mapping[col],
                specimen_id=df_row['specimen_id'],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'd_dimer', 'fibrinogen', 'thrombin', 'inr', 'pt', 'ptt']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'coagulation')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added coagulation (lab) entries.")


100%|██████████| 2688/2688 [00:04<00:00, 648.06it/s]
100%|██████████| 2694/2694 [00:04<00:00, 643.84it/s]
100%|██████████| 2694/2694 [00:04<00:00, 641.45it/s]
100%|██████████| 2694/2694 [00:04<00:00, 638.88it/s]
100%|██████████| 2694/2694 [00:04<00:00, 638.27it/s]
100%|██████████| 2694/2694 [00:04<00:00, 636.93it/s]
100%|██████████| 2694/2694 [00:04<00:00, 635.02it/s]
100%|██████████| 2694/2694 [00:04<00:00, 622.59it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting coagulation data


20it [00:10,  1.82it/s]


Number of entries for coagulation : 1594879
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'd_dimer', 'fibrinogen', 'thrombin', 'inr', 'pt', 'ptt', 'd_dimer_unit', 'fibrinogen_unit', 'thrombin_unit', 'inr_unit', 'pt_unit', 'ptt_unit', 'd_dimer_lower', 'fibrinogen_lower', 'thrombin_lower', 'inr_lower', 'pt_lower', 'ptt_lower', 'd_dimer_upper', 'fibrinogen_upper', 'thrombin_upper', 'inr_upper', 'pt_upper', 'ptt_upper']



100%|██████████| 20/20 [08:52<00:00, 26.60s/it]

Added coagulation (lab) entries.





## Complete blood count

In [19]:
id_mapping = {
    551221: 'hematocrit',
    551222: 'hemoglobin',
    551248: 'mch',
    551249: 'mchc',
    551250: 'mcv',
    551265: 'platelet',
    551279: 'rbc',
    551277: 'rdw',
    552159: 'rdwsd',
    # 551301: 'wbc', # present in blood_differential
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=df_row[col+'_unit'],
                lower_range=df_row[col+'_lower'],
                upper_range=df_row[col+'_upper'],
                category=cat_mapping[col],
                specimen_id=df_row['specimen_id'],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'hematocrit', 'hemoglobin', 'mch', 'mchc', 'mcv', 'platelet', 'rbc', 'rdw', 'rdwsd', 'wbc']
df_iter, num_entries = get_db_table_as_df_ext(
    'derived', 'complete_blood_count')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added complete_blood_count (lab) entries.")


100%|██████████| 2694/2694 [00:03<00:00, 810.41it/s]
100%|██████████| 2694/2694 [00:03<00:00, 790.47it/s]
100%|██████████| 2688/2688 [00:03<00:00, 792.33it/s]
100%|██████████| 2694/2694 [00:03<00:00, 788.04it/s]
100%|██████████| 2694/2694 [00:03<00:00, 788.10it/s]
100%|██████████| 2694/2694 [00:03<00:00, 781.35it/s]
100%|██████████| 2694/2694 [00:03<00:00, 785.62it/s]
100%|██████████| 2694/2694 [00:03<00:00, 778.55it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting complete_blood_count data


44it [01:25,  1.95s/it]


Number of entries for complete_blood_count : 3492512
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'hematocrit', 'hemoglobin', 'mch', 'mchc', 'mcv', 'platelet', 'rbc', 'rdw', 'rdwsd', 'wbc', 'hematocrit_unit', 'hemoglobin_unit', 'mch_unit', 'mchc_unit', 'mcv_unit', 'platelet_unit', 'rbc_unit', 'rdw_unit', 'rdwsd_unit', 'wbc_unit', 'hematocrit_lower', 'hemoglobin_lower', 'mch_lower', 'mchc_lower', 'mcv_lower', 'platelet_lower', 'rbc_lower', 'rdw_lower', 'rdwsd_lower', 'wbc_lower', 'hematocrit_upper', 'hemoglobin_upper', 'mch_upper', 'mchc_upper', 'mcv_upper', 'platelet_upper', 'rbc_upper', 'rdw_upper', 'rdwsd_upper', 'wbc_upper']



100%|██████████| 44/44 [13:15<00:00, 18.09s/it]

Added complete_blood_count (lab) entries.





## Enzyme

In [20]:
id_mapping = {
    550861: 'alt',
    550863: 'alp',
    550878: 'ast',
    550867: 'amylase',
    550885: 'bilirubin_total',
    550884: 'bilirubin_indirect',
    550883: 'bilirubin_direct',
    550910: 'ck_cpk',
    550911: 'ck_mb',
    550927: 'ggt',
    550954: 'ld_ldh',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=df_row[col+'_unit'],
                lower_range=df_row[col+'_lower'],
                upper_range=df_row[col+'_upper'],
                category=cat_mapping[col],
                specimen_id=df_row['specimen_id'],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'alt', 'alp', 'ast', 'amylase', 'bilirubin_total', 'bilirubin_direct', 'bilirubin_indirect', 'ck_cpk', 'ck_mb', 'ggt', 'ld_ldh']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'enzyme')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added enzyme (lab) entries.")


100%|██████████| 2694/2694 [00:02<00:00, 1009.40it/s]
100%|██████████| 2694/2694 [00:02<00:00, 993.42it/s]
100%|██████████| 2694/2694 [00:02<00:00, 987.48it/s] 
100%|██████████| 2694/2694 [00:02<00:00, 989.73it/s] 
100%|██████████| 2688/2688 [00:02<00:00, 986.48it/s]
100%|██████████| 2694/2694 [00:02<00:00, 980.74it/s]
100%|██████████| 2694/2694 [00:02<00:00, 970.26it/s]
100%|██████████| 2694/2694 [00:02<00:00, 966.74it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting enzyme data


23it [00:24,  1.06s/it]


Number of entries for enzyme : 1787236
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'alt', 'alp', 'ast', 'amylase', 'bilirubin_total', 'bilirubin_direct', 'bilirubin_indirect', 'ck_cpk', 'ck_mb', 'ggt', 'ld_ldh', 'alt_unit', 'alp_unit', 'ast_unit', 'amylase_unit', 'bilirubin_total_unit', 'bilirubin_direct_unit', 'bilirubin_indirect_unit', 'ck_cpk_unit', 'ck_mb_unit', 'ggt_unit', 'ld_ldh_unit', 'alt_lower', 'alp_lower', 'ast_lower', 'amylase_lower', 'bilirubin_total_lower', 'bilirubin_direct_lower', 'bilirubin_indirect_lower', 'ck_cpk_lower', 'ck_mb_lower', 'ggt_lower', 'ld_ldh_lower', 'alt_upper', 'alp_upper', 'ast_upper', 'amylase_upper', 'bilirubin_total_upper', 'bilirubin_direct_upper', 'bilirubin_indirect_upper', 'ck_cpk_upper', 'ck_mb_upper', 'ggt_upper', 'ld_ldh_upper']



100%|██████████| 23/23 [04:49<00:00, 12.58s/it]

Added enzyme (lab) entries.





## Inflamation

In [21]:
id_mapping = {
    550889: 'crp',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=df_row[col+'_unit'],
                lower_range=df_row[col+'_lower'],
                upper_range=df_row[col+'_upper'],
                category=cat_mapping[col],
                specimen_id=df_row['specimen_id'],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'crp']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'inflammation')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added inflammation (lab) entries.")


100%|██████████| 2694/2694 [00:02<00:00, 1012.64it/s]
100%|██████████| 2694/2694 [00:02<00:00, 988.76it/s]
100%|██████████| 2694/2694 [00:02<00:00, 987.17it/s] 
100%|██████████| 2694/2694 [00:02<00:00, 983.03it/s]
100%|██████████| 2694/2694 [00:02<00:00, 986.62it/s]
100%|██████████| 2688/2688 [00:02<00:00, 981.79it/s]
100%|██████████| 2694/2694 [00:02<00:00, 973.35it/s] 
100%|██████████| 2694/2694 [00:02<00:00, 966.15it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting inflammation data


2it [00:00,  6.99it/s]


Number of entries for inflammation : 118290
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen_id', 'crp', 'crp_unit', 'crp_lower', 'crp_upper']



100%|██████████| 2/2 [00:13<00:00,  6.72s/it]

Added inflammation (lab) entries.





## O2 delivery

In [22]:
id_mapping = {
    227287: 'o2_flow_additional',

    100012: 'o2_flow',
    100008: 'o2_delivery_device_1',
    100009: 'o2_delivery_device_2',
    100010: 'o2_delivery_device_3',
    100011: 'o2_delivery_device_4',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=unit_mapping[col],
                lower_range=low_mapping[col],
                upper_range=high_mapping[col],
                category=cat_mapping[col],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'stay_id', 'charttime', 'o2_flow', 'o2_flow_additional', 'o2_delivery_device_1', 'o2_delivery_device_2', 'o2_delivery_device_3', 'o2_delivery_device_4']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'oxygen_delivery')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added oxygen_delivery (chart) entries.")


100%|██████████| 2694/2694 [00:02<00:00, 937.72it/s]
100%|██████████| 2694/2694 [00:02<00:00, 931.84it/s]
100%|██████████| 2694/2694 [00:02<00:00, 932.92it/s]
100%|██████████| 2694/2694 [00:02<00:00, 930.77it/s]
100%|██████████| 2694/2694 [00:02<00:00, 926.89it/s]
100%|██████████| 2688/2688 [00:02<00:00, 921.38it/s]
100%|██████████| 2694/2694 [00:02<00:00, 918.96it/s]
100%|██████████| 2694/2694 [00:02<00:00, 908.19it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting oxygen_delivery data


11it [00:01,  7.55it/s]


Number of entries for oxygen_delivery : 829534
Column names : ['subject_id', 'stay_id', 'charttime', 'o2_flow', 'o2_flow_additional', 'o2_delivery_device_1', 'o2_delivery_device_2', 'o2_delivery_device_3', 'o2_delivery_device_4']



100%|██████████| 11/11 [00:58<00:00,  5.28s/it]

Added oxygen_delivery (chart) entries.





## Rhythm

In [23]:
id_mapping = {
    220048: 'heart_rhythm',
    224650: 'ectopy_type',
    224651: 'ectopy_frequency',
    226479: 'ectopy_type_secondary',
    226480: 'ectopy_frequency_secondary',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=unit_mapping[col],
                lower_range=low_mapping[col],
                upper_range=high_mapping[col],
                category=cat_mapping[col],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'charttime', 'heart_rhythm', 'ectopy_type', 'ectopy_frequency', 'ectopy_type_secondary', 'ectopy_frequency_secondary']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'rhythm')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added rhythm (chart) entries.")


100%|██████████| 2688/2688 [00:02<00:00, 1003.54it/s]
100%|██████████| 2694/2694 [00:02<00:00, 970.44it/s]
100%|██████████| 2694/2694 [00:02<00:00, 976.15it/s]
100%|██████████| 2694/2694 [00:02<00:00, 970.84it/s]
100%|██████████| 2694/2694 [00:02<00:00, 975.84it/s] 
100%|██████████| 2694/2694 [00:02<00:00, 966.39it/s] 
100%|██████████| 2694/2694 [00:02<00:00, 968.59it/s]
100%|██████████| 2694/2694 [00:02<00:00, 964.01it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting rhythm data


78it [00:08,  9.21it/s]


Number of entries for rhythm : 6184785
Column names : ['subject_id', 'charttime', 'heart_rhythm', 'ectopy_type', 'ectopy_frequency', 'ectopy_type_secondary', 'ectopy_frequency_secondary']



100%|██████████| 78/78 [28:19<00:00, 21.79s/it]

Added rhythm (chart) entries.





## Urine Output

In [24]:
def append_func(dt, df_row, cols):
    dt.append(
        uid=100013,
        value=df_row['urineoutput'],
        unit='mL',
        category='Output',
        starttime=df_row['charttime'],
    )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['stay_id', 'charttime', 'urineoutput']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'urine_output')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added urine_output (chart) entries.")


100%|██████████| 2694/2694 [00:02<00:00, 1000.78it/s]
100%|██████████| 2688/2688 [00:02<00:00, 986.44it/s] 
100%|██████████| 2694/2694 [00:02<00:00, 985.12it/s]
100%|██████████| 2694/2694 [00:02<00:00, 984.18it/s]
100%|██████████| 2694/2694 [00:02<00:00, 970.00it/s]
100%|██████████| 2694/2694 [00:02<00:00, 978.69it/s]
100%|██████████| 2694/2694 [00:02<00:00, 963.08it/s]
100%|██████████| 2694/2694 [00:02<00:00, 959.63it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting urine_output data


44it [00:04, 10.89it/s]


Number of entries for urine_output : 3497267
Column names : ['stay_id', 'charttime', 'urineoutput']



100%|██████████| 44/44 [06:10<00:00,  8.43s/it]

Added urine_output (chart) entries.





## Urine Output Rate

In [25]:
# -- attempt to calculate urine output per hour
# -- rate/hour is the interpretable measure of kidney function
# -- though it is difficult to estimate from aperiodic point measures
# -- first we get the earliest heart rate documented for the stay
id_mapping = {
    # 100013: 'uo', present in previous table.
    100014: 'urineoutput_6hr',  # output within 6hr (floor)
    100015: 'urineoutput_12hr',
    100016: 'urineoutput_24hr',
    100017: 'uo_mlkghr_6hr',  # (urineoutput_6hr/weight/uo_tm_6hr)
    100018: 'uo_mlkghr_12hr',
    100019: 'uo_mlkghr_24hr',
    100020: 'uo_tm_6hr',  # time from last uo measurement within 6hr (floor)
    100021: 'uo_tm_12hr',
    100022: 'uo_tm_24hr',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=unit_mapping[col],
                lower_range=low_mapping[col],
                upper_range=high_mapping[col],
                category=cat_mapping[col],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['stay_id', 'charttime', 'weight', 'uo', 'urineoutput_6hr', 'urineoutput_12hr', 'urineoutput_24hr', 'uo_mlkghr_6hr', 'uo_mlkghr_12hr', 'uo_mlkghr_24hr', 'uo_tm_6hr', 'uo_tm_12hr', 'uo_tm_24hr']
df_iter, num_entries = get_db_table_as_df_ext(
    'derived', 'urine_output_rate', _chunk=30000)
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added urine_output_rate (derived) entries.")


100%|██████████| 2694/2694 [00:02<00:00, 987.26it/s]
100%|██████████| 2694/2694 [00:02<00:00, 966.47it/s]
100%|██████████| 2694/2694 [00:02<00:00, 966.99it/s]
100%|██████████| 2694/2694 [00:02<00:00, 961.49it/s]
100%|██████████| 2694/2694 [00:02<00:00, 960.83it/s]
100%|██████████| 2694/2694 [00:02<00:00, 965.67it/s]
100%|██████████| 2688/2688 [00:02<00:00, 950.25it/s]
100%|██████████| 2694/2694 [00:02<00:00, 941.93it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting urine_output_rate data


15it [00:22,  1.48s/it]


Number of entries for urine_output_rate : 3497266
Column names : ['stay_id', 'charttime', 'weight', 'uo', 'urineoutput_6hr', 'urineoutput_12hr', 'urineoutput_24hr', 'uo_mlkghr_6hr', 'uo_mlkghr_12hr', 'uo_mlkghr_24hr', 'uo_tm_6hr', 'uo_tm_12hr', 'uo_tm_24hr']



100%|██████████| 15/15 [04:32<00:00, 18.20s/it]

Added urine_output_rate (derived) entries.





## Vent settings

In [26]:
id_mapping = {
    224688: 'respiratory_rate_set',
    224690: 'respiratory_rate_total',
    224689: 'respiratory_rate_spontaneous',
    224687: 'minute_volume',
    224684: 'tidal_volume_set',
    224685: 'tidal_volume_observed',
    224686: 'tidal_volume_spontaneous',
    224696: 'plateau_pressure',
    100023: 'peep',
    # 223835: 'fio2',  # same as fio2_chartevents
    223849: 'ventilator_mode',
    229314: 'ventilator_mode_hamilton',
    223848: 'ventilator_type',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=unit_mapping[col],
                lower_range=low_mapping[col],
                upper_range=high_mapping[col],
                category=cat_mapping[col],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'stay_id', 'charttime', 'respiratory_rate_set', 'respiratory_rate_total', 'respiratory_rate_spontaneous', 'minute_volume', 'tidal_volume_set', 'tidal_volume_observed', 'tidal_volume_spontaneous', 'plateau_pressure', 'peep', 'fio2', 'ventilator_mode', 'ventilator_mode_hamilton', 'ventilator_type']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'ventilator_setting')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added ventilator_setting (chart) entries.")


100%|██████████| 2694/2694 [00:02<00:00, 1022.10it/s]
100%|██████████| 2688/2688 [00:02<00:00, 1013.40it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1008.24it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1005.54it/s]
100%|██████████| 2694/2694 [00:02<00:00, 994.89it/s]
100%|██████████| 2694/2694 [00:02<00:00, 998.71it/s]
100%|██████████| 2694/2694 [00:02<00:00, 987.40it/s]
100%|██████████| 2694/2694 [00:02<00:00, 987.34it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting ventilator_setting data


14it [00:03,  4.19it/s]


Number of entries for ventilator_setting : 1067028
Column names : ['subject_id', 'stay_id', 'charttime', 'respiratory_rate_set', 'respiratory_rate_total', 'respiratory_rate_spontaneous', 'minute_volume', 'tidal_volume_set', 'tidal_volume_observed', 'tidal_volume_spontaneous', 'plateau_pressure', 'peep', 'fio2', 'ventilator_mode', 'ventilator_mode_hamilton', 'ventilator_type']



100%|██████████| 14/14 [01:47<00:00,  7.71s/it]

Added ventilator_setting (chart) entries.





## Vital Signs

In [27]:
id_mapping = {
    220045: 'heart_rate',
    100024: 'sbp',
    100025: 'dbp',
    100026: 'mbp',
    220179: 'sbp_ni',
    220180: 'dbp_ni',
    220181: 'mbp_ni',
    100027: 'resp_rate',
    100028: 'temperature',
    224642: 'temperature_site',
    220277: 'spo2',
    100029: 'glucose_chartevents',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)

# ['subject_id', 'stay_id', 'charttime', 'heart_rate', 'sbp', 'dbp', 'mbp', 'sbp_ni', 'dbp_ni', 'mbp_ni', 'resp_rate', 'temperature', 'temperature_site', 'spo2', 'glucose']
df_iter, num_entries = get_db_table_as_df_ext(
    'derived', 'vitalsign', _chunk=100000)


def func(dfs, pid):

    stay_id_mem = -1
    save_path = None

    df = dfs[0]
    dt = DataTable()
    for df_i in df.iterrows():
        df_row = df_i[1]

        if stay_id_mem == -1:
            stay_id_mem = df_row['stay_id']
            save_path = get_data_save_path(STRUCTURED_EXPORT_DIR+str(count),
                                           df_row['stay_id'])
            dt.data = load_data_dsv(save_path)

        elif df_row['stay_id'] != stay_id_mem:
            save_path = get_data_save_path(STRUCTURED_EXPORT_DIR+str(count),
                                           stay_id_mem)
            save_data_dsv_ext(save_path, dt.data)
            stay_id_mem = df_row['stay_id']
            save_path = get_data_save_path(STRUCTURED_EXPORT_DIR+str(count),
                                           df_row['stay_id'])
            dt.data = load_data_dsv(save_path)

        for col in df.columns.tolist():
            if col in id_mapping:
                dt.append(
                    uid=id_mapping[col],
                    value=df_row[col],
                    unit=unit_mapping[col],
                    lower_range=low_mapping[col],
                    upper_range=high_mapping[col],
                    category=cat_mapping[col],
                    starttime=df_row['charttime'],
                )

    save_data_dsv_ext(save_path, dt.data)

# parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)


sort_list = ['subject_id', 'hadm_id', 'stay_id',
             'charttime', 'starttime', 'endtime', ]
# mem_flag, df_mem = False, pd.DataFrame()
for df in tqdm(df_iter, total=num_entries):

    if 'stay_id' in df.columns.tolist():
        df = df[df.stay_id.isin(custom_icustays_list)]
        # df_mem = pd.concat([df_mem, df]) if mem_flag else df
        # uc = df_mem.nunique(axis=1)
        # if uc < MP_NUM_PROCESSES:
        #     mem_flag = True
        #     continue
        # else:
        #     df = df_mem
        #     mem_flag, df_mem = False, pd.DataFrame()

    df = df.sort_values(
        by=[i for i in sort_list if i in df.columns.tolist()])
    dfs = split_df(df, MP_NUM_PROCESSES)
    parallel_processing(func, MP_NUM_PROCESSES, dfs)


print("Added vitalsign (chart) entries.")


100%|██████████| 2694/2694 [00:02<00:00, 1052.50it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1045.58it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1034.73it/s]
100%|██████████| 2688/2688 [00:02<00:00, 1041.94it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1033.84it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1036.97it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1037.08it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1026.84it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting vitalsign data


13it [00:44,  3.42s/it]


Number of entries for vitalsign : 10249430
Column names : ['subject_id', 'stay_id', 'charttime', 'heart_rate', 'sbp', 'dbp', 'mbp', 'sbp_ni', 'dbp_ni', 'mbp_ni', 'resp_rate', 'temperature', 'temperature_site', 'spo2', 'glucose']



100%|██████████| 13/13 [57:27<00:00, 265.20s/it]

Added vitalsign (chart) entries.





## Antibiotics

In [None]:
# (query_schema_core,
#  query_schema_hosp,
#  query_schema_icu,
#  query_schema_derived,
#  conn) = connect_db()

# # ['subject_id', 'hadm_id', 'stay_id', 'antibiotic', 'route', 'starttime', 'stoptime']
# df = get_database_table_as_dataframe(conn, query_schema_derived, 'antibiotic')
# df = df[df.stay_id.isin(custom_icustays_list)]


# def func(dfs, pid):

#
#

#     df = dfs[0]
#     it = InfoTable()
#     dt = DataTable()
#     for df_i in df.iterrows():
#         df_row = df_i[1]
#         dt.data = load_data_dsv(STRUCTURED_EXPORT_DIR, df_row['stay_id'])
#         dt.append(
#             uid=100030,
#             value=df_row['antibiotic'],
#             category=df_row['route'],
#             starttime=df_row['starttime'],
#             endtime=df_row['stoptime'],
#         )
#         save_data_dsv(STRUCTURED_EXPORT_DIR,
#                       df_row['stay_id'], pd.DataFrame(dt.data))


# dfs = split_df(df, MP_NUM_PROCESSES)
# parallel_processing(func, MP_NUM_PROCESSES, dfs)

# print("Added antibiotic (hosp.prescriptions) entries.")


## Medications

In [28]:
med_ids = [
    220995,  # Sodium Bicarbonate 8.4%
    221794,  # Furosemide (Lasix) **
    228340,  # Furosemide (Lasix) 250/50 **
    # 100037,  # Furosemide (Lasix)
    221986,  # Milrinone
    229068,  # Protamine sulfate
    229639,  # Bumetanide (Bumex)

    221653,  # Dobutamine
    221662,  # Dopamine
    221289,  # Epinephrine
    229617,  # Epinephrine. ~145 entries only
    # 100036,  # Epinephrine
    221906,  # Norepinephrine
    221749,  # Phenylephrine
    222315,  # Vasopressin
]
id_mapping = {
    221794: 100037,
    228340: 100037,
    221289: 100036,
    229617: 100036,
}


def append_func(dt, df_row, df_cols):
    uid = df_row['itemid']
    dt.append(
        uid=id_mapping[uid] if uid in id_mapping else uid,
        value=df_row['amount'],
        unit=df_row['amountuom'],
        rate=df_row['rate'],
        rate_unit=df_row['rateuom'],
        category='Medication',
        starttime=df_row['starttime'],
        endtime=df_row['endtime'],
    )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)

(query_schema_core,
 query_schema_hosp,
 query_schema_icu,
 query_schema_derived,
 conn) = connect_db()

df_iter, num_entries = get_database_table_as_dataframe(
    conn, query_schema_icu, 'inputevents',
    _filter_col='itemid',
    _filter_col_val=tuple(med_ids),
    _chunk_size=10000*MP_NUM_PROCESSES)
num_entries = math.ceil(num_entries / (10000*MP_NUM_PROCESSES))

func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added (medication) entries.")


100%|██████████| 2694/2694 [00:02<00:00, 1079.15it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1074.99it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1064.84it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1062.09it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1065.17it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1058.13it/s]
100%|██████████| 2688/2688 [00:02<00:00, 1051.37it/s]
100%|██████████| 2694/2694 [00:02<00:00, 1049.97it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting inputevents data


10it [00:09,  1.10it/s]


Number of entries for inputevents : 782373
Column names : ['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'storetime', 'itemid', 'amount', 'amountuom', 'rate', 'rateuom', 'orderid', 'linkorderid', 'ordercategoryname', 'secondaryordercategoryname', 'ordercomponenttypedescription', 'ordercategorydescription', 'patientweight', 'totalamount', 'totalamountuom', 'isopenbag', 'continueinnextdept', 'cancelreason', 'statusdescription', 'originalamount', 'originalrate']



100%|██████████| 10/10 [00:36<00:00,  3.61s/it]

Added (medication) entries.





## KDIGO

In [29]:
id_mapping = {
    100031: 'creat_low_past_48hr',
    100032: 'creat_low_past_7day',
    100033: 'aki_stage_creat',
    100034: 'aki_stage_uo',
    100035: 'aki_stage',
}
(id_mapping,
 unit_mapping,
 low_mapping,
 high_mapping,
 cat_mapping) = create_mappings(id_mapping)


def append_func(dt, df_row, df_cols):
    for col in df_cols:
        if col in id_mapping:
            dt.append(
                uid=id_mapping[col],
                value=df_row[col],
                unit=unit_mapping[col],
                lower_range=low_mapping[col],
                upper_range=high_mapping[col],
                category=cat_mapping[col],
                starttime=df_row['charttime'],
            )
    return dt


count += 1
create_dummy_files(STRUCTURED_EXPORT_DIR+str(count), custom_icustays_list)
# ['subject_id', 'hadm_id', 'stay_id', 'charttime', 'creat_low_past_7day', 'creat_low_past_48hr', 'creat', 'aki_stage_creat', 'uo_rt_6hr', 'uo_rt_12hr', 'uo_rt_24hr', 'aki_stage_uo', 'aki_stage']
df_iter, num_entries = get_db_table_as_df_ext('derived', 'kdigo_stages')
func = export_func_factory(append_func, STRUCTURED_EXPORT_DIR+str(count))
parallel_processing_ext(func, df_iter, num_entries, custom_icustays_list)
print("Added kdigo_stages (derived) entries.")


100%|██████████| 2694/2694 [00:03<00:00, 680.95it/s]
100%|██████████| 2694/2694 [00:03<00:00, 679.40it/s]
100%|██████████| 2694/2694 [00:03<00:00, 675.48it/s]
100%|██████████| 2694/2694 [00:03<00:00, 678.52it/s]
100%|██████████| 2688/2688 [00:03<00:00, 676.85it/s]
100%|██████████| 2694/2694 [00:03<00:00, 675.83it/s]
100%|██████████| 2694/2694 [00:03<00:00, 677.34it/s]
100%|██████████| 2694/2694 [00:04<00:00, 672.69it/s]


Created dummy .dsv files.
Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting kdigo_stages data


52it [00:18,  2.76it/s]


Number of entries for kdigo_stages : 4111003
Column names : ['subject_id', 'hadm_id', 'stay_id', 'charttime', 'creat_low_past_7day', 'creat_low_past_48hr', 'creat', 'aki_stage_creat', 'uo_rt_6hr', 'uo_rt_12hr', 'uo_rt_24hr', 'aki_stage_uo', 'aki_stage']



100%|██████████| 52/52 [11:19<00:00, 13.06s/it]

Added kdigo_stages (derived) entries.





# Merge folders

In [30]:
def func(files, pid):

    parent_dir, main_folder = os.path.split(STRUCTURED_EXPORT_DIR)
    folders = [os.path.join(parent_dir, i)
               for i in os.listdir(parent_dir) if i != main_folder]

    for f in tqdm(files):
        main_path = os.path.join(STRUCTURED_EXPORT_DIR, f)
        df = pd.read_csv(main_path, sep='$')

        for folder in folders:
            path = os.path.join(folder, f)
            df = pd.concat([df, pd.read_csv(path, sep='$')])

        sort_list = ['starttime', 'uid']
        df = df.sort_values(by=sort_list)

        df.to_csv(main_path, na_rep='', sep='$', index=False)


create_dummy_files(STRUCTURED_EXPORT_DIR, custom_icustays_list)
data_files = [i for i in os.listdir(STRUCTURED_EXPORT_DIR) if 'data' in i]
parallel_processing(func, MP_NUM_PROCESSES, data_files)


100%|██████████| 2688/2688 [00:04<00:00, 585.70it/s]
100%|██████████| 2694/2694 [00:04<00:00, 580.19it/s]
100%|██████████| 2694/2694 [00:04<00:00, 580.80it/s]
100%|██████████| 2694/2694 [00:04<00:00, 580.82it/s]
100%|██████████| 2694/2694 [00:04<00:00, 579.57it/s]
100%|██████████| 2694/2694 [00:04<00:00, 574.86it/s]
100%|██████████| 2694/2694 [00:04<00:00, 578.59it/s]
100%|██████████| 2694/2694 [00:04<00:00, 572.77it/s]


Created dummy .dsv files.


100%|██████████| 2688/2688 [08:17<00:00,  5.40it/s]
100%|██████████| 2694/2694 [08:18<00:00,  5.40it/s]
100%|██████████| 2694/2694 [08:18<00:00,  5.40it/s]
100%|██████████| 2694/2694 [08:20<00:00,  5.39it/s]
100%|██████████| 2694/2694 [08:20<00:00,  5.38it/s]
100%|██████████| 2694/2694 [08:21<00:00,  5.37it/s]
100%|██████████| 2694/2694 [08:22<00:00,  5.36it/s]
100%|██████████| 2694/2694 [08:23<00:00,  5.35it/s]


[None, None, None, None, None, None, None, None]