# Exports raw data from mimic-iv database

Only the following care unit patients are exported:
- Coronary Care unit (CCU)
- Cardiac Vascular Intensive Care unit (CVICU)

In [1]:
from typing import Tuple
# from tqdm import tqdm
from tqdm.notebook import tqdm
from multiprocessing import Pool, RLock
from configobj import ConfigObj
import numpy as np
import getpass
import json
import math
import os
import psycopg2
import pandas as pd
import time

import matplotlib.pyplot as plt
%matplotlib inline

from projects.common import *
from projects.utils import *


In [2]:
def connect_db():
    db_dir = os.path.abspath('') + "/../../../db"
    return connect_to_database(db_dir)
    # (query_schema_core,
    # query_schema_hosp,
    # query_schema_icu,
    # query_schema_derived,
    # conn) = connect_to_database(db_dir)


def split_df(df: pd.DataFrame, num_processes: int = 8):
    interval = math.ceil(len(df)/num_processes)
    dfs = [df.iloc[interval*i:interval*(i+1)]
           for i in range((num_processes-1))]
    dfs += [df.iloc[interval*(num_processes-1):]]
    return dfs


# Sanity check

Table for icustays:  
['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit', 'intime', 'outtime', 'los'] 

Table for transfers:  
['subject_id', 'hadm_id', 'transfer_id', 'eventtype', 'careunit', 'intime', 'outtime'] 

Table for patients:  
['subject_id', 'gender', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod']

Table for admissions:  
['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'hospital_expire_flag']

# Select valid entries

Only entries from certain care units are included.

In [6]:
(query_schema_core,
 query_schema_hosp,
 query_schema_icu,
 query_schema_derived,
 conn) = connect_db()

patients_df = get_database_table_as_dataframe(
    conn, query_schema_core, 'patients')
admissions_df = get_database_table_as_dataframe(
    conn, query_schema_core, 'admissions')
transfers_df = get_database_table_as_dataframe(
    conn, query_schema_core, 'transfers').sort_values(by=['intime', 'outtime'])
icustays_df = get_database_table_as_dataframe(
    conn, query_schema_icu, 'icustays').sort_values(by=['intime', 'outtime'])

assert len(patients_df.to_numpy()[:, 0]) == len(
    np.unique(patients_df.to_numpy()[:, 0]))
assert len(admissions_df.to_numpy()[:, 1]) == len(
    np.unique(admissions_df.to_numpy()[:, 1]))
assert len(icustays_df.to_numpy()[:, 2]) == len(
    np.unique(icustays_df.to_numpy()[:, 2]))
print("\nAssertions are fine.\n")

_CAREUNITS = ['Coronary Care Unit (CCU)',
              'Cardiac Vascular Intensive Care Unit (CVICU)']

cu1 = get_id_list(conn, query_schema_icu, 'icustays',
                  'stay_id', 'first_careunit', tuple(_CAREUNITS))
cu2 = get_id_list(conn, query_schema_icu, 'icustays',
                  'stay_id', 'last_careunit', tuple(_CAREUNITS))
custom_icustays_list = list(set(cu1 + cu2))

print("Number of custom icustays :", len(custom_icustays_list))

custom_icustays_dict = dict()
for i in icustays_df.iterrows():
    j = i[1]
    if j['first_careunit'] in _CAREUNITS or j['last_careunit'] in _CAREUNITS:
        if j['subject_id'] in custom_icustays_dict:
            if j['hadm_id'] in custom_icustays_dict[j['subject_id']]:
                custom_icustays_dict[j['subject_id']][j['hadm_id']] += [j['stay_id']]  # noqa
            else:
                custom_icustays_dict[j['subject_id']][j['hadm_id']] = [j['stay_id']]  # noqa
        else:
            custom_icustays_dict[j['subject_id']] = {j['hadm_id']: [j['stay_id']]}  # noqa


with open("../../../" + TMP_CUSTOM_LIST, 'w+') as f:
    json.dump(custom_icustays_list, f)

with open("../../../" + TMP_CUSTOM_DICT, 'w+') as f:
    json.dump(custom_icustays_dict, f)


Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting patients data
Number of entries for patients : 382278
Column names : ['subject_id', 'gender', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod']

Getting admissions data
Number of entries for admissions : 523740
Column names : ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'hospital_expire_flag']

Getting transfers data
Number of entries for transfers : 2189535
Column names : ['subject_id', 'hadm_id', 'transfer_id', 'eventtype', 'careunit', 'intime', 'outtime']

Getting icustays data
Number of entries for icustays : 76540
Column names : ['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit', 'intime', 'outtime', 'los']


Assertions are fine.

Getting icustays data
Number of entries for icustays : 20915
Getting icustays data
Number of 

# Export patient info

create a uid dict for mapping the UID.


In [3]:
(query_schema_core,
 query_schema_hosp,
 query_schema_icu,
 query_schema_derived,
 conn) = connect_db()

df, _ = get_database_table_as_dataframe(
    conn, query_schema_derived, 'icustay_detail')
icustays_df, _ = get_database_table_as_dataframe(
    conn, query_schema_icu, 'icustays')
charlson_df, _ = get_database_table_as_dataframe(
    conn, query_schema_derived, 'charlson')
diag_icd_df, _ = get_database_table_as_dataframe(
    conn, query_schema_hosp, 'diagnoses_icd')
cr_base_df, _ = get_database_table_as_dataframe(
    conn, query_schema_derived, 'creatinine_baseline')

data_entry_list = \
    get_database_table_column_name(conn, 'icustay_detail') + \
    ['first_careunit', 'last_careunit'] + \
    get_database_table_column_name(conn, 'charlson')[2:] + \
    ['icd'] + \
    get_database_table_column_name(conn, 'creatinine_baseline')[3:]

print("Entries that are saved as patient info:", data_entry_list)

uid_info = {idx: name for idx, name in enumerate(data_entry_list)}
uid_info_path = os.path.abspath('') + "/../../../" + UID_INFO_PATH
if os.path.exists(uid_info_path):
    os.remove(uid_info_path)
with open(uid_info_path, 'w+') as f:
    json.dump(uid_info, f)


Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting icustay_detail data
Number of entries for icustay_detail : 76540
Column names : ['subject_id', 'hadm_id', 'stay_id', 'gender', 'dod', 'admittime', 'dischtime', 'los_hospital', 'admission_age', 'ethnicity', 'hospital_expire_flag', 'hospstay_seq', 'first_hosp_stay', 'icu_intime', 'icu_outtime', 'los_icu', 'icustay_seq', 'first_icu_stay']

Getting icustays data
Number of entries for icustays : 76540
Column names : ['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit', 'intime', 'outtime', 'los']

Getting charlson data
Number of entries for charlson : 523740
Column names : ['subject_id', 'hadm_id', 'age_score', 'myocardial_infarct', 'congestive_heart_failure', 'peripheral_vascular_disease', 'cerebrovascular_disease', 'dementia', 'chronic_pulmonary_disease', 'rheumatic_disease', 'peptic_ulcer_disease', 'mild_liver_disease', 'diabetes_without_cc', 'diabetes_with_cc', 'paraplegia', 'renal_disease', 'malig

saves the data into a .dsv file.

In [4]:
def func(dfs, pid):

    # This line is the strange hack
    print(' ', end='', flush=True)

    df = dfs[0]
    for df_i in tqdm(df.iterrows(), total=len(df), position=pid):
        df_row = df_i[1]

        it = InfoTable()

        if df_row['stay_id'] in custom_icustays_list:

            # it.data = load_info_dsv(STRUCTURED_EXPORT_DIR, df_row['stay_id'])
            assert not os.path.exists(
                os.path.join(STRUCTURED_EXPORT_DIR,
                             'info_'+str(df_row['stay_id'])+'.dsv'))

            # 1. icustay_detail
            c = 0
            for i, j in zip(uid_info, df_row):
                c = i
                it.append(uid=i, value=j)

            # 2. icustay
            c += 1
            j = icustays_df.loc[icustays_df['stay_id'] ==
                                df_row['stay_id']]['first_careunit'].item()
            it.append(uid=c, value=j)

            c += 1
            j = icustays_df.loc[icustays_df['stay_id'] ==
                                df_row['stay_id']]['last_careunit'].item()
            it.append(uid=c, value=j)

            # 3. charlson - Calculated
            for i in charlson_df.columns.tolist()[2:]:
                c += 1
                j = charlson_df.loc[charlson_df['hadm_id']
                                    == int(it.data['value'][1])][i].item()
                it.append(uid=c, value=j)

            # 4. diagnoses_icd
            c += 1
            cond = (diag_icd_df['subject_id'] == uid_info[0]) & \
                (diag_icd_df['hadm_id'] == uid_info[1])
            icd_df = diag_icd_df.loc[cond][[
                'seq_num', 'icd_code', 'icd_version']]
            icd_df = icd_df.sort_values('seq_num')
            j = None
            for i in icd_df.iterrows():
                icd_version = i[1]['icd_version']
                icd_code = i[1]['icd_code'].replace(' ', '')
                if icd_version == 9:
                    if icd_code[0] == 'E':
                        icd_code = icd_code[:4] + '.' + icd_code[4:]
                    else:
                        icd_code = icd_code[:3] + '.' + icd_code[3:]
                elif icd_version == 10:
                    icd_code = icd_code[:3] + '.' + icd_code[3:]
                else:
                    raise ValueError("Unknown ICD code")
                j = j + ',' if j is not None else ''
                j = f"{j}{icd_version:02d}-{icd_code}"
            it.append(uid=c, value=j)

            # 5. creatinine_baseline - Calculated
            for i in cr_base_df.columns.tolist()[3:]:
                c += 1
                j = cr_base_df.loc[cr_base_df['hadm_id']
                                   == int(it.data['value'][1])][i].item()
                it.append(uid=c, value=j)

            save_dsv(
                os.path.join(STRUCTURED_EXPORT_DIR,
                             'info_'+str(df_row['stay_id'])+'.dsv'),
                pd.DataFrame(it.data))


with open("../../../" + TMP_CUSTOM_LIST, 'r') as f:
    custom_icustays_list = json.load(f)

with open("../../../" + TMP_CUSTOM_DICT, 'r') as f:
    custom_icustays_dict = json.load(f)

dfs = split_df(df, MP_NUM_PROCESSES)
parallel_processing(func, MP_NUM_PROCESSES, dfs)


     

  0%|          | 0/9568 [00:00<?, ?it/s]

 

  0%|          | 0/9568 [00:00<?, ?it/s]

 

  0%|          | 0/9568 [00:00<?, ?it/s]

 

  0%|          | 0/9568 [00:00<?, ?it/s]

  0%|          | 0/9568 [00:00<?, ?it/s]

  0%|          | 0/9568 [00:00<?, ?it/s]

  0%|          | 0/9568 [00:00<?, ?it/s]

  0%|          | 0/9564 [00:00<?, ?it/s]

[None, None, None, None, None, None, None, None]

In [5]:
del df, charlson_df, diag_icd_df, cr_base_df
