# Exports raw data from mimic-iv database

Only the following care unit patients are exported:
- Coronary Care unit (CCU)
- Cardiac Vascular Intensive Care unit (CVICU)

In [1]:
from projects.common import *
from projects.data_cleaning import *
from typing import Tuple
from tqdm import tqdm
from multiprocessing import Pool, RLock
from configobj import ConfigObj
import numpy as np
import getpass
import json
import math
import os
import psycopg2
import pandas as pd
import time

import matplotlib.pyplot as plt
%matplotlib inline


In [69]:
def _save_dsv(path: str, data: pd.DataFrame):
    save_dir, _ = os.path.split(path)
    os.makedirs(save_dir, exist_ok=True)
    data.to_csv(path, na_rep='', sep='$', index=False)


def _load_dsv(path: str):
    assert os.path.exists(path), path
    return pd.read_csv(path, sep='$')


def save_info_dsv(stay_id: int, data: pd.DataFrame) -> dict:
    save_path = os.path.join(STRUCTURED_EXPORT_DIR,
                             'info_'+str(stay_id)+'.dsv')
    _save_dsv(save_path, data)


def save_data_dsv(stay_id: int, data: pd.DataFrame) -> dict:
    save_path = os.path.join(STRUCTURED_EXPORT_DIR,
                             'data_'+str(stay_id)+'.dsv')
    _save_dsv(save_path, data)


def load_info_dsv(stay_id: int) -> dict:
    save_path = os.path.join(STRUCTURED_EXPORT_DIR,
                             'info_'+str(stay_id)+'.dsv')
    data = _load_dsv(save_path).to_dict()
    data = {k: v if len(v) > 0 else np.array([], dtype=int)
            for k, v in data.items()}
    return data


def load_data_dsv(stay_id: int) -> dict:
    save_path = os.path.join(STRUCTURED_EXPORT_DIR,
                             'data_'+str(stay_id)+'.dsv')
    data = _load_dsv(save_path).to_dict()
    data = {k: v if len(v) > 0 else np.array([], dtype=int)
            for k, v in data.items()}
    return data


class InfoTable(object):

    col_entries = ['uid', 'value']

    def __init__(self) -> None:
        self.data = {i: np.array([], dtype=int) for i in self.col_entries}

    def sort_uid(self) -> None:
        sorted_ids = np.argsort(self.data['uid'])
        for k in self.data.keys():
            self.data[k] = self.data[k][sorted_ids]

    def append(self, **kwargs) -> None:
        for i in kwargs:
            assert i in self.col_entries
        assert len(kwargs) == len(self.col_entries)

        self.data = {i: np.append(self.data[i], kwargs[i])
                     for i in self.col_entries}


class DataTable(object):

    col_entries = ['uid', 'value', 'unit', 'type', 'starttime', 'endtime']

    def __init__(self) -> None:
        self.data = {i: np.array([], dtype=int) for i in self.col_entries}

    def sort_uid(self) -> None:
        sorted_ids = np.argsort(self.data['uid'])
        for k in self.data.keys():
            self.data[k] = self.data[k][sorted_ids]

    def sort_starttime(self) -> None:
        sorted_ids = np.argsort(self.data['starttime'])
        for k in self.data.keys():
            self.data[k] = self.data[k][sorted_ids]

    def remove_duplicates(self) -> None:
        stacked_arr = np.stack([i for _, i in self.data.items()]).astype(str)
        stacked_arr = np.unique(stacked_arr, axis=1)
        self.data = {i: stacked_arr[idx].astype(self.data[i].dtype)
                     for idx, i in enumerate(self.col_entries)}

    def remove_null_value(self) -> None:
        non_null_mask = ~pd.isnull(self.data['value'])
        for k in self.data:
            self.data[k] = self.data[k][non_null_mask]

    def concatenate(self, x: dict) -> None:
        assert len(x) == len(self.col_entries)

        self.data = {i: np.concatenate([self.data[i], x[i]])
                     for i in self.col_entries}

    def append(self, **kwargs) -> None:
        for i in kwargs:
            assert i in self.col_entries
        assert len(kwargs) == len(self.col_entries)

        self.data = {i: np.append(self.data[i], kwargs[i])
                     for i in self.col_entries}


In [72]:
db_dir = os.path.abspath('') + "/../../../db"

(query_schema_core,
 query_schema_hosp,
 query_schema_icu,
 query_schema_derived,
 conn) = connect_to_database(db_dir)


Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<


Table for icustays:  
['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit', 'intime', 'outtime', 'los'] 

Table for transfers:  
['subject_id', 'hadm_id', 'transfer_id', 'eventtype', 'careunit', 'intime', 'outtime'] 

Table for patients:  
['subject_id', 'gender', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod']

Table for admissions:  
['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'hospital_expire_flag']

In [4]:
patients_df = get_table(conn, query_schema_core, 'patients')
admissions_df = get_table(conn, query_schema_core, 'admissions')
transfers_df = get_table(conn, query_schema_core,
                         'transfers').sort_values(by=['intime', 'outtime'])
icustays_df = get_table(conn, query_schema_icu, 'icustays').sort_values(
    by=['intime', 'outtime'])

assert len(patients_df.to_numpy()[:, 0]) == len(
    np.unique(patients_df.to_numpy()[:, 0]))
assert len(admissions_df.to_numpy()[:, 1]) == len(
    np.unique(admissions_df.to_numpy()[:, 1]))
assert len(icustays_df.to_numpy()[:, 2]) == len(
    np.unique(icustays_df.to_numpy()[:, 2]))

patients_list = patients_df['subject_id'].tolist()
admissions_list = admissions_df['hadm_id'].tolist()


Getting patients data
Number of entries for patients : 382278
Column names : ['subject_id', 'gender', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod']

Getting admissions data
Number of entries for admissions : 523740
Column names : ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'hospital_expire_flag']

Getting transfers data
Number of entries for transfers : 2189535
Column names : ['subject_id', 'hadm_id', 'transfer_id', 'eventtype', 'careunit', 'intime', 'outtime']

Getting icustays data
Number of entries for icustays : 76540
Column names : ['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit', 'intime', 'outtime', 'los']



In [26]:
_CAREUNITS = ['Coronary Care Unit (CCU)',
              'Cardiac Vascular Intensive Care Unit (CVICU)']
custom_icustays_list = [
    i[1]['stay_id']
    for i in icustays_df.iterrows()
    if i[1]['first_careunit'] in _CAREUNITS or i[1]['last_careunit'] in _CAREUNITS]

custom_icustays_dict = dict()
for i in icustays_df.iterrows():
    j = i[1]
    if j['first_careunit'] in _CAREUNITS or j['last_careunit'] in _CAREUNITS:
        if j['subject_id'] in custom_icustays_dict:
            if j['hadm_id'] in custom_icustays_dict[j['subject_id']]:
                custom_icustays_dict[j['subject_id']][j['hadm_id']] += [j['stay_id']]  # noqa
            else:
                custom_icustays_dict[j['subject_id']][j['hadm_id']] = [j['stay_id']]  # noqa
        else:
            custom_icustays_dict[j['subject_id']] = {j['hadm_id']: [j['stay_id']]}  # noqa


# Export patient info.


In [31]:
df = get_table(conn, query_schema_derived, 'icustay_detail')
charlson_df = get_table(conn, query_schema_derived, 'charlson')
diag_icd_df = get_table(conn, query_schema_hosp, 'diagnoses_icd')

data_entry_list = df.columns.to_list() + \
    ['first_careunit', 'last_careunit'] + \
    charlson_df.columns.tolist()[2:] + \
    ['icd']


Getting icustay_detail data
Number of entries for icustay_detail : 76540
Column names : ['subject_id', 'hadm_id', 'stay_id', 'gender', 'dod', 'admittime', 'dischtime', 'los_hospital', 'admission_age', 'ethnicity', 'hospital_expire_flag', 'hospstay_seq', 'first_hosp_stay', 'icu_intime', 'icu_outtime', 'los_icu', 'icustay_seq', 'first_icu_stay']



In [7]:
uid_info = {idx: name for idx, name in enumerate(data_entry_list)}
uid_info_path = os.path.abspath('') + "/../../../" + UID_INFO_PATH
os.remove(uid_info_path)
with open(uid_info_path, 'w+') as f:
    json.dump(uid_info, f)

it = InfoTable()
for df_i in tqdm(df.iterrows(), total=len(df)):
    df_row = df_i[1]

    if df_row['stay_id'] in custom_icustays_list:

        it.data = load_info_dsv(df_row['stay_id'])

        # 1. icustay_detail
        c = 0
        for i, j in zip(uid_info, df_row):
            c = i
            it.append(uid=i, value=j)

        # 2. icustay
        c += 1
        j = icustays_df.loc[icustays_df['stay_id'] ==
                            df_row['stay_id']]['first_careunit'].item()
        it.append(uid=c, value=j)

        c += 1
        j = icustays_df.loc[icustays_df['stay_id'] ==
                            df_row['stay_id']]['last_careunit'].item()
        it.append(uid=c, value=j)

        # 3. charlson
        for i in charlson_df.columns.tolist()[2:]:
            c += 1
            j = charlson_df.loc[charlson_df['hadm_id']
                                == it['hadm_id']][i].item()
            it.append(uid=c, value=j)

        # 4. diagnoses_icd
        c += 1
        cond = (diag_icd_df['subject_id'] == uid_info[0]) & \
            (diag_icd_df['hadm_id'] == uid_info[1])
        icd_df = diag_icd_df.loc[cond][['seq_num', 'icd_code', 'icd_version']]
        icd_df = icd_df.sort_values('seq_num')
        j = None
        for i in icd_df.iterrows():
            icd_version = i[1]['icd_version']
            icd_code = i[1]['icd_code'].replace(' ', '')
            if icd_version == 9:
                if icd_code[0] == 'E':
                    icd_code = icd_code[:4] + '.' + icd_code[4:]
                else:
                    icd_code = icd_code[:3] + '.' + icd_code[3:]
            elif icd_version == 10:
                icd_code = icd_code[:3] + '.' + icd_code[3:]
            else:
                raise ValueError("Unknown ICD code")
            j = j + ',' if j is not None else ''
            j = f"{j}{icd_version:02d}-{icd_code}"
        it.append(uid=c, value=j)

        save_info_dsv(df_row['stay_id'], pd.DataFrame(it.data))


# Export data info.


In [None]:
labitems = pd.read_csv("../../../"+LAB_ITEM_PATH, sep='\t', header=0)
labitems.fillna('None')


Create dummy .dsv files.

In [70]:
for icustay_id in tqdm(custom_icustays_list):
    save_data_dsv(icustay_id, pd.DataFrame(DataTable().data))


100%|██████████| 21546/21546 [00:11<00:00, 1835.04it/s]


In [73]:
df = get_table(conn, query_schema_derived, 'height')

dt = DataTable()
for df_i in tqdm(df.iterrows(), total=len(df)):
    df_row = df_i[1]

    if df_row['stay_id'] in custom_icustays_list:
        dt.data = load_data_dsv(df_row['stay_id'])
        dt.append(
            uid=100001,
            value=df_row['height'],
            unit='cm',
            type=None,
            starttime=df_row['charttime'],
            endtime=None,
        )
        save_data_dsv(df_row['stay_id'], pd.DataFrame(dt.data))


Getting height data
Number of entries for height : 35170
Column names : ['subject_id', 'stay_id', 'charttime', 'height']



100%|██████████| 35170/35170 [01:18<00:00, 446.74it/s]


In [None]:
df = get_table(conn, query_schema_derived, 'weight_durations')

dt = DataTable()
for df_i in tqdm(df.iterrows(), total=len(df)):
    df_row = df_i[1]

    if df_row['stay_id'] in custom_icustays_list:
        dt.data = load_data_dsv(df_row['stay_id'])
        dt.append(
            uid=100002,
            value=df_row['weight'],
            unit='kg',
            type=df_row['weight_type'],
            starttime=df_row['starttime'],
            endtime=df_row['endtime'],
        )
        save_data_dsv(df_row['stay_id'], pd.DataFrame(dt.data))


In [None]:
id_mapping = {
    50862: 'albumin',
    50930: 'globulin',
    50976: 'total_protein',
    50868: 'aniongap',
    50882: 'bicarbonate',
    51006: 'bun',
    50893: 'calcium',
    50902: 'chloride',
    50912: 'creatinine',
    50931: 'glucose',
    50983: 'sodium',
    50971: 'potassium',
}
id_mapping = {j: i for i, j in id_mapping.items()}

df = get_table(conn, query_schema_derived, 'chemistry')

it = InfoTable()
dt = DataTable()
for df_i in tqdm(df.iterrows(), total=len(df)):
    df_row = df_i[1]

    if df_row['subject_id'] not in custom_icustays_dict:
        continue
    if df_row['hadm_id'] not in custom_icustays_dict[df_row['subject_id']]:
        continue

    stay_ids = custom_icustays_dict[df_row['subject_id']][df_row['hadm_id']]

    for stay_id in stay_ids:
        it.data = load_info_dsv(stay_id)
        dt.data = load_data_dsv(stay_id)

        icu_intime = it.data[13]
        icu_outtime = it.data[14]

        if icu_intime <= df_row['charttime'] <= icu_outtime:
            for col in df.columns().tolist()[4:]:
                dt.append(
                    uid=id_mapping[col]+500000,
                    value=df_row[col],
                    unit=None,
                    type=df.columns()[3],
                    starttime=df_row['charttime'],
                    endtime=None,
                )
            save_data_dsv(stay_id, pd.DataFrame(dt.data))


In [None]:
id_mapping = {
    552028: 'specimen',
    550801: 'aado2',
    550802: 'baseexcess',
    550803: 'bicarbonate',
    50804: 'totalco2',
    550805: 'carboxyhemoglobin',
    550806: 'chloride',
    550808: 'calcium',
    550809: 'glucose',
    550810: 'hematocrit',
    550811: 'hemoglobin',
    550813: 'lactate',
    550814: 'methemoglobin',
    550816: 'fio2',
    550817: 'so2',
    550818: 'pco2',
    550820: 'ph',
    550821: 'po2',
    550822: 'potassium',
    550824: 'sodium',
    550825: 'temperature',
    223835: 'fio2_chartevents',
    500001: 'pao2fio2ratio',
    500002: 'aado2_calc',
    500003: 'specimen_pred',
    500004: 'specimen_prob',
}
id_mapping = {j: i for i, j in id_mapping.items()}

df = get_table(conn, query_schema_derived, 'bg')

it = InfoTable()
dt = DataTable()
for df_i in tqdm(df.iterrows(), total=len(df)):
    df_row = df_i[1]

    if df_row['subject_id'] not in custom_icustays_dict:
        continue
    if df_row['hadm_id'] not in custom_icustays_dict[df_row['subject_id']]:
        continue

    stay_ids = custom_icustays_dict[df_row['subject_id']][df_row['hadm_id']]

    for stay_id in stay_ids:
        it.data = load_info_dsv(stay_id)
        dt.data = load_data_dsv(stay_id)

        icu_intime = it.data[13]
        icu_outtime = it.data[14]

        if icu_intime <= df_row['charttime'] <= icu_outtime:
            for col in df.columns().tolist()[3:]:
                dt.append(
                    uid=id_mapping[col],
                    value=df_row[col],
                    unit=None,
                    type=None,
                    starttime=df_row['charttime'],
                    endtime=None,
                )
            save_data_dsv(stay_id, pd.DataFrame(dt.data))
