# Investigate the concepts available in the database (mimic-iv)

The concepts are available from the mimic-code github repo.

In [2]:
import getpass
import json
import math
import os
import psycopg2
import pandas as pd
import time

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

from configobj import ConfigObj
from multiprocessing import Pool, RLock
from tqdm import tqdm
from typing import Tuple

from projects.data_cleaning import *
from projects.common import *


In [2]:
def save_dsv(path: str, data: pd.DataFrame):
    save_dir, _ = os.path.split(path)
    os.makedirs(save_dir, exist_ok=True)
    data.to_csv(path, na_rep='', sep='$', index=False)


def create_patient_info(dtype=int):
    return {
        'UID': np.array([], dtype=int),
        'Value': np.array([], dtype=dtype),
    }


def sort_patient_table(x: dict):
    sorted_ids = np.argsort(x['UID'])
    for k in x.keys():
        x[k] = x[k][sorted_ids]


In [24]:
db_dir = os.path.abspath('') + "/../../../db"

(query_schema_core,
 query_schema_hosp,
 query_schema_icu,
 query_schema_derived,
 conn) = connect_to_database(db_dir)


Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<


Table for icustays:  
['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit', 'intime', 'outtime', 'los'] 

Table for transfers:  
['subject_id', 'hadm_id', 'transfer_id', 'eventtype', 'careunit', 'intime', 'outtime'] 

Table for patients:  
['subject_id', 'gender', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod']

Table for admissions:  
['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'hospital_expire_flag']

In [4]:
patients_df = get_table(conn, query_schema_core, 'patients')
admissions_df = get_table(conn, query_schema_core, 'admissions')
transfers_df = get_table(conn, query_schema_core, 'transfers').sort_values(by=['intime', 'outtime'])
icustays_df = get_table(conn, query_schema_icu, 'icustays').sort_values(by=['intime', 'outtime'])

assert len(patients_df.to_numpy()[:, 0]) == len(np.unique(patients_df.to_numpy()[:, 0])) 
assert len(admissions_df.to_numpy()[:, 1]) == len(np.unique(admissions_df.to_numpy()[:, 1])) 
assert len(icustays_df.to_numpy()[:, 2]) == len(np.unique(icustays_df.to_numpy()[:, 2])) 

patients_list = patients_df['subject_id'].tolist()
admissions_list = admissions_df['hadm_id'].tolist()

Getting patients data
Number of entries for patients : 382278
Column names : ['subject_id', 'gender', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod']
Getting admissions data
Number of entries for admissions : 523740
Column names : ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'hospital_expire_flag']
Getting transfers data
Number of entries for transfers : 2189535
Column names : ['subject_id', 'hadm_id', 'transfer_id', 'eventtype', 'careunit', 'intime', 'outtime']
Getting icustays data
Number of entries for icustays : 76540
Column names : ['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit', 'intime', 'outtime', 'los']


In [5]:
_CAREUNITS = ['Coronary Care Unit (CCU)',
              'Cardiac Vascular Intensive Care Unit (CVICU)']
custom_icustays_list = [i[1]['stay_id']
                        for i in icustays_df.iterrows()
                        if i[1]['first_careunit'] in _CAREUNITS or i[1]['last_careunit'] in _CAREUNITS]


# Concept 2 : cormobity - charlson

In [8]:
_table = 'charlson'

df = get_table(conn, query_schema_derived, _table)
df

Getting charlson data
Number of entries for charlson : 523740
Column names : ['subject_id', 'hadm_id', 'age_score', 'myocardial_infarct', 'congestive_heart_failure', 'peripheral_vascular_disease', 'cerebrovascular_disease', 'dementia', 'chronic_pulmonary_disease', 'rheumatic_disease', 'peptic_ulcer_disease', 'mild_liver_disease', 'diabetes_without_cc', 'diabetes_with_cc', 'paraplegia', 'renal_disease', 'malignant_cancer', 'severe_liver_disease', 'metastatic_solid_tumor', 'aids', 'charlson_comorbidity_index']


Unnamed: 0,subject_id,hadm_id,age_score,myocardial_infarct,congestive_heart_failure,peripheral_vascular_disease,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,...,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,charlson_comorbidity_index
0,19618591,20000200,3,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,6
1,11513052,20000626,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13469890,20000750,3,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,6
3,12622652,20001121,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,18030855,20001633,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523735,14351952,29999277,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
523736,17128602,29999326,3,0,1,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,9
523737,10053207,29999444,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
523738,10698563,29999501,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6


# Concept 3 : weights

From chartevents : 224639, 226512


In [9]:
_table = 'weight_durations'

df = get_table(conn, query_schema_derived, _table)
df

Getting weight_durations data
Number of entries for weight_durations : 287155
Column names : ['stay_id', 'starttime', 'endtime', 'weight', 'weight_type']


Unnamed: 0,stay_id,starttime,endtime,weight,weight_type
0,30000153,2174-09-29 10:09:00,2174-09-29 16:00:00,70.0,admit
1,30000153,2174-09-29 16:00:00,2174-10-01 05:26:10,73.0,daily
2,30000213,2162-06-21 03:38:00,2162-06-22 00:00:00,84.7,admit
3,30000213,2162-06-22 00:00:00,2162-06-22 22:52:48,73.7,daily
4,30000484,2136-01-14 15:23:32,2136-01-17 06:53:08,68.5,admit
...,...,...,...,...,...
287150,39906734,2174-08-30 10:28:40,2174-09-01 05:00:00,109.5,daily
287151,39927972,2183-11-15 15:44:26,2183-11-15 18:00:00,71.9,daily
287152,39928174,2136-03-10 22:14:02,2136-03-14 00:00:00,61.3,daily
287153,39939183,2129-07-16 05:37:25,2129-07-17 04:00:00,93.0,daily


# Concept 4 : sepsis

In [15]:
_table = 'suspicion_of_infection'

df = get_table(conn, query_schema_derived, _table)
df.dropna()

Getting suspicion_of_infection data
Number of entries for suspicion_of_infection : 819471
Column names : ['subject_id', 'stay_id', 'hadm_id', 'ab_id', 'antibiotic', 'antibiotic_time', 'suspected_infection', 'suspected_infection_time', 'culture_time', 'specimen', 'positive_culture']


Unnamed: 0,subject_id,stay_id,hadm_id,ab_id,antibiotic,antibiotic_time,suspected_infection,suspected_infection_time,culture_time,specimen,positive_culture
22,10000980,39765666.0,26913865,1,CeftriaXONE,2189-06-27 11:00:00,1,2189-06-27 07:40:00,2189-06-27 07:40:00,BLOOD CULTURE,0.0
23,10000980,39765666.0,26913865,2,Levofloxacin,2189-06-27 11:00:00,1,2189-06-27 07:40:00,2189-06-27 07:40:00,BLOOD CULTURE,0.0
114,10001884,37510196.0,26184834,21,Vancomycin,2131-01-11 08:00:00,1,2131-01-10 16:36:00,2131-01-10 16:36:00,URINE,0.0
115,10001884,37510196.0,26184834,22,Vancomycin,2131-01-11 08:00:00,1,2131-01-10 16:36:00,2131-01-10 16:36:00,URINE,0.0
116,10001884,37510196.0,26184834,23,Sulfameth/Trimethoprim SS,2131-01-11 08:00:00,1,2131-01-10 16:36:00,2131-01-10 16:36:00,URINE,0.0
...,...,...,...,...,...,...,...,...,...,...,...
819461,19999840,38978960.0,21033226,8,Vancomycin,2164-09-15 20:00:00,1,2164-09-14 12:13:00,2164-09-14 12:13:00,MRSA SCREEN,0.0
819462,19999840,38978960.0,21033226,9,Levofloxacin,2164-09-16 14:00:00,1,2164-09-14 12:13:00,2164-09-14 12:13:00,MRSA SCREEN,0.0
819463,19999840,38978960.0,21033226,10,Ampicillin Sodium,2164-09-16 15:00:00,1,2164-09-14 12:13:00,2164-09-14 12:13:00,MRSA SCREEN,0.0
819464,19999840,38978960.0,21033226,11,CeftriaXONE,2164-09-16 16:00:00,1,2164-09-14 12:13:00,2164-09-14 12:13:00,MRSA SCREEN,0.0


In [13]:
_table = 'sepsis3'

df = get_table(conn, query_schema_derived, _table)
df

Getting sepsis3 data
Number of entries for sepsis3 : 35010
Column names : ['subject_id', 'stay_id', 'antibiotic_time', 'culture_time', 'suspected_infection_time', 'sofa_time', 'sofa_score', 'respiration', 'coagulation', 'liver', 'cardiovascular', 'cns', 'renal', 'sepsis3']


Unnamed: 0,subject_id,stay_id,antibiotic_time,culture_time,suspected_infection_time,sofa_time,sofa_score,respiration,coagulation,liver,cardiovascular,cns,renal,sepsis3
0,18421337,30000484,2136-01-14 21:00:00,2136-01-14 18:10:00,2136-01-14 18:10:00,2136-01-14 19:00:00,3,0,0,0,0,3,0,True
1,12207593,30000646,2194-04-29 07:00:00,2194-04-29 01:00:00,2194-04-29 01:00:00,2194-04-29 11:00:00,3,2,0,0,1,0,0,True
2,16513856,30001446,2186-04-12 04:00:00,2186-04-11 08:20:00,2186-04-11 08:20:00,2186-04-12 04:00:00,8,0,3,3,0,0,2,True
3,10656173,30001555,2177-09-27 16:00:00,2177-09-27 07:21:00,2177-09-27 07:21:00,2177-09-27 12:00:00,8,0,3,4,0,1,0,True
4,17921898,30002415,2126-12-17 12:00:00,2126-12-16 15:05:00,2126-12-16 15:05:00,2126-12-17 12:00:00,4,2,2,0,0,0,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35005,19046950,39998622,2135-02-17 10:00:00,2135-02-16 22:00:00,2135-02-16 22:00:00,2135-02-14 22:00:00,3,3,0,0,0,0,0,True
35006,15954569,39998871,2180-02-24 10:00:00,2180-02-24 04:05:00,2180-02-24 04:05:00,2180-02-24 08:00:00,3,0,0,0,3,0,0,True
35007,15669140,39999172,2185-02-17 20:00:00,2185-02-18 12:45:00,2185-02-17 20:00:00,2185-02-17 18:00:00,2,0,0,0,1,0,1,True
35008,13651601,39999230,2147-09-01 02:00:00,2147-08-31 20:59:00,2147-08-31 20:59:00,2147-08-31 23:00:00,2,0,0,0,1,0,1,True


# Concept 5 : score

In [16]:
_table = 'sofa'

df = get_table(conn, query_schema_derived, _table)
df

Getting sofa data
Number of entries for sofa : 6361478
Column names : ['stay_id', 'hr', 'starttime', 'endtime', 'pao2fio2ratio_novent', 'pao2fio2ratio_vent', 'rate_epinephrine', 'rate_norepinephrine', 'rate_dopamine', 'rate_dobutamine', 'meanbp_min', 'gcs_min', 'uo_24hr', 'bilirubin_max', 'creatinine_max', 'platelet_min', 'respiration', 'coagulation', 'liver', 'cardiovascular', 'cns', 'renal', 'respiration_24hours', 'coagulation_24hours', 'liver_24hours', 'cardiovascular_24hours', 'cns_24hours', 'renal_24hours', 'sofa_24hours']


Unnamed: 0,stay_id,hr,starttime,endtime,pao2fio2ratio_novent,pao2fio2ratio_vent,rate_epinephrine,rate_norepinephrine,rate_dopamine,rate_dobutamine,...,cardiovascular,cns,renal,respiration_24hours,coagulation_24hours,liver_24hours,cardiovascular_24hours,cns_24hours,renal_24hours,sofa_24hours
0,30000153,0,2174-09-29 12:00:00,2174-09-29 13:00:00,,,,,,,...,0.0,0.0,,0,0,0,0,0,0,0
1,30000153,1,2174-09-29 13:00:00,2174-09-29 14:00:00,,442.0,,,,,...,0.0,,,0,0,0,0,0,0,0
2,30000153,2,2174-09-29 14:00:00,2174-09-29 15:00:00,,526.0,,,,,...,,,,0,0,0,0,0,0,0
3,30000153,3,2174-09-29 15:00:00,2174-09-29 16:00:00,,,,,,,...,0.0,,0.0,0,0,0,0,0,0,0
4,30000153,4,2174-09-29 16:00:00,2174-09-29 17:00:00,,430.0,,,,,...,0.0,0.0,,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361473,39999810,107,2115-12-05 12:00:00,2115-12-05 13:00:00,,,,,,,...,,,,0,0,0,0,1,0,1
6361474,39999810,108,2115-12-05 13:00:00,2115-12-05 14:00:00,,,,,,,...,,,,0,0,0,0,1,0,1
6361475,39999810,109,2115-12-05 14:00:00,2115-12-05 15:00:00,,,,,,,...,0.0,,,0,0,0,0,1,0,1
6361476,39999810,110,2115-12-05 15:00:00,2115-12-05 16:00:00,,,,,,,...,,0.0,,0,0,0,0,1,0,1


# Concept 6 : measurement

In [19]:
_table = 'bg'

df = get_table(conn, query_schema_derived, _table)
df

Getting bg data
Number of entries for bg : 561212
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen', 'specimen_pred', 'specimen_prob', 'so2', 'po2', 'pco2', 'fio2_chartevents', 'fio2', 'aado2', 'aado2_calc', 'pao2fio2ratio', 'ph', 'baseexcess', 'bicarbonate', 'totalco2', 'hematocrit', 'hemoglobin', 'carboxyhemoglobin', 'methemoglobin', 'chloride', 'calcium', 'temperature', 'potassium', 'sodium', 'lactate', 'glucose']


Unnamed: 0,subject_id,hadm_id,charttime,specimen,specimen_pred,specimen_prob,so2,po2,pco2,fio2_chartevents,...,hemoglobin,carboxyhemoglobin,methemoglobin,chloride,calcium,temperature,potassium,sodium,lactate,glucose
0,10000935,25849114.0,2187-10-22 15:40:00,,ART.,0.945735,,86.0,33.0,,...,,,,,,,,,2.8,
1,10000980,20897796.0,2193-08-14 21:41:00,,,0.562355,,30.0,40.0,,...,,,,,,,,,,
2,10001884,29678536.0,2130-10-10 09:31:00,,ART.,0.927595,,73.0,58.0,,...,,,,,,,,,,
3,10001884,,2130-10-19 13:58:00,,,0.000818,35.0,23.0,56.0,,...,,,,,,,,,1.6,
4,10001884,28664981.0,2130-11-29 00:15:00,,ART.,0.983265,,103.0,49.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561207,19999840,21033226.0,2164-09-17 13:34:00,,,0.000114,25.0,25.0,57.0,40.0,...,8.8,,,102.0,1.15,,4.2,134.0,4.0,275.0
561208,19999840,21033226.0,2164-09-17 13:39:00,,,0.000156,25.0,23.0,71.0,40.0,...,,,,102.0,1.10,,4.1,141.0,4.9,369.0
561209,19999987,,2145-11-02 20:27:00,,ART.,1.000000,,439.0,51.0,,...,,,,,,,,,,
561210,19999987,23865745.0,2145-11-03 05:28:00,,ART.,0.984502,,114.0,44.0,40.0,...,,,,,,,,,,


In [92]:
db_dir = os.path.abspath('') + "/../../../db"

(query_schema_core,
 query_schema_hosp,
 query_schema_icu,
 query_schema_derived,
 conn) = connect_to_database(db_dir)

_table = 'blood_differential'

_df = get_table(conn, query_schema_derived, _table)
_df.head(5)


Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
Getting bg data
Number of entries for bg : 561212
Column names : ['subject_id', 'hadm_id', 'charttime', 'specimen', 'specimen_pred', 'specimen_prob', 'so2', 'po2', 'pco2', 'fio2_chartevents', 'fio2', 'aado2', 'aado2_calc', 'pao2fio2ratio', 'ph', 'baseexcess', 'bicarbonate', 'totalco2', 'hematocrit', 'hemoglobin', 'carboxyhemoglobin', 'methemoglobin', 'chloride', 'calcium', 'temperature', 'potassium', 'sodium', 'lactate', 'glucose']



In [4]:
db_dir = os.path.abspath('') + "/../../../db"

(query_schema_core,
 query_schema_hosp,
 query_schema_icu,
 query_schema_derived,
 conn) = connect_to_database(db_dir)

query = query_schema_hosp + f"""
select itemid, valueuom
from labevents
"""

df = pd.read_sql_query(query, conn)
df


Database: mimiciv
Username: mimiciv
>>>>> Connected to DB <<<<<
