In [1]:
import numpy as np
import pandas as pd
import datetime
import copy
import time
import os
import re
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import operator

from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
from datetime import timedelta

tqdm.pandas()

In [3]:
# Edit to point to your MIMIC directory.
dataDirStr = '/Users/gmessier/data/mimic-1.4/'

In [13]:
# Only load the first few rows since this is a big table.  Load the entire table for a full EDA.
chartevents_df = pd.read_csv(dataDirStr + "CHARTEVENTS.csv", nrows = 1000000) 
chartevents_df.columns = chartevents_df.columns.str.lower()
chartevents_df

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,itemid,charttime,storetime,cgid,value,valuenum,valueuom,warning,error,resultstatus,stopped
0,788,36,165660,241249.0,223834,2134-05-12 12:00:00,2134-05-12 13:56:00,17525.0,15.00,15.00,L/min,0,0,,
1,789,36,165660,241249.0,223835,2134-05-12 12:00:00,2134-05-12 13:56:00,17525.0,100.00,100.00,,0,0,,
2,790,36,165660,241249.0,224328,2134-05-12 12:00:00,2134-05-12 12:18:00,20823.0,0.37,0.37,,0,0,,
3,791,36,165660,241249.0,224329,2134-05-12 12:00:00,2134-05-12 12:19:00,20823.0,6.00,6.00,min,0,0,,
4,792,36,165660,241249.0,224330,2134-05-12 12:00:00,2134-05-12 12:19:00,20823.0,2.50,2.50,,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,1044187,10694,138159,294193.0,224697,2153-08-17 10:19:00,,,16.00,16.00,cmH2O,0,0,,
999996,1044188,10694,138159,294193.0,224738,2153-08-17 10:19:00,,,0.70,0.70,sec,0,0,,
999997,1044189,10694,138159,294193.0,224828,2153-08-17 10:19:00,2153-08-17 10:21:00,20889.0,-1.00,-1.00,mEq/L,0,0,,
999998,1044190,10694,138159,294193.0,225698,2153-08-17 10:19:00,2153-08-17 10:21:00,20889.0,22.00,22.00,mEq/L,0,0,,


`CHARTEVENTS` contains all the charted data available for a patient. During their ICU stay, the primary repository of a patient’s information is their electronic chart. The electronic chart displays patients’ routine vital signs and any additional information relevant to their care: ventilator settings, laboratory values, code status, mental status, and so on. 

Some data maybe frequently repeated within `CHARTEVENTS`, as some data may be desireable to display on a patient's electronic chart. (ex. `LABEVENTS`)

In [6]:
print(f"There are {chartevents_df.subject_id.nunique()} patients who have chart data")

There are 499 patients who have chart data


`itemid` is categorical data. It refers to the type of measurement taken. Each row is associated with one `itemid`, corresponds to an instantiation of the same measurement.

Refer to `D_ITEMS` table for exact definitions of each `itemid`.

In [7]:
c = chartevents_df.itemid.value_counts()[:5]
p = chartevents_df.itemid.value_counts(normalize=True).mul(100).round(2)[:5]
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
220045,79685,7.97
220210,78880,7.89
220277,76870,7.69
220181,48096,4.81
220179,47954,4.8


`charttime` refers to when the observation was made, and `storetime` records the time when the observation was manually input or validated by clinical staff.

`cgid` is the identifier for the caregivers who validated the measurement, this is directly related to the `caregivers.parquet` table.

In [8]:
c = chartevents_df.cgid.value_counts().nlargest(5)
p = chartevents_df.cgid.value_counts(normalize=True).mul(100).round(2).nlargest(5)
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
20889.0,127240,15.02
16586.0,6149,0.73
21108.0,5898,0.7
16302.0,4956,0.58
20622.0,4808,0.57


`value` and `valuenum` correspond to the value measured for `itemid`. If `value` is numeric, then `value` and `valuenum` are the exact same. If `value` is not numeric, then `valuenum` will be NULL. `valueuom` is the unit of measurement for `value`, if applicable. 

`warning` and `error` are binary valued columns, which specify if a warning or an error value occured for that measurement.

In [9]:
c = chartevents_df.warning.value_counts()
p = chartevents_df.warning.value_counts(normalize=True).mul(100).round(2)
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
0,934366,93.44
1,65634,6.56


In [10]:
c = chartevents_df.error.value_counts()
p = chartevents_df.error.value_counts(normalize=True).mul(100).round(2)
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
0,998079,99.81
1,1921,0.19


`resultstatus` and `stopped` specify the type of measurement (RESULTSTATUS is ‘Manual’ or ‘Automatic’) and whether the measurement was stopped.

In [11]:
c = chartevents_df.resultstatus.value_counts()
p = chartevents_df.resultstatus.value_counts(normalize=True).mul(100).round(2)
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%


In [12]:
c = chartevents_df.stopped.value_counts()
p = chartevents_df.stopped.value_counts(normalize=True).mul(100).round(2)
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
