In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import sys
pathname = "../code/"
if pathname not in sys.path:
    sys.path.append("../code/")

from mtl_patients import get_summaries, run_mortality_prediction_task

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Let's work with the patients dataframe coming from MIMIC-Extract pipeline.

In [18]:
mimic_extract_filename = '../data/all_hourly_data.h5'
mimic_data_folder = '../data/'
patients_df = pd.read_hdf(mimic_extract_filename, 'patients')

Patients in total, with no exclusion criteria.

In [4]:
len(patients_df)

34472

First criteria: How many patients have `NaN` values in both `dischtime` and `deathtime`. 

In [5]:
patients_df.loc[:, 'dischtime'].isnull().sum()

0

In [7]:
patients_df.loc[:, 'deathtime'].isnull().sum()

31122

So all patients have `dischtime` and 31122 patients have a null `deathtime` (meaning they were discharged alive). So patients who died are 3350. Let's see if that is true.

In [10]:
deathtimes_df = patients_df[['deathtime', 'dischtime']].dropna()
len(deathtimes_df)

3350

So, it is true. In MIMIC-Extract, 3350 patients died.

One of the criterias in the code is to discard those patients whose `dischtime` is earlier than `deathtime`. Let' see how many of those patients are in the data.

In [15]:
len(deathtimes_df[deathtimes_df.dischtime < deathtimes_df.deathtime])

26

In the paper it says mortality will be considered with a new *extended criteria* where is considered one of three conditions: death, CMO (Comfort Measures Only) and DNR (Do Not Resuscitate). Let's count how many patients have CMO only.

In [13]:
len(patients_df[patients_df.cmo > 0])

958

Let's see how many of these 958 patients with CMO also died.

In [14]:
len(patients_df[(patients_df.cmo > 0) & patients_df.deathtime.notnull()])

888

This means that 70 patients have a CMO condition but did not die, so total of mortality with the extended definition of death and CMO is: 3350 - 26 + 70 = 2294.

In [16]:
deathtimes_valid_df = deathtimes_df[deathtimes_df.dischtime >= deathtimes_df.deathtime].copy()
deathtimes_valid_df.loc[:, 'mort_hosp_valid'] = True

In [17]:
len(deathtimes_valid_df)

3324

3324 = 3350 - 26

In [19]:
cmo_df = pd.read_csv(f'{mimic_data_folder}code_status.csv')
cmo_df = cmo_df[cmo_df.cmo > 0]  # only keep those patients with a CMO note

In [20]:
len(cmo_df)

1788

There are way more CMO patients in the `code_status.csv` than in MIMIC-Extract `patients` (1788 vs 888). Let's see what happens when we merge them.

In [26]:
cmo_df['dnr_first_charttime'] = pd.to_datetime(cmo_df.dnr_first_charttime)
cmo_df['timecmo_chart'] = pd.to_datetime(cmo_df.timecmo_chart)
cmo_df['cmo_df_min_time'] = cmo_df.loc[:, ['dnr_first_charttime', 'timecmo_chart']].min(axis=1)
all_mort_times_df = pd.merge(deathtimes_valid_df, cmo_df, on=['subject_id', 'hadm_id', 'icustay_id'], how='outer') \
    [['subject_id', 'hadm_id', 'icustay_id', 'deathtime', 'dischtime', 'timecmo_chart', 'cmo', 'cmo_df_min_time']]
all_mort_times_df['deathtime'] = pd.to_datetime(all_mort_times_df.deathtime)

In [27]:
len(all_mort_times_df)

4227

In [28]:
all_mort_times_df

Unnamed: 0,subject_id,hadm_id,icustay_id,deathtime,dischtime,timecmo_chart,cmo,cmo_df_min_time
0,9,150750,220597,2149-11-14 10:15:00,2149-11-14 10:15:00,NaT,,NaT
1,12,112213,232669,2104-08-20 02:57:00,2104-08-20 02:57:00,NaT,,NaT
2,31,128652,254478,2108-08-30 15:00:00,2108-08-30 15:00:00,NaT,,NaT
3,56,181711,275642,2104-01-08 10:30:00,2104-01-08 10:30:00,NaT,,NaT
4,84,166401,239661,2196-04-17 13:42:00,2196-04-17 13:42:00,2196-04-17 11:00:00,1.0,2196-04-17 11:00:00
...,...,...,...,...,...,...,...,...
4222,28111,197728,287434,NaT,NaT,2136-04-29 09:30:00,1.0,2136-04-27 07:35:00
4223,7696,186225,278283,NaT,NaT,2179-12-05 16:35:00,1.0,2179-12-05 16:35:00
4224,31306,156146,289988,NaT,NaT,2132-07-07 13:50:00,1.0,2132-07-07 13:50:00
4225,17083,131212,265822,NaT,NaT,2103-06-06 19:00:00,1.0,2103-06-04 07:00:00


Let's see how many have CMO and death simultaneously.

In [30]:
len(all_mort_times_df[(all_mort_times_df.deathtime.notnull() & all_mort_times_df.cmo > 0)])

885

So:
* 885 have both conditions (death and CMO)
* 2439 have death (but not CMO)
* 903 have CMO (but not death)

In [32]:
all_mort_times_df['cmo_df_min_time'] = pd.to_datetime(all_mort_times_df.cmo_df_min_time)
all_mort_times_df['min_mort_time'] = all_mort_times_df.loc[:, ['deathtime', 'cmo_df_min_time']].min(axis=1)
min_mort_time_df = all_mort_times_df[['subject_id', 'hadm_id', 'icustay_id', 'min_mort_time']]
static_df = pd.merge(patients_df, min_mort_time_df, on=['subject_id', 'hadm_id', 'icustay_id'], how='left')
static_df['mort_hosp_valid'] = np.invert(np.isnat(static_df.min_mort_time))

In [33]:
static_df['time_til_mort'] = pd.to_datetime(static_df.min_mort_time) - pd.to_datetime(static_df.intime)
static_df['time_til_mort'] = static_df.time_til_mort.apply(lambda x: x.total_seconds() / 3600)
static_df['time_in_icu'] = pd.to_datetime(static_df.dischtime) - pd.to_datetime(static_df.intime)
static_df['time_in_icu'] = static_df.time_in_icu.apply(lambda x: x.total_seconds() / 3600)

Total patients:

In [34]:
len(static_df)

34472

How many stayed more than 24 hours in the ICU:

In [35]:
len(static_df[static_df.time_in_icu >= 24])

33259

How many stayed more than 24 hours in the ICU and lived:

In [37]:
len(static_df[((static_df.time_in_icu >= 24) & (static_df.mort_hosp_valid == False))])

30392

How many died after 36 hours in the ICU:

In [38]:
len(static_df[static_df.time_til_mort >= 36])

2321

Total number of selected patients alive and death meeting the 24h + 12h criteria.

In [39]:
print(30392 + 2321)

32713
