Data was preprocessed using the `DescEmb/preprocessing/preprocessing_main.py` script by the authors of the original paper. Let's explore the .pkl files and find a sensible way to load them. Preferably, loading should be done using the `pyhealth` library.

## Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
import pyhealth

## Data

In [2]:
storage_path = Path("/data/DescEmb")
# Load and unpickle the data
with open(storage_path / "eicu_df.pkl", "rb") as f:
    eicu = pickle.load(f)
with open(storage_path/ "mimic_df.pkl", "rb") as f:
    mimic = pickle.load(f)
with open(storage_path/"pooled_df.pkl", "rb") as f:
    pooled = pickle.load(f)

In [3]:
eicu.head()

Unnamed: 0,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,hospitaladmitoffset,...,los_3day,los_7day,diagnosisstring,ID,code_name,code_offset,value,uom,code_order,seq_len
0,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,0,...,0,0,[cardiovascular|chest pain / ASHD|coronary art...,141168,"[PT, PT - INR, PTT, platelets x 1000, -eos, WB...","[231, 231, 231, 516, 516, 516, 516, 516, 516, ...","[17.1, 1.7, 29.0, 209.0, 1.0, 9.8, 3.1, 19.0, ...","[sec, ratio, sec, K/mcL, %, K/mcL, g/dL, %, ,...","[2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",137
1,128941,Male,68,Caucasian,73,92,"Sepsis, renal/UTI (including bladder)",180.3,18:18:40,-780,...,1,0,,141194,"[creatinine, PT, AST (SGOT), platelets x 1000,...","[-923, -923, -923, -923, -923, -923, -923, -92...","[2.94, 12.1, 15.0, 298.0, 9.8, 15.7, 9.9, 0.1,...","[mg/dL, sec, Units/L, K/mcL, g/dL, %, mg/dL, m...","[50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 5...",325
2,128948,Female,77,Caucasian,66,90,"Arrest, respiratory (without cardiac arrest)",160.0,22:23:00,-1336,...,0,0,"[infectious diseases|skin, bone and joint infe...",141203,"[urinary specific gravity, chloride, creatinin...","[-1637, -1580, -1580, -1580, -1580, -1580, -15...","[1.026, 102.0, 0.33, 20.0, 6.6, 30.0, 31.4, 4....","[ , mmol/L, mg/dL, Units/L, g/dL, mmol/L, g/dL...","[7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...",137
3,128968,Male,82,Caucasian,60,83,"Sepsis, pulmonary",185.4,19:19:00,-1004,...,0,0,[cardiovascular|shock / hypotension|sepsis],141227,"[urinary specific gravity, -eos, Hgb, potassiu...","[-2188, -1566, -1566, -1566, -1566, -1566, -15...","[1.025, 1.0, 8.0, 4.2, 134.0, 130.0, 48.2, 31....","[ , %, g/dL, mmol/L, mmol/L, mg/dL, K/mcL, mg/...","[9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",141
4,128973,Female,81,Caucasian,73,92,Mitral valve replacement,165.1,17:27:13,-1,...,1,1,,141233,"[ONDANSETRON 4 MG PO TBDP, O2 Sat (%), HCO3, T...","[-435, -355, -355, -355, -355, -355, -355, -35...","[3.0, 98.0, 23.0, 24.0, 2.0, 7.4, 37.0, 127.0,...","[ , %, mmol/L, , mEq/L, , mm Hg, mm Hg, mm H...","[18, 41, 41, 41, 41, 41, 41, 41, 63, 63, 63, 6...",555


In [4]:
mimic.head()

Unnamed: 0,SUBJECT_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS,...,ICD9_CODE,12h_obs,24h_obs,ID,code_name,code_offset,value,uom,code_order,seq_len
0,58526,275225,metavision,MICU,MICU,52,52,2117-09-11 11:47:35,2117-09-15 17:57:14,4.2567,...,"[25013, 3371, 5849, 5780, V5867, 25063, 5363, ...",2117-09-11 23:47:35,2117-09-12 11:47:35,100001,"[Sodium, Potassium, Phosphate, Calcium, Total,...","[182, 182, 182, 182, 182, 182, 403, 403, 403, ...","[143.0, 4.2, 3.4, 8.7, 20.0, 1.9, 16.0, 9.0, 2...","[mEq/L, mEq/L, mg/dL, mg/dL, mEq/L, mg/dL, mEq...","[3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",22
1,54610,209281,metavision,MICU,MICU,50,50,2150-04-17 15:35:42,2150-04-19 14:12:52,1.9425,...,"[53100, 2851, 07054, 5715, 45621, 53789, 4019,...",2150-04-18 03:35:42,2150-04-18 15:35:42,100003,"[Anion Gap, Blue Top Hold, CK-MB Index, Creati...","[125, 125, 125, 125, 125, 125, 125, 125, 125, ...","[10.0, , 4.9, 1.2, 1.9, 3.8, 5.0, 6.7, 60.5, ...","[mEq/L, , %, mg/dL, mg/dL, mg/dL, mEq/L, %, %...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...",48
2,9895,291788,carevue,MICU,MICU,15,15,2108-04-06 15:50:15,2108-04-11 15:18:03,4.9776,...,"[49320, 51881, 486, 20300, 2761, 7850, 3090, V...",2108-04-07 03:50:15,2108-04-07 15:50:15,100006,"[pO2, pH, pCO2, Calculated Total CO2, Base Exc...","[241, 241, 241, 241, 241, 241, 397, 397, 397, ...","[189.0, 7.34, 48.0, 27.0, 0.0, , , 0.0, 69.0...","[mm Hg, units, mm Hg, mEq/L, mEq/L, , , mEq/...","[0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, ...",16
3,533,253656,metavision,CSRU,CSRU,15,15,2162-05-17 10:18:31,2162-05-19 22:05:14,2.4908,...,"[41401, 99604, 4142, 25000, 27800, V8535, 4148...",2162-05-17 22:18:31,2162-05-18 10:18:31,100009,"[Sodium, Whole Blood, Potassium, Whole Blood, ...","[120, 120, 120, 120, 120, 120, 120, 120, 120, ...","[139.0, 4.2, 7.4, 44.0, 14.6, 44.0, 1.26, 103....","[mEq/L, mEq/L, units, mm Hg, g/dL, %, mmol/L, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, ...",83
4,55853,271147,metavision,SICU,SICU,33,33,2109-12-10 21:58:01,2109-12-12 12:13:20,1.594,...,"[1890, 1961, 1987, 1976, 27652]",2109-12-11 09:58:01,2109-12-11 21:58:01,100010,"[Ketorolac, Heparin, Epidural Bag, Phenylephri...","[121, 121, 121, 121, 121, 121, 121, 121, 121, ...","[15, 5000, 1, 60, 25, 12.5, 0.25, 250, 250, 50...","[mg, UNIT, BAG, mg, mcg, mg, mg, mL, mL, mL, m...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...",23


How are the three dataframes related?

In [5]:
eicu.columns

Index(['patienthealthsystemstayid', 'gender', 'age', 'ethnicity', 'hospitalid',
       'wardid', 'apacheadmissiondx', 'admissionheight', 'hospitaladmittime24',
       'hospitaladmitoffset', 'hospitaladmitsource', 'hospitaldischargeyear',
       'hospitaldischargetime24', 'hospitaldischargeoffset',
       'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype',
       'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'dischargeweight', 'unitdischargetime24',
       'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus',
       'uniquepid', 'readmission', 'mortality', 'losday', 'los_3day',
       'los_7day', 'diagnosisstring', 'ID', 'code_name', 'code_offset',
       'value', 'uom', 'code_order', 'seq_len'],
      dtype='object')

In [6]:
mimic.columns

Index(['SUBJECT_ID', 'ICUSTAY_ID', 'DBSOURCE', 'FIRST_CAREUNIT',
       'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID', 'INTIME', 'OUTTIME',
       'LOS', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG',
       'age', 'readmission', 'mortality', 'los_3day', 'los_7day', 'ICD9_CODE',
       '12h_obs', '24h_obs', 'ID', 'code_name', 'code_offset', 'value', 'uom',
       'code_order', 'seq_len'],
      dtype='object')

In [7]:
pooled.columns

Index(['SUBJECT_ID', 'ICUSTAY_ID', 'DBSOURCE', 'FIRST_CAREUNIT',
       'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID', 'INTIME', 'OUTTIME',
       'LOS', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG',
       'age', 'readmission', 'mortality', 'los_3day', 'los_7day', 'ICD9_CODE',
       '12h_obs', '24h_obs', 'ID', 'code_name', 'code_offset', 'value', 'uom',
       'code_order', 'seq_len', 'patienthealthsystemstayid', 'gender',
       'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx',
       'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaladmitsource', 'hospitaldischargeyear',
       'hospitaldischargetime24', 'hospitaldischargeoffset',
       'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype',
       'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'dischargeweight', 'unitdischargetime24',
       'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus',


In [8]:
# Check diagnosis text or clinical notes in the data
eicu['diagnosisstring'].head(10)


0    [cardiovascular|chest pain / ASHD|coronary art...
1                                                  NaN
2    [infectious diseases|skin, bone and joint infe...
3          [cardiovascular|shock / hypotension|sepsis]
4                                                  NaN
5                                                  NaN
6                                                  NaN
7                                                  NaN
8    [gastrointestinal|GI bleeding / PUD|upper GI b...
9                                                  NaN
Name: diagnosisstring, dtype: object

In [9]:
eicu['diagnosisstring'][0]

['cardiovascular|chest pain / ASHD|coronary artery disease|known',
 'cardiovascular|ventricular disorders|cardiomyopathy',
 'pulmonary|disorders of the airways|COPD',
 'pulmonary|disorders of the airways|COPD',
 'cardiovascular|ventricular disorders|congestive heart failure',
 'cardiovascular|arrhythmias|atrial fibrillation|with hemodynamic compromise',
 'renal|disorder of kidney|chronic kidney disease',
 'cardiovascular|chest pain / ASHD|coronary artery disease|known',
 'cardiovascular|ventricular disorders|congestive heart failure',
 'cardiovascular|ventricular disorders|cardiomyopathy',
 'renal|disorder of kidney|chronic kidney disease',
 'cardiovascular|arrhythmias|atrial fibrillation|with hemodynamic compromise']

In [10]:
# Check proportion of missing values in diagnosis string
eicu['diagnosisstring'].isna().sum() / len(eicu['diagnosisstring'])

0.04136526817767776

In [11]:
# lenghts of datasets
len(eicu), len(mimic), len(pooled)

(149304, 43193, 192497)

In [12]:
len(eicu)+len(mimic)==len(pooled)

True

Let's explore the five predictions tasks and find the columns associated with each task. The tasks are as follows:
Dx, Mortality, LOS > 3, LOS > 7, and Readmission. 

In [18]:
eicu[["mortality", "los_3day", "los_7day", "readmission"]]



Unnamed: 0,mortality,los_3day,los_7day,readmission
0,1,0,0,0
1,0,1,0,0
2,0,0,0,0
3,0,0,0,0
4,0,1,1,0
...,...,...,...,...
149299,1,1,1,0
149300,0,0,0,0
149301,0,0,0,0
149302,0,1,1,0


In [22]:
mimic.columns

Index(['SUBJECT_ID', 'ICUSTAY_ID', 'DBSOURCE', 'FIRST_CAREUNIT',
       'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID', 'INTIME', 'OUTTIME',
       'LOS', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG',
       'age', 'readmission', 'mortality', 'los_3day', 'los_7day', 'ICD9_CODE',
       '12h_obs', '24h_obs', 'ID', 'code_name', 'code_offset', 'value', 'uom',
       'code_order', 'seq_len'],
      dtype='object')

In [31]:
pooled.columns

Index(['SUBJECT_ID', 'ICUSTAY_ID', 'DBSOURCE', 'FIRST_CAREUNIT',
       'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID', 'INTIME', 'OUTTIME',
       'LOS', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG',
       'age', 'readmission', 'mortality', 'los_3day', 'los_7day', 'ICD9_CODE',
       '12h_obs', '24h_obs', 'ID', 'code_name', 'code_offset', 'value', 'uom',
       'code_order', 'seq_len', 'patienthealthsystemstayid', 'gender',
       'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx',
       'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaladmitsource', 'hospitaldischargeyear',
       'hospitaldischargetime24', 'hospitaldischargeoffset',
       'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype',
       'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'dischargeweight', 'unitdischargetime24',
       'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus',


In [21]:
mimic[["mortality", "los_3day", "los_7day", "readmission"]]

Unnamed: 0,mortality,los_3day,los_7day,readmission
0,0,1,0,0
1,0,0,0,0
2,0,1,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
43188,0,1,1,0
43189,0,1,0,0
43190,0,0,0,0
43191,0,0,0,0


In [37]:
mimic.ICD9_CODE

0        [25013, 3371, 5849, 5780, V5867, 25063, 5363, ...
1        [53100, 2851, 07054, 5715, 45621, 53789, 4019,...
2        [49320, 51881, 486, 20300, 2761, 7850, 3090, V...
3        [41401, 99604, 4142, 25000, 27800, V8535, 4148...
4                          [1890, 1961, 1987, 1976, 27652]
                               ...                        
43188    [41031, 42821, 42731, 4271, 5180, 4240, 2760, ...
43189    [486, 4280, 51881, 3970, 496, 4169, 585, 42732...
43190    [4210, 7464, 42971, 30401, 4412, 44284, V1259,...
43191    [41401, 9971, 9975, 42731, 78820, 4111, V4582,...
43192    [48284, 51881, 5119, 5849, 2761, 78606, 79902,...
Name: ICD9_CODE, Length: 43193, dtype: object

In [36]:
eicu.apacheadmissiondx[0], eicu.diagnosisstring[0]

('Rhythm disturbance (atrial, supraventricular)',
 ['cardiovascular|chest pain / ASHD|coronary artery disease|known',
  'cardiovascular|ventricular disorders|cardiomyopathy',
  'pulmonary|disorders of the airways|COPD',
  'pulmonary|disorders of the airways|COPD',
  'cardiovascular|ventricular disorders|congestive heart failure',
  'cardiovascular|arrhythmias|atrial fibrillation|with hemodynamic compromise',
  'renal|disorder of kidney|chronic kidney disease',
  'cardiovascular|chest pain / ASHD|coronary artery disease|known',
  'cardiovascular|ventricular disorders|congestive heart failure',
  'cardiovascular|ventricular disorders|cardiomyopathy',
  'renal|disorder of kidney|chronic kidney disease',
  'cardiovascular|arrhythmias|atrial fibrillation|with hemodynamic compromise'])

In [38]:
pooled.apacheadmissiondx[0], pooled.diagnosisstring[0]

(nan, nan)

In [39]:
pooled.ICD9_CODE[0]

['25013',
 '3371',
 '5849',
 '5780',
 'V5867',
 '25063',
 '5363',
 '4580',
 '25043',
 '40390',
 '5853',
 '25053',
 '36201',
 '25083',
 '7078',
 'V1351']

In [41]:
pooled.ICD9_CODE.isna().sum()==len(eicu) # all missing values are in eicu. The two datasets were concatenated. ICD9_CODE is not available in eicu

True

In [43]:
pooled.diagnosisstring.isna().sum()

49369