# MIMIC Pre-Processing Example
An example of how to prepare MIMIC data for use in a machine learning algorithm.  Takes the raw table data and creates a single table of static (don't change with time) and dynamic (do change with time) data features for the MIMIC patient population.  The outcome label used is mortality.

In [63]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import os
import pickle 
from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
dataDirStr = '/Users/gmessier/data/mimic-1.4/'   # MIMIC CSV file location.
cacheDirStr = '/Users/gmessier/data/mimic-1.4/cache/'  # Cache directory for intermediate files.

## Timestamped Events
---
Start by gathering all of our events of interest that have a time stamp.

### Service Type

In [3]:
srvTbl = pd.read_csv(dataDirStr + 'SERVICES.csv')
srvTbl.columns = srvTbl.columns.str.lower()

In [4]:
def convert_date_type(tbl,dateCols):
    for dateCol in dateCols:
        tbl[dateCol] = pd.to_datetime(tbl[dateCol])

In [5]:
dateCols = [ 'transfertime' ]
convert_date_type(srvTbl,dateCols)

In [6]:
columnMap = {
    'subject_id': 'SubjectId', 
    'transfertime': 'Date',
    'curr_service': 'ServiceType'
}

In [7]:
# Selects and renames columns from a MIMIC table.
def select_mimic_columns(tbl,mapper):
    d = {}
    for k in mapper.keys():
        d[mapper[k]] = tbl[k]
    return pd.DataFrame(d)      

In [8]:
events = select_mimic_columns(srvTbl,columnMap)

### Admission Type

In [9]:
admitTbl = pd.read_csv(dataDirStr + 'ADMISSIONS.csv')
admitTbl.columns = admitTbl.columns.str.lower()

In [10]:
dateCols = [ 'admittime', 'dischtime', 'deathtime', 'edregtime', 'edouttime' ]
convert_date_type(admitTbl,dateCols)

In [11]:
columnMap = {
    'subject_id': 'SubjectId', 
    'admittime': 'Date',
    'admission_type': 'AdmitType'
}

In [12]:
eventsNew = select_mimic_columns(admitTbl,columnMap)

In [13]:
events = pd.concat([ events, eventsNew ], ignore_index=True, sort=False)

In [14]:
events

Unnamed: 0,SubjectId,Date,ServiceType,AdmitType
0,471,2122-07-22 14:07:27,MED,
1,471,2122-07-26 18:31:49,TSURG,
2,472,2172-09-28 19:22:15,CMED,
3,473,2201-01-09 20:16:45,NB,
4,474,2181-03-23 08:24:41,NB,
...,...,...,...,...
132314,98800,2131-03-30 21:13:00,,EMERGENCY
132315,98802,2151-03-05 20:00:00,,EMERGENCY
132316,98805,2200-09-12 07:15:00,,ELECTIVE
132317,98813,2128-11-11 02:29:00,,EMERGENCY


### Chart Events

`CHARTEVENTS` is a massive table so only extract the events we're interested in.  Use the dask library to implement parallelized filtering of the table as it's read in from its csv file.

In [15]:
# The item ID numbers for the different chart events we're interested in.
chartEvents = {
    'GscVerbal': [ 723, 223900 ],
    'GscMotor': [ 454, 223901 ],
    'GscEyes': [ 184, 220739 ],
    'SystolicBloodPressure': [ 51, 442, 455, 6701, 220050, 220179 ],
    'HeartRate': [ 211, 220045 ],
    'Temperature': [ 676, 678, 223762, 223761 ],
    'BloodO2': [ 190, 3420, 3422, 223835 ]
}
    

In [16]:
allEvents = [ ]
for k in chartEvents.keys():
    allEvents += chartEvents[k]

In [17]:
# Check to see if the table has already been generated and cached.
if not os.path.isfile(cacheDirStr + 'FirstAdmissionChartTable.parquet'):
    
    # Parallelize reading the large CHARTEVENTS table using dask.
    chartCols = [ 'SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'ITEMID', 'VALUE' ]    
    chartTblFull = dd.read_csv(dataDirStr + 'CHARTEVENTS.csv', 
                               usecols = chartCols, 
                               blocksize = 100e6, 
                               dtype={'VALUE': 'object'})   
    
    # Only read entries for the item ID numbers we're interested in.
    with ProgressBar():
        chartTbl = chartTblFull.loc[chartTblFull.ITEMID.isin(allEvents)].compute()

    chartTbl.columns = chartTbl.columns.str.lower()    
    chartTbl['charttime'] = pd.to_datetime(chartTbl.charttime)
    chartTbl['value'] = chartTbl.value.astype('str')

    # parquet files work a lot better than HDF or CSV files for large tables.
    chartTbl.to_parquet(cacheDirStr + 'FirstAdmissionChartTable.parquet')

else:
    # Load the cache file if present.
    chartTbl = pd.read_parquet(cacheDirStr + 'FirstAdmissionChartTable.parquet')



In [18]:
renameMap = {
    'subject_id': 'SubjectId', 
    'charttime': 'Date'
}

In [19]:
# Separates multiple events from the same MIMIC column into separate columns.
# tbl - The MIMIC table.
# renameMap - A map used to rename columns that we want to directly copy over.
# eventMap - A map that names the different events we're interested in and provides their ID numbers.
# eventIds - The MIMIC column that contains the eventIDs (ie. ID numbers for blood pressure readings, temperature, etc.). 
# value - The MIMIC column that contains the value for each event (ie. the value of temperature or blood pressure).
def extract_multiple_events(tbl,renameMap,eventMap,eventIds,value,):
    n = len(tbl.index)
    d = {}
    
    # Directly copy over the columns in renameMap.
    for k in renameMap.keys():
        d[ renameMap[k] ] = tbl[k]
    
    # Create new columns named after the eventMap keys.  Give these columns
    # an entry when their corresponding event IDs appear in the value column.
    for k in eventMap.keys():
        d[k] = pd.Series(np.nan,index=tbl.index)
        eventIdx = tbl[eventIds].isin(eventMap[k])
        d[k].loc[eventIdx] = tbl[value].loc[eventIdx]
     
    return pd.DataFrame(d)      

In [20]:
eventsNew = extract_multiple_events(chartTbl,renameMap=renameMap,eventMap=chartEvents,eventIds='itemid',value='value')

In [21]:
events = pd.concat([ events, eventsNew ], ignore_index=True, sort=False)

In [66]:
chartTbl

Unnamed: 0,subject_id,hadm_id,itemid,charttime,value
1,36,165660,223835,2134-05-12 12:00:00,100
15,36,165660,220045,2134-05-12 13:00:00,86
16,36,165660,220179,2134-05-12 13:00:00,137
28,36,165660,220045,2134-05-12 14:00:00,85
29,36,165660,220179,2134-05-12 14:00:00,118
...,...,...,...,...,...
958772,99781,167791,223900,2133-07-28 04:37:00,Oriented
958773,99781,167791,223901,2133-07-28 04:37:00,Obeys Commands
958819,99781,167791,220739,2133-07-28 08:00:00,Spontaneously
958830,99781,167791,223900,2133-07-28 08:00:00,Oriented


### Output Events

In [22]:
output = pd.read_csv(dataDirStr + 'OUTPUTEVENTS.csv')
output.columns = output.columns.str.lower()

In [23]:
dateCols = [ 'charttime' ]
convert_date_type(output,dateCols)

In [24]:
outputEvents = {
    'Urine': [ 40055, 43175, 40069, 40715, 40473, 40085, 40057, 40056, 40405, 40428, 40086, 40096, 
             40651, 226559, 226560, 226561, 226584, 226563, 226564, 226565, 226567, 226557, 226558, 
             227488]
}

In [25]:
renameMap = {
    'subject_id': 'SubjectId', 
    'charttime': 'Date'
}

In [26]:
output = output.loc[output.itemid.isin(outputEvents['Urine'])]

In [27]:
eventsNew = extract_multiple_events(output,renameMap=renameMap,eventIds='itemid',value='value',eventMap=outputEvents)

In [28]:
events = pd.concat([ events, eventsNew ], ignore_index=True, sort=False)

### Adjust Data Column Types
Cast numerical columns to float and convert categorical string columns to categorical floating point values.

In [29]:
catColumns = [ 'ServiceType', 'AdmitType', 'GscVerbal', 'GscMotor', 'GscEyes' ]
numColumns = [ 'SystolicBloodPressure', 'HeartRate', 'Temperature', 'BloodO2', 'Urine' ]

In [30]:
catMap = {}
for c in tqdm(catColumns):
    vals = events[c].loc[~events[c].isna()].unique()
    nums = list(range(len(vals)))
    catNums = dict(zip(vals,nums))
    
    catMap[c] = catNums
    
    notNa = ~events[c].isna()
    events.loc[notNa,c] = events[c].loc[notNa].map(catNums)
    events[c] = events[c].astype(float)

  0%|          | 0/5 [00:00<?, ?it/s]

In [31]:
for c in tqdm(numColumns):
    events.loc[events[c] == '<NA>',c] = np.nan
    events[c] = events[c].astype(float)

  0%|          | 0/5 [00:00<?, ?it/s]

In [62]:
# Save the categorical string to number mapping.
fp = open(cacheDirStr + 'CategoricalVariableMapping.pkl', 'wb')
pickle.dump(catMap, fp)
fp.close()

### Normalize Time
Express time as the number of seconds relative to the first event in the data (usually an admission event).

In [33]:
def calc_elapsed_time(tbl):
    tbl = tbl.sort_values(by='Date')
    dataCols = tbl.columns[2:]
    
#    timeSec = (tbl.Date - tbl.Date.min()).dt.seconds
    timeSec = tbl.Date - tbl.Date.min()
    timeSec.rename('Time',inplace=True)
    
    return pd.concat([ timeSec, tbl[dataCols] ],axis=1)

In [34]:
events = events.groupby('SubjectId').progress_apply(calc_elapsed_time)

  0%|          | 0/46520 [00:00<?, ?it/s]

## Static Features
---
The following features are static for the duration of the data time span.

### Age
Due to the random offsets applied to each patient's timeline, age is calculated relative to date of first admission.

In [35]:
patientTbl = pd.read_csv(dataDirStr + 'PATIENTS.csv')
patientTbl.columns = patientTbl.columns.str.lower()

In [36]:
dateCols = [ 'dob', 'dod', 'dod_hosp', 'dod_ssn' ]
convert_date_type(patientTbl,dateCols)

In [37]:
def find_first_admission(tbl):
    return tbl.sort_values(by='admittime').iloc[0][[ 'hadm_id', 'admittime' ]]

In [38]:
firstAdmit = admitTbl.groupby('subject_id').progress_apply(find_first_admission)

  0%|          | 0/46520 [00:00<?, ?it/s]

In [39]:
patientIds = patientTbl.subject_id
patientAdmitTime = firstAdmit.loc[patientTbl.subject_id].admittime
patientDob = pd.Series(patientTbl.dob.values,index=patientTbl.subject_id)
age = pd.DataFrame({ 'Age': patientAdmitTime.dt.year - patientDob.dt.year})

In [40]:
staticFeatures = age

### Disease Diagnoses

In [41]:
diagnoses = pd.read_csv(dataDirStr + 'DIAGNOSES_ICD.csv')
diagnoses.columns = diagnoses.columns.str.lower()

In [42]:
# Limit to diagnoses on first admission.
firstAdmitDiag = diagnoses.hadm_id.isin(firstAdmit.hadm_id)
diagnoses = diagnoses.loc[firstAdmitDiag]

In [43]:
icdDict = pd.read_csv(dataDirStr + 'D_ICD_DIAGNOSES.csv')
icdDict.columns = icdDict.columns.str.lower()
icdDict = icdDict.sort_values(by = 'icd9_code', ascending=True)

In [44]:
diseaseCodes = {
    'Aids': [ '042' ]
}  

**Musa:** I use a different approach to select the ICD codes corresponding to malignancy.  These are based on the [ICD9 wikipedia page](https://en.wikipedia.org/wiki/List_of_ICD-9_codes) and should encompass both malignant cancers and malignant blood disorders.

In [45]:
malignantPrefix = [ '14', '15', '16', '17', '18', '19', '20' ]

In [46]:
cancerIcd = []
for pfx in malignantPrefix:
    idx = icdDict.icd9_code.str.contains(f'^{pfx}')
    cancerIcd += list(icdDict.loc[idx].icd9_code)
diseaseCodes['Cancer'] = cancerIcd    

In [47]:
def check_diagnoses(tbl,diagCode,diseaseCodes):
    diag = {}
    for d in diseaseCodes.keys():
        diag[d] = tbl[diagCode].isin(diseaseCodes[d]).sum() > 0
    return pd.Series(diag)

In [48]:
diagScan = diagnoses.groupby('subject_id').progress_apply(
    check_diagnoses,diagCode='icd9_code',diseaseCodes=diseaseCodes
)

  0%|          | 0/46520 [00:00<?, ?it/s]

In [49]:
staticFeatures = pd.concat([ staticFeatures, diagScan ], axis=1)

## Merge Static and Timestamped Features

In [50]:
idx = events.index.get_level_values(0)

In [51]:
staticRepeatedTbl = staticFeatures.loc[idx]
staticRepeatedTbl.index = events.index

In [52]:
events = pd.concat([ events, staticRepeatedTbl ],axis=1)

In [53]:
events

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,ServiceType,AdmitType,GscVerbal,GscMotor,GscEyes,SystolicBloodPressure,HeartRate,Temperature,BloodO2,Urine,Age,Aids,Cancer
SubjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,73554,0 days 00:00:00,,2.0,,,,,,,,,0,False,False
2,368,0 days 00:19:42,3.0,,,,,,,,,,0,False,False
2,6461661,0 days 01:16:00,,,,,,,148.0,,,,0,False,False
2,6461662,0 days 01:26:00,,,,,,,131.0,,,,0,False,False
2,6461663,0 days 01:56:00,,,,,,,144.0,,,,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,22559585,2 days 10:41:52,,,8.0,,,,,,,,63,False,False
99999,22559586,2 days 10:41:52,,,,8.0,,,,,,,63,False,False
99999,6440611,2 days 11:10:52,,,,,,,86.0,,,,63,False,False
99999,6440612,2 days 11:10:52,,,,,,120.0,,,,,63,False,False


## Label Data

In [54]:
deathTbl = pd.DataFrame({ 'Death': patientTbl.expire_flag.values != 0 },index=patientTbl.subject_id)

In [55]:
idx = events.index.get_level_values(0)
events['Death'] = deathTbl.loc[idx].Death.values

In [56]:
events

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,ServiceType,AdmitType,GscVerbal,GscMotor,GscEyes,SystolicBloodPressure,HeartRate,Temperature,BloodO2,Urine,Age,Aids,Cancer,Death
SubjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,73554,0 days 00:00:00,,2.0,,,,,,,,,0,False,False,False
2,368,0 days 00:19:42,3.0,,,,,,,,,,0,False,False,False
2,6461661,0 days 01:16:00,,,,,,,148.0,,,,0,False,False,False
2,6461662,0 days 01:26:00,,,,,,,131.0,,,,0,False,False,False
2,6461663,0 days 01:56:00,,,,,,,144.0,,,,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,22559585,2 days 10:41:52,,,8.0,,,,,,,,63,False,False,False
99999,22559586,2 days 10:41:52,,,,8.0,,,,,,,63,False,False,False
99999,6440611,2 days 11:10:52,,,,,,,86.0,,,,63,False,False,False
99999,6440612,2 days 11:10:52,,,,,,120.0,,,,,63,False,False,False


In [57]:
events.to_parquet(cacheDirStr + 'ElapsedTimeEvents.parquet')