# MIMIC Pre-Processing Example
The output of the [MIMIC Pre-Processing Example](./MimicPreprocessingExample.ipynb) notebook generates a series of irregularly spaced timestamped dynamic data features.  This notebook demonstrates how to group those dynamic features into one or more regularly spaced time windows.

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import os
from sklearn.preprocessing import LabelEncoder 

from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
dataDirStr = '/Users/gmessier/data/mimic-1.4/'   # MIMIC CSV file location.
cacheDirStr = '/Users/gmessier/data/mimic-1.4/cache/'  # Cache directory for intermediate files.

## Load Data
---

In [3]:
events = pd.read_parquet(cacheDirStr + 'ElapsedTimeEvents.parquet')

In [4]:
events

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,ServiceType,AdmitType,GscVerbal,GscMotor,GscEyes,SystolicBloodPressure,HeartRate,Temperature,BloodO2,Urine,Age,Aids,Cancer,Death
SubjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,73554,0 days 00:00:00,,2.0,,,,,,,,,0,False,False,False
2,368,0 days 00:19:42,3.0,,,,,,,,,,0,False,False,False
2,6461661,0 days 01:16:00,,,,,,,148.0,,,,0,False,False,False
2,6461662,0 days 01:26:00,,,,,,,131.0,,,,0,False,False,False
2,6461663,0 days 01:56:00,,,,,,,144.0,,,,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,22559585,2 days 10:41:52,,,8.0,,,,,,,,63,False,False,False
99999,22559586,2 days 10:41:52,,,,8.0,,,,,,,63,False,False,False
99999,6440611,2 days 11:10:52,,,,,,,86.0,,,,63,False,False,False
99999,6440612,2 days 11:10:52,,,,,,120.0,,,,,63,False,False,False


### Feature Types

In [5]:
# Categorical
catFtrs = [ 'ServiceType', 'AdmitType', 'GscVerbal', 'GscMotor', 'GscEyes' ] 

# Continous
contFtrs = [ 'SystolicBloodPressure', 'HeartRate', 'Temperature', 'BloodO2', 'Urine' ]

# Static
staticFtrs = [  'Age', 'Aids', 'Cancer', 'Death' ]

## Define Observation Window
---

In [6]:
tO = pd.Timedelta(f'{24*60*60}S') # Observation window length, T_O (seconds).
events = events.loc[events.Time <= tO]

In [7]:
events

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,ServiceType,AdmitType,GscVerbal,GscMotor,GscEyes,SystolicBloodPressure,HeartRate,Temperature,BloodO2,Urine,Age,Aids,Cancer,Death
SubjectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,73554,0 days 00:00:00,,2.0,,,,,,,,,0,False,False,False
2,368,0 days 00:19:42,3.0,,,,,,,,,,0,False,False,False
2,6461661,0 days 01:16:00,,,,,,,148.0,,,,0,False,False,False
2,6461662,0 days 01:26:00,,,,,,,131.0,,,,0,False,False,False
2,6461663,0 days 01:56:00,,,,,,,144.0,,,,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6439849,0 days 23:30:39,,,,,,115.0,,,,,89,False,False,True
99995,6439302,0 days 23:30:39,,,,,,,72.0,,,,89,False,False,True
99995,25766653,0 days 23:30:39,,,,,,,,,,30.0,89,False,False,True
99999,66685,0 days 00:00:00,11.0,,,,,,,,,,63,False,False,False


## One Hot Encode Categorical Features
---

In [8]:
oneHot = pd.DataFrame(index=events.index)

In [9]:
for c in tqdm(catFtrs):
    oneHot = pd.concat([ oneHot, pd.get_dummies(events[c],prefix=c) ], axis=1)

  0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
oneHotCols = list(oneHot.columns)

In [11]:
events = pd.concat([ events.drop(columns=catFtrs), oneHot ], axis=1)

## Resample Into Time Bins
---

In [12]:
tB = f'{60*60}S' # Bin size (seconds).

### One Hot Encoded Features
Sum up how many events fall into each time bin.

In [13]:
eventsBin = events[['Time']+oneHotCols].groupby(level=0).resample(on='Time',rule=tB).sum()

### Continuous Features
Take the average of the feature value for each time bin.

In [14]:
eventsBinNew = events[['Time']+contFtrs].groupby(level=0).resample(on='Time',rule=tB).mean()

In [15]:
eventsBin = pd.concat([eventsBin, eventsBinNew], axis=1)

### Static Features
Copy static features across all time bins since they don't change.

In [16]:
staticTbl = events[staticFtrs].groupby(level=0).apply(lambda x: x.iloc[0,:])

In [17]:
idx = eventsBin.index.get_level_values(0)

In [18]:
for c in staticFtrs:
    eventsBin[c] = staticTbl.loc[idx,c].values

## Save Windowed Data

In [19]:
eventsBin

Unnamed: 0_level_0,Unnamed: 1_level_0,ServiceType_0.0,ServiceType_1.0,ServiceType_2.0,ServiceType_3.0,ServiceType_4.0,ServiceType_5.0,ServiceType_6.0,ServiceType_7.0,ServiceType_8.0,ServiceType_9.0,...,GscEyes_7.0,SystolicBloodPressure,HeartRate,Temperature,BloodO2,Urine,Age,Aids,Cancer,Death
SubjectId,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2,0 days 00:00:00,0,0,0,1,0,0,0,0,0,0,...,0,,,,,,0,False,False,False
2,0 days 01:00:00,0,0,0,0,0,0,0,0,0,0,...,0,,141.0,,,,0,False,False,False
2,0 days 02:00:00,0,0,0,0,0,0,0,0,0,0,...,0,,140.0,,,,0,False,False,False
3,0 days 00:00:00,0,0,0,0,0,0,0,0,0,1,...,0,86.250000,95.0,96.900002,,,76,False,False,True
3,0 days 01:00:00,0,0,0,0,0,0,0,0,0,0,...,0,86.333333,148.5,,1.0,,76,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,0 days 01:00:00,0,0,0,0,0,0,0,0,0,0,...,0,,,,,,63,False,False,False
99999,0 days 02:00:00,0,0,0,0,0,0,0,0,0,0,...,0,,,,,,63,False,False,False
99999,0 days 03:00:00,0,0,0,0,0,0,0,0,0,0,...,0,,,,,,63,False,False,False
99999,0 days 04:00:00,0,0,0,0,0,0,0,0,0,0,...,0,,,,,,63,False,False,False


In [20]:
eventsBin.to_parquet(cacheDirStr + 'ElapsedTimeEventsBinned.parquet')