In [None]:
import pandas as pd
import os
import numpy as np
from data_prep_helper import simple_imputer, getSentences
import nltk
import re
import warnings
import spacy
from nltk import sent_tokenize, word_tokenize

In [None]:
##Helper functions
def simple_imputer(df):
    idx = pd.IndexSlice
    df = df.copy()
    ID_COLS = ['subject_id', 'hadm_id', 'icustay_id']
    if len(df.columns.names) > 2: df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2'))

    df_out = df.loc[:, idx[:, ['mean', 'count']]]
    icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean()

    df_out.loc[:, idx[:, 'mean']] = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).fillna(
        method='ffill'
    ).groupby(ID_COLS).fillna(icustay_means).fillna(0)

    df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float)
    df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)

    is_absent = (1 - df_out.loc[:, idx[:, 'mask']])
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence[is_absent == 0].fillna(method='ffill')
    time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)

    df_out = pd.concat((df_out, time_since_measured), axis=1)
    df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100)

    df_out.sort_index(axis=1, inplace=True)
    return df_out


SECTION_TITLES = re.compile(
    r'('
    r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
    r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
    r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
    r'|TECHNIQUE'
    r'):|FINAL REPORT',
    re.I | re.M)


def getSentences(t):
    return list(preprocess_mimic(t))


def pattern_repl(matchobj):
    """
    Return a replacement string to be used for match object
    """
    return ' '.rjust(len(matchobj.group(0)))


def clean_text(text):
    """
    Clean text
    """

    # Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    # Replace `_` with spaces.
    text = re.sub(r'_', ' ', text)

    start = 0
    end = find_end(text)
    new_text = ''
    if start > 0:
        new_text += ' ' * start
    new_text = text[start:end]

    # make sure the new text has the same length of old text.
    if len(text) - end > 0:
        new_text += ' ' * (len(text) - end)
    return new_text


def preprocess_mimic(text):
    """
    Preprocess reports in MIMIC-III.
    1. remove [**Patterns**] and signature
    2. split the report into sections
    3. tokenize sentences and words
    4. lowercase
    """
    for sec in split_heading(clean_text(text)):
        for sent in sent_tokenize(sec):
            text = ' '.join(word_tokenize(sent))
            yield text.lower()


def split_heading(text):
    """Split the report into sections"""
    start = 0
    for matcher in SECTION_TITLES.finditer(text):
        # add last
        end = matcher.start()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        # add title
        start = end
        end = matcher.end()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        start = end

    # add last piece
    end = len(text)
    if start < end:
        section = text[start:end].strip()
        if section:
            yield section


def find_end(text):
    """Find the end of the report."""
    ends = [len(text)]
    patterns = [
        re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
        re.compile(r'\n {3,}DR.', re.I),
        re.compile(r'[ ]{1,}RADLINE ', re.I),
        re.compile(r'.*electronically signed on', re.I),
        re.compile(r'M\[0KM\[0KM')
    ]
    for pattern in patterns:
        matchobj = pattern.search(text)
        if matchobj:
            ends.append(matchobj.start())
    return min(ends)

# TIMESERIES DATA

In [None]:
MIMIC_EXTRACT_DATA = "../data/all_hourly_data.h5"
ts_data_lvl2 = pd.read_hdf(MIMIC_EXTRACT_DATA, "vitals_labs")
ts_data_raw= pd.read_hdf(MIMIC_EXTRACT_DATA, "vitals_labs")
static_data = pd.read_hdf(MIMIC_EXTRACT_DATA, 'patients')
SEED = 10

In [None]:
##Inclusion-Exclusion Criteria applied for time series data
##At least 24+6(gap) hours in ICU
GAP_TIME = 6  # In hours
WINDOW_SIZE = 24  # In hours
Ys = static_data[static_data.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys['los_7'] = Ys['los_icu'] > 7
Ys.drop(columns=['los_icu'], inplace=True)
Ys.astype(float)

lvl2, raw = [df[
                 (df.index.get_level_values('icustay_id').isin(set(Ys.index.get_level_values('icustay_id')))) &
                 (df.index.get_level_values('hours_in') < WINDOW_SIZE)
                 ] for df in (ts_data_lvl2, ts_data_raw)]
raw.columns = raw.columns.droplevel(level=['LEVEL2'])

In [None]:
##Split time series data into train/dev/test
train_frac, dev_frac, test_frac = 0.7, 0.1, 0.2
lvl2_subj_idx, raw_subj_idx, Ys_subj_idx = [df.index.get_level_values('subject_id') for df in (lvl2, raw, Ys)]
lvl2_subjects = set(lvl2_subj_idx)
assert lvl2_subjects == set(Ys_subj_idx), "Subject ID pools differ!"
assert lvl2_subjects == set(raw_subj_idx), "Subject ID pools differ!"

np.random.seed(SEED)
subjects, N = np.random.permutation(list(lvl2_subjects)), len(lvl2_subjects)
N_train, N_dev, N_test = int(train_frac * N), int(dev_frac * N), int(test_frac * N)
train_subj = subjects[:N_train]
dev_subj = subjects[N_train:N_train + N_dev]
test_subj = subjects[N_train + N_dev:]

[(lvl2_train, lvl2_dev, lvl2_test), (raw_train, raw_dev, raw_test), (Ys_train, Ys_dev, Ys_test)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, dev_subj, test_subj)] \
    for df in (lvl2, raw, Ys)
]

In [None]:
##Normalize time series data
idx = pd.IndexSlice
lvl2_means, lvl2_stds = lvl2_train.loc[:, idx[:, 'mean']].mean(axis=0), lvl2_train.loc[:, idx[:, 'mean']].std(axis=0)

lvl2_train.loc[:, idx[:, 'mean']] = (lvl2_train.loc[:, idx[:, 'mean']] - lvl2_means) / lvl2_stds
lvl2_dev.loc[:, idx[:, 'mean']] = (lvl2_dev.loc[:, idx[:, 'mean']] - lvl2_means) / lvl2_stds
lvl2_test.loc[:, idx[:, 'mean']] = (lvl2_test.loc[:, idx[:, 'mean']] - lvl2_means) / lvl2_stds

lvl2_train, lvl2_dev, lvl2_test = [
    simple_imputer(df) for df in (lvl2_train, lvl2_dev, lvl2_test)
]
lvl2_flat_train, lvl2_flat_dev, lvl2_flat_test = [
    df.pivot_table(index=['subject_id', 'hadm_id', 'icustay_id'], columns=['hours_in']) for df in (
        lvl2_train, lvl2_dev, lvl2_test
    )
]

for df in lvl2_train, lvl2_dev, lvl2_test: assert not df.isnull().any().any()

[(Ys_train, Ys_dev, Ys_test)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, dev_subj, test_subj)] \
    for df in (Ys,)
]

In [None]:
print("Shape of train, dev, test {}, {}, {}.".format(lvl2_train.shape, lvl2_dev.shape, lvl2_test.shape))

In [37]:
##Save datasets in pickle file
pd.to_pickle(lvl2_train, "../data/timeseries/lvl2_imputer_train.pkl")
pd.to_pickle(lvl2_dev, "../data/timeseries/lvl2_imputer_dev.pkl")
pd.to_pickle(lvl2_test, "../data/timeseries/lvl2_imputer_test.pkl")

pd.to_pickle(Ys, "../data/timeseries/Ys.pkl")
pd.to_pickle(Ys_train, "../data/timeseries/Ys_train.pkl")
pd.to_pickle(Ys_dev, "../data/timeseries/Ys_dev.pkl")
pd.to_pickle(Ys_test, "../data/timeseries/Ys_test.pkl")

# Clinical Notes Data

In [None]:
admission_df = pd.read_csv("../data/ADMISSIONS.csv")
noteevents_df = pd.read_csv("../data/NOTEEVENTS.csv")
icustays_df = pd.read_csv("../data/ICUSTAYS.csv")

In [None]:
patient_ids = []  # store all patient ids
for each_entry in Ys.index:
    patient_ids.append(each_entry[0])
note_categories = noteevents_df.groupby(noteevents_df.CATEGORY).agg(['count']).index
selected_note_types = []
for each_cat in list(note_categories):
    if each_cat != 'Discharge summary':
        selected_note_types.append(each_cat)

In [None]:
## Drop discharge summaries to avoid information leak
sub_notes = noteevents_df[noteevents_df.CATEGORY.isin(selected_note_types)]

In [None]:
## Drop clinical notes with no chart times
missing_chardate_index = []
for each_note in sub_notes.itertuples():
    if isinstance(each_note.CHARTTIME, str):
        continue
    if np.isnan(each_note.CHARTTIME):
        missing_chardate_index.append(each_note.Index)
print("{} of notes does not have charttime.".format(len(missing_chardate_index)))

sub_notes.drop(missing_chardate_index, inplace=True)
print("After dropping no notes, the note shape is {}".format(sub_notes.shape))

In [None]:
##Drop patients with no clinical notes in 24 hours
sub_notes = sub_notes[sub_notes.SUBJECT_ID.isin(patient_ids)]
TIMELIMIT = 1  # 1day
new_static = static_data.reset_index()
new_static.rename(columns={"subject_id": "SUBJECT_ID", "hadm_id": "HADM_ID"}, inplace=True)
print("New Stats shape is {}".format(new_static.shape))
print("Sub note shape is {}".format(sub_notes.shape))
df_adm_notes = pd.merge(sub_notes[['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'CATEGORY', 'TEXT']],
                        new_static[['SUBJECT_ID', 'HADM_ID', 'icustay_id', 'age', 'admittime', 'dischtime', 'deathtime',
                                   'intime', 'outtime', 'los_icu', 'mort_icu', 'mort_hosp', 'hospital_expire_flag',
                                   'hospstay_seq', 'max_hours']],
                        on=['SUBJECT_ID'],
                        how='left')

df_adm_notes['CHARTTIME'] = pd.to_datetime(df_adm_notes['CHARTTIME'])
df_less_n = df_adm_notes[
    ((df_adm_notes['CHARTTIME'] - df_adm_notes['intime']).dt.total_seconds() / (24 * 60 * 60)) < TIMELIMIT]
pd.to_pickle(df_less_n, "../data/clinical_notes/sub_notes.p")

In [None]:
# Process clinical notes
sub_notes = df_less_n[df_less_n.SUBJECT_ID.notnull()]
sub_notes = sub_notes[sub_notes.CHARTTIME.notnull()]
sub_notes = sub_notes[sub_notes.TEXT.notnull()]
sub_notes = sub_notes[['SUBJECT_ID', 'HADM_ID_y', 'CHARTTIME', 'TEXT']]
sub_notes['preprocessed_text'] = None
for each_note in sub_notes.itertuples():
    text = each_note.TEXT
    sub_notes.at[each_note.Index, 'preprocessed_text'] = getSentences(text)
pd.to_pickle(sub_notes, "../data/clinical_notes/preprocessed_notes.p")