In [None]:
import os
import numpy as np
import pandas as pd
import pickle
import re
import requests
import time

### Set the below variables to the paths to input MIMIC data and output processed data:

In [None]:
# Set this variable to your local path to the MIMIC-III dataset csv files
MIMIC_DATA_PATH = "/Users/ericahlgren/Documents/UIUC/CS598/Project/data/mimic-iii-clinical-database-1.4"
OUT_PATH = "data/"

### Read the csvs with ICU visit and diagnosis information:

In [None]:
icustays = pd.read_csv(os.path.join(MIMIC_DATA_PATH, "ICUSTAYS.csv"))
diagnoses = pd.read_csv(os.path.join(MIMIC_DATA_PATH, "DIAGNOSES_ICD.csv"))
d_icd = pd.read_csv(os.path.join(MIMIC_DATA_PATH, "D_ICD_DIAGNOSES.csv"))

### Tokenize the diagnosis descriptions and merge into one DataFrame:

In [None]:
d_icd['LONG_TITLE_REPL'] = (d_icd.LONG_TITLE.str.replace('-', ' ', regex=False)
                            .str.replace('[^\w\s]','', regex=True)
                            .str.replace('[\d]', '', regex=True)
                            .str.lower())
diag_desc = pd.merge(diagnoses, d_icd, left_on="ICD9_CODE", right_on="ICD9_CODE")
icu_diag_merge = pd.merge(icustays, diag_desc, left_on="HADM_ID", right_on="HADM_ID")
icu_diag_merge = icu_diag_merge.drop('SUBJECT_ID_y', axis=1)
icu_diag_merge = icu_diag_merge.rename({'SUBJECT_ID_x': 'SUBJECT_ID'}, axis=1)
icu_diag_merge = icu_diag_merge.dropna(subset=['ICD9_CODE'])
icu_diag_merge = icu_diag_merge.loc[
    icu_diag_merge.groupby('SUBJECT_ID')['ICUSTAY_ID'].transform('nunique') > 1
]

### Define function to format ICD-9 codes with correct decimal placement:

In [None]:
def insert_decimal(row):
    icd_str = str(row.ICD9_CODE)
    if icd_str.startswith('E'):
        out_str = icd_str[:4] + '.' + icd_str[4:]
    else:
        out_str = icd_str[:3] + '.' + icd_str[3:]
    return out_str

icu_diag_merge['ICD9_CODE_D'] = icu_diag_merge.apply(insert_decimal, axis=1)
icu_diag_merge['ICD_CAT'] = icu_diag_merge.apply(
    lambda row: row.ICD9_CODE_D.split('.')[0], axis=1)

### Web scraper for extracting category-level binning and descriptions

In [None]:
page = requests.get("http://www.icd9data.com/2015/Volume1/default.htm")
categories = re.findall(
    r'\<a href="/2015/Volume1/([0-9VE]{3,4}-[0-9VE]{3,4})/default.htm".*?alt="" /> ([a-zA-Z \-,]+)</li',
    page.text)
icd_category_df = pd.DataFrame(categories, columns=['ICD_RANGE', 'ICD_CATEGORY_DESC'])
sub_categories = re.findall(
    r'class=&quot;identifier&quot;&gt;([0-9VE\-].*?)&lt;/span&gt; ([a-zA-Z \-,]+)&lt',
    page.text)
icd_subcategory_df = pd.DataFrame(sub_categories, columns=['ICD_SUBRANGE', 'ICD_SUBCATEGORY_DESC'])
icd_subcategory_df['ICD_SUBCATEGORY_DESC_REPL'] = icd_subcategory_df.ICD_SUBCATEGORY_DESC.str.replace('-', ' ', regex=False).str.replace('[^\w\s]','', regex=True).str.lower()
icd_subcategory_df = icd_subcategory_df.reset_index().rename({'index': 'ICD_SUBCATEGORY_INDEX'}, axis=1)

### Assign a subcategory index value to every ICD-9 code in every visit

In [None]:
def get_subcategory_index(row):
    try:
        icd_cat = int(row.ICD_CAT)
    except ValueError:
        icd_cat = int(row.ICD_CAT[1:])
    if row.ICD_CAT.startswith('E'):
        for idx in icd_subcategory_df[icd_subcategory_df.ICD_SUBRANGE.str.startswith('E')].index:
            low, high = [int(i[1:]) for i in icd_subcategory_df.iloc[idx].ICD_SUBRANGE.split('-')]
            if icd_cat >= low and icd_cat <= high:
                return idx
    elif row.ICD_CAT.startswith('V'):
        for idx in icd_subcategory_df[icd_subcategory_df.ICD_SUBRANGE.str.startswith('V')].index:
            low, high = [int(i[1:]) for i in icd_subcategory_df.iloc[idx].ICD_SUBRANGE.split('-')]
            if icd_cat >= low and icd_cat <= high:
                return idx
    else:
        for idx in icd_subcategory_df[~((icd_subcategory_df.ICD_SUBRANGE.str.startswith('E')) | (icd_subcategory_df.ICD_SUBRANGE.str.startswith('V')))].index:
            if '-' in icd_subcategory_df.iloc[idx].ICD_SUBRANGE:
                low, high = [int(i) for i in icd_subcategory_df.iloc[idx].ICD_SUBRANGE.split('-')]
                if icd_cat >= low and icd_cat <= high:
                    return idx
            else:
                if icd_cat == int(icd_subcategory_df.iloc[idx].ICD_SUBRANGE):
                    return idx

icu_diag_merge['ICD_SUBCATEGORY_INDEX'] = icu_diag_merge.apply(get_subcategory_index, axis=1)
icu_diag_merge = pd.merge(icu_diag_merge, icd_subcategory_df, left_on='ICD_SUBCATEGORY_INDEX', right_on='ICD_SUBCATEGORY_INDEX')

### Create DataFrame of ICD-9 codes and associated lists

In [None]:
icd9_df = (pd.DataFrame(icu_diag_merge, columns=["ICD9_CODE", "LONG_TITLE_REPL"])
           .drop_duplicates(subset="ICD9_CODE")
           .sort_values("ICD9_CODE"))
icd9_df = (icd9_df
           .reset_index(drop=True)
           .reset_index()
           .rename({'index': 'ICD9_CODE_INDEX'}, axis=1))

icd9 = icd9_df.ICD9_CODE.tolist()
icd9_text = icd9_df.LONG_TITLE_REPL.tolist()

### Merge result into ICD Diagnosis df and store with MIMIC data

In [None]:
icu_diag_merge = pd.merge(icu_diag_merge, icd9_df.drop(columns=['LONG_TITLE_REPL']), left_on='ICD9_CODE', right_on='ICD9_CODE')
icu_diag_merge = icu_diag_merge.sort_values(['SUBJECT_ID', 'ICUSTAY_ID'])
icu_diag_merge.to_csv(os.path.join(MIMIC_DATA_PATH, "icu_diag_merge.csv"))

### Define functions to calculate summary statistics and review results

In [None]:
def get_description_vocab(input_df, col='LONG_TITLE_REPL'):
    words = set()
    phrases = input_df[col].unique()
    for ph in phrases:
        words.update(ph.split())
    return words

In [None]:
def calculate_summary_stats(input_df):
    num_patients = input_df.SUBJECT_ID.nunique()
    num_visits = input_df.ICUSTAY_ID.nunique()
    mean_visits_per_patient = np.round(num_visits / num_patients, 2)
    num_icd = input_df.ICD9_CODE.nunique()
    mean_icd_per_visit = np.round(input_df.groupby('ICUSTAY_ID').ICD9_CODE.nunique().mean(), 2)
    max_icd_per_visit = input_df.groupby('ICUSTAY_ID').ICD9_CODE.nunique().max()
    num_words = len(get_description_vocab(input_df))
    num_categories = input_df.ICD_SUBCATEGORY_INDEX.nunique()
    mean_categories_per_visit = np.round(input_df.groupby('ICUSTAY_ID').ICD_SUBCATEGORY_INDEX.nunique().mean(), 2)
    max_categories_per_visit = input_df.groupby('ICUSTAY_ID').ICD_SUBCATEGORY_INDEX.nunique().max()
    print(f"# of patients: \t\t\t\t{num_patients:,}")
    print(f"# of visits: \t\t\t\t{num_visits:,}")
    print(f"Avg. visits per patient: \t\t{mean_visits_per_patient}")
    print(f"# of unique ICD9 codes: \t\t{num_icd:,}")
    print(f"Avg. # of diagnosis codes per visit: \t{mean_icd_per_visit}")
    print(f"Max # of diagnosis codes per visit: \t{max_icd_per_visit}")
    print(f"# of words in code descriptions: \t{num_words:,}")
    print(f"# of category codes: \t\t\t{num_categories:,}")
    print(f"Avg. # of category codes per visit: \t{mean_categories_per_visit}")
    print(f"Max # of category codes per visit: \t{max_categories_per_visit}")

In [None]:
calculate_summary_stats(icu_diag_merge)

### Define functions to extract lists of seqs/targets for various training methods

In [None]:
def construct_targets(pids):
    seqs = []
    for p_idx, pid in enumerate(pids):
        seqs.append([])
        subset = icu_diag_merge[icu_diag_merge.SUBJECT_ID == pid]
        vid_subset = subset.ICUSTAY_ID.unique()
        for v_idx, vid in enumerate(vid_subset):
            seqs[p_idx].append(subset[subset.ICUSTAY_ID == vid].ICD_SUBCATEGORY_INDEX.unique().tolist())
    return seqs

In [None]:
def construct_seqs(pids):
    seqs = []
    vids = []
    for p_idx, pid in enumerate(pids):
        vids.append([])
        seqs.append([])
        subset = icu_diag_merge[icu_diag_merge.SUBJECT_ID == pid]
        vid_subset = subset.ICUSTAY_ID.unique()
        for v_idx, vid in enumerate(vid_subset):
            vids[p_idx].append(v_idx)
            seqs[p_idx].append(subset[subset.ICUSTAY_ID == vid].ICD9_CODE_INDEX.unique().tolist())
    return seqs, vids

In [None]:
def construct_text_seqs(pids):
    seqs = []
    for p_idx, pid in enumerate(pids):
        seqs.append([])
        subset = icu_diag_merge[icu_diag_merge.SUBJECT_ID == pid]
        vid_subset = subset.ICUSTAY_ID.unique()
        for v_idx, vid in enumerate(vid_subset):
            seqs[p_idx].append(
                subset[subset.ICUSTAY_ID == vid].LONG_TITLE_REPL.unique().tolist()
            )
    return seqs

In [None]:
def construct_seqs_multihot(pids):
    seqs = []
    vids = []
    for p_idx, pid in enumerate(pids):
        vids.append([])
        seqs.append([])
        subset = icu_diag_merge[icu_diag_merge.SUBJECT_ID == pid]
        vid_subset = subset.ICUSTAY_ID.unique()
        for v_idx, vid in enumerate(vid_subset):
            vids[p_idx].append(v_idx)
            seq_idxs = (subset[subset.ICUSTAY_ID == vid].ICD9_CODE_INDEX.unique())
            seq_row = np.zeros(len(icd9_df), dtype=int)
            seq_row[seq_idxs] = 1
            seqs[p_idx].append(seq_row.tolist())
            #seqs[p_idx].append(subset[subset.ICUSTAY_ID == vid].ICD_SUBCATEGORY_INDEX.unique().tolist())
    return seqs, vids

### Extract relevant data from DataFrame into lists and save pickle for training

In [None]:
pids = icu_diag_merge.SUBJECT_ID.unique().tolist()
diags = icu_diag_merge.ICD_CAT.unique().tolist()
sub_categories = icd_subcategory_df.ICD_SUBRANGE.unique().tolist()
codes = icu_diag_merge.ICD9_CODE_INDEX.unique().tolist()
seqs, vids = construct_seqs(pids)
text_seqs = construct_text_seqs(pids)
targets = construct_targets(pids)

In [None]:
with open(os.path.join(OUT_PATH, 'pids.pkl'), 'wb') as f:
    pickle.dump(pids, f)

In [None]:
with open(os.path.join(OUT_PATH, 'seqs.pkl'), 'wb') as f:
    pickle.dump(seqs, f)

In [None]:
with open(os.path.join(OUT_PATH, 'text_seqs.pkl'), 'wb') as f:
    pickle.dump(text_seqs, f)

In [None]:
with open(os.path.join(OUT_PATH, 'targets.pkl'), 'wb') as f:
    pickle.dump(targets, f)

In [None]:
with open(os.path.join(OUT_PATH, 'vids.pkl'), 'wb') as f:
    pickle.dump(vids, f)

In [None]:
with open(os.path.join(OUT_PATH, 'subcategories.pkl'), 'wb') as f:
    pickle.dump(sub_categories, f)

In [None]:
with open(os.path.join(OUT_PATH, 'categories.pkl'), 'wb') as f:
    pickle.dump(categories, f)

In [None]:
with open(os.path.join(OUT_PATH, 'icd9.pkl'), 'wb') as f:
    pickle.dump(icd9, f)

In [None]:
with open(os.path.join(OUT_PATH, 'icd9_text.pkl'), 'wb') as f:
    pickle.dump(icd9_text, f)

In [None]:
with open(os.path.join(OUT_PATH, 'diags.pkl'), 'wb') as f:
    pickle.dump(diags, f)

### Define additional methods for constructing targets with category probabilities

In [None]:
def calculate_prob_row(row_indxs, num_categories):
    unit_p = 1 / len(row_indxs)
    row = [0.0] * num_categories
    for ri in row_indxs:
        row[ri] += unit_p
    assert np.isclose(sum(row), 1.0), f"row={row}, sum row={sum(row)}"
    return row

In [None]:
def construct_prob_targets_last_visit(pids, categories, field='ICD_SUBCATEGORY_INDEX'):
    targets = []
    num_categories = len(categories)
    for p_idx, pid in enumerate(pids):
        subset = icu_diag_merge[icu_diag_merge.SUBJECT_ID == pid]
        last_visit = subset.ICUSTAY_ID.unique()[-1]
        row_indxs = subset[subset.ICUSTAY_ID == last_visit][field].tolist()
        targets.append(calculate_prob_row(row_indxs, num_categories))
    return targets

In [None]:
def construct_prob_targets_all_visits(pids, categories, field='ICD_SUBCATEGORY_INDEX'):
    targets = []
    num_categories = len(categories)
    for p_idx, pid in enumerate(pids):
        visit = []
        subset = icu_diag_merge[icu_diag_merge.SUBJECT_ID == pid]
        visits = subset.ICUSTAY_ID.unique()
        for v in visits:
            row_indxs = subset[subset.ICUSTAY_ID == v][field].tolist()
            visit.append(calculate_prob_row(row_indxs, num_categories))
        targets.append(visit)
    return targets

### Construct probability target lists and save with pickle

In [None]:
prob_targets = construct_prob_targets_last_visit(pids, sub_categories)
prob_targets_allvisits = construct_prob_targets_all_visits(pids, sub_categories)

In [None]:
with open(os.path.join(OUT_PATH, 'prob_targets.pkl'), 'wb') as f:
    pickle.dump(prob_targets, f)

In [None]:
with open(os.path.join(OUT_PATH, 'prob_targets_allvisits.pkl'), 'wb') as f:
    pickle.dump(prob_targets_allvisits, f)