* This file is based on https://github.com/jamesmullenbach/caml-mimic/blob/master/notebooks/dataproc_mimic_III.ipynb

* MIMIC-III dataset can be downloaded at https://physionet.org/content/mimiciii/1.4/
* The dataset split can be downloaded at https://github.com/jamesmullenbach/caml-mimic/tree/master/mimicdata/mimic3
~~~
wget https://raw.githubusercontent.com/jamesmullenbach/caml-mimic/master/mimicdata/mimic3/train_50_hadm_ids.csv
wget https://raw.githubusercontent.com/jamesmullenbach/caml-mimic/master/mimicdata/mimic3/dev_50_hadm_ids.csv
wget https://raw.githubusercontent.com/jamesmullenbach/caml-mimic/master/mimicdata/mimic3/test_50_hadm_ids.csv
~~~
* Extract MIMIC-III files and make sure there are DIAGNOSES_ICD.csv, D_ICD_DIAGNOSES.csv, D_ICD_PROCEDURES.csv, NOTEEVENTS.csv, PROCEDURES_ICD.csv under INPUT_DIR

In [1]:
INPUT_DIR = "/data/dai031/Corpora/MIMIC-III/v_1_4"
OUTPUT_DIR = "/data/dai031/ProcessedData/MIMIC-III/0"
SPLIT_DIR = "/data/dai031/Corpora/MIMIC-III/split"
!mkdir -p $OUTPUT_DIR

In [2]:
import csv, operator, os, re
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
import pandas as pd

## Combine diagnosis and procedure codes and reformat them
The codes in MIMIC-III are given in separate files for procedures and diagnoses, and the codes are given without periods, which might lead to collisions if we naively combine them. So we have to add the periods back in the right place.

In [3]:
def reformat_code(code, is_diagnosis):
    """Generally, procedure codes have dots after the first two digits,
    while diagnosis codes have dots after the first three digits."""
    code = "".join(code.split("."))
    if is_diagnosis:
        if code.startswith("E"):
            if len(code) > 4:
                code = code[:4] + "." + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + "." + code[3:]
    else:
        code = code[:2] + "." + code[2:]
    return code

In [4]:
PROCEDURES_ICD = pd.read_csv(os.path.join(INPUT_DIR, "PROCEDURES_ICD.csv"))
DIAGNOSES_ICD = pd.read_csv(os.path.join(INPUT_DIR, "DIAGNOSES_ICD.csv"))

In [5]:
DIAGNOSES_ICD["absolute_code"] = DIAGNOSES_ICD.apply(lambda row: str(reformat_code(str(row[4]), True)), axis=1)
PROCEDURES_ICD["absolute_code"] = PROCEDURES_ICD.apply(lambda row: str(reformat_code(str(row[4]), False)), axis=1)

In [7]:
ALL_ICD = pd.concat([DIAGNOSES_ICD, PROCEDURES_ICD])
ALL_ICD.to_csv("ALL_ICD.csv", index=False,
               columns=["ROW_ID", "SUBJECT_ID", "HADM_ID", "SEQ_NUM", "absolute_code"],
               header=["ROW_ID", "SUBJECT_ID", "HADM_ID", "SEQ_NUM", "ICD9_CODE"])

In [8]:
ALL_ICD = pd.read_csv("ALL_ICD.csv", dtype={"ICD9_CODE": str})
len(ALL_ICD["ICD9_CODE"].unique())

8994

## Tokenize and preprocess raw text
Preprocessing time!
This will:
* Select only discharge summaries and their addenda
* change all numbers to 0s
* lowercase all tokens

In [9]:
tokenizer = RegexpTokenizer(r"\w+")

In [10]:
with open(os.path.join(INPUT_DIR, "NOTEEVENTS.csv"), "r") as in_f:
    with open("DISCHARGE_SUMMARIES.csv", "w") as out_f:
        out_f.write(",".join(["SUBJECT_ID", "HADM_ID", "CHARTTIME", "TEXT"]) + "\n")
        reader = csv.reader(in_f)
        next(reader) # skip the first line

        for line in tqdm(reader):
            if line[6] == "Discharge summary":
                text = line[10].strip()
                # tokenize, lowercase and normalize numerics
                text = re.sub("\d", "0", text.lower())
                tokens = tokenizer.tokenize(text)
                # Mullenbach et al delete numeric-only tokens
                text = '"' + ' '.join(tokens) + '"'
                out_f.write(",".join([line[1], line[2], line[4], text]) + "\n")

2083180it [01:04, 32208.17it/s] 


In [11]:
DISCHARGE_SUMMARIES = pd.read_csv("DISCHARGE_SUMMARIES.csv")
len(DISCHARGE_SUMMARIES["HADM_ID"].unique())

52726

In [12]:
DISCHARGE_SUMMARIES = DISCHARGE_SUMMARIES.sort_values(["SUBJECT_ID", "HADM_ID"])
DISCHARGE_SUMMARIES.to_csv("DISCHARGE_SUMMARIES_SORTED.csv", index=False)

In [13]:
! rm DISCHARGE_SUMMARIES.csv

In [14]:
ALL_ICD = pd.read_csv("ALL_ICD.csv")
ALL_ICD = ALL_ICD.sort_values(["SUBJECT_ID", "HADM_ID"])
len(DISCHARGE_SUMMARIES["HADM_ID"].unique()), len(ALL_ICD["HADM_ID"].unique())

  ALL_ICD = pd.read_csv("ALL_ICD.csv")


(52726, 58976)

## Consolidate labels with discharge summaries
Looks like there were some HADM_ID's that didn't have discharge summaries, so they weren't included with our notes

In [15]:
hadm_ids = set(DISCHARGE_SUMMARIES["HADM_ID"])

with open("ALL_ICD.csv", "r") as in_f:
    with open("ALL_ICD_FILTERED.csv", "w") as out_f:
        writer = csv.writer(out_f)
        writer.writerow(["SUBJECT_ID", "HADM_ID", "ICD9_CODE", "ADMITTIME", "DISCHTIME"])
        reader = csv.reader(in_f)
        next(reader)
        for row in reader:
            hadm_id = int(row[2])
            if hadm_id in hadm_ids:
                writer.writerow(row[1:3] + [row[-1], "", ""])

In [16]:
ALL_ICD_FILTERED = pd.read_csv("ALL_ICD_FILTERED.csv", index_col=None)
len(ALL_ICD_FILTERED["HADM_ID"].unique())

  ALL_ICD_FILTERED = pd.read_csv("ALL_ICD_FILTERED.csv", index_col=None)


52726

In [17]:
ALL_ICD_FILTERED = ALL_ICD_FILTERED.sort_values(["SUBJECT_ID", "HADM_ID"])
ALL_ICD_FILTERED.to_csv("ALL_ICD_FILTERED_SORTED.csv", index=False)

In [18]:
! rm ALL_ICD.csv ALL_ICD_FILTERED.csv

## Append labels to notes in a single file

In [19]:
def next_labels(label_filepath):
    reader = csv.reader(label_filepath)
    next(reader)

    first_line = next(reader)

    cur_subj = int(first_line[0])
    cur_hadm = int(first_line[1])
    cur_labels = [first_line[2]]

    for row in reader:
        subj_id = int(row[0])
        hadm_id = int(row[1])
        label = row[2]
        # keep reading until you hit a new hadm id
        if hadm_id != cur_hadm or subj_id != cur_subj:
            yield cur_subj, cur_hadm, cur_labels
            cur_subj = subj_id
            cur_hadm = hadm_id
            cur_labels = [label]
        else:
            # add to the labels and move on
            cur_labels.append(label)
    yield cur_subj, cur_hadm, cur_labels

In [20]:
def next_notes(note_filepath):
    reader = csv.reader(note_filepath)
    next(reader)

    first_line = next(reader)

    cur_subj = int(first_line[0])
    cur_hadm = int(first_line[1])
    cur_text = first_line[3]

    for row in reader:
        subj_id = int(row[0])
        hadm_id = int(row[1])
        text = row[3]
        # keep reading until you hit a new hadm id
        if hadm_id != cur_hadm or subj_id != cur_subj:
            yield cur_subj, cur_hadm, cur_text
            cur_subj = subj_id
            cur_hadm = hadm_id
            cur_text = text
        else:
            # concatenate to the discharge summary and move on
            cur_text += " " + text
    yield cur_subj, cur_hadm, cur_text

In [21]:
def concat_data(note_filepath, label_filepath, out_filepath):
    with open(label_filepath, "r") as label_f:
        with open(note_filepath, "r") as note_f:
            with open(out_filepath, "w") as out_f:
                writer = csv.writer(out_f)
                writer.writerow(["SUBJECT_ID", "HADM_ID", "TEXT", "LABELS"])

                labels_gen = next_labels(label_f)
                notes_gen = next_notes(note_f)

                for i, (subj_id, hadm_id, text) in enumerate(notes_gen):
                    cur_subj, cur_hadm , cur_labels= next(labels_gen)

                    assert cur_hadm == hadm_id
                    writer.writerow([subj_id, str(hadm_id), text, ";".join(cur_labels)])

In [22]:
concat_data(note_filepath="DISCHARGE_SUMMARIES_SORTED.csv",
            label_filepath="ALL_ICD_FILTERED_SORTED.csv",
            out_filepath="DISCHARGE_SUMMARIES_ICD.csv")

In [23]:
! rm DISCHARGE_SUMMARIES_SORTED.csv ALL_ICD_FILTERED_SORTED.csv

Sanity check

In [24]:
DISCHARGE_SUMMARIES_ICD = pd.read_csv("DISCHARGE_SUMMARIES_ICD.csv")
all_tokens = set()
num_tokens = 0
for row in DISCHARGE_SUMMARIES_ICD.itertuples():
    for t in row[3].split():
        all_tokens.add(t)
        num_tokens += 1

len(all_tokens), num_tokens, len(DISCHARGE_SUMMARIES_ICD["HADM_ID"].unique())

(138762, 93565687, 52726)

In [25]:
## Create train/dev/test splits
split_ids = {}
for split in ["train", "dev", "test"]:
    lines = [l.strip() for l in open(os.path.join(SPLIT_DIR, f"{split}_full_hadm_ids.csv")).readlines()]
    split_ids[split] = set(lines)
    print(f"{split} set has {len(split_ids[split])} examples")

train set has 47723 examples
dev set has 1631 examples
test set has 3372 examples


In [26]:
split_examples = {k: [] for k in split_ids}

with open("DISCHARGE_SUMMARIES_ICD.csv", "r") as f:
    reader = csv.reader(f)
    next(reader)

    for row in reader:
        hadm_id = row[1]
        text = row[2]
        labels = [l.strip() for l in row[3].split(";") if len(l.strip()) > 0]
        labels = list(set(labels))
        if len(labels) == 0:
            print(f"Ignore one record ({hadm_id}), because it has no labels")
            continue
        example = {"subject_id": int(row[0]), "hadm_id": hadm_id, "text": text ,"labels": labels}
        if hadm_id in split_ids["train"]:
            split_examples["train"].append(example)
        elif hadm_id in split_ids["dev"]:
            split_examples["dev"].append(example)
        else:
            assert hadm_id in split_ids["test"]
            split_examples["test"].append(example)

Ignore one record (110220), because it has no labels
Ignore one record (142890), because it has no labels
Ignore one record (109963), because it has no labels
Ignore one record (182252), because it has no labels


In [27]:
import json, numpy

class NumpyJsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.integer):
            return int(obj)
        elif isinstance(obj, numpy.floating):
            return float(obj)
        elif isinstance(obj, numpy.ndarray):
            return obj.tolist()
        else:
            return super(NumpyJsonEncoder, self).default(obj)

def write_list_to_json_file(data, filepath):
    with open(filepath, "w") as f:
        for i in data:
            f.write(f"{json.dumps(i, cls=NumpyJsonEncoder)}\n")

os.makedirs(os.path.join(OUTPUT_DIR, "full"), exist_ok=True)

for k, v in split_examples.items():
    sorted_v = sorted(v, key=lambda i: len(i["text"].split()))
    write_list_to_json_file(sorted_v, os.path.join(OUTPUT_DIR, "full", f"{k}.json"))

## Filter each split to the top 50 diagnosis/procedure codes

first calculate the top k

In [28]:
counts = Counter()
DISCHARGE_SUMMARIES_ICD = pd.read_csv("DISCHARGE_SUMMARIES_ICD.csv")
for row in DISCHARGE_SUMMARIES_ICD.itertuples():
    for label in str(row[4]).split(";"):
        counts[label] += 1

In [29]:
import json

os.makedirs(os.path.join(OUTPUT_DIR, "50"), exist_ok=True)
counts_sorted = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
label2idx = {code[0]: i for i, code in enumerate(counts_sorted)}
json.dump(label2idx, open(os.path.join(OUTPUT_DIR, "full", "label2idx.json"), "w"))
top50labels = [code[0] for code in counts_sorted[:50]]
label2idx = {l: i for i, l in enumerate(top50labels)}
json.dump(label2idx, open(os.path.join(OUTPUT_DIR, "50", "label2idx.json"), "w"))

In [30]:
split_ids = {}
for split in ["train", "dev", "test"]:
    lines = [l.strip() for l in open(os.path.join(SPLIT_DIR, f"{split}_50_hadm_ids.csv")).readlines()]
    split_ids[split] = set(lines)
    print(f"{split} set has {len(split_ids[split])} examples")

train set has 8066 examples
dev set has 1573 examples
test set has 1729 examples


In [31]:
split_examples = {k: [] for k in split_ids}

with open("DISCHARGE_SUMMARIES_ICD.csv", "r") as f:
    reader = csv.reader(f)
    next(reader)

    for row in reader:
        hadm_id = row[1]
        text = row[2]
        labels = set(row[3].split(";")).intersection(set(top50labels))
        example = {"subject_id": int(row[0]), "hadm_id": hadm_id, "text": text ,"labels": list(labels)}
        if hadm_id in split_ids["train"]:
            split_examples["train"].append(example)
        elif hadm_id in split_ids["dev"]:
            split_examples["dev"].append(example)
        elif hadm_id in split_ids["test"]:
            split_examples["test"].append(example)

In [32]:
for k, v in split_examples.items():
    sorted_v = sorted(v, key=lambda i: len(i["text"].split()))
    write_list_to_json_file(sorted_v, os.path.join(OUTPUT_DIR, "50", f"{k}.json"))

In [33]:
!rm DISCHARGE_SUMMARIES_ICD.csv