In [4]:
from mimic3_preprocessing.segmentation import *
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
from spacy.lang.en import English
import spacy
import json
from multiprocessing import Pool, cpu_count

In [24]:
NOTEEVENTS_path = "mimic3_data/NOTEEVENTS.csv"
DIAGNOSES_ICD_path = 'mimic3_data/DIAGNOSES_ICD.csv'
PROCEDURES_ICD_path = 'mimic3_data/PROCEDURES_ICD.csv'

train_full_ids_path = 'mimic3_data/train_full_hadm_ids.csv'
dev_full_ids_path = "mimic3_data/dev_full_hadm_ids.csv"
test_full_ids_path = "mimic3_data/test_full_hadm_ids.csv"

output_train_full_set_path = 'preprocessed_mimic3/train_set.json'
output_dev_full_set_path = 'preprocessed_mimic3/dev_set.json'
output_test_full_set_path = 'preprocessed_mimic3/test_set.json'

num_cpus = cpu_count()

In [6]:
dtype = {
    "ROW_ID": int,
    "SUBJECT_ID": int,
    "HADM_ID": "O",
    "CATEGORY": str,
    "DESCRIPTION": str,
    "CGID": "O",
    "ISERROR": str,
    "TEXT": str,
}
parse_dates = ["CHARTDATE", "CHARTTIME", "STORETIME"]
df_notes = pd.read_csv(NOTEEVENTS_path,
                       parse_dates=parse_dates,
                       infer_datetime_format=True,
                       dtype=dtype,
                       )

In [10]:
"""headings extraction"""
df_disc = df_notes.loc[df_notes["CATEGORY"] == 'Discharge summary']
all_text = df_disc['TEXT'].tolist()
p = re.compile(".*:\n")
all_groups = []
for text in tqdm(all_text):
    for m in p.finditer(text):
        all_groups.append(m.group().lower().strip())
all_groups, counts = np.unique(all_groups, return_counts=True)
groups_dic = {}
for g, c in tqdm(zip(all_groups, counts)):
    if c > 5:
        g = g.strip()
        if re.match('^[\d|#]|\(.*\)', g) is None:
            groups_dic[g] = int(c)

100%|██████████| 59652/59652 [00:37<00:00, 1597.24it/s]
89815it [00:00, 693691.80it/s]


In [11]:
preprocess = PreprocessNotes(groups_dic)
dtype = {
    "ROW_ID": int,
    "SUBJECT_ID": int,
    "HADM_ID": int,
    "SEQ_NUM": "O",
    "ICD9_CODE": str,
}
df_diag = pd.read_csv(DIAGNOSES_ICD_path, dtype=dtype)
df_proc = pd.read_csv(PROCEDURES_ICD_path, dtype=dtype)
df_diag_grouped = df_diag.groupby('HADM_ID')
df_proc_grouped = df_proc.groupby('HADM_ID')
labels_dic = {}
for idx, _df in df_diag_grouped.__iter__():
    labels_dic[idx] = {'diag': _df['ICD9_CODE'].tolist()}
for idx, _df in df_proc_grouped.__iter__():
    labels_dic[idx].update({'proc': _df['ICD9_CODE'].tolist()})
df_disc_grouped = df_disc.groupby('HADM_ID')
all_hadm_id = list(df_disc_grouped.groups.keys())

In [14]:
data_dic = {}
pool = Pool(num_cpus)
for i in tqdm(range(0, len(all_hadm_id), num_cpus)):
    hadm_ids_lis = all_hadm_id[i: i + num_cpus]

    text_lis = []
    for hadm_id in hadm_ids_lis:
        _df = df_disc_grouped.get_group(hadm_id)
        text = '\n\n'.join(_df['TEXT'].tolist())
        text_lis.append(text)

    data_lis = pool.map(preprocess, text_lis)
    for hadm_id, text, data in zip(hadm_ids_lis, text_lis, data_lis):
        data_dic[hadm_id] = {
            'labels': labels_dic[int(hadm_id)],
            'text': text,
            'data': data}

In [17]:
with open(train_full_ids_path, 'r') as f:
    train_full_ids = [str(i.strip()) for i in f.readlines()]
with open(dev_full_ids_path, 'r') as f:
    dev_full_ids = [str(i.strip()) for i in f.readlines()]
with open(test_full_ids_path, 'r') as f:
    test_full_ids = [str(i.strip()) for i in f.readlines()]

In [22]:
train_set, dev_set, test_set = {}, {}, {}
for k in train_full_ids:
    if len(data_dic[k]['target_codes']) != 0:
        train_set[k] = data[k]
for k in dev_full_ids:
    if len(data_dic[k]['target_codes']) != 0:
        dev_set[k] = data[k]
for k in test_full_ids:
    if len(data_dic[k]['target_codes']) != 0:
        test_set[k] = data[k]

In [23]:
with open(output_train_full_set_path, 'w+', encoding='utf-8') as f:
    f.write(json.dumps(train_set, ensure_ascii=False))
with open(output_dev_full_set_path, 'w+', encoding='utf-8') as f:
    f.write(json.dumps(dev_set, ensure_ascii=False))
with open(output_test_full_set_path, 'w+', encoding='utf-8') as f:
    f.write(json.dumps(test_set, ensure_ascii=False))