# Data for the "Discharge Me!": BioNLP ACL'24 Shared Task on Streamlining Discharge Documentation

The following notebook provides code and information on how to download the datasets and get the data splits for the "Discharge Me!" shared task. A successfully credentialed PhysioNet account is required. Please refer to https://physionet.org/about/citi-course/ for information on the data access requirements.

To download the data, please enter your PhysioNet credentials in the below cell, select Runtime -> Run all, wait for completion, then check the `discharge-me-data` folder on the left-hand pane. You may choose to run the notebook locally or on Colab and then download the `.csv.gz` files that you need.

### Download necessary raw data files from PhysioNet:

In [None]:
username = '' # enter your PhysioNet username
password = '' # enter your PhysioNet password

In [None]:
!wget -r -N -c -np --user $username --password=$password https://physionet.org/files/mimic-iv-note/2.2/
!wget -r -N -c -np --user $username --password=$password https://physionet.org/files/mimic-iv-ed/2.2/

### Filtering Datasets

Imports

In [None]:
import re
import os
import pandas as pd

Load Data

In [None]:
df_radiology = pd.read_csv('/content/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')

In [None]:
df_discharge = pd.read_csv('/content/physionet.org/files/mimic-iv-note/2.2/note/discharge.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')

In [None]:
df_diagnoses_ed = pd.read_csv('/content/physionet.org/files/mimic-iv-ed/2.2/ed/diagnosis.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')
df_triage_ed = pd.read_csv('/content/physionet.org/files/mimic-iv-ed/2.2/ed/triage.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')
df_stays_ed = pd.read_csv('/content/physionet.org/files/mimic-iv-ed/2.2/ed/edstays.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')

Filter Emergency Department (ED) Admissions

In [None]:
df_triage_ed = df_triage_ed[df_triage_ed['chiefcomplaint'].notnull()]

valid_stay_ids = set(df_triage_ed['stay_id']) & set(df_diagnoses_ed['stay_id'])

df_stays_ed = df_stays_ed[df_stays_ed['stay_id'].isin(valid_stay_ids)]

df_stays_ed.dropna(subset=['hadm_id'], inplace=True)

valid_ed_admission_ids = df_stays_ed['hadm_id'].unique()

Filter Radiology Reports

In [None]:
df_radiology.dropna(subset=['hadm_id'], inplace=True)

df_radiology = df_radiology[df_radiology['note_type'] != 'AR']

valid_radiology_admission_ids = df_radiology['hadm_id'].unique()

Filter Discharge Summaries

In [None]:
df_discharge.dropna(subset=['hadm_id'], inplace=True)

df_target = df_discharge[df_discharge['text'].str.contains('Discharge Instruction') & df_discharge['text'].str.contains('Brief Hospital Course')]

Extracting Discharge Instructions Sections

In [None]:
df_target['discharge_instructions'] = df_target['text'].apply(lambda x: re.findall(r'Discharge Instructions:\n(.*?)Followup Instruction', x, re.DOTALL))

df_target['discharge_instructions'] = df_target['discharge_instructions'].apply(lambda x: [i.strip() for i in x])

df_target = df_target[df_target['discharge_instructions'].str.len() == 1]

df_target['discharge_instructions'].str.len().value_counts()

df_target = df_target.explode('discharge_instructions')

Extracting Brief Hospital Course Sections

In [None]:
df_target['brief_hospital_course'] = df_target['text'].apply(lambda x: re.findall(r'Brief Hospital Course:\s*\n{0,2}(.*?)(?=\n\s*\n{0,2}\s*[A-Z_]+[^\n:]+:\n)', x, re.DOTALL))

df_target['brief_hospital_course'].str.len().value_counts()

df_target = df_target[df_target['brief_hospital_course'].str.len() == 1]

df_target = df_target.explode('brief_hospital_course')

Filtering Target Sections

In [None]:
df_target = df_target[df_target['discharge_instructions'] != '']

valid_discharge_admission_ids = df_target['hadm_id'].unique()

Filtering Tables

In [None]:
valid_admission_ids = set(valid_ed_admission_ids) & set(valid_radiology_admission_ids) & set(valid_discharge_admission_ids)

In [None]:
valid_admission_ids = set(valid_ed_admission_ids) & set(valid_radiology_admission_ids) & set(valid_discharge_admission_ids)

df_stays_ed = df_stays_ed[df_stays_ed['hadm_id'].isin(valid_admission_ids)]

valid_stay_ids = set(df_stays_ed['stay_id'])

df_diagnoses_ed = df_diagnoses_ed[df_diagnoses_ed['stay_id'].isin(valid_stay_ids)]
df_triage_ed = df_triage_ed[df_triage_ed['stay_id'].isin(valid_stay_ids)]
df_radiology = df_radiology[df_radiology['hadm_id'].isin(valid_admission_ids)]
df_discharge = df_discharge[df_discharge['hadm_id'].isin(valid_admission_ids)]
df_target = df_target[df_target['hadm_id'].isin(valid_admission_ids)]

Calculating Word Counts

In [None]:
df_target['discharge_instructions_word_count'] = df_target['discharge_instructions'].apply(lambda x: len(str(x).split(" ")))
df_target['brief_hospital_course_word_count'] = df_target['brief_hospital_course'].apply(lambda x: len(str(x).split(" ")))

df_target = df_target[['note_id', 'hadm_id', 'discharge_instructions', 'brief_hospital_course', 'discharge_instructions_word_count', 'brief_hospital_course_word_count']]
print(df_target['discharge_instructions_word_count'].describe())
print(df_target['brief_hospital_course_word_count'].describe())

### Creating Datasets (Note: Public has been renamed to Phase I, Hidden has been renamed to Phase II)

Processing

In [None]:
df_stays_ed.reset_index(drop=True, inplace=True)
df_diagnoses_ed.reset_index(drop=True, inplace=True)
df_triage_ed.reset_index(drop=True, inplace=True)
df_radiology.reset_index(drop=True, inplace=True)
df_discharge.reset_index(drop=True, inplace=True)
df_target.reset_index(drop=True, inplace=True)

In [None]:
df_stays_ed['hadm_id'] = df_stays_ed['hadm_id'].astype(int)
df_radiology['hadm_id'] = df_radiology['hadm_id'].astype(int)
df_discharge['hadm_id'] = df_discharge['hadm_id'].astype(int)
df_target['hadm_id'] = df_target['hadm_id'].astype(int)

Data Splits

In [None]:
# Split dataset into phase 1 and phase 2 at 90%, 10%
df_target_phase_1 = df_target.sample(frac=0.9, random_state=42)
df_target_phase_2 = df_target.drop(df_target_phase_1.index)

df_discharge_phase_1 = df_discharge[df_discharge['hadm_id'].isin(df_target_phase_1['hadm_id'])]
df_discharge_phase_2 = df_discharge[df_discharge['hadm_id'].isin(df_target_phase_2['hadm_id'])]

df_radiology_phase_1 = df_radiology[df_radiology['hadm_id'].isin(df_target_phase_1['hadm_id'])]
df_radiology_phase_2 = df_radiology[df_radiology['hadm_id'].isin(df_target_phase_2['hadm_id'])]

df_stays_ed_phase_1 = df_stays_ed[df_stays_ed['hadm_id'].isin(df_target_phase_1['hadm_id'])]
df_stays_ed_phase_2 = df_stays_ed[df_stays_ed['hadm_id'].isin(df_target_phase_2['hadm_id'])]

df_diagnoses_ed_phase_1 = df_diagnoses_ed[df_diagnoses_ed['stay_id'].isin(df_stays_ed_phase_1['stay_id'])]
df_diagnoses_ed_phase_2 = df_diagnoses_ed[df_diagnoses_ed['stay_id'].isin(df_stays_ed_phase_2['stay_id'])]

df_triage_ed_phase_1 = df_triage_ed[df_triage_ed['stay_id'].isin(df_stays_ed_phase_1['stay_id'])]
df_triage_ed_phase_2 = df_triage_ed[df_triage_ed['stay_id'].isin(df_stays_ed_phase_2['stay_id'])]

In [None]:
# Split phase 1 into train, valid, test at 70%, 15%, 15%
df_target_train = df_target_phase_1.sample(frac=0.7, random_state=42)
df_target_valid = df_target_phase_1.drop(df_target_train.index).sample(frac=0.5, random_state=42)
df_target_phase_1_test = df_target_phase_1.drop(df_target_train.index).drop(df_target_valid.index)

df_discharge_train = df_discharge_phase_1[df_discharge_phase_1['hadm_id'].isin(df_target_train['hadm_id'])]
df_discharge_valid = df_discharge_phase_1[df_discharge_phase_1['hadm_id'].isin(df_target_valid['hadm_id'])]
df_discharge_phase_1_test = df_discharge_phase_1[df_discharge_phase_1['hadm_id'].isin(df_target_phase_1_test['hadm_id'])]

df_radiology_train = df_radiology_phase_1[df_radiology_phase_1['hadm_id'].isin(df_target_train['hadm_id'])]
df_radiology_valid = df_radiology_phase_1[df_radiology_phase_1['hadm_id'].isin(df_target_valid['hadm_id'])]
df_radiology_phase_1_test = df_radiology_phase_1[df_radiology_phase_1['hadm_id'].isin(df_target_phase_1_test['hadm_id'])]

df_stays_ed_train = df_stays_ed_phase_1[df_stays_ed_phase_1['hadm_id'].isin(df_target_train['hadm_id'])]
df_stays_ed_valid = df_stays_ed_phase_1[df_stays_ed_phase_1['hadm_id'].isin(df_target_valid['hadm_id'])]
df_stays_ed_phase_1_test = df_stays_ed_phase_1[df_stays_ed_phase_1['hadm_id'].isin(df_target_phase_1_test['hadm_id'])]

df_diagnoses_ed_train = df_diagnoses_ed_phase_1[df_diagnoses_ed_phase_1['stay_id'].isin(df_stays_ed_train['stay_id'])]
df_diagnoses_ed_valid = df_diagnoses_ed_phase_1[df_diagnoses_ed_phase_1['stay_id'].isin(df_stays_ed_valid['stay_id'])]
df_diagnoses_ed_phase_1_test = df_diagnoses_ed_phase_1[df_diagnoses_ed_phase_1['stay_id'].isin(df_stays_ed_phase_1_test['stay_id'])]

df_triage_ed_train = df_triage_ed_phase_1[df_triage_ed_phase_1['stay_id'].isin(df_stays_ed_train['stay_id'])]
df_triage_ed_valid = df_triage_ed_phase_1[df_triage_ed_phase_1['stay_id'].isin(df_stays_ed_valid['stay_id'])]
df_triage_ed_phase_1_test = df_triage_ed_phase_1[df_triage_ed_phase_1['stay_id'].isin(df_stays_ed_phase_1_test['stay_id'])]

### Amendment on 02/20/2024 (Thanks @mchizhik!)

In [None]:
df_target_phase_1_test_hadm_id = df_target_phase_1_test['hadm_id'].tolist()

In [None]:
# Remove rows in all df_target dataframes where discharge_instructions_word_count is less than 10
df_target_phase_1_test = df_target_phase_1_test[df_target_phase_1_test['discharge_instructions_word_count'] >= 10]
df_target_valid = df_target_valid[df_target_valid['discharge_instructions_word_count'] >= 10]
df_target_train = df_target_train[df_target_train['discharge_instructions_word_count'] >= 10]
df_target_phase_2 = df_target_phase_2[df_target_phase_2['discharge_instructions_word_count'] >= 10]

# Remove rows in all df_target dataframes where brief_hospital_course_word_count is less than 10
df_target_phase_1_test = df_target_phase_1_test[df_target_phase_1_test['brief_hospital_course_word_count'] >= 10]
df_target_valid = df_target_valid[df_target_valid['brief_hospital_course_word_count'] >= 10]
df_target_train = df_target_train[df_target_train['brief_hospital_course_word_count'] >= 10]
df_target_phase_2 = df_target_phase_2[df_target_phase_2['brief_hospital_course_word_count'] >= 10]

df_target_phase_1_test_hadm_id_new = df_target_phase_1_test['hadm_id'].tolist()
df_target_valid_hadm_id_new = df_target_valid['hadm_id'].tolist()
df_target_train_hadm_id_new = df_target_train['hadm_id'].tolist()
df_target_phase_2_hadm_id_new = df_target_phase_2['hadm_id'].tolist()

In [None]:
# Filtering rest of the dataframes based on updated list of hadm_ids
df_discharge_phase_1_test = df_discharge_phase_1_test[df_discharge_phase_1_test['hadm_id'].isin(df_target_phase_1_test_hadm_id_new)]
df_radiology_phase_1_test = df_radiology_phase_1_test[df_radiology_phase_1_test['hadm_id'].isin(df_target_phase_1_test_hadm_id_new)]
df_stays_ed_phase_1_test = df_stays_ed_phase_1_test[df_stays_ed_phase_1_test['hadm_id'].isin(df_target_phase_1_test_hadm_id_new)]
df_diagnoses_ed_phase_1_test = df_diagnoses_ed_phase_1_test[df_diagnoses_ed_phase_1_test['stay_id'].isin(df_stays_ed_phase_1_test['stay_id'])]
df_triage_ed_phase_1_test = df_triage_ed_phase_1_test[df_triage_ed_phase_1_test['stay_id'].isin(df_stays_ed_phase_1_test['stay_id'])]

df_discharge_valid = df_discharge_valid[df_discharge_valid['hadm_id'].isin(df_target_valid_hadm_id_new)]
df_radiology_valid = df_radiology_valid[df_radiology_valid['hadm_id'].isin(df_target_valid_hadm_id_new)]
df_stays_ed_valid = df_stays_ed_valid[df_stays_ed_valid['hadm_id'].isin(df_target_valid_hadm_id_new)]
df_diagnoses_ed_valid = df_diagnoses_ed_valid[df_diagnoses_ed_valid['stay_id'].isin(df_stays_ed_valid['stay_id'])]
df_triage_ed_valid = df_triage_ed_valid[df_triage_ed_valid['stay_id'].isin(df_stays_ed_valid['stay_id'])]

df_discharge_train = df_discharge_train[df_discharge_train['hadm_id'].isin(df_target_train_hadm_id_new)]
df_radiology_train = df_radiology_train[df_radiology_train['hadm_id'].isin(df_target_train_hadm_id_new)]
df_stays_ed_train = df_stays_ed_train[df_stays_ed_train['hadm_id'].isin(df_target_train_hadm_id_new)]
df_diagnoses_ed_train = df_diagnoses_ed_train[df_diagnoses_ed_train['stay_id'].isin(df_stays_ed_train['stay_id'])]
df_triage_ed_train = df_triage_ed_train[df_triage_ed_train['stay_id'].isin(df_stays_ed_train['stay_id'])]

df_discharge_phase_2 = df_discharge_phase_2[df_discharge_phase_2['hadm_id'].isin(df_target_phase_2_hadm_id_new)]
df_radiology_phase_2 = df_radiology_phase_2[df_radiology_phase_2['hadm_id'].isin(df_target_phase_2_hadm_id_new)]
df_stays_ed_phase_2 = df_stays_ed_phase_2[df_stays_ed_phase_2['hadm_id'].isin(df_target_phase_2_hadm_id_new)]
df_diagnoses_ed_phase_2 = df_diagnoses_ed_phase_2[df_diagnoses_ed_phase_2['stay_id'].isin(df_stays_ed_phase_2['stay_id'])]
df_triage_ed_phase_2 = df_triage_ed_phase_2[df_triage_ed_phase_2['stay_id'].isin(df_stays_ed_phase_2['stay_id'])]

Verifying sample counts (should match those on the website: https://stanford-aimi.github.io/discharge-me)

In [None]:
print(df_target_train.shape)
print(df_target_valid.shape)
print(df_target_phase_1_test.shape)
print(df_target_phase_2.shape)
assert(df_target_train.shape[0] + df_target_valid.shape[0] + df_target_phase_1_test.shape[0] + df_target_phase_2.shape[0] == 109168)
print("\n")

print(df_discharge_train.shape)
print(df_discharge_valid.shape)
print(df_discharge_phase_1_test.shape)
print(df_discharge_phase_2.shape)
assert(df_discharge_train.shape[0] + df_discharge_valid.shape[0] + df_discharge_phase_1_test.shape[0] + df_discharge_phase_2.shape[0] == 109168)
print("\n")

print(df_radiology_train.shape)
print(df_radiology_valid.shape)
print(df_radiology_phase_1_test.shape)
print(df_radiology_phase_2.shape)
print(df_radiology_train.shape[0] + df_radiology_valid.shape[0] + df_radiology_phase_1_test.shape[0] + df_radiology_phase_2.shape[0])
print("\n")

print(df_stays_ed_train.shape)
print(df_stays_ed_valid.shape)
print(df_stays_ed_phase_1_test.shape)
print(df_stays_ed_phase_2.shape)
assert(df_stays_ed_train.shape[0] + df_stays_ed_valid.shape[0] + df_stays_ed_phase_1_test.shape[0] + df_stays_ed_phase_2.shape[0] == 109403)
print("\n")

print(df_diagnoses_ed_train.shape)
print(df_diagnoses_ed_valid.shape)
print(df_diagnoses_ed_phase_1_test.shape)
print(df_diagnoses_ed_phase_2.shape)
assert(df_diagnoses_ed_train.shape[0] + df_diagnoses_ed_valid.shape[0] + df_diagnoses_ed_phase_1_test.shape[0] + df_diagnoses_ed_phase_2.shape[0] == 218376)
print("\n")

print(df_triage_ed_train.shape)
print(df_triage_ed_valid.shape)
print(df_triage_ed_phase_1_test.shape)
print(df_triage_ed_phase_2.shape)
assert(df_triage_ed_train.shape[0] + df_triage_ed_valid.shape[0] + df_triage_ed_phase_1_test.shape[0] + df_triage_ed_phase_2.shape[0] == 109403)
print("\n")

### Amendment on 04/10/2024 (Thanks @anquangtang!)

In [None]:
# Improved extraction of Brief Hospital Course sections

from collections import OrderedDict
input_sections = OrderedDict([
    ('Brief Hospital Course', 'Brief Hospital Course'),
    ('Medications on Admission', '[A-Za-z_]+ on Admission'),
    ('Discharge Medications', '[A-Za-z_]+ Medications'),
    ('Discharge Disposition', '[A-Za-z_]+ Disposition'),
    ('Discharge Diagnosis', '[A-Za-z_]+ Diagnosis'),
    ('Discharge Condition', '[A-Za-z_]+ Condition')
])


def parse_brief_hospital_course(row):
    discharge_summary = row['text']
    section_name = 'Brief Hospital Course'
    section = input_sections.get(section_name)
    for next_section in list(input_sections.values())[1:]:
        search = re.findall(section + ".+\n" + next_section, discharge_summary, re.DOTALL)
        if len(search) > 0:
            break
    rex = r'(%s?):\s*\n{0,2}(.+?)\s*(\n\s*){1,10}(%s):\n' % (section, next_section)

    section_ext = re.findall(rex, discharge_summary, re.DOTALL)
    if len(section_ext) > 0:
        return section_ext[-1][1]
    else:
        return None

df_target_phase_1_test = df_target_phase_1_test.join(df_discharge[['hadm_id', 'text']].set_index('hadm_id'), on='hadm_id')
df_target_phase_1_test['brief_hospital_course'] = df_target_phase_1_test.apply(parse_brief_hospital_course, axis=1)
df_target_phase_1_test.drop(columns=['text'], inplace=True)

df_target_valid = df_target_valid.join(df_discharge[['hadm_id', 'text']].set_index('hadm_id'), on='hadm_id')
df_target_valid['brief_hospital_course'] = df_target_valid.apply(parse_brief_hospital_course, axis=1)
df_target_valid.drop(columns=['text'], inplace=True)

df_target_train = df_target_train.join(df_discharge[['hadm_id', 'text']].set_index('hadm_id'), on='hadm_id')
df_target_train['brief_hospital_course'] = df_target_train.apply(parse_brief_hospital_course, axis=1)
df_target_train.drop(columns=['text'], inplace=True)

df_target_phase_2 = df_target_phase_2.join(df_discharge[['hadm_id', 'text']].set_index('hadm_id'), on='hadm_id')
df_target_phase_2['brief_hospital_course'] = df_target_phase_2.apply(parse_brief_hospital_course, axis=1)
df_target_phase_2.drop(columns=['text'], inplace=True)

In [None]:
# Updating word counts
df_target_phase_1_test['brief_hospital_course_word_count'] = df_target_phase_1_test['brief_hospital_course'].apply(lambda x: len(str(x).split(" ")))
df_target_valid['brief_hospital_course_word_count'] = df_target_valid['brief_hospital_course'].apply(lambda x: len(str(x).split(" ")))
df_target_train['brief_hospital_course_word_count'] = df_target_train['brief_hospital_course'].apply(lambda x: len(str(x).split(" ")))
df_target_phase_2['brief_hospital_course_word_count'] = df_target_phase_2['brief_hospital_course'].apply(lambda x: len(str(x).split(" ")))

### Output to csv.gz files

In [None]:
os.mkdir('discharge-me-data')
os.mkdir('discharge-me-data/phase_1_test')
os.mkdir('discharge-me-data/train')
os.mkdir('discharge-me-data/valid')

df_target_train.to_csv('discharge-me-data/train/discharge_target.csv.gz', index=False, compression='gzip')
df_target_valid.to_csv('discharge-me-data/valid/discharge_target.csv.gz', index=False, compression='gzip')
df_target_phase_1_test.to_csv('discharge-me-data/phase_1_test/discharge_target.csv.gz', index=False, compression='gzip')

df_discharge_train.to_csv('discharge-me-data/train/discharge.csv.gz', index=False, compression='gzip')
df_discharge_valid.to_csv('discharge-me-data/valid/discharge.csv.gz', index=False, compression='gzip')
df_discharge_phase_1_test.to_csv('discharge-me-data/phase_1_test/discharge.csv.gz', index=False, compression='gzip')

df_radiology_train.to_csv('discharge-me-data/train/radiology.csv.gz', index=False, compression='gzip')
df_radiology_valid.to_csv('discharge-me-data/valid/radiology.csv.gz', index=False, compression='gzip')
df_radiology_phase_1_test.to_csv('discharge-me-data/phase_1_test/radiology.csv.gz', index=False, compression='gzip')

df_stays_ed_train.to_csv('discharge-me-data/train/edstays.csv.gz', index=False, compression='gzip')
df_stays_ed_valid.to_csv('discharge-me-data/valid/edstays.csv.gz', index=False, compression='gzip')
df_stays_ed_phase_1_test.to_csv('discharge-me-data/phase_1_test/edstays.csv.gz', index=False, compression='gzip')

df_diagnoses_ed_train.to_csv('discharge-me-data/train/diagnosis.csv.gz', index=False, compression='gzip')
df_diagnoses_ed_valid.to_csv('discharge-me-data/valid/diagnosis.csv.gz', index=False, compression='gzip')
df_diagnoses_ed_phase_1_test.to_csv('discharge-me-data/phase_1_test/diagnosis.csv.gz', index=False, compression='gzip')

df_triage_ed_train.to_csv('discharge-me-data/train/triage.csv.gz', index=False, compression='gzip')
df_triage_ed_valid.to_csv('discharge-me-data/valid/triage.csv.gz', index=False, compression='gzip')
df_triage_ed_phase_1_test.to_csv('discharge-me-data/phase_1_test/triage.csv.gz', index=False, compression='gzip')

In [None]:
os.mkdir('discharge-me-data/phase_2_test')

df_target_phase_2.to_csv('discharge-me-data/phase_2_test/discharge_target.csv.gz', index=False, compression='gzip')
df_discharge_phase_2.to_csv('discharge-me-data/phase_2_test/discharge.csv.gz', index=False, compression='gzip')
df_radiology_phase_2.to_csv('discharge-me-data/phase_2_test/radiology.csv.gz', index=False, compression='gzip')
df_stays_ed_phase_2.to_csv('discharge-me-data/phase_2_test/edstays.csv.gz', index=False, compression='gzip')
df_diagnoses_ed_phase_2.to_csv('discharge-me-data/phase_2_test/diagnosis.csv.gz', index=False, compression='gzip')
df_triage_ed_phase_2.to_csv('discharge-me-data/phase_2_test/triage.csv.gz', index=False, compression='gzip')