In [2]:
import collections
import json
import math
import os
import shutil
import sys
from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from rouge_score import rouge_scorer

from utils import (
    get_rouge_score,
    get_levenshtein_score
)

In [2]:
input_dir = '../data/N2C2-Track3-May3/'
input_dir2 = '../data/N2C2-Track3-May3_rowid/'
output_dir = '../data/N2C2-Track3-May3_noteaug/'
test_fpath = '../data/n2c2_test_noLabel.csv'
mimic_note_fpath = '../data/mimic3/NOTEEVENTS.csv'
section_titles_fpath = '../data/mimic3/section_title_cands.json'

# Load data

### 1. Load original dataset

In [156]:
df_train = pd.read_csv(os.path.join(input_dir, 'train.csv'), low_memory=False)
df_dev = pd.read_csv(os.path.join(input_dir, 'dev.csv'), low_memory=False)
pd.concat([df_train, df_dev]).to_csv(os.path.join(input_dir, 'traindev.csv'))

In [4]:
# Fix anon fields
# def fix_wrong_anon_field(text):
#     if text.startswith('**') and text[2] != '*':
#         text = '[' + text
#     if text.endswith('**') and text[-3] != '*':
#         text = text + ']'
#     return text

# df_train['Assessment'] = df_train['Assessment'].map(fix_wrong_anon_field)
# df_train['Plan Subsection'] = df_train['Plan Subsection'].map(fix_wrong_anon_field)
# df_dev['Assessment'] = df_dev['Assessment'].map(fix_wrong_anon_field)
# df_dev['Plan Subsection'] = df_dev['Plan Subsection'].map(fix_wrong_anon_field)

In [4]:
df_train.head()

Unnamed: 0,ROW ID,HADM ID,Assessment,Plan Subsection,Relation
0,701359,186454,"51 yr old F with a history of 3V CAD, confirme...",# CORONARIES: Patient with 3 vessel disease on...,Direct
1,701359,186454,"51 yr old F with a history of 3V CAD, confirme...",# Acute on Chronic systolic CHF: Patient with ...,Indirect
2,701359,186454,"51 yr old F with a history of 3V CAD, confirme...",# Social stressors: Recently lost mother to ca...,Neither
3,701359,186454,"51 yr old F with a history of 3V CAD, confirme...","# FEN: Replete lytes PRN, NPO for CABG today\n...",Not Relevant
4,554144,196435,"CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, B...",# Dyspnea: Differential in this patient would ...,Direct


In [5]:
df_train.iloc[0]

ROW ID                                                        701359
HADM ID                                                       186454
Assessment         51 yr old F with a history of 3V CAD, confirme...
Plan Subsection    # CORONARIES: Patient with 3 vessel disease on...
Relation                                                      Direct
Name: 0, dtype: object

### 2. Update ROW_ID

In [6]:
# Updated ROW_ID (Jun 26th, 2022)
updated_rowid_dir = '../data/ROWID_Updated/'
df_rowid_train = pd.read_csv(os.path.join(updated_rowid_dir, 'ROWID_Updated_train.csv'))
df_rowid_dev = pd.read_csv(os.path.join(updated_rowid_dir, 'ROWID_Updated_dev.csv'))

In [7]:
df_rowid_train.head()

Unnamed: 0,original_RowID,updated_RowID,updated_HadmID,updated_SubjID
0,315721,315979,111458,31820
1,316897,316897,197423,12113
2,317744,317923,109679,27866
3,318258,318497,186721,4588
4,318860,318905,111039,30663


In [8]:
# Check whether all ROW_IDs are unique in the update files
assert len(df_rowid_train) == len(df_rowid_train['original_RowID'].unique())
assert len(df_rowid_train) == len(df_rowid_train['updated_RowID'].unique())
assert len(df_rowid_dev) == len(df_rowid_dev['original_RowID'].unique())
assert len(df_rowid_dev) == len(df_rowid_dev['updated_RowID'].unique())

# Check whether the ROW_IDs match with the dataset file
assert set(df_rowid_train['original_RowID']) == set(df_train['ROW ID'].unique())
assert set(df_rowid_dev['original_RowID']) == set(df_dev['ROW ID'].unique())

In [9]:
for df_data, df_update in [(df_train, df_rowid_train), (df_dev, df_rowid_dev)]:    
    update_map = {row.original_RowID: (row.updated_RowID, row.updated_HadmID, row.updated_SubjID)
                  for _, row in df_update.iterrows()}
    new_rowid = [update_map[row['ROW ID']][0] for _, row in df_data.iterrows()]
    new_hadmid = [update_map[row['ROW ID']][1] for _, row in df_data.iterrows()]
    new_subjid = [update_map[row['ROW ID']][2] for _, row in df_data.iterrows()]
    
    assert all(df_data['HADM ID'] == new_hadmid)
    df_data['ROW ID'] = new_rowid
    df_data.insert(2, 'SUBJECT ID', new_subjid)

In [10]:
df_train.head()

Unnamed: 0,ROW ID,HADM ID,SUBJECT ID,Assessment,Plan Subsection,Relation
0,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# CORONARIES: Patient with 3 vessel disease on...,Direct
1,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# Acute on Chronic systolic CHF: Patient with ...,Indirect
2,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# Social stressors: Recently lost mother to ca...,Neither
3,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...","# FEN: Replete lytes PRN, NPO for CABG today\n...",Not Relevant
4,554204,196435,85490,"CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, B...",# Dyspnea: Differential in this patient would ...,Direct


### 3. Load MIMIC-III notes

In [11]:
df_notes = pd.read_csv(mimic_note_fpath, low_memory=False)
assert len(df_notes) == len(df_notes.ROW_ID.unique())
len(df_notes)

2083180

# Check the ROW ID and the source of input texts

Let's see the `ROW ID` in the dataset actually refers to the note that contains "Assessment" and "Plan Subsection".  
$\rightarrow$ The notes where ROW ID are under `Physician` category. Sub-category (Description) differs by note, but mostly they are `Physician Resident Progress Note`.

In [14]:
# A train example
row_ex = df_train.iloc[0]
row_ex

ROW ID                                                        701531
HADM ID                                                       186454
SUBJECT ID                                                     40514
Assessment         51 yr old F with a history of 3V CAD, confirme...
Plan Subsection    # CORONARIES: Patient with 3 vessel disease on...
Relation                                                      Direct
Name: 0, dtype: object

In [15]:
# The MIMIC-III note that `ROW ID` refers to
row_note = df_notes[df_notes.ROW_ID == 701531].iloc[0]
row_note

ROW_ID                                                    701531
SUBJECT_ID                                                 40514
HADM_ID                                                 186454.0
CHARTDATE                                             2181-11-05
CHARTTIME                                    2181-11-05 06:53:00
STORETIME                                    2181-11-05 12:15:54
CATEGORY                                              Physician 
DESCRIPTION                     Physician Resident Progress Note
CGID                                                     19759.0
ISERROR                                                      NaN
TEXT           TITLE:\n   Chief Complaint:\n   None.\n   24 H...
Name: 697123, dtype: object

In [16]:
# Now check whether the corresponding notes contain the assessments and the plans
for df_data in [df_train, df_dev]:
    for _, row_ex in tqdm(df_data.iterrows(), total=len(df_data)):
        row_note = df_notes[df_notes.ROW_ID == row_ex['ROW ID']].iloc[0]
        assert int(row_note.SUBJECT_ID) == row_ex['SUBJECT ID'] # Additional sanity check of the SUBJECT ID
        check = (row_ex['Assessment'] in row_note.TEXT) and (row_ex['Plan Subsection'] in row_note.TEXT)
        if not check:
            print(f'{row_ex["ROW ID"]} - {row_ex["HADM ID"]}')

  0%|          | 0/4633 [00:00<?, ?it/s]

674484 - 185903


  0%|          | 0/597 [00:00<?, ?it/s]

- There is one note that does not contain the assessment and the plan

In [17]:
df_train_check = df_train[df_train['HADM ID'] == 185903]
df_train_check

Unnamed: 0,ROW ID,HADM ID,SUBJECT ID,Assessment,Plan Subsection,Relation
4318,674484,185903,65448,78 year old female with atrial fibrillation s/...,# Acute Anemia - HCT dropped from 32 to 26 but...,Direct
4319,674484,185903,65448,78 year old female with atrial fibrillation s/...,# Hypotension - resolved after dropping briefl...,Direct
4320,674484,185903,65448,78 year old female with atrial fibrillation s/...,# Atrial Fibrillation - current in NSR post pr...,Indirect
4321,674484,185903,65448,78 year old female with atrial fibrillation s/...,# Hypothyroidism\n -- continue levothyroxine,Neither
4322,674484,185903,65448,78 year old female with atrial fibrillation s/...,"# Osteoporosis\n -- continue Ca+D, bisphosph...",Neither
4323,674484,185903,65448,78 year old female with atrial fibrillation s/...,"# FEN:\n -- Cardiac HH diet, rplete lytes pr...",Not Relevant


In [18]:
df_notes_check = df_notes[df_notes.HADM_ID == 185903]
df_notes_check

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
40118,35558,65448,185903.0,2186-06-07,,,Discharge summary,Report,,,Admission Date: [**2186-6-6**] D...
88457,89204,65448,185903.0,2186-06-06,,,Echo,Report,,,PATIENT/TEST INFORMATION:\nIndication: Eval. F...
230045,240253,65448,185903.0,2186-06-07,,,ECG,Report,,,Sinus rhythm. Inferolateral T wave flattening...
230046,240254,65448,185903.0,2186-06-06,,,ECG,Report,,,Atrial flutter. Right bundle-branch block. M...
666051,674428,65448,185903.0,2186-06-07,2186-06-07 12:32:00,2186-06-07 12:33:09,Nursing,Nursing Progress Note,18424.0,,Atrial fibrillation (Afib)\n Assessment:\n ...
666415,674389,65448,185903.0,2186-06-07,2186-06-07 07:04:00,2186-06-07 09:26:11,Physician,Physician Resident Progress Note,17550.0,,Chief Complaint:\n 24 Hour Events:\n Aller...
666418,674392,65448,185903.0,2186-06-07,2186-06-07 07:04:00,2186-06-07 10:12:02,Physician,Physician Resident Progress Note,18900.0,,Chief Complaint:\n 24 Hour Events: ADMITTED\...
666464,674272,65448,185903.0,2186-06-06,2186-06-06 22:28:00,2186-06-06 22:28:12,Physician,Physician Resident Admission Note,21468.0,,TITLE:\n Chief Complaint: hypotension afer ...
666503,674383,65448,185903.0,2186-06-07,2186-06-07 08:55:00,2186-06-07 08:55:59,General,Generic Note,17923.0,,TITLE: EP note.\n 1 day post PVI for PAF a...
666551,674275,65448,185903.0,2186-06-07,2186-06-06 22:28:00,2186-06-07 00:41:57,Physician,Physician Resident Admission Note,19996.0,,TITLE:\n Chief Complaint: hypotension afer ...


In [20]:
for _, row_note in df_notes_check.iterrows():
    check = True
    for _, row_ex in df_train_check.iterrows():
        if not (row_ex['Assessment'] in row_note.TEXT) or not (row_ex['Plan Subsection'] in row_note.TEXT):
            check = False
            break
    print(f'{row_note.ROW_ID}: {check}')

35558: False
89204: False
240253: False
240254: False
674428: False
674389: True
674392: False
674272: False
674383: False
674275: False
674484: False
674339: False
674317: False
674318: False
1078133: False


- Okay we found the correct note, just update this. (`674484` $\rightarrow$ `674389`)

In [21]:
sum(df_train['ROW ID'] == 674484)

6

In [22]:
sum(df_train['ROW ID'] == 674389)

0

In [23]:
temp = df_train['ROW ID'].tolist()
temp = [row_id if (row_id != 674484) else 674389 for row_id in temp]
df_train['ROW ID'] = temp

In [24]:
sum(df_train['ROW ID'] == 674484)

0

In [25]:
sum(df_train['ROW ID'] == 674389)

6

- Okay now we can save the dataset with the row_ids updated

### Save the rowid-updated dataset

In [26]:
if not os.path.exists(input_dir2):
    os.makedirs(input_dir2)
    
df_train.to_csv(os.path.join(input_dir2, 'train.csv'), index=False)
df_dev.to_csv(os.path.join(input_dir2, 'dev.csv'), index=False)

In [155]:
df_traindev = pd.concat([df_train, df_dev])
df_traindev.to_csv(os.path.join(input_dir2, 'traindev.csv'), index=False)

### Note category statistics

- What are the categories/sub-categories of the notes referring assessments/plans

In [27]:
note_cat_counter = collections.Counter()
note_desc_counter = collections.Counter()
for df_data in [df_train, df_dev]:
    for _, row_ex in tqdm(df_data.iterrows(), total=len(df_data)):
        row_note = df_notes[df_notes.ROW_ID == row_ex['ROW ID']].iloc[0]
        note_cat_counter[row_note['CATEGORY']] += 1
        note_desc_counter[row_note['DESCRIPTION']] += 1

  0%|          | 0/4633 [00:00<?, ?it/s]

  0%|          | 0/597 [00:00<?, ?it/s]

In [28]:
note_cat_counter.most_common()

[('Physician ', 5230)]

In [29]:
note_desc_counter.most_common()

[('Physician Resident Progress Note', 4846),
 ('Intensivist Note', 197),
 ('Physician Resident/Attending Progress Note - MICU', 123),
 ('Physician Attending Progress Note', 44),
 ('Physician Resident Admission Note', 11),
 ('Resident / Attending Notes', 9)]

# Assessment and Plan Subsection text analysis

Can we specify the name of the problem/disease at the beginning of Plan text?  
$\rightarrow$ Not clear

In [30]:
df_train.iloc[0]

ROW ID                                                        701531
HADM ID                                                       186454
SUBJECT ID                                                     40514
Assessment         51 yr old F with a history of 3V CAD, confirme...
Plan Subsection    # CORONARIES: Patient with 3 vessel disease on...
Relation                                                      Direct
Name: 0, dtype: object

In [31]:
print(df_train.iloc[0]['Assessment'])

51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.


In [32]:
print(df_train.iloc[0]['Plan Subsection'])

# CORONARIES: Patient with 3 vessel disease on cath. Previously found
   to be the case in [**2174**], but had been treated medically. On the floor,
   patient with recurrent chest pain and EKG changes, concerning for
   further ischemia. Chest pain has been controlled on nitro gtt. Patient
   in CCU for monitoring; plan for CABG today.
   - continue nitro gtt
   - if has further CP, would uptitrate nitro, and contact [**Name2 (NI) 9339**] and
   attending for potential balloon pump vs. urgent CABG
   - appreciate CT [**Doctor First Name 91**] recs
   - continue [**Last Name (LF) **], [**First Name3 (LF) 119**], ACE-i, simvastatin


In [33]:
print(df_train.iloc[1]['Plan Subsection'])

# Acute on Chronic systolic CHF: Patient with EF of 25% at OSH, but
   repeat echo at [**Hospital1 5**] showed EF of 40%, consistent with prior echo. SOB
   resolved, but still with elevated JVP (no crackles on my exam this
   morning).
   -Goal even I
s & O
s as preparation for CABG today
   -Continue BB, ACEi, [**Hospital1 119**], statin


In [34]:
def check_plan_colon(text):
    return text.find(':') != -1

def check_plan_colon_firstline(text):
    return text.split('\n')[0].find(':') != -1

train_colon = df_train['Plan Subsection'].map(check_plan_colon)
train_colon_firstline = df_train['Plan Subsection'].map(check_plan_colon_firstline)
dev_colon = df_dev['Plan Subsection'].map(check_plan_colon)
dev_colon_firstline = df_dev['Plan Subsection'].map(check_plan_colon_firstline)
print(f'Train colon: {sum(train_colon)}/{len(train_colon)} '
      f'(first line {sum(train_colon_firstline)}/{len(train_colon_firstline)})')
print(f'Dev colon: {sum(dev_colon)}/{len(dev_colon)} '
      f'(first line {sum(dev_colon_firstline)}/{len(dev_colon_firstline)})')

Train colon: 3527/4633 (first line 3237/4633)
Dev colon: 451/597 (first line 410/597)


In [35]:
df_train_colon = df_train[train_colon_firstline]
df_train_nocolon = df_train[train_colon_firstline.map(lambda x: not x)]

Colon is not the unique delimiter of the problem... Let's see some more examples!

In [36]:
row_ex

ROW ID                                                        679140
HADM ID                                                       187377
SUBJECT ID                                                     98413
Assessment         This is a 67 year old male h/o HTN, systolic H...
Plan Subsection    ICU Care\n   Nutrition:\n   Glycemic Control:\...
Relation                                                Not Relevant
Name: 596, dtype: object

In [37]:
for i in range(10):
    print('='*40)
    row_ex = df_train_nocolon.iloc[(i * 101) % len(df_train_nocolon)]
    print(f"(ROW ID: {row_ex['ROW ID']} / {row_ex['Relation']})")
    print(row_ex['Plan Subsection'])

(ROW ID: 334065 / Direct)
# Pneumoperitoneum - Exam reassuring.  No leukoctyosis or fevers. CT
   confirms pneumoperitoneum without free fluid, fat stranding or contrast
   leakage.
   - Continue empiric Zosyn (day 2) and metronidazole (day 2)
   - IV fluids
   - Appreciate surgery recommendations. No plan for OR currently.
   - Serial abdominal exams
   - Holding PO anti-hypertensives, substitute with IV metoprolol q6H
(ROW ID: 329070 / Neither)
Depression. Appropriate mood and affect.
   - Continue home citalopram.
   .
(ROW ID: 705751 / Indirect)
Asthma-
   - Continue with atrovent nebs
(ROW ID: 501377 / Neither)
# Hypothyroidism.
   - continue levothyroxine IV for now..
(ROW ID: 379349 / Not Relevant)
ICU Care
   Nutrition: Restart diabetic diet this am
   Glycemic Control:
   Lines:
   20 Gauge - [**2181-5-22**] 03:30 PM
   18 Gauge - [**2181-5-22**] 03:42 PM
   Prophylaxis:
   DVT: heparin SC
   Stress ulcer: not indicated
   VAP:
(ROW ID: 430048 / Direct)
# Fever/chills/malaise/

$\rightarrow$ Still not sure what is the best way to split sections in those notes.



# MIMIC-III section augmentation

### Load discharge summary

In [38]:
# Discharge summaries
df_ds = df_notes[df_notes.CATEGORY == 'Discharge summary']
len(df_ds)

59652

In [39]:
# Find HADM IDs of the dataset without discharge summary
for data_name, df_data in [('Train', df_train), ('Dev', df_dev)]:
    hadm_ids_no_ds = []
    for hadm_id in tqdm(df_data['HADM ID'].unique()):
        if sum(df_ds.HADM_ID == float(hadm_id)) == 0:
            hadm_ids_no_ds.append(hadm_id)
    print(f'{data_name}: {len(hadm_ids_no_ds)} ({hadm_ids_no_ds})')

  0%|          | 0/598 [00:00<?, ?it/s]

Train: 7 ([193540, 106860, 111857, 101199, 186902, 193824, 195120])


  0%|          | 0/75 [00:00<?, ?it/s]

Dev: 4 ([197436, 197560, 107025, 187208])


In [40]:
# Get the longest discharge summary
def get_discharge_summary(hadm_id):
    rows = df_ds[df_ds.HADM_ID == float(hadm_id)]
    notes = [row.TEXT for _, row in rows.iterrows()]
    if notes:
        return notes[np.argmax(list(map(len, notes)))]
    else:
        return ""

### Extract specific types of section

In [41]:
# Predefined section titles
with open(section_titles_fpath, 'r') as fd:
    section_titles = json.load(fd)

print(f'{len(section_titles)} title lines')
section_titles_set = set(section_titles)

def split_ds_sections(note, verbose=False):
    # Split into lines
    lines = note.splitlines()
    
    # Detect section title lines
    title_idxs = []
    for i, line in enumerate(lines):
#         line = line.lstrip()
        if ':' in line:
            title_cand = line.lower()[:line.index(':')]
            if title_cand in section_titles_set:
                title_idxs.append(i)
                if verbose:
                    print(f'Found ({line[:line.index(":")]})')
    
    # Output sections
    sections = []
    last_idx = 0
    for idx in title_idxs:
        if last_idx == idx:
            continue
        sections.append('\n'.join(lines[last_idx:idx]))
        last_idx = idx
    sections.append('\n'.join(lines[last_idx:]))
            
    return sections

def get_section_title(section):
    if '\n' in section:
        first_line = section[:section.index('\n')]
    else:
        first_line = section
    assert ':' in first_line
    return first_line[:first_line.index(':')]

106 title lines


- Example discharge summary sections

In [42]:
# Test split
row_ex = df_train.iloc[0]
note = get_discharge_summary(row_ex['HADM ID'])
sections = split_ds_sections(note, verbose=True)
for section in sections:
    sec_title = get_section_title(section)
    print('='*20 + f'[{sec_title}]' + '='*20)
    print(section)

Found (Admission Date)
Found (Date of Birth)
Found (Service)
Found (Allergies)
Found (Attending)
Found (Chief Complaint)
Found (Major Surgical or Invasive Procedure)
Found (History of Present Illness)
Found (Past Medical History)
Found (Social History)
Found (Family History)
Found (Physical Exam)
Found (Pertinent Results)
Found (HISTORY)
Found (Brief Hospital Course)
Found (Medications on Admission)
Found (Discharge Medications)
Found (Discharge Disposition)
Found (Facility)
Found (Discharge Diagnosis)
Found (Discharge Condition)
Found (Discharge Instructions)
Found (Followup Instructions)
Found (Completed by)
Admission Date:  [**2181-10-30**]              Discharge Date:   [**2181-11-9**]

Date of Birth:  [**2129-12-24**]             Sex:   M

Service: CARDIOTHORACIC

Allergies:
Vancomycin

Attending:[**First Name3 (LF) 1406**]
Chief Complaint:
Chest pain

Major Surgical or Invasive Procedure:
Cardiac Catherization
s/p CABGx5(LIMA->LAD, SVG->Diag1, Diag2, OM, PDA) [**2181-11-5**]


Hi

In [43]:
present_history_titles = [
    "history of present illness",
    "history of the present illness",
]

chief_complaint_titles = [
    "chief complaint",
]

hospital_course_titles = [
    "brief hospital course",
    "hospital course",
    "review of systems",
    "summary of hospital course by systems",
    "hospital course by systems",
    "summary of hospital course",
    "hospital course by system",
    "brief summary of hospital course",
    "summary of hospital course by system",
    "history of hospital course by systems",
]

section_title_dict = {
    'present_history': set(present_history_titles),
    'chief_complaint': set(chief_complaint_titles),
    'hospital_course': set(hospital_course_titles)
}

all_section_names = list(section_title_dict.keys())

def extract_sections(note, section_names=all_section_names):
    assert all([name in section_title_dict for name in section_names])
    ret = {name: [] for name in section_names}
    
    sections = split_ds_sections(note)
    for section in sections:
        if not section: continue
        sec_title = get_section_title(section).lower()
        for k, v in section_title_dict.items():
            if sec_title in v:
                ret[k].append(section)
                
    return {name: '\n'.join(sections) for name, sections in ret.items()}

- Example augmentation

In [44]:
print(f'ROW ID: {row_ex["ROW ID"]}')
print(f'HADM ID: {row_ex["HADM ID"]}')
print(f'Assessment: {row_ex["Assessment"]}')
print(f'Plan Subsection: {row_ex["Plan Subsection"]}')
print(f'Relation: {row_ex["Relation"]}')

ROW ID: 701531
HADM ID: 186454
Assessment: 51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
Plan Subsection: # CORONARIES: Patient with 3 vessel disease on cath. Previously found
   to be the case in [**2174**], but had been treated medically. On the floor,
   patient with recurrent chest pain and EKG changes, concerning for
   further ischemia. Chest pain has been controlled on nitro gtt. Patient
   in CCU for monitoring; plan for CABG today.
   - continue nitro gtt
   - if has further CP, would uptitrate nitro, and contact [**Name2 (NI) 9339**] and
   attending for potential balloon pump vs. urgent CABG
   - appreciate CT [**Doctor First Name 91**] recs
   - continue [**Last Name (LF) **], [**First Name3 (LF) 119**], ACE-i, simvastatin
Relation: Direct


In [45]:
for k, v in extract_sections(note).items():
    print('='*20 + f'[{k}]' + '='*20)
    print(v)

History of Present Illness:
51 year male that presented on [**10-30**] with chest pain, SOB that
had been occuring for over a month. Of note, the patient has
been non-compliant with medications because of lack of insurance
because he lost his job. The patient woke up at 3am on [**10-29**]
with acute SOB. His father drove him to emergency department.
He was gasping, saturating 78% on RA, [**Doctor Last Name 352**] in color with a
respiratory rate in the 40s.
In OSH ED, he was thought to be in pulmonary edema. He was given
Lasix 240 total, morphine, placed on a non-rebreather and then
eventually bipap briefly. Then weaned to non-rebreather and then
back to 6L NC, with sats 93-95%. A foley was placed- has since
diuresed 1920cc since admission. EKG showed new left bundle on
admission, with trop of 0.15. An echo was done on [**2181-10-29**]
showing new low EF of 25% with akinesis of apex, posterior and
inferior walls, severe hypokinesis of septum and anterior walls.
Given CP, elevated trop,

### Augmented section analysis

- Similarity with the assessment

In [46]:
# Compute the similarity between the input and additional sections
assessment = row_ex["Assessment"]
section_dict = extract_sections(note)

for name, section in section_dict.items():
    print(f'assessment - {name}')
    print(f'Levenshtein: {get_levenshtein_score(assessment.split(), section.split(), similarity=True)}')
    print(f'Rouge-1    : {get_rouge_score(assessment, section, metric="rouge1")}')
    print(f'Rouge-2    : {get_rouge_score(assessment, section, metric="rouge2")}')
    print(f'Rouge-L    : {get_rouge_score(assessment, section, metric="rougeL")}')
    print()

assessment - present_history
Levenshtein: 0.05319148936170213
Rouge-1    : 0.09032258064516129
Rouge-2    : 0.0130718954248366
Rouge-L    : 0.03870967741935484

assessment - chief_complaint
Levenshtein: 0.0
Rouge-1    : 0.12903225806451613
Rouge-2    : 0.06896551724137931
Rouge-L    : 0.12903225806451613

assessment - hospital_course
Levenshtein: 0.04054054054054054
Rouge-1    : 0.062015503875969
Rouge-2    : 0.0
Rouge-L    : 0.04651162790697674



### Dataset Augmentation

- Get train augmentation sections

In [47]:
# Get sections
train_no_ds_cnt = 0
train_additional_data = {name: [] for name in all_section_names}
for i, row in tqdm(df_train.iterrows(), total=len(df_train)):
    note = get_discharge_summary(row['HADM ID'])
    section_dict = extract_sections(note)
    for name, section in section_dict.items():
        train_additional_data[name].append(section)
    train_no_ds_cnt += 1 if not note else 0

  0%|          | 0/4633 [00:00<?, ?it/s]

In [48]:
train_no_ds_cnt

63

In [51]:
# Checkout the length statistics
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1'], use_stemmer=False)

In [53]:
for name, sections in train_additional_data.items():
    print(f'{name} ({sum([bool(x) for x in sections])})')
    
    word_cnt = [len(x.split()) for x in sections]
    print(f'- Word    : {np.mean(word_cnt):.4f}')
    
    token_cnt = [len(tokenizer.encode(x, add_special_tokens=False)) for x in sections]
    print(f'- Token   : {np.mean(token_cnt):.4f}')
    
    assessments = df_train2['Assessment']
    rouge_scores = [scorer.score(assessments[i], x)['rouge1'].recall for i, x in enumerate(sections)]
    print(f'- Rouge1 R: {np.mean(rouge_scores):.4f}')
    
    leven_scores = [get_levenshtein_score(assessments[i].split(), x.split(), similarity=True)
                    for i, x in enumerate(sections)]
    print(f'- Leven   : {np.mean(leven_scores):.4f}')

present_history (4570)
- Word    : 268.6223
- Token   : 458.3471
- Rouge1 R: 0.6064
- Leven   : 0.0534
chief_complaint (4558)
- Word    : 6.0246
- Token   : 11.2599
- Rouge1 R: 0.0462
- Leven   : 0.0211
hospital_course (4570)
- Word    : 519.8973
- Token   : 891.7118
- Rouge1 R: 0.6877
- Leven   : 0.0377


- Augment the train data

In [49]:
df_train2 = df_train.copy()
for i, (name, sections) in enumerate(train_additional_data.items()):
    df_train2.insert(loc=5 + i, column=name, value=sections)

In [50]:
print(len(df_train2))
df_train2.head()

4633


Unnamed: 0,ROW ID,HADM ID,SUBJECT ID,Assessment,Plan Subsection,present_history,chief_complaint,hospital_course,Relation
0,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# CORONARIES: Patient with 3 vessel disease on...,History of Present Illness:\n51 year male that...,Chief Complaint:\nChest pain\n,Brief Hospital Course:\nTransferred from outsi...,Direct
1,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# Acute on Chronic systolic CHF: Patient with ...,History of Present Illness:\n51 year male that...,Chief Complaint:\nChest pain\n,Brief Hospital Course:\nTransferred from outsi...,Indirect
2,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# Social stressors: Recently lost mother to ca...,History of Present Illness:\n51 year male that...,Chief Complaint:\nChest pain\n,Brief Hospital Course:\nTransferred from outsi...,Neither
3,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...","# FEN: Replete lytes PRN, NPO for CABG today\n...",History of Present Illness:\n51 year male that...,Chief Complaint:\nChest pain\n,Brief Hospital Course:\nTransferred from outsi...,Not Relevant
4,554204,196435,85490,"CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, B...",# Dyspnea: Differential in this patient would ...,History of Present Illness:\n57 y/o man with a...,Chief Complaint:\nDyspnea\n,Brief Hospital Course:\n57 y/o man with long s...,Direct


- Same for the dev set

In [54]:
dev_no_ds_cnt = 0
dev_additional_data = {name: [] for name in all_section_names}
for i, row in tqdm(df_dev.iterrows(), total=len(df_dev)):
    note = get_discharge_summary(row['HADM ID'])
    section_dict = extract_sections(note)
    for name, section in section_dict.items():
        dev_additional_data[name].append(section)
    dev_no_ds_cnt += 1 if not note else 0

  0%|          | 0/597 [00:00<?, ?it/s]

In [55]:
dev_no_ds_cnt

32

In [64]:
for name, sections in dev_additional_data.items():
    print(f'{name} ({sum([bool(x) for x in sections])})')
    
    word_cnt = [len(x.split()) for x in sections]
    print(f'- Word    : {np.mean(word_cnt):.4f}')
    
    token_cnt = [len(tokenizer.encode(x, add_special_tokens=False)) for x in sections]
    print(f'- Token   : {np.mean(token_cnt):.4f}')
    
    assessments = df_dev2['Assessment']
    rouge_scores = [scorer.score(assessments[i], x)['rouge1'].recall for i, x in enumerate(sections)]
    print(f'- Rouge1 R: {np.mean(rouge_scores):.4f}')
    
    leven_scores = [get_levenshtein_score(assessments[i].split(), x.split(), similarity=True)
                    for i, x in enumerate(sections)]
    print(f'- Leven   : {np.mean(leven_scores):.4f}')

present_history (565)
- Word    : 252.5126
- Token   : 426.6683
- Rouge1 R: 0.5464
- Leven   : 0.0508
chief_complaint (565)
- Word    : 5.0419
- Token   : 9.5812
- Rouge1 R: 0.0406
- Leven   : 0.0206
hospital_course (565)
- Word    : 559.2479
- Token   : 945.0352
- Rouge1 R: 0.6432
- Leven   : 0.0304


In [61]:
df_dev2 = df_dev.copy()
for i, (name, sections) in enumerate(dev_additional_data.items()):
    df_dev2.insert(loc=5 + i, column=name, value=sections)

In [62]:
print(len(df_dev2))
df_dev2.head()

597


Unnamed: 0,ROW ID,HADM ID,SUBJECT ID,Assessment,Plan Subsection,present_history,chief_complaint,hospital_course,Relation
0,575156,112508,91333,A 60 year old woman with recurrent ALL with CN...,"CNS VRE: S/P Omaya removal, on linezolid for ...",History of Present Illness:\n60-year-old woman...,Chief Complaint:\n10% blasts on routine differ...,Brief Hospital Course:\n60 YO F with recurrent...,Direct
1,575156,112508,91333,A 60 year old woman with recurrent ALL with CN...,LEUKOCYTOSIS: GCSF vs culture negative bacte...,History of Present Illness:\n60-year-old woman...,Chief Complaint:\n10% blasts on routine differ...,Brief Hospital Course:\n60 YO F with recurrent...,Direct
2,575156,112508,91333,A 60 year old woman with recurrent ALL with CN...,ALL: Currently day +43 s/p hyper-CVAD. She i...,History of Present Illness:\n60-year-old woman...,Chief Complaint:\n10% blasts on routine differ...,Brief Hospital Course:\n60 YO F with recurrent...,Indirect
3,575156,112508,91333,A 60 year old woman with recurrent ALL with CN...,DRUG RASH: Appears improved today. Evidence ...,History of Present Illness:\n60-year-old woman...,Chief Complaint:\n10% blasts on routine differ...,Brief Hospital Course:\n60 YO F with recurrent...,Indirect
4,575156,112508,91333,A 60 year old woman with recurrent ALL with CN...,ELEVATED LIVER ENZYMES: Consistent with shock...,History of Present Illness:\n60-year-old woman...,Chief Complaint:\n10% blasts on routine differ...,Brief Hospital Course:\n60 YO F with recurrent...,Neither


- Output to files

In [65]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
df_train2.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
df_dev2.to_csv(os.path.join(output_dir, 'dev.csv'), index=False)

In [153]:
df_traindev2 = pd.concat([df_train2, df_dev2])
df_traindev2.to_csv(os.path.join(output_dir, 'traindev.csv'), index=False)

# Build the test dataset

- Provided file (no label)

In [146]:
df_test_raw = pd.read_csv(test_fpath)

In [147]:
df_test_raw.head(40)

Unnamed: 0,ROW ID,HADM ID,Assessment,Plan Subsection
0,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,Sepsis\n likely has sepsis physiology with hyp...
1,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,Anuria- Renal consulted for anuria- consideri...
2,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,R pleural effusion: Unilateral nature in the s...
3,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,DM/hypoglycemia: possibly from not clearing me...
4,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,S/p recent femur fracture: held SQH overnight
5,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,"Chronic anemia: stable at baseline, ?thalesemi..."
6,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,"# FEN: NG tube\n # PPx: SC hep, IV ppi, bowe..."
7,560683,194340,ssessment and Plan\n SEPSIS WITHOUT ORGAN DYSF...,#. Sepsis: Patient has re-developed fevers on ...
8,560683,194340,ssessment and Plan\n SEPSIS WITHOUT ORGAN DYSF...,#. Volume status/Chronic systolic HF- As evide...
9,560683,194340,ssessment and Plan\n SEPSIS WITHOUT ORGAN DYSF...,#. Peripheral arterial disease: will follow up...


- Append arbitrary labels so our code can deal with

In [75]:
# Append arbitrary labels
df_test_raw['Relation'] = ['Direct'] * len(df_test_raw)

In [76]:
df_test_raw.head()

Unnamed: 0,ROW ID,HADM ID,Assessment,Plan Subsection,Relation
0,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,Sepsis\n likely has sepsis physiology with hyp...,Direct
1,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,Anuria- Renal consulted for anuria- consideri...,Direct
2,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,R pleural effusion: Unilateral nature in the s...,Direct
3,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,DM/hypoglycemia: possibly from not clearing me...,Direct
4,560683,106899,Ms. [**Known lastname 3847**] is a 79F with a ...,S/p recent femur fracture: held SQH overnight,Direct


In [70]:
df_test_raw.to_csv(os.path.join(input_dir, 'test.csv'), index=False)

- Note augmentation

In [77]:
# Basic statistics
print(f'Total  : {len(df_test_raw)}')
print(f'ROW ID : {len(df_test_raw["ROW ID"].unique())}')
print(f'HADM ID: {len(df_test_raw["HADM ID"].unique())}')

Total  : 667
ROW ID : 1
HADM ID: 86


In [84]:
# Assert that HADM IDs are not leaked
assert len(set(df_test_raw["HADM ID"]) & set(df_train["HADM ID"])) == 0
assert len(set(df_test_raw["HADM ID"]) & set(df_dev["HADM ID"])) == 0

In [101]:
hadm_row_dict = {}
ds_hadm_id_set = set(df_ds.HADM_ID.unique())

for hadm_id in tqdm(df_test_raw["HADM ID"].unique()):
    df_notes2 = df_notes[df_notes.HADM_ID == hadm_id]
    df_ex = df_test_raw[df_test_raw["HADM ID"] == hadm_id]
    assert len(df_ex['Assessment'].unique()) == 1
    text_to_check = [row['Plan Subsection'] for _, row in df_ex.iterrows()] + [df_ex.iloc[0]['Assessment']]

    found_row_ids = []
    for i, row_note in df_notes2.iterrows():
        note = row_note.TEXT
        found = True
        for text in text_to_check:
            if text not in note:
                found = False
                break
        if found:
            found_row_ids.append(row_note.ROW_ID)
            
    if found_row_ids:
        hadm_row_dict[hadm_id] = found_row_ids
    else:
        raise ValueError()
        
    print(f'{hadm_id}: {"DS " + ("O" if hadm_id in ds_hadm_id_set else "X")} {found_row_ids}')

  0%|          | 0/86 [00:00<?, ?it/s]

106899: DS O [560683]
194340: DS O [388450]
192318: DS O [734294]
101316: DS O [590986]
187860: DS O [536764, 536812, 536813]
107891: DS O [496232]
107797: DS O [326076]
103490: DS O [575527]
196422: DS O [374147]
149247: DS O [372604]
194006: DS O [684855]
102155: DS O [325575]
102666: DS O [708981, 708945]
103060: DS O [485211, 485251]
101787: DS O [406946, 406936]
102942: DS O [456939]
195911: DS X [543730, 543669, 543674]
198668: DS O [716153]
101125: DS O [495225, 495430]
188033: DS O [532722]
186658: DS O [550561, 550559]
195702: DS O [390593, 390607, 390592, 390653]
107256: DS O [371368]
188969: DS O [338963]
187621: DS O [436236]
197588: DS O [585787]
196864: DS O [475581]
197368: DS O [674436, 674461]
190934: DS O [391789]
199574: DS O [393126]
110899: DS O [631611]
100469: DS O [412508]
112153: DS O [351946]
104726: DS O [541342]
106374: DS O [344441]
109415: DS O [586980, 586975]
103926: DS O [560387, 560388, 560321]
112342: DS O [516144, 516319]
104840: DS O [409149, 409154

In [137]:
test_no_ds_cnt = 0
test_additional_data = {name: [] for name in all_section_names}
for i, row in tqdm(df_test_raw.iterrows(), total=len(df_test_raw)):
    note = get_discharge_summary(row['HADM ID'])
    section_dict = extract_sections(note)
    for name, section in section_dict.items():
        test_additional_data[name].append(section)
    test_no_ds_cnt += 1 if not note else 0

  0%|          | 0/667 [00:00<?, ?it/s]

In [138]:
df_test2 = df_test_raw.copy()
row_ids = [hadm_sections_dict[row_ex['HADM ID']][0] for _, row_ex in df_test2.iterrows()]
subject_ids = [hadm_sections_dict[row_ex['HADM ID']][0] for _, row_ex in df_test2.iterrows()]
df_test2['ROW ID'] = row_ids
df_test2.insert(loc=2, column="SUBJECT ID", value=subject_ids)

for i, (name, sections) in enumerate(test_additional_data.items()):
    df_test2.insert(loc=5 + i, column=name, value=sections)

In [139]:
df_test2.head()

Unnamed: 0,ROW ID,HADM ID,SUBJECT ID,Assessment,Plan Subsection,present_history,chief_complaint,hospital_course,Relation
0,35704,106899,35704,Ms. [**Known lastname 3847**] is a 79F with a ...,Sepsis\n likely has sepsis physiology with hyp...,History of Present Illness:\nMs. [**Known last...,Chief Complaint:\nDehydration\n,Brief Hospital Course:\nMs. [**Known lastname ...,Direct
1,35704,106899,35704,Ms. [**Known lastname 3847**] is a 79F with a ...,Anuria- Renal consulted for anuria- consideri...,History of Present Illness:\nMs. [**Known last...,Chief Complaint:\nDehydration\n,Brief Hospital Course:\nMs. [**Known lastname ...,Direct
2,35704,106899,35704,Ms. [**Known lastname 3847**] is a 79F with a ...,R pleural effusion: Unilateral nature in the s...,History of Present Illness:\nMs. [**Known last...,Chief Complaint:\nDehydration\n,Brief Hospital Course:\nMs. [**Known lastname ...,Direct
3,35704,106899,35704,Ms. [**Known lastname 3847**] is a 79F with a ...,DM/hypoglycemia: possibly from not clearing me...,History of Present Illness:\nMs. [**Known last...,Chief Complaint:\nDehydration\n,Brief Hospital Course:\nMs. [**Known lastname ...,Direct
4,35704,106899,35704,Ms. [**Known lastname 3847**] is a 79F with a ...,S/p recent femur fracture: held SQH overnight,History of Present Illness:\nMs. [**Known last...,Chief Complaint:\nDehydration\n,Brief Hospital Course:\nMs. [**Known lastname ...,Direct


In [140]:
df_test2.to_csv(os.path.join(output_dir, 'test.csv'), index=False)

## Appendix: examples

In [174]:
df_train2

Unnamed: 0,ROW ID,HADM ID,SUBJECT ID,Assessment,Plan Subsection,present_history,chief_complaint,hospital_course,Relation
0,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# CORONARIES: Patient with 3 vessel disease on...,History of Present Illness:\n51 year male that...,Chief Complaint:\nChest pain\n,Brief Hospital Course:\nTransferred from outsi...,Direct
1,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# Acute on Chronic systolic CHF: Patient with ...,History of Present Illness:\n51 year male that...,Chief Complaint:\nChest pain\n,Brief Hospital Course:\nTransferred from outsi...,Indirect
2,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...",# Social stressors: Recently lost mother to ca...,History of Present Illness:\n51 year male that...,Chief Complaint:\nChest pain\n,Brief Hospital Course:\nTransferred from outsi...,Neither
3,701531,186454,40514,"51 yr old F with a history of 3V CAD, confirme...","# FEN: Replete lytes PRN, NPO for CABG today\n...",History of Present Illness:\n51 year male that...,Chief Complaint:\nChest pain\n,Brief Hospital Course:\nTransferred from outsi...,Not Relevant
4,554204,196435,85490,"CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, B...",# Dyspnea: Differential in this patient would ...,History of Present Illness:\n57 y/o man with a...,Chief Complaint:\nDyspnea\n,Brief Hospital Course:\n57 y/o man with long s...,Direct
...,...,...,...,...,...,...,...,...,...
4628,548032,148906,55750,The patient is a 71 year old female with a his...,Polymyositis: On long term prednisone\n - co...,History of Present Illness:\n71 year old woman...,Chief Complaint:\nNSTEMI\n,Brief Hospital Course:\nPatient is a 71 yo F a...,Neither
4629,548032,148906,55750,The patient is a 71 year old female with a his...,Diabetes Type II: Cover with sliding scale\n .,History of Present Illness:\n71 year old woman...,Chief Complaint:\nNSTEMI\n,Brief Hospital Course:\nPatient is a 71 yo F a...,Indirect
4630,548032,148906,55750,The patient is a 71 year old female with a his...,COPD: Continue home medications\n .,History of Present Illness:\n71 year old woman...,Chief Complaint:\nNSTEMI\n,Brief Hospital Course:\nPatient is a 71 yo F a...,Indirect
4631,548032,148906,55750,The patient is a 71 year old female with a his...,"Anxiety: Continue home medications, but holdin...",History of Present Illness:\n71 year old woman...,Chief Complaint:\nNSTEMI\n,Brief Hospital Course:\nPatient is a 71 yo F a...,Neither


In [228]:
print(df_train2.iloc[123]['Assessment'])

Patient is a 58 year old male with history of atrial fibrillation on
   coumadin, hyperlipidemia, hypertension, peripheral vascular disease,
   s/p CVA who presented to [**Hospital3 847**] on [**2185-3-2**] with heart
   failure exacerbation and atrial fibrillation with RVR, transferred due
   to difficulty with diuresis [**2-7**] hypotension.


In [231]:
print(df_train2.iloc[123]['Plan Subsection'])

GERD
   - Continue [**Hospital1 **] PPI


In [232]:
print(df_train2.iloc[123]['chief_complaint'])

Chief Complaint:
Atrial fibrillation with RVR, CHF



In [229]:
print(df_train2.iloc[123]['present_history'])

History of Present Illness:
Mr. [**Known lastname 34850**] is a 58 year old male with history of atrial
fibrillation on coumadin, hyperlipidemia, hypertension,
peripheral vascular disease, s/p CVA who presented to [**Hospital 11373**] on [**2185-3-2**] with heart failure exacerbation and atrial
fibrillation with RVR. He presented with a one week history of
increased dyspnea on exertion, PND, 3 pillow orthopnea and
decreased exercise tolerance. Prior to presentation to the
hospital he was unable to transfer from his bed without marked
SOB. At [**Location (un) **], BNP was noted to be elevated and CXR showed
increased interstitial markings. He was seen by Cardiology
consult service who recommended rule out for MI, repeat echo,
beta blockers and calcium channel blockers. He was diuresed and
for his AF initially started on a diltiazem gtt and then
converted to Metoprolol 25mg q6H to treat his heart rate. At
some point during the hospitalization he was started on
amiodarone and cardioverted

In [233]:
print(df_train2.iloc[123]['hospital_course'])

Brief Hospital Course:
Patient is a 58 year old male with history of atrial
fibrillation on coumadin, hyperlipidemia, hypertension,
peripheral vascular disease, s/p CVA who presented to [**Hospital 11373**] on [**2185-3-2**] with heart failure exacerbation and atrial
fibrillation with RVR, transferred due to difficulty with
diuresis [**2-7**] hypotension.
.
# Pump: Appears euvolemic on admission creatinine normalized.
Weaned off oxygen. Denies SOB. Monitor sats on RA, with
ambulation.  Lasix held but will resume outpatient dose on d/c.
Cont lisinopril.
.
# Acute renal failure: Resolved.
.
# Atrial fibrillation: Currently in sinus rhythm s/p
cardioversion. INR 3.5 in setting of starting amiodarone.
Monitored on telemetry. Cont metoprolol and loading dose of
amiodarone 400mg PO BID with plans to wean in 10 days to 200 PO
daily. Held warfarin as INR supratherapeutic.
.
# Leukocytosis: Resolved. Afebrile, urine cx negative, no focal
evidence of infection on ROS, exam, CXR.
.
#. CAD: Report

## Appendix: token length for Clinical Longformer

In [176]:
tokenizer_lf = AutoTokenizer.from_pretrained("yikuan8/Clinical-Longformer")

In [180]:
def get_token_len(text):
    text = text.lower()
    text = re.sub(' +', ' ', text)
    return len(tokenizer_lf.encode(text, add_special_tokens=False))

In [186]:
df_train2_lens = {}
for col in ['Assessment', 'Plan Subsection', 'present_history', 'chief_complaint', 'hospital_course']:
    df_train2_lens[col] = df_train2[col].map(get_token_len)
df_train2_lens = pd.DataFrame(df_train2_lens)

In [188]:
df_train2_lens.describe()

Unnamed: 0,Assessment,Plan Subsection,present_history,chief_complaint,hospital_course
count,4633.0,4633.0,4633.0,4633.0,4633.0
mean,62.723721,80.531837,453.994172,12.774013,887.287719
std,32.771466,68.016243,207.14504,29.314713,550.330935
min,13.0,2.0,0.0,0.0,0.0
25%,41.0,30.0,329.0,8.0,527.0
50%,56.0,60.0,434.0,10.0,803.0
75%,76.0,113.0,566.0,13.0,1140.0
max,231.0,565.0,1562.0,855.0,3871.0


In [221]:
train_lens_trim = [
    df_train2_lens['Assessment'].map(lambda x: min(x, 150)),
    df_train2_lens['Plan Subsection'].map(lambda x: min(x, 250)),
    df_train2_lens['chief_complaint'],
    df_train2_lens['hospital_course'].map(lambda x: min(x, 1240)),
    df_train2_lens['present_history'].map(lambda x: min(x, 590)),
]
pd.Series(np.array(train_lens_trim).sum(axis=0)).describe()

count    4633.000000
mean     1366.709260
std       433.083707
min        31.000000
25%      1061.000000
50%      1405.000000
75%      1706.000000
max      2251.000000
dtype: float64

In [219]:
np.percentile(df_train2_lens['Assessment'], 95)

130.0

In [220]:
np.percentile(df_train2_lens['Plan Subsection'], 95)

216.0

In [215]:
np.percentile(df_train2_lens['present_history'], 80)

590.0

In [216]:
np.percentile(df_train2_lens['hospital_course'], 80)

1240.0

In [189]:
df_dev2_lens = {}
for col in ['Assessment', 'Plan Subsection', 'present_history', 'chief_complaint', 'hospital_course']:
    df_dev2_lens[col] = df_dev2[col].map(get_token_len)
df_dev2_lens = pd.DataFrame(df_dev2_lens)

In [190]:
df_dev2_lens.describe()

Unnamed: 0,Assessment,Plan Subsection,present_history,chief_complaint,hospital_course
count,597.0,597.0,597.0,597.0,597.0
mean,69.293132,75.122278,422.137353,10.832496,945.231156
std,42.793905,72.869679,222.83412,5.002893,618.727846
min,11.0,4.0,0.0,0.0,0.0
25%,40.0,26.0,321.0,8.0,507.0
50%,59.0,54.0,409.0,10.0,881.0
75%,80.0,101.0,522.0,14.0,1237.0
max,226.0,896.0,1379.0,26.0,3960.0
