In [1]:
import csv
import re
import sys
import os
from collections import Counter, defaultdict
import random
import shutil

In [2]:
# Flags
use_header_abbr = True
use_cui_less = True

In [3]:
# Paths
sc13t2_dir = '../data/sc13t2/'

train_note_dir = os.path.join(sc13t2_dir, 'Task1TrainSetCorpus199/ALLREPORTS')
train_anno_dir = os.path.join(sc13t2_dir, 'Task2TrainSetSILVER2pipe-cui_and_label')
train_output_fpath = os.path.join(sc13t2_dir, 'train.tsv')
train_output_fpath2 = os.path.join(sc13t2_dir, 'train_test.tsv')
train_label_fpath = os.path.join(sc13t2_dir, 'train_cui_label.tsv')

test_note_dir = os.path.join(sc13t2_dir, 'Task1TestSetCorpus100/ALLREPORTS')
test_anno_dir = os.path.join(sc13t2_dir, 'Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe')
test_output_fpath = os.path.join(sc13t2_dir, 'test.tsv')
test_label_fpath = os.path.join(sc13t2_dir, 'test_cui_label.tsv')

cui_label_fpath = os.path.join(sc13t2_dir, 'cui_label.tsv')

lrabr_fpath = '../data/LRABR'

## Preparation

1. Please download the following files at https://physionet.org/content/shareclefehealth2013/1.0/
and place it under `data`:
    - Task1TrainSetCorpus199.zip
    - Task1TestSetCorpus100.zip
    - Task2TrainSetSILVER2pipe-cui_and_label.zip
    - Task2CLEFShARe2013Test_StrictAndLenientpipe.zip


2. Download `LRABR` of the SPECIALIST Lexicon from the [LSG website](https://lhncbc.nlm.nih.gov/LSG/Projects/lexicon/current/web/release/) (2018AB version) and put under `data`.


3. Install MetaMap (2018AB) version from [Installation guide](https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/Installation.html) and [Download page](https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/run-locally/MainDownload.html)

In [4]:
# Extract files
!mkdir -p ../data/sc13t2/Task1TrainSetCorpus199
!mkdir -p ../data/sc13t2/Task1TestSetCorpus100

!unzip -o ../data/Task1TrainSetCorpus199.zip -d ../data/sc13t2/Task1TrainSetCorpus199
!unzip -o ../data/Task1TestSetCorpus100.zip -d ../data/sc13t2/Task1TestSetCorpus100
!unzip -o ../data/Task2TrainSetSILVER2pipe-cui_and_label.zip -d ../data/sc13t2
!unzip -o ../data/Task2CLEFShARe2013Test_StrictAndLenientpipe.zip -d ../data/sc13t2

Archive:  ../data/Task1TrainSetCorpus199.zip
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/00098-016139-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/00211-027889-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/00414-104513-ECHO_REPORT.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/00500-097836-ECHO_REPORT.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/00587-400001-RADIOLOGY_REPORT.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/01114-083601-ECG_REPORT.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/01234-029456-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/01314-028800-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/01427-342648-RADIOLOGY_REPORT.txt  
  inflating: ../data/sc13t2/Task1TrainSetCorpus199/ALLREPORTS/01455-067052-

Archive:  ../data/Task1TestSetCorpus100.zip
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/00176-102920-ECHO_REPORT.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/00381-006281-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/00534-017453-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/00534-100076-ECHO_REPORT.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/01160-000945-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/01163-001840-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/01222-104065-ECHO_REPORT.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/02740-024700-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/03087-026480-DISCHARGE_SUMMARY.txt  
  inflating: ../data/sc13t2/Task1TestSetCorpus100/ALLREPORTS/03298-014440-DISCHARG

Archive:  ../data/Task2TrainSetSILVER2pipe-cui_and_label.zip
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/00098-016139-DISCHARGE_SUMMARY.pipe.txt  
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/00211-027889-DISCHARGE_SUMMARY.pipe.txt  
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/00414-104513-ECHO_REPORT.pipe.txt  
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/00500-097836-ECHO_REPORT.pipe.txt  
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/00587-400001-RADIOLOGY_REPORT.pipe.txt  
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/01114-083601-ECG_REPORT.pipe.txt  
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/01234-029456-DISCHARGE_SUMMARY.pipe.txt  
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/01314-028800-DISCHARGE_SUMMARY.pipe.txt  
  inflating: ../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label/01427-34264

Archive:  ../data/Task2CLEFShARe2013Test_StrictAndLenientpipe.zip
  inflating: ../data/sc13t2/Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe/00176-102920-ECHO_REPORT.pipe.txt  
  inflating: ../data/sc13t2/Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe/00381-006281-DISCHARGE_SUMMARY.pipe.txt  
  inflating: ../data/sc13t2/Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe/00534-017453-DISCHARGE_SUMMARY.pipe.txt  
  inflating: ../data/sc13t2/Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe/00534-100076-ECHO_REPORT.pipe.txt  
  inflating: ../data/sc13t2/Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe/01160-000945-DISCHARGE_SUMMARY.pipe.txt  
  inflating: ../data/sc13t2/Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe/01163-001840-DISCHARGE_SUMMARY.pipe.txt  
  inflating: ../data/sc13t2/Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe/01222-104065-ECHO_REPORT.pipe.txt  
  inflating: ../data/sc13t2/Task2ReferenceStd_CLEFShARe

## Read notes & annotations

In [5]:
# Parse single line of annotation
def parse_annotation_line(l, training=True):
    temp = l.split('||')
    assert temp[1] == 'Acronym_Abbreviation'
    if training:
        if temp[2] == 'CUI-less':
#             label = None
            label = ('CUI-less', 'CUI-less')
        else:
            # label = (temp[2][6:14], temp[2][26:b-1])
            cui = temp[2][temp[2].index('[')+1:temp[2].index(']')]
            name = temp[2][temp[2].rindex('[')+1:temp[2].rindex(']')]
            label = (cui, name)
    else:
#         label = (temp[2] if temp[2] != 'CUI-less' else None,
#                  temp[5] if temp[5] != 'CUI-less' else None)
        label = (temp[2], temp[5])
    result = ((int(temp[3]), int(temp[4])), label)
    return result

In [6]:
# Normalize a context of one side (left or right)
def trim_context(text, left=True, max_words=128):
    text = re.sub(r'\|{2,}', '|', text)  # Remove multiple '|' into one '|'
    text = re.sub(r'\t{1,}', ' ', text)  # Remove \t
    text = re.sub(r'_{2,}', '', text)    # Remove ____
    text = re.sub(r'\n{1,}', ' ', text)  # Remove multiple linebreak
    text = re.sub(r' {2,}', ' ', text)   # Remove multiple space
    text = text.strip()
    # Include up to max_words tokens
    tokens = text.split(' ')
    cnt, temp, anon_flag = 0, [], False
    while True:
        t = tokens[cnt] if (not left) else tokens[-(cnt+1)]
        if ('[**' if (not left) else '**]') in t:
            anon_flag = True
        if ('**]' if (not left) else '[**') in t:
            anon_flag = False

        if not left:
            temp.append(t)
        else:
            temp.insert(0, t)

        cnt += 1
        if ((not anon_flag) and (cnt >= max_words)) or (cnt == len(tokens)):
            break

    return ' '.join(temp)

In [7]:
# Parse annotations, read corresponding notes, and extract examples
def get_examples_and_cui_labels(note_dir, anno_dir, training=True):
    examples = []  # list of examples (abbr, left, right, cui)
    cui_labels_dict = {}  # dict of cui: list(label)
    
    anno_fnames = [fname for fname in os.listdir(anno_dir) if fname.endswith('.txt')]
    anno_fnames.sort()
    print(f'{anno_dir} -> {len(anno_fnames)} annotation files found')
    
    for fname in anno_fnames:
        with open(os.path.join(anno_dir, fname), 'r') as fd:
            anno_lines = [l.strip() for l in fd.readlines() if (l and l.strip())]
            if not anno_lines:
                continue
            annos = [parse_annotation_line(l, training=training) for l in anno_lines]

        note_fname = fname[:fname.index('.')] + '.txt'
        with open(os.path.join(note_dir, note_fname), 'r', encoding='iso8859_2') as fd:
            text = fd.read()

        anno_cnt_temp = 0
        first_lb = text.index('\n')  # the position of first linebreak. the first line will not be used.
        for (s, e), label in annos:
            if (not use_cui_less) and (label is None or label[0] is None or label[0] == 'CUI-less'):  # Skip if there's no CUI assigned for the abbr
                continue
            if (not use_header_abbr) and e <= first_lb:  # Skip if we don't want to use abbrs in the header
                continue

            if label[0] in cui_labels_dict:
                cui_labels_dict[label[0]].append(label[1])
            else:
                cui_labels_dict[label[0]] = [label[1]]

            abbr, left, right = text[s:e], text[:s], text[e:-1]
            abbr, left, right = abbr.strip(), left.strip(), right.strip()
            left = trim_context(left, True)
            right = trim_context(right, False)
            cui = label[0]
            examples.append((abbr, left, right, cui))
            anno_cnt_temp += 1
        print(f'\t{fname}: {anno_cnt_temp}')
    return examples, cui_labels_dict

In [8]:
train_examples, train_cui_labels_dict = get_examples_and_cui_labels(train_note_dir, train_anno_dir, training=True)
print(f'{len(train_examples)} training examples found')

../data/sc13t2/Task2TrainSetSILVER2pipe-cui_and_label -> 200 annotation files found
	00098-016139-DISCHARGE_SUMMARY.pipe.txt: 12
	00211-027889-DISCHARGE_SUMMARY.pipe.txt: 86
	00414-104513-ECHO_REPORT.pipe.txt: 13
	00500-097836-ECHO_REPORT.pipe.txt: 20
	00587-400001-RADIOLOGY_REPORT.pipe.txt: 6
	01114-083601-ECG_REPORT.pipe.txt: 1
	01234-029456-DISCHARGE_SUMMARY.pipe.txt: 103
	01314-028800-DISCHARGE_SUMMARY.pipe.txt: 82
	01427-342648-RADIOLOGY_REPORT.pipe.txt: 10
	01455-067052-ECG_REPORT.pipe.txt: 1
	01487-290421-RADIOLOGY_REPORT.pipe.txt: 24
	01982-060190-ECG_REPORT.pipe.txt: 1
	02034-037300-ECG_REPORT.pipe.txt: 1
	02115-010823-DISCHARGE_SUMMARY.pipe.txt: 77
	02136-017465-DISCHARGE_SUMMARY.pipe.txt: 1
	02405-069810-ECG_REPORT.pipe.txt: 1
	02410-026171-DISCHARGE_SUMMARY.pipe.txt: 22
	02652-006395-DISCHARGE_SUMMARY.pipe.txt: 27
	02916-100844-ECHO_REPORT.pipe.txt: 31
	03066-084521-ECG_REPORT.pipe.txt: 1
	03089-097913-ECHO_REPORT.pipe.txt: 26
	03273-009330-DISCHARGE_SUMMARY.pipe.txt: 5
	03

In [9]:
# Print some training data
print(train_examples[0], end='\n\n')
print(train_examples[10], end='\n\n')
print(train_examples[200], end='\n\n')
print(train_examples[3000], end='\n\n')
print(train_examples[-1])

('F', '16139 | 98 | 15836 | DISCHARGE_SUMMARY | 2015-03-24 00:00:00.0 | | | | Admission Date: [**2015-03-17**] Discharge Date: [**2015-03-24**] Date of Birth: [**1974-10-03**] Sex:', 'Service: Neurosurgery HISTORY OF PRESENT ILLNESS: The patient is a 40-year-old female with complaints of headache and dizziness. In [**2015-01-14**], the patient had headache with neck stiffness and was unable to walk for 45 minutes. The patient also had a similar episode a year and a half ago where she had inability to walk without pain. She had a headache at that time which was relieved with Tylenol. PAST MEDICAL HISTORY: Hypothyroidism. ALLERGIES: Penicillin and Bactrim which causes a rash. MEDICATIONS: Levoxyl 1.75 mg. PHYSICAL EXAMINATION: On physical examination, her blood pressure was 104/73, pulse 79. In general, she was a woman in no acute distress. HEENT: Nonicteric. Pupils are equal, round, and reactive to light. Extraocular movements are full. Pharynx is benign. Tongue midline. Neck is supple.

In [10]:
test_examples, test_cui_labels_dict = get_examples_and_cui_labels(test_note_dir, test_anno_dir, training=False)
print(f'{len(test_examples)} test examples found')

../data/sc13t2/Task2ReferenceStd_CLEFShARe2013Test_StrictAndLenientpipe -> 99 annotation files found
	00176-102920-ECHO_REPORT.pipe.txt: 9
	00381-006281-DISCHARGE_SUMMARY.pipe.txt: 31
	00534-017453-DISCHARGE_SUMMARY.pipe.txt: 1
	00534-100076-ECHO_REPORT.pipe.txt: 9
	01160-000945-DISCHARGE_SUMMARY.pipe.txt: 7
	01163-001840-DISCHARGE_SUMMARY.pipe.txt: 115
	01222-104065-ECHO_REPORT.pipe.txt: 7
	02740-024700-DISCHARGE_SUMMARY.pipe.txt: 8
	03087-026480-DISCHARGE_SUMMARY.pipe.txt: 26
	03298-014440-DISCHARGE_SUMMARY.pipe.txt: 9
	03628-023268-DISCHARGE_SUMMARY.pipe.txt: 10
	03835-028462-DISCHARGE_SUMMARY.pipe.txt: 9
	04082-167766-RADIOLOGY_REPORT.pipe.txt: 18
	04525-003099-DISCHARGE_SUMMARY.pipe.txt: 114
	04882-004677-DISCHARGE_SUMMARY.pipe.txt: 68
	04995-028156-DISCHARGE_SUMMARY.pipe.txt: 37
	05065-011493-DISCHARGE_SUMMARY.pipe.txt: 112
	05163-019624-DISCHARGE_SUMMARY.pipe.txt: 3
	05382-010331-DISCHARGE_SUMMARY.pipe.txt: 124
	05837-000274-DISCHARGE_SUMMARY.pipe.txt: 14
	06134-005003-DISCHARGE

In [11]:
# Print some test data
print(test_examples[0], end='\n\n')
print(test_examples[10], end='\n\n')
print(test_examples[200], end='\n\n')
print(test_examples[3000], end='\n\n')
print(test_examples[-1])

('ECHO', '102920 | 176 | 2167 |', '_REPORT | 2013-09-23 00:00:00.0 | | PATIENT/TEST INFORMATION: Indication: Pericardial effusion. Height: (in) 68 Weight (lb): 184 BSA (m2): 1.97 m2 BP (mm Hg): 140/80 HR (bpm): 70 Status: Inpatient Date/Time: [**2013-09-23**] at 10:20 Test: TTE (Focused views) Doppler: No doppler Contrast: None Technical Quality: Adequate | | INTERPRETATION: Findings: LEFT ATRIUM: The left atrium is moderately dilated. The left atrium is elongated. RIGHT ATRIUM/INTERATRIAL SEPTUM: The right atrium is moderately dilated. LEFT VENTRICLE: There is severe symmetric left ventricular hypertrophy. The left ventricular cavity size is normal. Overall left ventricular systolic function is low normal (LVEF 50-55%). LV WALL MOTION: The following resting regional left ventricular wall motion abnormalities are seen: basal inferoseptal - hypokinetic; mid inferoseptal - hypokinetic; basal inferior - hypokinetic; mid inferior - hypokinetic;', 'C0013516')

('GI', '6281 | 381 | 17672 | D

## Abbreviation Normalization

In [12]:
# Get some statistics of training abbrs
abbr_var_set = set(list(zip(*train_examples))[0])
print(f'{len(abbr_var_set)} training abbr variations\n')
print(sorted(abbr_var_set))

843 training abbr variations

['+EtOH', '02', '02 sat', '16Fr', '18Fr', '2D', '2u', '3VD', '3vd', '5', '5u', 'A', 'A&Ox3', 'A+O times 3', 'A-GRAM', 'A-V', 'A-comm', 'A. fib', 'A/O x3', 'A/P', 'AAA', 'AAO', 'AAO x3', 'AAOx3', 'ABD', 'ABD/PEL', 'ABG', 'AC', 'ACE', 'ACLS', 'ACOVE', 'ADAT', "ADD'L", 'ADLs', 'AFB', 'AFIB', 'AFib', 'AH', 'AHA', 'AI', 'AICD', 'AML', 'AMLs', 'ANGIO', 'AO', 'AOx3', 'AP', 'APC', 'AR', 'ARDS', 'AS', 'ASA', 'ASCTES', 'ASD', 'AT', 'ATN', 'AV', 'AVF', 'AVN', 'AVR', 'Abd', 'Abd U/S', 'Abxs', 'Ag', 'Amt', 'Angio', 'AoVA', 'B', 'B/L', 'B12', 'BAL', 'BCXR', 'BID', 'BIPAP', 'BL', 'BM', 'BP', 'BPH', 'BPs', 'BRAT', 'BRBPR', 'BS', 'BSA', 'BUN', 'BiPAP', 'Bili', 'Bp', 'Bs', 'C', 'C-SPINE', 'C-Spine', 'C-section', 'C-spine CT', 'C. Diff', 'C/D/I', 'C/o', 'C2', 'C3', 'CA', 'CABG', 'CAD', 'CAP', 'CARD', 'CAT', 'CATH', 'CCA', 'CCU', 'CCY', 'CD', 'CD4', 'CE', 'CEA', 'CECT', 'CFA', 'CFV', 'CHF', 'CHI', 'CIWA', 'CKs', 'CMO', 'CN', 'COPD', 'CP', 'CPAP', 'CPB', 'CPP', 'CPT', 'CRP', '

In [13]:
# Get some statistics of test abbrs
abbr_var_set = Counter(list(zip(*test_examples))[0])
print(f'{len(abbr_var_set)} test abbr variations\n')
print(sorted(abbr_var_set))

871 test abbr variations

['02', '02 sat', '2', '2D', '3U', '3VD', '5V', '5VCABG', '60Amp', '6MP', 'A', 'A and Ox3', 'A fib', 'A&O X 3', 'A&Ox2', 'A&Ox3', 'A+O x 3', 'A-Fib', 'A-V', 'A/O x3', 'A/P', 'AA', 'AAA', 'AAOx3', 'ABD', 'ABG', 'AC', 'ACE', 'ACEI', 'ACLS', 'ACS', 'AD', 'ADF', 'ADL', 'ADLs', 'AF', 'AFB', 'AFIB', 'AFib', 'AI', 'AICD', 'AIDS', 'ALT', 'AMA', 'ANC', 'AO', 'AO x 3', 'AOCD', 'AP', 'APD', 'APF', 'AR', 'ARDS', 'ARF', 'AS', 'ASA', 'ASD', 'AST', 'AT', 'ATNC', 'AV', 'AV-block', 'AVG', 'AVR', 'Ab', 'Abd', 'Abx', 'Afib', 'Agap', 'Alc', 'B', 'B-blockers', 'B/L', 'B12', 'BB', 'BG', 'BGs', 'BIBA', 'BID', 'BIPAP', 'BKA', 'BL', 'BLE', 'BNP', 'BODMC', 'BP', 'BPH', 'BRBPR', 'BS', 'BSA', 'BSO', 'BUN', 'Barb', 'Bicarb', 'Bilat', 'C', 'C-SPINE', 'C-spine', 'C2', 'C3-4', 'C4-5', 'C5-6', 'C6-7', 'CA', 'CABG', 'CAD', 'CATH', 'CBC', 'CC', 'CCE', 'CCU', 'CCY', 'CDDP', 'CE', 'CEA', 'CENT', 'CHB', 'CHF', 'CI', 'CICU', 'CK', 'CKD', 'CKMB', 'CKs', 'CME', 'CMO', 'CMV', 'CN', 'CNS', 'CO', 'CO2', 

In [14]:
# Get special symbols
special_symbols = set()
for abbr in list(zip(*train_examples))[0] + list(zip(*test_examples))[0]:
    for c in abbr:
        if not c.isalnum():
            special_symbols.add(c)
special_symbols

{' ', '#', '&', "'", '+', '-', '.', '/', '>'}

In [15]:
# regex_sym = re.compile('(\+/-)|[-_\.]')
# regex_sym = re.compile('(\+/-)|[-_\.\\+]')
regex_sym = re.compile('[-_\.\\+/&#> \']')
# regex_sym = re.compile('[-_\.\+/&#> \']')
# regex_sym = re.compile('[-_\.\\+/&#> \'\(\)\[\]\:\%\!\,\*]')
def normalize_abbr_variation(abbr):
    abbr = abbr.strip()
    abbr = regex_sym.sub('', abbr)
    abbr = abbr.lower()
    return abbr

abbr_set = set(list(zip(*train_examples))[0])
# abbr_set = set(list(zip(*test_examples))[0])
abbr_group_dict = {}
group_abbr_dict = {}
for abbr in abbr_set:
    abbr_norm = normalize_abbr_variation(abbr)
    if abbr_norm in abbr_group_dict:
        abbr_group_dict[abbr_norm].append(abbr)
    else:
        abbr_group_dict[abbr_norm] = [abbr]
    group_abbr_dict[abbr] = abbr_norm
        
print(f'{len(abbr_set)} abbrs -> {len(abbr_group_dict)} abbr groups\n')

for abbr_group in sorted(abbr_group_dict.keys()):
    print(f"{abbr_group:15}: {', '.join(['[%s]'%abbr for abbr in abbr_group_dict[abbr_group]])}")

843 abbrs -> 670 abbr groups

02             : [02]
02sat          : [02 sat]
16fr           : [16Fr]
18fr           : [18Fr]
2d             : [2D]
2u             : [2u]
3vd            : [3vd], [3VD]
5              : [5]
5u             : [5u]
a              : [A]
aaa            : [AAA]
aao            : [AAO]
aaox3          : [AAOx3], [AAO x3]
abd            : [Abd], [ABD], [abd]
abdpel         : [ABD/PEL]
abdus          : [Abd U/S]
abg            : [ABG]
abx            : [abx]
abxs           : [Abxs]
ac             : [AC]
ace            : [ACE]
acls           : [ACLS]
acomm          : [A-comm]
acove          : [ACOVE]
adat           : [ADAT]
addl           : [ADD'L]
adls           : [ADLs]
afb            : [AFB]
afib           : [AFib], [AFIB], [afib], [A. fib]
ag             : [Ag]
agram          : [A-GRAM]
ah             : [AH]
aha            : [AHA]
ai             : [AI]
aicd           : [aicd], [AICD]
alkphos        : [alk phos]
aml            : [AML]
amls           : [AMLs]
amt   

## Pseudonymization

In [16]:
texts = []
for d in train_examples + test_examples:
    texts.extend([d[1], d[2]])

In [17]:
temp_indir = os.path.join(sc13t2_dir, 'temp/')
temp_infpath = os.path.join(temp_indir, 'temp.txt')
temp_outdir = os.path.join(sc13t2_dir, 'temp_pseudonym/')
temp_outfpath = os.path.join(temp_outdir, 'temp.txt')

if not os.path.exists(temp_indir):
    os.makedirs(temp_indir)
with open(temp_infpath, 'w') as fd:
    fd.write('\n'.join(texts))
if os.path.exists(temp_outdir):
    shutil.rmtree(temp_outdir)

In [18]:
# pip install requests joblib sqlalchemy gensim
!python mimic-tools/main.py REPLACE \
        --input-dir {temp_indir} \
        --output-dir {temp_outdir} \
        --list-dir mimic-tools/lists

2022-04-19 06:12:07,251 Starting placeholder replacing
2022-04-19 06:12:07,251 Loading lists
2022-04-19 06:12:07,278 * Postal addresses: 20000 [656C Newport Court Coatesville, PA 19320 ...]
2022-04-19 06:12:07,537 * Last names: 88799 [SMITH, JOHNSON, WILLIAMS, JONES, BROWN ...]
2022-04-19 06:12:07,541 * Male first names: 1219 [JAMES, JOHN, ROBERT, MICHAEL, WILLIAM ...]
2022-04-19 06:12:07,553 * Female first names: 4275 [MARY, PATRICIA, LINDA, BARBARA, ELIZABETH ...]
2022-04-19 06:12:07,579 * Phone numbers: 20000 [(666) 372-7835, (923) 739-2644 ...]
2022-04-19 06:12:07,605 * Companies: 20000 [Ligula Aenean Gravida Ltd, Non Bibendum Sed LLC ...]
2022-04-19 06:12:07,606 * Countries: 264 [Afghanistan, Albania, Algeria, American Samoa ...]
2022-04-19 06:12:07,634 * Emails: 20000 [enim.Suspendisse.aliquet@Crasdictum.com, sapien.Cras.dolor@Curabitur.org ...]
2022-04-19 06:12:07,634 * Holiday names: 187 [Administrative Professionals Day, Air Force Birthday ...]
2022-04-19 06:12:07,641 * Hospit

In [19]:
with open(temp_outfpath, 'r') as fd:
    texts_pseudo = [l.strip() for l in fd.readlines()]
    
cnt = 0
for i, d in enumerate(train_examples):
    new_d = (d[0], texts_pseudo[cnt], texts_pseudo[cnt+1], d[3])
    train_examples[i] = new_d
    cnt += 2
    
for i, d in enumerate(test_examples):
    new_d = (d[0], texts_pseudo[cnt], texts_pseudo[cnt+1], d[3])
    test_examples[i] = new_d
    cnt += 2

In [20]:
# Print some test data again
print(test_examples[0], end='\n\n')
print(test_examples[10], end='\n\n')
print(test_examples[200], end='\n\n')
print(test_examples[3000], end='\n\n')
print(test_examples[-1])

('ECHO', '102920 | 176 | 2167 |', '_REPORT | 2013-09-23 00:00:00.0 | | PATIENT/TEST INFORMATION: Indication: Pericardial effusion. Height: (in) 68 Weight (lb): 184 BSA (m2): 1.97 m2 BP (mm Hg): 140/80 HR (bpm): 70 Status: Inpatient Date/Time: 2013-09-23 at 10:20 Test: TTE (Focused views) Doppler: No doppler Contrast: None Technical Quality: Adequate | | INTERPRETATION: Findings: LEFT ATRIUM: The left atrium is moderately dilated. The left atrium is elongated. RIGHT ATRIUM/INTERATRIAL SEPTUM: The right atrium is moderately dilated. LEFT VENTRICLE: There is severe symmetric left ventricular hypertrophy. The left ventricular cavity size is normal. Overall left ventricular systolic function is low normal (LVEF 50-55%). LV WALL MOTION: The following resting regional left ventricular wall motion abnormalities are seen: basal inferoseptal - hypokinetic; mid inferoseptal - hypokinetic; basal inferior - hypokinetic; mid inferior - hypokinetic;', 'C0013516')

('GI', '6281 | 381 | 17672 | DISCHAR

## Load UMLS specialist data

In [21]:
umls_eui_name_dict = defaultdict(list)  # EUI -> names (list)
umls_abbr_expansion_dict = defaultdict(list)  # Abbr EUI -> Expansion EUI (list)
umls_abbr_eui_dict = defaultdict(list)  # Abbr -> EUI (list)

with open(lrabr_fpath, 'r', encoding='utf-8') as fd:
    reader = csv.reader(fd, delimiter='|', quotechar=None)
    for i, l in enumerate(reader):
        abbr_eui, abbr, abbr_type, label_eui, label = l[:5]
        if label_eui and abbr_eui:
            if label_eui not in umls_abbr_expansion_dict[abbr_eui]:
                umls_abbr_expansion_dict[abbr_eui].append(label_eui)

        if abbr not in umls_eui_name_dict[abbr_eui]:
            umls_eui_name_dict[abbr_eui].append(abbr)
        if label not in umls_eui_name_dict[label_eui]:
            umls_eui_name_dict[label_eui].append(label)
            
        if abbr_eui not in umls_abbr_eui_dict[abbr]:
            umls_abbr_eui_dict[abbr].append(abbr_eui)
            
print(f'Read {i+1} LRABR records')
print(f'{len(umls_eui_name_dict)} EUIs')
print(f'{len(umls_abbr_eui_dict)} abbrs')
print(f'{len(umls_abbr_expansion_dict)} abbrs')

Read 289314 LRABR records
114770 EUIs
70336 abbrs
40539 abbrs


In [22]:
# Show some EUI-name mappings
list(umls_eui_name_dict.items())[:10]

[('E0000048', ['AA']),
 ('E0006859', ['achievement age']),
 ('E0000204', ['Alcoholics Anonymous']),
 ('E0356324', ['alcohol abuse', 'alcohol-abuse']),
 ('E0009858', ['aortic aneurysm']),
 ('E0009859', ['aortic arch']),
 ('E0010668', ['ascending aorta']),
 ('E0356325', ['attendance allowance']),
 ('E0008570', ['aminoacid', 'amino acid', 'amino-acid']),
 ('E0010231', ['arachidonic acid'])]

In [23]:
# Show some Abbr-EUI mappings
list(umls_abbr_eui_dict.items())[:10]

[('AA', ['E0000048', 'E0006420', 'E0356326']),
 ('AAA', ['E0000049', 'E0000546']),
 ('AAMD', ['E0000050']),
 ('A.A.M.D.', ['E0000050']),
 ('AAT', ['E0000051', 'E0722680']),
 ('AB', ['E0000052', 'E0565697', 'E0763870']),
 ('ABC', ['E0000053']),
 ('ACA', ['E0000055']),
 ('ACE', ['E0000056', 'E0695741']),
 ('ACEP', ['E0000057'])]

In [24]:
# Show some Abbr EUI-Expansion EUI mappings
list(umls_abbr_expansion_dict.items())[:5]

[('E0000048',
  ['E0006859',
   'E0000204',
   'E0356324',
   'E0009858',
   'E0009859',
   'E0010668',
   'E0356325',
   'E0008570',
   'E0010231',
   'E0010693',
   'E0009968',
   'E0007418',
   'E0430108',
   'E0008236',
   'E0420574',
   'E0071058',
   'E0071063',
   'E0448041',
   'E0007136',
   'E0006445',
   'E0598116',
   'E0598115',
   'E0598114',
   'E0598113',
   'E0598112',
   'E0598111',
   'E0598110',
   'E0598109',
   'E0598108',
   'E0759888',
   'E0006802',
   'E0066286',
   'E0006800',
   'E0598117']),
 ('E0000049',
  ['E0429482',
   'E0429483',
   'E0429484',
   'E0429485',
   'E0429486',
   'E0356310',
   'E0356309',
   'E0006446',
   'E0598118',
   'E0588272',
   'E0561250',
   'E0598119',
   'E0598120',
   'E0598123',
   'E0598121',
   'E0598122']),
 ('E0000050', ['E0000277']),
 ('E0000051',
  ['E0500264',
   'E0500260',
   'E0007918',
   'E0500262',
   'E0500261',
   'E0010744',
   'E0515593',
   'E0515594',
   'E0515595',
   'E0515596']),
 ('E0000052',
  ['E0010

In [25]:
# Normalized abbr (Group)-Orig abbrs mapping
umls_abbr_denorm_dict = defaultdict(list)
for abbr in umls_abbr_eui_dict:
    abbr_norm = normalize_abbr_variation(abbr)
    umls_abbr_denorm_dict[abbr_norm].append(abbr)
print(f'{len(umls_abbr_denorm_dict)} normalized abbrs in LRABR')

44517 normalized abbrs in LRABR


In [26]:
list(umls_abbr_denorm_dict.items())[:10]

[('aa', ['AA', 'aa']),
 ('aaa', ['AAA', 'A.A.A.']),
 ('aamd', ['AAMD', 'A.A.M.D.']),
 ('aat', ['AAT']),
 ('ab', ['AB', 'ab', 'A-B']),
 ('abc', ['ABC']),
 ('aca', ['ACA', 'A.C.A.']),
 ('ace', ['ACE', 'ace', 'Ace', 'ACe']),
 ('acep', ['ACEP']),
 ('acs', ['ACS', 'A.C.S.'])]

## Dataset Generation

Each row of the dataset consists of example index, abbreviation, left context, right context, and label CUIs (positive & negative). Here, the labels are written differently in the train and the test set. Specifically, the train dataset has the positive CUI followed by the negative CUIs (the negative CUIs can be empty). On the other hand, instead of the negative CUIs, the test dataset has the candidate CUIs from where an evaluated model should choose its output (the candidate CUIs may not include the positive CUI and also may be empty).

A train example looks like:
- index: `1996`
- abbreviation: `EOMs`
- Left context: `"... The patient was therefore taken back to the operating room for a clipping of the second MCA aneurysm without intraoperative complication. The patient was monitored in the Surgical Intensive Care Unit. She was alert, awake, oriented, complaining of severe headache, moving all extremities."`
- Right context: `"full, negative drift, a smile was symmetric, IP is 05-18. The patient was then discharged to the floor on 2015-03-20. She has been out of bed and ambulating with Physical Therapy, tolerating a regular diet, and voiding spontaneously. ..."`
- Positive (correct) CUI: `C0149566`
- **Negative CUIs**: `C2228439`, `C2228347`

A test example looks like:
- index: `2`
- abbreviation: `r/o`
- Left context: `"166217 | 19154 | 7888 | RADIOLOGY_REPORT | 2012-09-11 17:36:00.0 | C12 CHEST (PORTABLE AP) | | Clip # 348-5236 Actual report | DATE: 2012-09-11 5:36 PM CHEST (PORTABLE AP) 2935 Reason: new hypoxia, fevers, please"`
- Right context: `"pneumonia Admitting Diagnosis: SKULL FRACTURE;HEAD INJURY;SAH UNDERLYING MEDICAL CONDITION: 65 year old man with REASON FOR THIS EXAMINATION: new hypoxia, fevers, please r/o pneumonia FINAL REPORT CLINICAL HISTORY: A 65-year-old male with hypoxia and fevers. ..."`
- Positive (correct) CUI: `C0332196`
- **Candidate CUIs**: `C0332196`, `C2205978`


### 1. Training Set

For each annotation of the train set, we assign negative labels (CUI) as all other labels of the examples with same abbreviation group.

In [27]:
# Training Group-CUI dict
train_group_cuis_dict = {}
for d in train_examples:
    abbr, _, _, cui = d
    abbr_group = normalize_abbr_variation(abbr)
    if not abbr_group in train_group_cuis_dict:
        train_group_cuis_dict[abbr_group] = [cui]
    elif cui not in train_group_cuis_dict[abbr_group]:
        train_group_cuis_dict[abbr_group].append(cui)
        
print(f'{len(train_group_cuis_dict)} groups, {sum([len(v) for k, v in train_group_cuis_dict.items()])} senses')
train_group_cuis_dict

670 groups, 860 senses


{'f': ['C0015780', 'C0015671', 'C0553741'],
 'heent': ['C3494720'],
 's1': ['C0232223', 'C1261045'],
 's2': ['C0232230', 'C0018820'],
 'mca': ['C0149566', 'C1267298'],
 'acomm': ['C0149562'],
 'eoms': ['C2228347', 'C0149566', 'C2228439'],
 'ip': ['CUI-less'],
 'sob': ['C0013404'],
 'mibi': ['C0430473', 'C2205978'],
 'osh': ['CUI-less'],
 'nl': ['C0205307'],
 'lvef': ['C0428772'],
 'cath': ['C0018795', 'C0007430', 'C0879005'],
 'lmca': ['C0226031'],
 'lad': ['C0226032', 'C0497156'],
 'd1': ['C2067351'],
 'rca': ['C1261316'],
 'ho': ['C0332119', 'C1510665'],
 'lgib': ['C0024050'],
 'asa': ['C0004057'],
 'qod': ['C0558287'],
 'mi': ['C0027051'],
 'exam': ['C0582103', 'CUI-less'],
 'pe': ['C0031809', 'C0034065'],
 't': ['C0005903', 'C0578034', 'C0039476', 'C0581269'],
 'bp': ['C1271104', 'C0005823'],
 'r': ['C0205090', 'C1291003', 'C0232267', 'C0034642', 'C0035508', 'CUI-less'],
 'p': ['C0577836', 'C0577802', 'C0030987', 'C0030247'],
 'sat': ['C0523807', 'C3163851'],
 'ra': ['C2709070', 'C

In [28]:
group_snum_counter = Counter([len(v) for k, v in train_group_cuis_dict.items()])
print(f'Number of senses in training set: {group_snum_counter}')

Number of senses in training set: Counter({1: 537, 2: 97, 3: 24, 4: 7, 5: 3, 6: 1, 8: 1})


In [29]:
# Training data output
num_negs = 7
cnt = 0
example_snum_counter = Counter()
print(f'Write training data to {train_output_fpath}')
with open(train_output_fpath, 'w', encoding='utf-8') as f_out:
    writer = csv.writer(f_out, delimiter='\t', quotechar=None)
    writer.writerow(['index', 'group', 'left', 'right', 'label'] + [f'neg_{i}' for i in range(num_negs)])
    for d in train_examples:
        abbr, left, right, cui = d
        abbr_group = normalize_abbr_variation(abbr)
        cuis = train_group_cuis_dict[abbr_group]
#         if len(cuis) == 1:  # Skip if there's only one sense for this abbr(group)
#             continue
        cui_label = cui
        cui_negs = [c for c in cuis if c != cui]
        while len(cui_negs) < num_negs:
            cui_negs.append(' ')
        writer.writerow([cnt, abbr, left, right, cui_label] + cui_negs)
        cnt += 1
        example_snum_counter[len(cuis)] += 1

Write training data to ../data/sc13t2/train.tsv


In [30]:
example_snum_counter

Counter({3: 302, 1: 1920, 2: 1276, 4: 102, 6: 57, 8: 104, 5: 44})

In [31]:
# Training data output (for evaluation)
num_train = 4
cnt = 0
print(f'Write training data (in the format of the test data) to {train_output_fpath2}')
with open(train_output_fpath2, 'w', encoding='utf-8') as f_out:
    writer = csv.writer(f_out, delimiter='\t', quotechar=None)
    writer.writerow(['index', 'group', 'left', 'right', 'label'] + [f'train_{i}' for i in range(num_train)])
    for d in train_examples:
        abbr, left, right, cui = d
        abbr_group = normalize_abbr_variation(abbr)
        train_cuis = train_group_cuis_dict[abbr_group]
        writer.writerow([cnt, abbr, left, right, cui] + train_cuis + [] * (num_train - len(train_cuis)))
        cnt += 1

Write training data (in the format of the test data) to ../data/sc13t2/train_test.tsv


### 2. Test Set

First, we retrive the labels of unseen normalized abbrs (or unseen abbr groups) using LRABR and MetaMap. And then we choose the candidate CUIs for each test example as follows:  
- If the abbr group is seen (in the train set) -> Use the CUIs from train set
- If the abbr group is unseen & found in LRABR -> Use CUIs from LRABR
- If the abbr group is unseen & not found in LRABR -> No candidate CUIs (always incorrect on this example)

In [32]:
# Test Group-CUI dict
test_group_cuis_dict = {}
for d in test_examples:
    abbr, _, _, cui = d
    abbr_group = normalize_abbr_variation(abbr)
    if not abbr_group in test_group_cuis_dict:
        test_group_cuis_dict[abbr_group] = [cui]
    elif cui not in test_group_cuis_dict[abbr_group]:
        test_group_cuis_dict[abbr_group].append(cui)
        
print(f'{len(test_group_cuis_dict)} groups, {sum([len(v) for k, v in test_group_cuis_dict.items()])} senses')
test_group_cuis_dict

704 groups, 884 senses


{'echo': ['C0013516'],
 'bsa': ['C0005902'],
 'bp': ['C1271104', 'C0005823'],
 'hr': ['C0577802', 'C0018810'],
 'tte': ['C0430462'],
 'lvef': ['C0428772'],
 'lv': ['C0225897'],
 'mr': ['C0026266', 'C0024485'],
 'f': ['C0015780', 'CUI-less', 'C0015967'],
 'gi': ['C0521362', 'C0017187', 'C2609454'],
 'ct': ['C0040405', 'C0008034', 'C1274037', 'CUI-less'],
 'sp': ['C0231290'],
 'etoh': ['C0001962', 'CUI-less'],
 'exam': ['C0582103'],
 'vs': ['C0518766'],
 'gen': ['C0436117'],
 'nad': ['C2051415'],
 'cta': ['CUI-less', 'C1536105'],
 'bilat': ['C0238767'],
 'cv': ['C0436125', 'C0007222', 'C0007226'],
 'rrr': ['C0232185', 'C0232188', 'C0513693'],
 'm': ['C0018808', 'CUI-less', 'C0024554', 'C0026591'],
 'r': ['C0232267', 'C0205090', 'C0034642', 'C0035508', 'C0231832'],
 'g': ['C0232200', 'C1704242'],
 'abd': ['C0562238', 'C0000726'],
 'musc': ['C0475091'],
 'neuro': ['C0027853',
  'C0221571',
  'C0205494',
  'C0587475',
  'C0027855',
  'C0587591'],
 'aox3': ['C1961840'],
 'us': ['C0041618', '

In [33]:
test_unseen_groups = []
for abbr_group, cuis in test_group_cuis_dict.items():
    if abbr_group not in train_group_cuis_dict:
        test_unseen_groups.append(abbr_group)
print(f'{len(test_unseen_groups)} abbr groups are not seen in the training set')        

group_snum_counter = Counter([len(v) for k, v in test_group_cuis_dict.items()])
print(f'Number of senses in test set (all): {group_snum_counter}')

group_snum_counter = Counter([len(v) for k, v in test_group_cuis_dict.items() if k in train_group_cuis_dict])
print(f'Number of senses in test set (abbrs in train): {group_snum_counter}')

326 abbr groups are not seen in the training set
Number of senses in test set (all): Counter({1: 568, 2: 107, 3: 21, 4: 3, 5: 3, 6: 2})
Number of senses in test set (abbrs in train): Counter({1: 266, 2: 85, 3: 19, 4: 3, 5: 3, 6: 2})


In [34]:
# For all test abbreviation -> Find possible labels
test_unseen_umls_labels = []
for abbr_group in test_group_cuis_dict:
    for abbr in umls_abbr_denorm_dict[abbr_group]:
        for abbr_eui in umls_abbr_eui_dict[abbr]:
            for label_eui in umls_abbr_expansion_dict[abbr_eui]:
                test_unseen_umls_labels.extend(umls_eui_name_dict[label_eui])
test_unseen_umls_labels = list(set(test_unseen_umls_labels))  # Remove duplicates
test_unseen_umls_labels = [l for l in test_unseen_umls_labels if all([ord(c) < 128 for c in l])]  # Only alpha-numeric

print(f'{len(test_unseen_umls_labels)} labels found from LRABR and to be mapped by MetaMap')
print(test_unseen_umls_labels[:20])

9717 labels found from LRABR and to be mapped by MetaMap
['ptaquiloside', 'somatic cell hybrid', 'epichlorohydrin', 'watt', 'cyclic adenosine monophosphate-response element', 'supra-chiasmatic', 'metabolic index', 'retrograde pyelogram', 'Rhizomucor miehei lipase', 'osteopetrosis', 'wall thickening', 'endoperoxide', 'thoracolumbosacral orthosis', 'heat-stable', 'ectopic focus', 'nervous system', 'organising pneumonia', 'ammoniumsulphate', 'low-responder', 'catechol oestrogen']


In [35]:
# Map possible labels into CUIs using Metamap
mm_infpath = os.path.join(sc13t2_dir, 'umls_expansions.txt')
mm_outfpath = os.path.join(sc13t2_dir, 'mm_output.txt')

In [36]:
with open(mm_infpath, 'w', encoding='utf-8') as fd:
    fd.write('\n\n'.join(test_unseen_umls_labels)+'\n')  # 2 line spacing for metamap process

- Please run Metamap (2018AB version) with following command on `data/sc13t2`:  
```!metamap -z -I umls_expansions.txt mm_output.txt```

In [37]:
# Parsing Metamap output: CUI-label mapping
with open(mm_outfpath, 'r') as fd:
    lines = [l.strip() for l in fd.readlines()]

mm_label_cui_dict = {}
i = 0
while i < len(lines):
    l = lines[i]
    if l.startswith("Phrase: "):
        label = l[8:]  # After "Phrase: "
        label_lower = label.lower()
        while i < len(lines)-1:
            i += 1
            l = lines[i]
            if not l: break
            elif l.startswith("Meta Mapping"):
                i += 1
                l = lines[i]
                name = l[l.index(":")+1:l.rindex("[")].strip()
                name_lower = name.lower().replace(" ", "")
                if all([(w in name_lower) for w in label_lower.split(" ")]):
                    cui = l[l.index(":")-8:l.index(":")]
                    mm_label_cui_dict[label] = cui
#                     print(f'{label}: {name} ({cui})')
                    break
    i += 1

mm_cui_label_dict = {cui:label for label, cui in mm_label_cui_dict.items()}
print(f'{len(mm_cui_label_dict)} labels mapped to CUI')

4083 labels mapped to CUI


In [38]:
# Show some mappings
print('\n'.join([f'({cui}) {label}' for cui, label in list(mm_cui_label_dict.items())[:10]]))

(C0072576) ptaquiloside
(C0020200) somatic cell hybrid
(C0014486) epichlorohydrin
(C0439261) watt
(C0203110) retrograde pyelogram
(C0029454) osteopetrosis
(C0027763) nervous system
(C0264383) organising pneumonia
(C0002620) ammonium sulphate
(C0083727) neurogranin


In [39]:
# Test unseen abbr group -> LRABR CUIs
test_unseen_group_cuis_dict = defaultdict(set)
for abbr_group in test_unseen_groups:
    for abbr in umls_abbr_denorm_dict[abbr_group]:
        for abbr_eui in umls_abbr_eui_dict[abbr]:
            for label_eui in umls_abbr_expansion_dict[abbr_eui]:
                labels = umls_eui_name_dict[label_eui]
                cuis = [mm_label_cui_dict[label] for label in labels if label in mm_label_cui_dict]
                if cuis:
                    test_unseen_group_cuis_dict[abbr_group].update(cuis)
test_unseen_group_cuis_dict = {k:list(v) for k, v in test_unseen_group_cuis_dict.items()}
print(f'{len(test_unseen_group_cuis_dict)} unseen abbr groups mapped to {sum(map(len,test_unseen_group_cuis_dict.values()))} CUIs')
max_label_num = max(map(len, test_unseen_group_cuis_dict.values()))
print(f'Up to {max_label_num} candidates')

214 unseen abbr groups mapped to 1323 CUIs
Up to 50 candidates


In [40]:
list(test_unseen_group_cuis_dict.items())[:5]

[('gb', ['C0005304', 'C4321287', 'C1960832', 'C0016976']),
 ('uop', ['C0232856']),
 ('bnp', ['C0054015']),
 ('cr',
  ['C1547104',
   'C0009069',
   'C0007443',
   'C0234176',
   'C0868939',
   'C0391908',
   'C0555952',
   'C0677874',
   'C3842377',
   'C0039985',
   'C0419011',
   'C0010294',
   'C2981152',
   'C0232187',
   'C2936352',
   'C1135809',
   'C0034793',
   'C0008972',
   'C2930544',
   'C0007702',
   'C0009647',
   'C0436307',
   'C0008574',
   'C0009742',
   'C0034936']),
 ('bx', ['C0185298', 'C0005558'])]

In [41]:
# Test data output (All test abbr + CUIs augmented from LRABR)
num_train = max(8, max_label_num)
cnt = 0
cnt_unseen = 0
example_snum_counter = Counter()
with open(test_output_fpath, 'w', encoding='utf-8') as f_out:
    writer = csv.writer(f_out, delimiter='\t', quotechar=None)
    writer.writerow(['index', 'group', 'left', 'right', 'label'] + [f'train_{i}' for i in range(num_train)])
    for d in test_examples:
        abbr, left, right, cui = d
        abbr_group = normalize_abbr_variation(abbr)
        if (not abbr_group in train_group_cuis_dict):  #  We cannot get this if the abbr is not in training set
            if abbr_group in test_unseen_group_cuis_dict:
                train_cuis = test_unseen_group_cuis_dict[abbr_group]
#                 print((abbr_group, train_cuis))
            else:
                cnt_unseen += 1
                train_cuis = []
        else:
            train_cuis = train_group_cuis_dict[abbr_group]
        test_cuis = test_group_cuis_dict[abbr_group]
#         cuis = test_group_cuis_dict[abbr_group]
        example_snum_counter[len(test_cuis)] += 1
        writer.writerow([cnt, abbr, left, right, cui] + train_cuis + [''] * (num_train - len(train_cuis)))
        cnt += 1

In [42]:
print(f'{cnt_unseen} test examples are not in the training set')

166 test examples are not in the training set


### 3. CUI-label

In [43]:
print(f'Write test(training+LRABR) CUI-label list to {cui_label_fpath}')
with open(cui_label_fpath, 'w', encoding='utf-8') as f_out:
    writer = csv.writer(f_out, delimiter='\t', quotechar=None)
    writer.writerow(['cui', 'label'])
    for cui, labels in train_cui_labels_dict.items():
        writer.writerow([cui, labels[0]])
    for cui, label in mm_cui_label_dict.items():
        if cui not in train_cui_labels_dict:
            writer.writerow([cui, label])

Write test(training+LRABR) CUI-label list to ../data/sc13t2/cui_label.tsv


## Data statistics

- Abbreviation

In [44]:
abbr_var_set_train = set(list(zip(*train_examples))[0])
print(f'{len(abbr_var_set_train)} training abbr variations')

abbr_var_set_test = set(list(zip(*test_examples))[0])
print(f'{len(abbr_var_set_test)} test abbr variations')

print(f'{len(abbr_var_set_train | abbr_var_set_test)} total abbr variations')

843 training abbr variations
871 test abbr variations
1285 total abbr variations


- Abbreviation groups

In [45]:
abbr_group_set_train = set([normalize_abbr_variation(abbr) for abbr in list(zip(*train_examples))[0]])
print(f'{len(abbr_group_set_train)} training abbr groups')

abbr_group_set_test = set([normalize_abbr_variation(abbr) for abbr in list(zip(*test_examples))[0]])
print(f'{len(abbr_group_set_test)} test abbr groups')

print(f'{len(abbr_group_set_train | abbr_group_set_test)} total abbr groups')

670 training abbr groups
704 test abbr groups
996 total abbr groups


- Labels (CUIs)

In [46]:
cui_group_set_train = set(list(zip(*train_examples))[3])
print(f'{len(cui_group_set_train)} training cuis')

cui_group_set_test = set(list(zip(*test_examples))[3])
print(f'{len(cui_group_set_test)} test cuis')
    
print(f'{len(cui_group_set_train | cui_group_set_test)} total cuis')

697 training cuis
707 test cuis
1058 total cuis


## Random & Majority accuracies

In [47]:
train_abbr_cuis = defaultdict(list)

with open(train_output_fpath, 'r') as fd:
    reader = csv.reader(fd, delimiter='\t', quotechar=None)
    for i, line in enumerate(reader):
        if i == 0:
#             print(line)
            abbr_idx = line.index('group')
            label_idx = line.index('label')
            continue
        abbr = line[abbr_idx]
        label = line[label_idx]
        train_abbr_cuis[normalize_abbr_variation(abbr)].append(label)
        
train_majority_cuis = {}
for abbr, cuis in train_abbr_cuis.items():
    counter = Counter(cuis)
    train_majority_cuis[abbr] = counter.most_common()[0][0]

In [48]:
test_total_cnt = 0
test_majority_cnt = 0
test_random_cnt = 0

with open(test_output_fpath, 'r') as fd:
    reader = csv.reader(fd, delimiter='\t', quotechar=None)
    for i, line in enumerate(reader):
        if i == 0:
#             print(line)
            abbr_idx = line.index('group')
            label_idx = line.index('label')
            continue
        abbr = line[abbr_idx]
        label = line[label_idx]
        cands = []
        for j in range(label_idx+1, len(line)):
            if line[j]:
                cands.append(line[j])
        
        test_total_cnt += 1
        if label in cands:
            test_random_cnt += 1.0/len(cands)
        if normalize_abbr_variation(abbr) in train_majority_cuis and \
                    train_majority_cuis[normalize_abbr_variation(abbr)] == label:
            test_majority_cnt += 1

In [49]:
print(test_total_cnt)
print(test_majority_cnt)
print(test_random_cnt)

print(f'Random  : {test_random_cnt/test_total_cnt}')
print(f'Majority: {test_majority_cnt/test_total_cnt}')

3774
2614
2100.0549575423483
Random  : 0.5564533538797956
Majority: 0.6926338102808691


---- For debug ----

In [50]:
old_test_examples = list(csv.reader(open('/home/juyongk/workspace/temp/NCBI-BERT_aws_backup/abbr/sc13t2_lrabr/test.tsv'), delimiter='\t', quotechar=None))[1:]
len(old_test_examples)

3774

In [51]:
new_test_examples = list(csv.reader(open(test_output_fpath), delimiter='\t', quotechar=None))[1:]
len(new_test_examples)

3774

In [57]:
def sort_examples(examples):
    temp_list = [tuple(e[1:2] + sorted(e[4:]) + [i]) for i, e in enumerate(examples)]
    temp_list.sort()
    return [examples[t[-1]] for t in temp_list]

In [58]:
old_test_examples = sort_examples(old_test_examples)
new_test_examples = sort_examples(new_test_examples)

for i, (e1, e2) in enumerate(zip(old_test_examples, new_test_examples)):
    if i == 3140: continue
    if e1[1:2] + sorted(e1[4:]) != e2[1:2] + sorted(e2[4:-1]):
        break

In [59]:
i

156

In [60]:
list(zip(e1, e2))

[('3572', '206'),
 ('AV-block', 'AV-block'),
 ('seen on single view chest examination. Brief Hospital Course: Mr.  was admitted the morning of 4/27 and proceeded directly to the operating room. He underwent a mitral valve repair with resection of the posterior leaflet with a 28 mm DIETRICK band with Dr. Prof FRANKIN. Please see OP note for full details. He was successfully weened and extubated on his operative evening and was placed on a steroid taper with the help of endocrinology given his addisons disease. On postoperative day two he was transferred to the inpatient telemetry floor for ongoing management and rehabilitation. On postoperative day four he had a burst of atrial fibrillation -- converted spontaneously and was noted to have a first degree',
  'seen on single view chest examination. Brief Hospital Course: Mr.  was admitted the morning of 4/27 and proceeded directly to the operating room. He underwent a mitral valve repair with resection of the posterior leaflet with a 28 m

In [61]:
normalize_abbr_variation('AV-block')

'avblock'

In [62]:
umls_abbr_denorm_dict['avblock']

['AV block']

In [63]:
mm_cui_label_dict['C0004245']

'atrioventricular block'

??!?!