# Script for loading N2C2 Terminology file
## All records must be:
- From SNOMED-CT or RxNorm
- Have a semantic type used by at least one CUI in the N2C2 training data
- Be English language
- Be not suppressible (MRCONSO.SUPPRESS!='E')
- Have an unambiguous name

In [1]:
import collections
import os
import pandas as pd
import re

## Load MRSTY.RRF 
UMLS semantic type mapping file

In [2]:
# Load MRSTY from source
file = 'umls/MRSTY.RRF'
cols = ['CUI','TUI','STN','STY','ATUI','CVF','BLANK']
mrsty = pd.read_table(file,sep='|',header=None,names=cols)
mrsty = mrsty.drop(columns=['BLANK'])
mrsty.to_csv('umls/mrsty.txt',sep='|',index=False)
# mrsty.head()

## Load MRCONSO.RRF 
UMLS metathesaurus file

In [3]:
# Load MRCONSO from source
file = 'umls/MRCONSO.RRF'
cols = ['CUI','LAT','TS','LUI','STT','SUI','ISPREF','AUI','SAUI','SCUI','SDUI','SAB','TTY','CODE','STR','SRL','SUPPRESS','CVF','BLANK']
mrconso = pd.read_table(file,sep='|',header=None,names=cols)
mrconso = mrconso.drop(columns=['BLANK'])
umls = mrconso[mrconso.LAT=='ENG']
umls = umls[(umls.SAB=='RXNORM') | (umls.SAB=='SNOMEDCT_US')]
umls = umls[umls.SUPPRESS!='E'] # Seems like there are obsolete CUIs in the annotations...
umls.to_csv('umls/mrconso.txt',sep='|',index=False)
# umls.head()

  interactivity=interactivity, compiler=compiler, result=result)


## Curate list of semantic types appearing in training data
We will use these to filter UMLS data

In [4]:
#Load a dataset of all training annotations
train_path = '../n2c2-data/all'
files = os.listdir(train_path)
train = pd.DataFrame([])
for file in files:
    if '.concept' in file:
        df = pd.read_table(f'{train_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','CUI'])
        df['file'] = [file for x in range(len(df))]
        train = pd.concat([train, df])
train = train[['CUI','name','file']]
train = pd.merge(train, mrsty, on='CUI')[['CUI','name','file','TUI','STY']]
train.to_csv('train.txt',sep='|',index=False)
train_cuis = list(set(train.CUI))
# train.head()

  import sys


In [5]:
# Code to derive N2C2_TUI
# Get distinct TUI list in train order by # of CUIs in UMLS
tuis = pd.merge(mrsty, pd.DataFrame(set(train.TUI), columns=['TUI']), on='TUI') \
    .groupby('TUI').count()['CUI'] \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False) \
    .TUI.tolist()

expected = len(set(train.CUI))
removed = []

for tui in tuis:
    # Try removing records with given TUI
    subset = train[(train.TUI.isin(tuis)) & (train.TUI != tui)]
    
    # If removing the TUI didn't drop unique CUIs, remove it permanently
    if expected == len(set(subset.CUI)):
        tuis.remove(tui)
        
n2c2_tui = train[(train.TUI.isin(tuis))] \
    .groupby('TUI').count()['CUI'] \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False)

with open('../scripts/n2c2_tui.txt','w+') as f:
    for tui in list(n2c2_tui.TUI):
        f.write(f'{tui}\n')

## Generate abbreviations.txt
Filter UMLS data to pick out abbreviations. We'll remove abbreviation records and any record that matches an abbreviation from the terminology file we generate.

In [9]:
# Find records with names of the form 'BW - Birth weight' and exclude records with parens.
acronyms = umls[[bool(re.search(r'^[A-z0-9]{2,} - .+ .+[^)]$',s)) if pd.notnull(s) else False for s in umls.STR]]

# Acronym contains at least one letter.
acronyms = acronyms[[bool(re.search(r'^.*[A-z]+.* - .+ .+',s)) for s in acronyms.STR]]

# Filter out records starting with full words 'adolescent - needs help'.
acronyms = acronyms[[not bool(re.search(r'^[A-z0-9]+[a-z]{3,} - .+ .+',s)) for s in acronyms.STR]]
print(len(set(acronyms.STR)), ' unique acronyms')

# Write acronyms file
abbreviations = []
with open('../resources/abbreviations.txt', 'w+') as f:
    for s in sorted(set(list(acronyms.STR))):
        parts = s.split('-')
        abbreviation = parts[0].strip().lower()
        expansion = parts[1].strip().lower()
        f.write(f'{abbreviation}||{expansion}\n')
        
        abbreviations.append(abbreviation)
abbreviations = set(abbreviations)

8159  unique acronyms


## Generate n2c2_terminology.txt

In [47]:
# Drop acronym records and any record with a string matching an acronym
umls_term = umls.drop(acronyms.index, axis=0)
umls_term = umls_term[umls_term.STR.notnull()]
umls_term.STR = [x.lower().strip() for x in umls_term.STR]
umls_term = umls_term[~umls_term.STR.isin(abbreviations)]

# Filter by TUIs in N2C2 train
umls_term = umls_term[['CUI','STR']]
umls_term = pd.merge(umls_term, mrsty[['CUI','TUI']], on='CUI')
umls_term = pd.merge(umls_term, n2c2_tui[['TUI']], on='TUI')
umls_term = umls_term[['CUI','STR']]

# Add "angina disorder" for names like "angina (disorder)"
disorder_paren = umls_term[umls_term.STR.str.contains(r'\(disorder\)')].reset_index(drop=True)
disorder_paren.STR = [x.replace('(disorder)',' disorder ') for x in disorder_paren.STR]
umls_term = pd.concat([umls_term,disorder_paren],ignore_index=True)

# Add "sedation" for names like "[d]sedation"
letter_bracket = umls_term[umls_term.STR.str.contains(r'\[[a-z]\]')].reset_index(drop=True)
letter_bracket.STR = [re.sub(r'\[[a-z]\]','',x).strip() for x in letter_bracket.STR]
umls_term = pd.concat([umls_term,letter_bracket],ignore_index=True)

# THIS IS CREATING AMBIGUITY IT SHOULDN'T. Just remove records with parens?
# # Remove parentheticals like "Electroretinograph (physical object)"
# umls_term.STR = [re.sub(r'\(.*\)',' ',x) for x in umls_term.STR]

# Remove extra whitespace
umls_term.STR = [re.sub(r'\s+',' ',x) for x in umls_term.STR]
umls_term.STR = [x.lower().strip() for x in umls_term.STR]
umls_term = umls_term.drop_duplicates()
umls_term = umls_term[umls_term.STR!='']

# Remove CUIs with ambiguous names
ambiguous = [item for item, count in collections.Counter(list(umls_term.STR)).items() if count > 1]
umls_term = umls_term[~umls_term.STR.isin(ambiguous)]

print('Unique CUIs:', len(set(umls_term.CUI)))
print('Unique names:', len(set(umls_term.STR)))

Unique CUIs: 548694
Unique names: 1606676


In [48]:
# Write n2c2_terminology.txt
n2c2_terminology = umls_term.groupby('CUI')['STR'].apply(list)
n2c2_terminology = pd.DataFrame(n2c2_terminology).reset_index()
n2c2_terminology['NAMES'] = ['|'.join(sorted(x)) for x in n2c2_terminology.STR]

with open('../resources/n2c2_terminology.txt', 'w+') as f:
    for x in n2c2_terminology.iterrows():
        try:
            f.write(f'{x[1].CUI}||{x[1].NAMES}\n')
        except:
            print(f'{x[1].CUI}||{x[1].NAMES}')

C0428654||carbon dioxide concentration - respired|respired carbon dioxide (co₂) concentration|respired carbon dioxide concentration|respired carbon dioxide concentration (observable entity)
C2584948||lipoprotein associated phospholipase a₂ measurement|lipoprotein associated phospholipase a₂ measurement (procedure)
C2585014||partial pressure arterial oxygen/fraction inspired oxygen ratio|ratio of arterial oxygen tension to inspired oxygen fraction|ratio of arterial oxygen tension to inspired oxygen fraction (pao₂/fio₂)|ratio of arterial oxygen tension to inspired oxygen fraction (procedure)
C2585389||thromboelastography (teg) alpha angle|thromboelastography (teg) α angle|thromboelastography alpha angle|thromboelastography alpha angle (observable entity)
C4316637||galad score|gender, age, afp-l3, α fetoprotein and des-carboxy-prothrombin score|gender, age, afp-l3, α fetoprotein and des-carboxy-prothrombin score (assessment scale)
