# Script for loading N2C2 Terminology file
## All records must be:
- From SNOMED-CT or RxNorm
- Have a semantic type used by at least one CUI in the N2C2 training data
- Be English language
- Be not suppressible (MRCONSO.SUPPRESS!='E')
- Have an unambiguous name

In [1]:
import collections
import os
import pandas as pd
import re

In [2]:
# umls = pd.read_csv('umls/mrconso.txt',sep='|')
# mrsty = pd.read_csv('umls/mrsty.txt',sep='|')
terminology = pd.read_table('../ncbi-data/TERMINOLOGY.txt',sep='\|\|',header=None, names=['cui','name'])
terminology.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,cui,name
0,C565588,Epidermolysis Bullosa With Diaphragmatic Hernia
1,C567755|613097,"Tooth Agenesis, Selective, 6|STHAG6"
2,C565584,"Epithelial Squamous Dysplasia, Keratinizing De..."
3,C565585,"Epiphyseal Dysplasia of Femoral Head, Myopia, ..."
4,C565587,"Epilepsy, Photogenic, with Spastic Diplegia an..."


In [3]:
# Load MRSTY from source
file = 'umls/MRSTY.RRF'
cols = ['CUI','TUI','STN','STY','ATUI','CVF','BLANK']
mrsty = pd.read_table(file,sep='|',header=None,names=cols)
mrsty = mrsty.drop(columns=['BLANK'])
mrsty.to_csv('umls/mrsty.txt',sep='|',index=False)
mrsty.head()

Unnamed: 0,CUI,TUI,STN,STY,ATUI,CVF
0,C0000005,T116,A1.4.1.2.1.7,"Amino Acid, Peptide, or Protein",AT17648347,256.0
1,C0000005,T121,A1.4.1.1.1,Pharmacologic Substance,AT17575038,256.0
2,C0000005,T130,A1.4.1.1.4,"Indicator, Reagent, or Diagnostic Aid",AT17634323,256.0
3,C0000039,T109,A1.4.1.2.1,Organic Chemical,AT45562015,256.0
4,C0000039,T121,A1.4.1.1.1,Pharmacologic Substance,AT17567371,256.0


In [4]:
# Load MRCONSO from source
file = 'umls/MRCONSO.RRF'
cols = ['CUI','LAT','TS','LUI','STT','SUI','ISPREF','AUI','SAUI','SCUI','SDUI','SAB','TTY','CODE','STR','SRL','SUPPRESS','CVF','BLANK']
mrconso = pd.read_table(file,sep='|',header=None,names=cols)
mrconso = mrconso.drop(columns=['BLANK'])
umls = mrconso[mrconso.LAT=='ENG']
umls = umls[(umls.SAB=='RXNORM') | (umls.SAB=='SNOMEDCT_US')]
umls = umls[umls.SUPPRESS!='E'] # Seems like there are obsolete CUIs in the annotations...
umls.to_csv('umls/mrconso.txt',sep='|',index=False)
umls.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
9,C0000039,ENG,P,L0000039,PF,S17175117,N,A28315139,9194921.0,1926948,,RXNORM,IN,1926948,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0
19,C0000039,ENG,S,L0012507,PF,S0033298,N,A22817493,166113012.0,102735002,,SNOMEDCT_US,OAP,102735002,Dipalmitoylphosphatidylcholine,9,O,256.0
25,C0000039,ENG,S,L3000054,PF,S3260062,Y,A22880204,544223010.0,102735002,,SNOMEDCT_US,OAF,102735002,Dipalmitoylphosphatidylcholine (substance),9,O,
61,C0000052,ENG,P,L0000052,VC,S0575717,Y,A27769867,97197014.0,58488005,,SNOMEDCT_US,PT,58488005,"1,4-alpha-Glucan branching enzyme",9,N,
69,C0000052,ENG,S,L0006129,VC,S0604824,Y,A27781005,97198016.0,58488005,,SNOMEDCT_US,SY,58488005,Branching enzyme,9,N,256.0


In [5]:
#Load a dataset of all training annotations
train_path = '../n2c2-data/all'
files = os.listdir(train_path)
train = pd.DataFrame([])
for file in files:
    if '.concept' in file:
        df = pd.read_table(f'{train_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','CUI'])
        df['file'] = [file for x in range(len(df))]
        train = pd.concat([train, df])
train = train[['CUI','name','file']]
train = pd.merge(train, mrsty, on='CUI')[['CUI','name','file','TUI','STY']]
train.to_csv('train.txt',sep='|',index=False)
train_cuis = list(set(train.CUI))
train.head()

  import sys


Unnamed: 0,CUI,name,file,TUI,STY
0,C0564823,Right LE pain,0034.concept,T184,Sign or Symptom
1,C0564823,R leg pain,0034.concept,T184,Sign or Symptom
2,C0564823,Right LE pain,0034.concept,T184,Sign or Symptom
3,C0030193,Pain,0034.concept,T184,Sign or Symptom
4,C0030193,Pain,0034.concept,T184,Sign or Symptom


In [6]:
# Code to derive N2C2_TUI
# Get distinct TUI list in train order by # of CUIs in UMLS
tuis = pd.merge(mrsty, pd.DataFrame(set(train.TUI), columns=['TUI']), on='TUI') \
    .groupby('TUI').count()['CUI'] \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False) \
    .TUI.tolist()

expected = len(set(train.CUI))
removed = []

for tui in tuis:
    # Try removing records with given TUI
    subset = train[(train.TUI.isin(tuis)) & (train.TUI != tui)]
    
    # If removing the TUI didn't drop unique CUIs, remove it permanently
    if expected == len(set(subset.CUI)):
        tuis.remove(tui)
        
n2c2_tui = train[(train.TUI.isin(tuis))] \
    .groupby('TUI').count()['CUI'] \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False)

with open('../scripts/n2c2_tui.txt','w+') as f:
    for tui in list(n2c2_tui.TUI):
        f.write(f'{tui}\n')

In [30]:
# Generate TERMINOLOGY file

# Filter by TUIs in N2C2 train
umls_term = umls[['CUI','STR']]
umls_term = pd.merge(umls_term, mrsty[['CUI','TUI']], on='CUI')
umls_term = pd.merge(umls_term, n2c2_tui[['TUI']], on='TUI')
umls_term = umls_term[['CUI','STR']]

# Clean up and filter STR field
umls_term = umls_term[umls_term.STR.notnull()]
umls_term.STR = [x.lower().strip() for x in umls_term.STR]
umls_term = umls_term.drop_duplicates()

# Add "angina disorder" for names like "angina (disorder)"
disorder_paren = umls_term[umls_term.STR.str.contains(r'\(disorder\)')].reset_index(drop=True)
disorder_paren.STR = [x.replace('(disorder)',' disorder ') for x in disorder_paren.STR]
umls_term = pd.concat([umls_term,disorder_paren],ignore_index=True)

# THIS IS CREATING AMBIGUITY IT SHOULDN'T. Just remove records with parens?
# # Remove parentheticals like "Electroretinograph (physical object)"
# umls_term.STR = [re.sub(r'\(.*\)',' ',x) for x in umls_term.STR]

# Remove extra whitespace
umls_term.STR = [re.sub(r'\s+',' ',x) for x in umls_term.STR]
umls_term.STR = [x.lower().strip() for x in umls_term.STR]
umls_term = umls_term.drop_duplicates()
umls_term = umls_term[umls_term.STR!='']

# Remove CUIs with ambiguous names
ambiguous = [item for item, count in collections.Counter(list(umls_term.STR)).items() if count > 1]
umls_term = umls_term[~umls_term.STR.isin(ambiguous)]

print('Unique CUIs:', len(set(umls_term.CUI)))
print('Unique names:', len(set(umls_term.STR)))

Unique CUIs: 548746
Unique names: 1586616


In [31]:
# Write n2c2_terminology.txt
n2c2_terminology = umls_term.groupby('CUI')['STR'].apply(list)
n2c2_terminology = pd.DataFrame(n2c2_terminology).reset_index()
n2c2_terminology['NAMES'] = ['|'.join(sorted(x)) for x in n2c2_terminology.STR]

with open('../resources/n2c2_terminology.txt', 'w+') as f:
    for x in n2c2_terminology.iterrows():
        try:
            f.write(f'{x[1].CUI}||{x[1].NAMES}\n')
        except:
            print(f'{x[1].CUI}||{x[1].NAMES}')

C0428654||carbon dioxide concentration - respired|respired carbon dioxide (co₂) concentration|respired carbon dioxide concentration|respired carbon dioxide concentration (observable entity)
C2584948||lipoprotein associated phospholipase a₂ measurement|lipoprotein associated phospholipase a₂ measurement (procedure)
C2585014||partial pressure arterial oxygen/fraction inspired oxygen ratio|ratio of arterial oxygen tension to inspired oxygen fraction|ratio of arterial oxygen tension to inspired oxygen fraction (pao₂/fio₂)|ratio of arterial oxygen tension to inspired oxygen fraction (procedure)
C2585389||thromboelastography (teg) alpha angle|thromboelastography (teg) α angle|thromboelastography alpha angle|thromboelastography alpha angle (observable entity)
C4316637||galad score|gender, age, afp-l3, α fetoprotein and des-carboxy-prothrombin score|gender, age, afp-l3, α fetoprotein and des-carboxy-prothrombin score (assessment scale)


In [33]:
umls_term[umls_term.CUI=='C0554740']

Unnamed: 0,CUI,STR
967275,C0554740,pelvis x-ray
967276,C0554740,pelvic x-ray
967277,C0554740,pelvis x-ray (procedure)
