# Script for loading N2C2 Terminology file
## All records must be:
- From SNOMED-CT or RxNorm
- Have a semantic type used by at least one CUI in the N2C2 training data
- Be English language
- Be not suppressible (MRCONSO.SUPPRESS!='E')

In [2]:
import collections
import os
import pandas as pd
import pyarrow.feather as feather
import re

raw_path = 'raw'
processed_path = 'processed'

## Load MRSTY.RRF 
UMLS semantic type mapping file

In [170]:
# Load MRSTY from source
file = f'{raw_path}/MRSTY.RRF'
cols = ['CUI','TUI','STN','STY','ATUI','CVF','BLANK']
mrsty = pd.read_table(f'{raw_path}/MRSTY.RRF',sep='|',header=None,names=cols)
mrsty = mrsty[['TUI','CUI','STY']]
mrsty.reset_index(drop=True).to_feather(f'{processed_path}/mrsty.feather')
mrsty.head(1)

Unnamed: 0,TUI,CUI,STY
0,T116,C0000005,"Amino Acid, Peptide, or Protein"


## Load MRCONSO.RRF 
UMLS metathesaurus file

In [3]:
# Load MRCONSO from source (Only needs to be done once)
cols = ['CUI','LAT','TS','LUI','STT','SUI','ISPREF','AUI','SAUI','SCUI','SDUI','SAB','TTY','CODE','STR','SRL','SUPPRESS','CVF','BLANK']
mrconso = pd.read_table(f'{raw_path}/MRCONSO.RRF',sep='|',header=None,names=cols)
mrconso = mrconso.drop(columns=['BLANK'])
umls = mrconso[mrconso.LAT=='ENG']
umls = umls[(umls.SAB=='RXNORM') | (umls.SAB=='SNOMEDCT_US')]
umls = umls[umls.SUPPRESS!='E'] # Seems like there are obsolete CUIs in the annotations...
umls = umls[['CUI','STR','ISPREF']]
umls.reset_index(drop=True).to_feather(f'{processed_path}/mrconso.feather')
umls.head(1)

  mrconso = pd.read_table(f'{raw_path}/MRCONSO.RRF',sep='|',header=None,names=cols)


Unnamed: 0,CUI,STR,ISPREF
9,C0000039,"1,2-dipalmitoylphosphatidylcholine",N


## Load MRREL.RRF 
One-time load UMLS CUI relationship mapping file

In [172]:
# Load raw MRREL from source, filter and save (Only needs to be done once)
names = ['CUI1','AUI1','STYPE1','REL','CUI2','AUI2','STYPE2','RELA','RUI','SRUI','SAB','SL','RG','DIR','SUPPRESS','CVF','BLANK']
usecols = ['CUI1','REL','CUI2','RELA','SAB','SL']
mrrel = pd.read_table(f'{raw_path}/MRREL.RRF',sep='|',header=None, names=names, usecols=usecols)
mrrel = mrrel[mrrel.SAB.isin(['SNOMEDCT_US','RXNORM'])]
mrrel = mrrel[mrrel.REL.isin(['PAR','CHD'])][['CUI1','CUI2','REL']]
mrrel = mrrel[mrrel.CUI1!=mrrel.CUI2]
mrrel.reset_index(drop=True).to_feather(f'{processed_path}/mrrel.feather')

mrrel_cnt = '{:,}'.format(len(mrrel))
print(f'{mrrel_cnt} raw relations')
mrrel.head(1)

  mrrel = pd.read_table(f'{raw_path}/MRREL.RRF',sep='|',header=None, names=names, usecols=usecols)


2,122,638 raw relations


Unnamed: 0,CUI1,CUI2,REL
80,C0000039,C0031610,PAR


## Curate list of semantic types appearing in training data
We will use these to filter UMLS data

In [173]:
#Load a dataset of all training annotations
train_path = '../datasets/n2c2/preprocessed/n2c2_traindev.concept'
train = pd.DataFrame([])
train = pd.read_table(train_path,sep='\|\|',header=None, names=['file_id','ix','type','name','CUI'])
train['file'] = '0'
train = train[['CUI','name','file']]
train = pd.merge(train, mrsty, on='CUI')[['CUI','name','file','TUI','STY']]
train.head(1)

  train = pd.read_table(train_path,sep='\|\|',header=None, names=['file_id','ix','type','name','CUI'])


Unnamed: 0,CUI,name,file,TUI,STY
0,C0011854,insulin dependent diabetes mellitus,0,T047,Disease or Syndrome


In [189]:
# Code to derive N2C2_TUI
# Get distinct TUI list in train order by # of CUIs in UMLS
tuis = pd.merge(mrsty, pd.DataFrame(train.TUI.unique(), columns=['TUI']), on='TUI')
tuis = tuis \
    .groupby('TUI').count()['CUI'] \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False) \
    .TUI.tolist()

# Distinct CUIs when filtering to TUI list
expected = len(set(train[train.TUI.isin(tuis)].CUI))
removed = []

for tui in tuis:
    # Try removing records with given TUI
    subset = train[(train.TUI.isin(tuis)) & (train.TUI != tui)]
    
    # If removing the TUI didn't drop unique CUIs, remove it permanently
    if expected == len(set(subset.CUI)):
        tuis.remove(tui)
        
n2c2_tui = train[(train.TUI.isin(tuis))] \
    .groupby('TUI').count()['CUI'] \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False)

## Generate n2c2_terminology.txt

In [164]:
umls_term = umls[umls.STR.notnull()]
umls_term.STR = [x.lower().strip() for x in umls_term.STR]

# Filter by TUIs in N2C2 train
umls_term = umls_term[['CUI','STR']]
umls_term = pd.merge(umls_term, mrsty[['CUI','TUI']], on='CUI')
umls_term = pd.merge(umls_term, n2c2_tui[['TUI']], on='TUI')
umls_term = umls_term[['CUI','STR']]

# Remove () qualifiers of the form "acetylcysteine (substance)" with >1000 instances
series = umls_term[umls_term.STR.str.contains('\(')].STR
qualifiers = series.str.extract('(\([^)]*\))').groupby(0) \
    .filter(lambda x : len(x)>1000)[0].unique().tolist()
qualifiers = [q.replace('(','\(').replace(')','\)') for q in qualifiers]
umls_term.STR = umls_term.STR.str.replace('|'.join(qualifiers), '')

# Remove [] qualifiers of the form "[d]spots"
series = umls_term[umls_term.STR.str.contains('\[[a-z]\]')].STR
qualifiers = series.str.extract('(\[[a-z]\])').groupby(0) \
    .filter(lambda x : len(x)>1)[0].unique().tolist()
qualifiers = [q.replace('[','\[').replace(']','\]') for q in qualifiers]
umls_term.STR = umls_term.STR.str.replace('|'.join(qualifiers), '')

# Remove extra whitespace
umls_term.STR = [re.sub(r'\s+',' ',x) for x in umls_term.STR]
umls_term.STR = [x.lower().strip() for x in umls_term.STR]
umls_term = umls_term.drop_duplicates()
umls_term = umls_term[umls_term.STR!='']

print('Unique CUIs:', len(set(umls_term.CUI)))
print('Unique names:', len(set(umls_term.STR)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  umls_term.STR = [x.lower().strip() for x in umls_term.STR]
  umls_term.STR = umls_term.STR.str.replace('|'.join(qualifiers), '')
  umls_term.STR = umls_term.STR.str.replace('|'.join(qualifiers), '')


Unique CUIs: 548578
Unique names: 984442


In [200]:
# Write mrconso_dictionary.txt
with open('../datasets/n2c2/mrconso_dictionary.txt', 'w+') as f:
    for x in umls_term.iterrows():
        try:
            f.write(f'{x[1].CUI}||{x[1].STR}\n')
        except:
            print(f'{x[1].CUI}||{x[1].STR}')

In [201]:
import numpy as np
n2c2_cuis = np.array([])
for s in ['dev','train','test']:
    cuis = pd.read_table(f'../datasets/n2c2/preprocessed/n2c2_{s}.concept', sep='\|\|',header=None)[4].unique()
    n2c2_cuis = np.append(n2c2_cuis,cuis)
n2c2_cuis = pd.DataFrame(set(n2c2_cuis), columns=['CUI'])
print(len(n2c2_cuis))
n2c2_cuis = pd.merge(n2c2_cuis, umls_term, on='CUI', how="left")
not_in_dict = n2c2_cuis[n2c2_cuis.STR.isnull()]
print('CUIs missing from dictionary: ', len(not_in_dict))

  cuis = pd.read_table(f'../datasets/n2c2/preprocessed/n2c2_{s}.concept', sep='\|\|',header=None)[4].unique()
  cuis = pd.read_table(f'../datasets/n2c2/preprocessed/n2c2_{s}.concept', sep='\|\|',header=None)[4].unique()
  cuis = pd.read_table(f'../datasets/n2c2/preprocessed/n2c2_{s}.concept', sep='\|\|',header=None)[4].unique()


2331
CUIs missing from dictionary:  17
