In [132]:
import pyarrow.feather as feather
import pandas as pd
import pickle
import random

processed_path = 'umls/processed'

## Load MRREL.RRF 
One-time load UMLS CUI relationship mapping file

In [133]:
# Load raw MRREL from source, filter and save (Only needs to be done once)
file = 'umls/raw/MRREL.RRF'
names = ['CUI1','AUI1','STYPE1','REL','CUI2','AUI2','STYPE2','RELA','RUI','SRUI','SAB','SL','RG','DIR','SUPPRESS','CVF','BLANK']
usecols = ['CUI1','REL','CUI2','RELA','SAB','SL']
mrrel = pd.read_table(file,sep='|',header=None, names=names, usecols=usecols)
snomed_rxnorm_rel = mrrel[mrrel.SAB.isin(['SNOMEDCT_US','RXNORM'])]
snomed_rxnorm_rel = snomed_rxnorm_rel[snomed_rxnorm_rel.REL.isin(['PAR','CHD'])][['CUI1','CUI2','REL']]
snomed_rxnorm_rel = snomed_rxnorm_rel[snomed_rxnorm_rel.CUI1!=snomed_rxnorm_rel.CUI2]
snomed_rxnorm_rel.reset_index(drop=True).to_feather(f'{processed_path}/snomed_rxnorm_rel.feather')

snomed_rxnorm_rel_cnt = '{:,}'.format(len(snomed_rxnorm_rel))
print(f'{snomed_rxnorm_rel_cnt} raw relations')
snomed_rxnorm_rel.head(1)

  mrrel = pd.read_table(file,sep='|',header=None, names=names, usecols=usecols)


2,122,638 raw relations


Unnamed: 0,CUI1,CUI2,REL
80,C0000039,C0031610,PAR


In [134]:
# Load filtered MRCONSO
concepts = pd.read_feather(f'{processed_path}/mrconso.feather')[['CUI','STR','ISPREF']]
concepts_cnt = '{:,}'.format(len(concepts))
print(f'{concepts_cnt} concepts')

# Load MRREL filtered to SNOMED and RxNorm
rel = pd.read_feather(f'{processed_path}/snomed_rxnorm_rel.feather')[['CUI1','CUI2','REL']]
rel_cnt = '{:,}'.format(len(rel))
print(f'{rel_cnt} relationships')

1,844,773 concepts
2,122,638 relationships


In [135]:
# Check that every parent-child record has a symmetric child-parent
par = (rel[rel.REL=='PAR'][['CUI2','CUI1']]
    .sort_values(['CUI2','CUI1'])
    .reset_index(drop=True)
)
par.columns = ['Parent','Child']
chd = (rel[rel.REL=='CHD'][['CUI1','CUI2']]
    .sort_values(['CUI1','CUI2'])
    .reset_index(drop=True)
)
chd.columns = ['Parent','Child']

assert par.equals(chd), "Non-symmetric parent/child relationship present"

# Drop child half of symmetric relationships
relations = par
relation_cnt = '{:,}'.format(len(relations))
print(f'{relation_cnt} unique relationships')

1,061,319 unique relationships


In [136]:
# Choosing a single preferred name per CUI
pref_concepts = concepts[concepts.ISPREF=='Y'].drop_duplicates('CUI')
assert len(pref_concepts.CUI)==len(set(pref_concepts.CUI)), "Non-unique CUI"

# Join CUIs with names
df = pd.merge(relations, pref_concepts, how="inner", left_on="Child", right_on="CUI")
length_after_merge = len(df)
drop_cnt = '{:,}'.format(len(relations)-length_after_merge)
print(f'Dropped {drop_cnt} relations merging child column')

df = pd.merge(df, pref_concepts, how="inner", left_on="Parent", right_on="CUI")
drop_cnt = '{:,}'.format(length_after_merge-len(df))
print(f'Dropped {drop_cnt} relations merging parent column')

df = df[['Child','Parent','STR_x','STR_y']]
df.columns = ['Child','Parent','Child_Name','Parent_Name']
relation_cnt = '{:,}'.format(len(df))
print(f'{relation_cnt} relationships remaining')
df.head(1)

Dropped 13,442 relations merging child column
Dropped 4,163 relations merging parent column
1,043,714 relationships remaining


Unnamed: 0,Child,Parent,Child_Name,Parent_Name
0,C0000300,C0000102,2-naphthylamine,1-naththylamine


In [137]:
# Build a dictionary of child names keyed on parent CUI
children = {}
for parent_cui, child_name in df.set_index('Parent')['Child_Name'].iteritems():
    if parent_cui not in children:
        children[parent_cui] = []
    children[parent_cui].append(child_name)

with open(f'{processed_path}/children.pickle', 'wb') as f:
    pickle.dump(children, f)

# Build a dictionary to parent name keyed on child CUI
parent_name = df.set_index('Child')['Parent_Name'].to_dict()

with open(f'{processed_path}/parent_name.pickle', 'wb') as f:
    pickle.dump(parent_name, f)

# Build a dictionary to parent name keyed on child CUI
parent_cui = df.set_index('Child')['Parent'].to_dict()

with open(f'{processed_path}/parent_cui.pickle', 'wb') as f:
    pickle.dump(parent_cui, f)

print(children['C0000102'])
print(parent_name['C0000300'])
print(parent_cui['C0000300'])

['2-naphthylamine', '4-nitroso dimethylamine', 'N-b-bis (2-chloroethyl)-2-naphthylamine']
1-naththylamine
C0000102


In [138]:
with open(f'{processed_path}/children.pickle', 'rb') as f:
    children = pickle.load(f)

with open(f'{processed_path}/parent_name.pickle', 'rb') as f:
    parent_name = pickle.load(f)

def get_candidates(cui:str, k:int):
    "Returns k ontological candidates for the specified CUI"
    candidates = []

    # Append parent
    if cui in parent_name:
        candidates.append(parent_name[cui])
        k = k-len(candidates)

    # Append children
    if cui in children:
        kids = children[cui]
        if len(kids) > k:
            kids = random.sample(kids, k)
        candidates += kids

    # TODO: What about siblings?
    # TODO: What if enough candidates cannot be found?

    return candidates

print('Candidates for child:',get_candidates(df.iloc[-3].Child,5))
print()
print('Candidates for parent:',get_candidates(df.iloc[-3].Parent,5))

Candidates for child: ['Evaluation of eligibility for food pantry program']

Candidates for parent: ['Evaluation (procedure)', 'Evaluation of eligibility for Food Distribution Program on Indian Reservations']


In [152]:
def umls_dist(cui1, cui2, dist_dict={}):
    "Finds the lowest common ancestor between two CUIs in the UMLS and calculates distance between them"

    # If CUIs are identical, distance is zero
    if cui1==cui2:
        return 0
    
    # Check for precomputed distance
    key = tuple(sorted([cui2, cui1]))
    if key in dist_dict:
        return dist_dict[key]

    lineage1 = [cui1]
    lineage2 = [cui2]
    lca = None

    while lineage1[-1] in parent_cui and lineage2[-1] in parent_cui:
        print(lineage1[-1],parent_name[lineage1[-1]], '|||||||||||||||||', lineage2[-1],parent_name[lineage2[-1]])
        # Extend lineage1 by one level
        if lineage1[-1] in parent_cui:
            cui = parent_cui[lineage1[-1]]
            lineage1.append(cui)
            if cui in lineage2:
                lca = cui
                break

        # Extend lineage2 by one level
        if lineage2[-1] in parent_cui:
            cui = parent_cui[lineage2[-1]]
            lineage2.append(cui)
            if cui in lineage1:
                lca = cui
                break
    
    dist = lineage1.index(lca) + lineage2.index(lca) if lca else float('inf')
    return dist

umls_dist('C0588734','C3668918')

C0588734 Royal Air Force officer (occupation) ||||||||||||||||| C3668918 Dapagliflozin only product in oral dose form
C0588725 Royal Air Force personnel (occupation) ||||||||||||||||| C4756044 Dapagliflozin only product
C0588724 Military services member ||||||||||||||||| C4752726 dapagliflozin
C0336524 Worker (occupation) ||||||||||||||||| C2353951 Sodium glucose cotransporter subtype 2 inhibitor (substance)
C1306056 Employee (person) ||||||||||||||||| C3838944 Antidiabetic agent
C0599987 Person in the work environment ||||||||||||||||| C0935929 Medicinal product categorized by therapeutic role
C0557511 Person in the community environment ||||||||||||||||| C4708067 General drug type
C0580211 Person ||||||||||||||||| C0013227 © 2002-2012 International Health Terminology Standards Development Organisation (IHTSDO). All rights reserved. SNOMED CT®, was originally created by The College of American Pathologists. "SNOMED" and "SNOMED CT" are registered trademarks of the IHTSDO.


inf

In [147]:
n = 100
cuis1 = list(relations.Parent.sample(n))
cuis2 = list(relations.Child.sample(n))

In [166]:
mrrel[(mrrel.CUI1=='C0360511')&(mrrel.CUI2=='C0019932')]

Unnamed: 0,CUI1,REL,CUI2,RELA,SAB,SL
17170932,C0360511,CHD,C0019932,isa,SNOMEDCT_US,SNOMEDCT_US
17170933,C0360511,CHD,C0019932,isa,SNOMEDCT_US,SNOMEDCT_US
17170964,C0360511,PAR,C0019932,inverse_isa,SNOMEDCT_US,SNOMEDCT_US
17170965,C0360511,RO,C0019932,active_ingredient_of,SNOMEDCT_US,SNOMEDCT_US
17171574,C0360511,CHD,C0019932,isa,SCTSPA,SCTSPA
17171575,C0360511,CHD,C0019932,isa,SCTSPA,SCTSPA
17171606,C0360511,PAR,C0019932,inverse_isa,SCTSPA,SCTSPA
17171607,C0360511,RO,C0019932,active_ingredient_of,SCTSPA,SCTSPA


In [164]:
cycles = pd.merge(df, df, how="inner", left_on=['Child','Parent'], right_on=['Parent','Child'])
cycles

Unnamed: 0,Child_x,Parent_x,Child_Name_x,Parent_Name_x,Child_y,Parent_y,Child_Name_y,Parent_Name_y
0,C0360511,C0019932,"Hormones, synthetic substitutes and antagonist...",Hormone agent,C0019932,C0360511,Hormone agent,"Hormones, synthetic substitutes and antagonist..."
1,C0360511,C0019932,"Hormones, synthetic substitutes and antagonist...",Hormone agent,C0019932,C0360511,Hormone agent,"Hormones, synthetic substitutes and antagonist..."
2,C0732611,C0521963,Selective estrogen receptor modulator,Synthetic antiestrogen,C0521963,C0732611,Synthetic antiestrogen,Selective estrogen receptor modulator
3,C0732611,C0521963,Selective estrogen receptor modulator,Synthetic antiestrogen,C0521963,C0732611,Synthetic antiestrogen,Selective estrogen receptor modulator
4,C0732611,C0521963,Selective estrogen receptor modulator,Synthetic antiestrogen,C0521963,C0732611,Synthetic antiestrogen,Selective estrogen receptor modulator
...,...,...,...,...,...,...,...,...
4855,C3872570,C3873378,Primary salivary gland type carcinoma of hypop...,Primary mucoepidermoid carcinoma of hypopharyn...,C3873378,C3872570,Primary mucoepidermoid carcinoma of hypopharyn...,Primary salivary gland type carcinoma of hypop...
4856,C3873334,C3854329,Primary salivary gland type carcinoma of lung,Primary mucoepidermoid carcinoma of lung (diso...,C3854329,C3873334,Primary mucoepidermoid carcinoma of lung (diso...,Primary salivary gland type carcinoma of lung
4857,C3873334,C3873335,Primary salivary gland type carcinoma of lung,Primary adenoid cystic carcinoma of lung (diso...,C3873335,C3873334,Primary adenoid cystic carcinoma of lung (diso...,Primary salivary gland type carcinoma of lung
4858,C3873798,C3873799,Thermography system,Computed tomography system thermography system,C3873799,C3873798,Computed tomography system thermography system,Thermography system


In [160]:
cui = 'C0506706'
print(cui,parent_cui[cui], parent_name[cui])
cui = parent_cui[cui]
print(cui,parent_cui[cui], parent_name[cui])
cui = parent_cui[cui]
print(cui,parent_cui[cui], parent_name[cui])

C0506706 C0700276 Anatomical structure
C0700276 C0506706 Physical anatomical entity
C0506706 C0700276 Anatomical structure


In [148]:
%time
for cui1, cui2 in zip(cuis1,cuis2):
    # print(cui1, cui2)
    umls_dist(cui1,cui2)

CPU times: total: 0 ns
Wall time: 0 ns
C1276746 C0408670
C0420206 C0408665
C0586513 C0025286
C0543467 C0025284
C0521126 C0474843
C0332191 C0474842
C1534707 C0027651
C1285164 C1302761
C0684881 C1290805
C1306609 C0038354
C0153664 C1290613
C1306609 C0266804
C0153664 C2700612
C1306609 C0012242
C0153664 C1285159
C1306609 C1290853
C0153664 C1290906
C1306609 C0243095
C3873653 C0397682
C0699733 C0456349
C0347997 C0457182
C0003209 C4516413
C4708067 C1827498
C0013227 C0226525
C0559473 C0068771
C0574102 C0059814
C0424892 C0050505
C0574102 C0014631
C0424892 C0035845
C0574102 C0440137
C0424892 C0011379
C0574102 C0586397
C0424892 C0439861
C0347810 C0426114
C0015806 C0426110
C0435805 C1286421
C3840178 C0566787
C0281865 C0566692
C3697286 C0425961
C0560636 C0037088
C0304008 C0472582
C0032088 C0194046
C0557858 C1292863
C0567360 C1285153
C0440265 C1285152
C0520510 C0184661
C4750866 C1045768
C0178916 C0321977
C0442711 C0997947
C0586393 C0562623
C1285164 C0562622
C1285640 C0205103
C1274016 C0441889
C128557

KeyboardInterrupt: 