In [1]:
import numpy as np
import pyarrow.feather as feather
import pandas as pd
import pickle
import random

processed_path = 'processed'

In [2]:
# Load filtered MRCONSO
concepts = pd.read_feather(f'{processed_path}/mrconso.feather')[['CUI','STR','ISPREF']]
concepts_cnt = '{:,}'.format(len(concepts))
print(f'{concepts_cnt} concepts')

# Load MRREL filtered to SNOMED and RxNorm
rel = pd.read_feather(f'{processed_path}/mrrel.feather')[['CUI1','CUI2','REL']]
rel_cnt = '{:,}'.format(len(rel))
print(f'{rel_cnt} relationships')

1,844,773 concepts
2,122,638 relationships


In [25]:
concepts[concepts.CUI=='C0013528']

Unnamed: 0,CUI,STR,ISPREF
31445,C0013528,Echolalia,N
31446,C0013528,Echo speech,Y
31447,C0013528,Echolalia (finding),Y


In [3]:
# Check that every parent-child record has a symmetric child-parent
par = (rel[rel.REL=='PAR'][['CUI2','CUI1']]
    .sort_values(['CUI2','CUI1'])
    .reset_index(drop=True)
)
par.columns = ['Parent','Child']
chd = (rel[rel.REL=='CHD'][['CUI1','CUI2']]
    .sort_values(['CUI1','CUI2'])
    .reset_index(drop=True)
)
chd.columns = ['Parent','Child']

assert par.equals(chd), "Non-symmetric parent/child relationship present"

# Drop child half of symmetric relationships
relations = par
relation_cnt = '{:,}'.format(len(relations))
print(f'{relation_cnt} unique relationships')

1,061,319 unique relationships


In [4]:
# Choosing a single preferred name per CUI
pref_concepts = concepts[concepts.ISPREF=='Y'].drop_duplicates('CUI')
assert len(pref_concepts.CUI)==len(set(pref_concepts.CUI)), "Non-unique CUI"

# Join CUIs with names
df = pd.merge(relations, pref_concepts, how="inner", left_on="Child", right_on="CUI")
length_after_merge = len(df)
drop_cnt = '{:,}'.format(len(relations)-length_after_merge)
print(f'Dropped {drop_cnt} relations merging child column')

df = pd.merge(df, pref_concepts, how="inner", left_on="Parent", right_on="CUI")
drop_cnt = '{:,}'.format(length_after_merge-len(df))
print(f'Dropped {drop_cnt} relations merging parent column')

df = df[['Child','Parent','STR_x','STR_y']]
df.columns = ['Child','Parent','Child_Name','Parent_Name']
relation_cnt = '{:,}'.format(len(df))
print(f'{relation_cnt} relationships remaining')
df.head(1)

Dropped 13,442 relations merging child column
Dropped 4,163 relations merging parent column
1,043,714 relationships remaining


Unnamed: 0,Child,Parent,Child_Name,Parent_Name
0,C0000300,C0000102,2-naphthylamine,1-naththylamine


In [5]:
# Build a dictionary of child names keyed on parent CUI
children = {}
for parent_cui, child in df.set_index('Parent')[['Child_Name','Child']].iterrows():
    if parent_cui not in children:
        children[parent_cui] = []
    children[parent_cui].append((child.Child_Name,child.Child))

with open(f'{processed_path}/children.pickle', 'wb') as f:
    pickle.dump(children, f)

# Build a dictionary of parents keyed on child CUI
parents = {}
for child_cui, parent in df.set_index('Child')[['Parent_Name','Parent']].iterrows():
    if child_cui not in parents:
        parents[child_cui] = []
    parents[child_cui].append((parent.Parent_Name,parent.Parent))

with open(f'{processed_path}/parents.pickle', 'wb') as f:
    pickle.dump(parents, f)

print(children['C0000102'])
print(parents['C0000300'])

[('2-naphthylamine', 'C0000300'), ('4-nitroso dimethylamine', 'C0301275'), ('N-b-bis (2-chloroethyl)-2-naphthylamine', 'C0303972')]
[('1-naththylamine', 'C0000102')]


In [None]:
# Build a dictionary of names to CUIs
name_cuis = {}
for i,d in concepts.iterrows():
    if d.STR is None or d.CUI is None:
        continue
    name = d.STR.lower()
    if name in name_cuis:
        name_cuis[name] = name_cuis[name].union(set([d.CUI]))
    else:
        name_cuis[name] = set([d.CUI])
with open(f'{processed_path}/name_cuis.pickle', 'wb') as f:
    pickle.dump(name_cuis, f)

In [46]:
import pickle
import random

# Helper methods
def flatten(list):
    "Flatten a 2D list into 1D"
    return set([item for sublist in list for item in sublist])

def parent_bfs(cuis, parent_dict):
    "Gets a unique set of parents for the list of CUIs"
    parents = []
    for cui in cuis:
        if cui in parent_dict:
            parents += [c for _,c in parent_dict[cui]]
    return set(parents)

def extend_lineage(l_given, l_compare, parent_dict):
    "Extend a lineage up one level and check if the LCA is found"
    d = None
    reached_root = False

    # Breadth First Search next level of parents
    parents = parent_bfs(l_given[-1], parent_dict)
    parents = parents.difference(flatten(l_given))

    if len(parents) > 0:
        # Check for common ancestors (only need to check parents)
        common_ancestors = flatten(l_compare).intersection(parents)
        if len(common_ancestors) > 0:
            for i, level in enumerate(l_compare):
                if not level.isdisjoint(parents):
                    d = len(l_given) + i
                    break
        l_given.append(parents)
    else:
        reached_root = True

    # Return extended lineage and distance if found
    return l_given, d, reached_root

class Umls():
    def __init__(self, path):
        with open(f'{path}/children.pickle', 'rb') as f:
            self.children = pickle.load(f)

        with open(f'{path}/parents.pickle', 'rb') as f:
            self.parents = pickle.load(f)

    def get_candidates(self, cui:str, k:int):
        "Returns k ontological candidates for the specified CUI"
        candidates = set()

        # Append up to half parent candidates
        parents = []
        if cui in self.parents:
            parents = self.parents[cui]
            if len(parents) > int(k/2):
                parents = random.sample(parents, int(k/2))
            candidates = candidates.union(set(parents))
            k = k-len(candidates)

        # Append children
        kids = []
        if cui in self.children:
            kids = self.children[cui]
            if len(kids) > k:
                kids = random.sample(kids, k)
            kids = set(kids).difference(candidates)
            candidates = candidates.union(set(kids))
            k = k-len(kids)

        # Append siblings (children of parents) until k is reached
        for _, parent_cui in parents:
            if k == 0:
                break
            if parent_cui in self.children:
                siblings = self.children[parent_cui]
                if len(siblings) > k:
                    siblings = random.sample(siblings, k)
                siblings = set(siblings).difference(candidates)
                candidates = candidates.union(set(siblings))
                k = k-len(siblings)

        # Append siblings (parents of children) until k is reached
        for _, child_cui in kids:
            if k == 0:
                break
            if child_cui in self.parents:
                siblings = self.parents[child_cui]
                if len(siblings) > k:
                    siblings = random.sample(siblings, k)
                siblings = set(siblings).difference(candidates)
                candidates = candidates.union(set(siblings))
                k = k-len(siblings)

        # Fill in with empty strings if enough candidates can't be found
        #TODO: Check that all candidates are in the dictionary?
        #TODO: Return non-random options?
        return [[n,c] for n,c in candidates]  + ([["NAME","CUI"]]*k)

    def dist(self, cui1, cui2):
        "Finds the lowest common ancestor between two CUIs in the UMLS and calculates distance between them"

        # If CUIs are identical, distance is zero
        if cui1==cui2:
            return 0

        lineage1 = [{cui1}]
        lineage2 = [{cui2}]
        reached_root1 = reached_root2 = False
        while not reached_root1 or not reached_root2:
            if not reached_root1:
                lineage1, d, reached_root1 = extend_lineage(lineage1, lineage2, self.parents)
                if d is not None: break

            if not reached_root2:
                lineage2, d, reached_root2 = extend_lineage(lineage2, lineage1, self.parents)
                if d is not None: break

        if reached_root1 and reached_root2:
            d = len(lineage1) + len(lineage2)

        if d is None:
            print("THIS SHOULD NOT HAPPEN")
        return d

    def similarity(self, cui1, cui2):
        "Calculates the ontological similarity between two CUIS"
        d = self.dist(cui1, cui2)
        return 0 if d<0 else 1/(1+d)

In [47]:
umls = Umls('processed')
umls.dist('C0013516','C0013528')

8

In [36]:
parents['C0562492']

[('Clinical history and observation findings', 'C0427350'),
 ('Speech and language finding', 'C0564649')]