# Mapping UBERON to MeSH

In [30]:
import json
import re

import pandas
import requests

import obo

## MeSH tree numbers

In [None]:
# Read MeSH json
url = 'https://raw.githubusercontent.com/dhimmel/mesh/1a1ab7f1d30cb8a21618b5239370863ccb6de547/data/mesh.json'
response = requests.get(url)
mesh = json.loads(response.text)

# Extract (mesh_id, mesh_tree_number) pairs
rows = []
for term in mesh:
    mesh_id = term['mesh_id']
    mesh_name = term['mesh_name']
    for tree_number in term['tree_numbers']:
        rows.append([mesh_id, mesh_name, tree_number])

mesh_df = pandas.DataFrame(rows, columns=['mesh_id', 'mesh_name', 'mesh_tree_number'])

tn_to_uid = dict(zip(mesh_df.mesh_tree_number, mesh_df.mesh_id))

## UBERON

In [None]:
# Download .obo file
#! sh download.sh

In [88]:
# Read the ontology into a networkx MultiDiGraph
with open('download/basic.obo') as obo_file:
    ontology = obo.read_obo(obo_file)

In [89]:
# Extract xref resources that indication human terms
human_xrefs = set()
for x in ontology.graph['treat-xrefs-as-reverse-genus-differentia']:
    resource, relationship, genus = x.split(' ')
    if genus == 'NCBITaxon:9606':
        human_xrefs.add(resource)
human_xrefs

{'DHBA', 'EHDAA2', 'FMA', 'HBA', 'HsapDv'}

In [90]:
# Parse xrefs and add as node attribute
for node, data in ontology.nodes_iter(data=True):
    data['xrefs'] = [xref.split(':', 1) for xref in data.get('xref', [])]

In [91]:
def get_children(graph, node, in_keys = {'is_a', 'part_of'}, out_keys = set()):
    children = set()
    for u, v, key in graph.in_edges(node, keys=True):
        if key in in_keys:
            children.add(u)
            children |= get_children(graph, u)
    for u, v, key in graph.out_edges(node, keys=True):
        if key in out_keys:
            children.add(v)
            children |= get_children(graph, v)
    return children

In [92]:
get_children(ontology, 'UBERON:0009870')
#ontology.in_edges('UBERON:0001166', keys=True)

{'UBERON:0001160',
 'UBERON:0001161',
 'UBERON:0001162',
 'UBERON:0001163',
 'UBERON:0001164',
 'UBERON:0001165',
 'UBERON:0001166',
 'UBERON:0001202',
 'UBERON:0004550',
 'UBERON:0004933',
 'UBERON:0004934',
 'UBERON:0004935',
 'UBERON:0004936',
 'UBERON:0004937',
 'UBERON:0004994',
 'UBERON:0004995',
 'UBERON:0004996',
 'UBERON:0004997',
 'UBERON:0004998',
 'UBERON:0005477',
 'UBERON:0005637',
 'UBERON:0008858',
 'UBERON:0008859',
 'UBERON:0008860',
 'UBERON:0008861',
 'UBERON:0010038',
 'UBERON:0010238',
 'UBERON:0012503',
 'UBERON:0016501',
 'UBERON:0016511'}

In [93]:
# Extract MeSH cross-references
rows = []
for node, data in ontology.nodes_iter(data=True):
    if not node.startswith('UBERON:'):
        continue
    xrefs = data['xrefs']
    mesh_refs = [ref[1] for ref in xrefs if ref[0] == 'MESH']
    if len(mesh_refs) != 1:
        continue
    
    # Find whether term is in humans by seeing if node or its children
    # xref specific human terminologies
    children = get_children(ontology, node, in_keys = {'is_a', 'part_of', 'develops_from'})
    child_xrefs = set()
    for child in children | {node}:
        child_xrefs |= {xref[0] for xref in ontology.node[child]['xrefs']}
    human = int(bool(child_xrefs & human_xrefs))
    
    rows.append([node, data['name'], mesh_refs[0], human])

uberon_mesh_df = pandas.DataFrame(rows, columns=['uberon_id', 'uberon_name', 'mesh_tree_number', 'in_human'])

In [94]:
uberon_mesh_df

Unnamed: 0,uberon_id,uberon_name,mesh_tree_number,in_human
0,UBERON:0001577,facial muscle,A02.633.567.400,1
1,UBERON:0001092,vertebral bone 1,A02.835.232.834.151.213,1
2,UBERON:0001681,nasal bone,A02.835.232.781.324.665,1
3,UBERON:0001264,pancreas,A03.734,1
4,UBERON:0002356,perineum,A01.719,1
5,UBERON:0001688,incus bone,A09.246.397.247.362,1
6,UBERON:0010074,chromaffin system,A06.224,1
7,UBERON:0004019,baroreceptor,A08.800.050.800.900.700,1
8,UBERON:0001283,bile canaliculus,A03.159.183.158.125,1
9,UBERON:0001648,vestibulocochlear nerve,A08.800.800.120.910,1


In [95]:
# Merge uberon terms with mesh terms based on MeSH tree_numbers
uberon_mesh_df.merge(mesh_df, how='left').to_csv('data/mesh-map-with-unmatched.tsv', index=False, sep='\t')

In [96]:
# Merge uberon terms with mesh terms based on MeSH tree_numbers
uberon_mesh_df.merge(mesh_df, how='inner').to_csv('data/mesh-map.tsv', index=False, sep='\t')

# Cell Ontology

In [34]:
import re
mesh_pattern = re.compile(r'MESH:[A-Z][0-9.]+')

# Extract MeSH cross-references
rows = []
for node, data in ontology.nodes_iter(data=True):
    if not node.startswith('CL:'):
        continue

    matched_mesh = list()
    for value in [data.get('def', '')] + data.get('synonym', []) + data.get('xref', []):
        matched_mesh += re.findall(mesh_pattern, value)
    for matched_mesh_id in set(matched_mesh):
        mesh_id = matched_mesh_id
        throwaway, mesh_id = mesh_id.split(':')
        if not (mesh_id.startswith('D') and len(mesh_id) == 7 and '.' not in mesh_id):
            mesh_id = tn_to_uid.get(mesh_id)
        rows.append([node, data['name'], matched_mesh_id, mesh_id])

cl_mesh_df = pandas.DataFrame(rows, columns=['cl_id', 'cl_name', 'matched_string', 'mesh_id']).drop_duplicates()
cl_mesh_df.head()

Unnamed: 0,cl_id,cl_name,matched_string,mesh_id
0,CL:0000202,auditory hair cell,MESH:A08.663.650.250,
1,CL:0000513,cardiac muscle myoblast,MESH:A11.635.470,D032386
2,CL:0000094,granulocyte,MESH:D006098,D006098
3,CL:0000094,granulocyte,MESH:A11.118.637.415,D006098
4,CL:0000499,stromal cell,MESH:A11.329.830,D017154


In [44]:
cl_mesh_df.to_csv('data/CL-mesh-map-with-unmatched.tsv', index=False, sep='\t')