# Mapping UBERON to MeSH

In [1]:
import json
import re

import pandas
import requests

import obo

## MeSH tree numbers

In [2]:
# Read MeSH json
url = 'https://raw.githubusercontent.com/dhimmel/mesh/1a1ab7f1d30cb8a21618b5239370863ccb6de547/data/mesh.json'
response = requests.get(url)
mesh = json.loads(response.text)

# Extract (mesh_id, mesh_tree_number) pairs
rows = []
for term in mesh:
    mesh_id = term['mesh_id']
    mesh_name = term['mesh_name']
    for tree_number in term['tree_numbers']:
        rows.append([mesh_id, mesh_name, tree_number])

mesh_df = pandas.DataFrame(rows, columns=['mesh_id', 'mesh_name', 'mesh_tree_number'])

tn_to_uid = dict(zip(mesh_df.mesh_tree_number, mesh_df.mesh_id))

## UBERON

In [3]:
# Download .obo file
#! sh download.sh

In [4]:
# Read the ontology into a networkx MultiDiGraph
with open('download/basic.obo') as obo_file:
    ontology = obo.read_obo(obo_file)

In [5]:
# Parse xrefs and add as node attribute
for node, data in ontology.nodes_iter(data=True):
    data['xrefs'] = [xref.split(':', 1) for xref in data.get('xref', [])]

In [6]:
# Extract MeSH cross-references
rows = []
for node, data in ontology.nodes_iter(data=True):
    if not node.startswith('UBERON:'):
        continue
    xrefs = data['xrefs']
    mesh_refs = [ref[1] for ref in xrefs if ref[0] == 'MESH']
    if len(mesh_refs) != 1:
        continue
        
    rows.append([node, data['name'], mesh_refs[0]])

uberon_mesh_df = pandas.DataFrame(rows, columns=['uberon_id', 'uberon_name', 'mesh_tree_number'])

In [8]:
uberon_mesh_df.head()

Unnamed: 0,uberon_id,uberon_name,mesh_tree_number
0,UBERON:0014892,skeletal muscle organ,A02.633.567
1,UBERON:0003133,reproductive organ,A05.360
2,UBERON:0002606,neuropil,A08.637.500
3,UBERON:0001310,umbilical artery,A07.231.114.929
4,UBERON:0000924,ectoderm,A16.254.425.273


In [16]:
# Read human constraint table
human_df = pandas.read_table('data/human-constraint.tsv')
human_df = human_df[['uberon_id', 'positive_evidence', 'no_negative_evidence']]
human_df.head()

Unnamed: 0,uberon_id,positive_evidence,no_negative_evidence
0,UBERON:0011624,1,1
1,UBERON:0001642,1,1
2,UBERON:0013695,0,1
3,UBERON:0001111,1,1
4,UBERON:2001813,0,0


In [13]:
# Merge uberon terms with mesh terms based on MeSH tree_numbers
uberon_mesh_df.merge(mesh_df, how='left').to_csv('data/mesh-map-with-unmatched.tsv', index=False, sep='\t')

In [15]:
# Merge uberon terms with mesh terms based on MeSH tree_numbers
inner_df = uberon_mesh_df.merge(mesh_df, how='inner')
inner_df = inner_df.merge(human_df, how='left')
inner_df.to_csv('data/mesh-map.tsv', index=False, sep='\t')

# Cell Ontology

In [34]:
import re
mesh_pattern = re.compile(r'MESH:[A-Z][0-9.]+')

# Extract MeSH cross-references
rows = []
for node, data in ontology.nodes_iter(data=True):
    if not node.startswith('CL:'):
        continue

    matched_mesh = list()
    for value in [data.get('def', '')] + data.get('synonym', []) + data.get('xref', []):
        matched_mesh += re.findall(mesh_pattern, value)
    for matched_mesh_id in set(matched_mesh):
        mesh_id = matched_mesh_id
        throwaway, mesh_id = mesh_id.split(':')
        if not (mesh_id.startswith('D') and len(mesh_id) == 7 and '.' not in mesh_id):
            mesh_id = tn_to_uid.get(mesh_id)
        rows.append([node, data['name'], matched_mesh_id, mesh_id])

cl_mesh_df = pandas.DataFrame(rows, columns=['cl_id', 'cl_name', 'matched_string', 'mesh_id']).drop_duplicates()
cl_mesh_df.head()

Unnamed: 0,cl_id,cl_name,matched_string,mesh_id
0,CL:0000202,auditory hair cell,MESH:A08.663.650.250,
1,CL:0000513,cardiac muscle myoblast,MESH:A11.635.470,D032386
2,CL:0000094,granulocyte,MESH:D006098,D006098
3,CL:0000094,granulocyte,MESH:A11.118.637.415,D006098
4,CL:0000499,stromal cell,MESH:A11.329.830,D017154


In [44]:
cl_mesh_df.to_csv('data/CL-mesh-map-with-unmatched.tsv', index=False, sep='\t')