In [1]:
# Download .obo file
! wget --timestamping --directory-prefix download/ http://purl.obolibrary.org/obo/uberon/ext.obo

--2015-05-19 16:08:42--  http://purl.obolibrary.org/obo/uberon/ext.obo
Resolving purl.obolibrary.org (purl.obolibrary.org)... 132.174.1.35
Connecting to purl.obolibrary.org (purl.obolibrary.org)|132.174.1.35|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: http://berkeleybop.org/ontologies/uberon/ext.obo [following]
--2015-05-19 16:08:42--  http://berkeleybop.org/ontologies/uberon/ext.obo
Resolving berkeleybop.org (berkeleybop.org)... 131.243.192.99
Connecting to berkeleybop.org (berkeleybop.org)|131.243.192.99|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12821492 (12M) [text/plain]
Server file no newer than local file ‘download/ext.obo’ -- not retrieving.



In [30]:
import json
import re

import pandas
import requests

import obo

In [3]:
# Read the ontology into a networkx MultiDiGraph
with open('download/ext.obo') as obo_file:
    ontology = obo.read_obo(obo_file)

In [None]:
# Read MeSH json
url = 'https://raw.githubusercontent.com/dhimmel/mesh/1a1ab7f1d30cb8a21618b5239370863ccb6de547/data/mesh.json'
response = requests.get(url)
mesh = json.loads(response.text)

# Extract (mesh_id, mesh_tree_number) pairs
rows = []
for term in mesh:
    mesh_id = term['mesh_id']
    mesh_name = term['mesh_name']
    for tree_number in term['tree_numbers']:
        rows.append([mesh_id, mesh_name, tree_number])

mesh_df = pandas.DataFrame(rows, columns=['mesh_id', 'mesh_name', 'mesh_tree_number'])

tn_to_uid = dict(zip(mesh_df.mesh_tree_number, mesh_df.mesh_id))

# UBERON

In [45]:
# Extract MeSH cross-references
rows = []
for node, data in ontology.nodes_iter(data=True):
    if not node.startswith('UBERON:'):
        continue
    xrefs = [xref.split(':', 1) for xref in data.get('xref', [])]
    mesh_refs = [ref[1] for ref in xrefs if ref[0] == 'MESH']
    if len(mesh_refs) != 1:
        continue
    rows.append([node, data['name'], mesh_refs[0]])

uberon_mesh_df = pandas.DataFrame(rows, columns=['uberon_id', 'uberon_name', 'mesh_tree_number'])

In [48]:
# Merge uberon terms with mesh terms based on MeSH tree_numbers
uberon_mesh_df.merge(mesh_df, how='left').to_csv('data/mesh-map-with-unmatched.tsv', index=False, sep='\t')

In [50]:
# Merge uberon terms with mesh terms based on MeSH tree_numbers
uberon_mesh_df.merge(mesh_df, how='inner').to_csv('data/mesh-map.tsv', index=False, sep='\t')

# Cell Ontology

In [34]:
import re
mesh_pattern = re.compile(r'MESH:[A-Z][0-9.]+')

# Extract MeSH cross-references
rows = []
for node, data in ontology.nodes_iter(data=True):
    if not node.startswith('CL:'):
        continue

    matched_mesh = list()
    for value in [data.get('def', '')] + data.get('synonym', []) + data.get('xref', []):
        matched_mesh += re.findall(mesh_pattern, value)
    for matched_mesh_id in set(matched_mesh):
        mesh_id = matched_mesh_id
        throwaway, mesh_id = mesh_id.split(':')
        if not (mesh_id.startswith('D') and len(mesh_id) == 7 and '.' not in mesh_id):
            mesh_id = tn_to_uid.get(mesh_id)
        rows.append([node, data['name'], matched_mesh_id, mesh_id])

cl_mesh_df = pandas.DataFrame(rows, columns=['cl_id', 'cl_name', 'matched_string', 'mesh_id']).drop_duplicates()
cl_mesh_df.head()

Unnamed: 0,cl_id,cl_name,matched_string,mesh_id
0,CL:0000202,auditory hair cell,MESH:A08.663.650.250,
1,CL:0000513,cardiac muscle myoblast,MESH:A11.635.470,D032386
2,CL:0000094,granulocyte,MESH:D006098,D006098
3,CL:0000094,granulocyte,MESH:A11.118.637.415,D006098
4,CL:0000499,stromal cell,MESH:A11.329.830,D017154


In [44]:
cl_mesh_df.to_csv('data/CL-mesh-map-with-unmatched.tsv', index=False, sep='\t')