## Download and process the Uberon ontology

In [1]:
import collections
import re

import pandas

import obo

In [2]:
# Download most recent uberon release
! wget --timestamping --directory-prefix download/ http://purl.obolibrary.org/obo/uberon/ext.obo
! wget --timestamping --directory-prefix download/ http://purl.obolibrary.org/obo/uberon/basic.obo

--2015-07-29 18:26:38--  http://purl.obolibrary.org/obo/uberon/ext.obo
Resolving purl.obolibrary.org (purl.obolibrary.org)... 132.174.1.35
Connecting to purl.obolibrary.org (purl.obolibrary.org)|132.174.1.35|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: http://berkeleybop.org/ontologies/uberon/ext.obo [following]
--2015-07-29 18:26:38--  http://berkeleybop.org/ontologies/uberon/ext.obo
Resolving berkeleybop.org (berkeleybop.org)... 131.243.192.99
Connecting to berkeleybop.org (berkeleybop.org)|131.243.192.99|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12973724 (12M) [text/plain]
Server file no newer than local file ‘download/ext.obo’ -- not retrieving.

--2015-07-29 18:26:38--  http://purl.obolibrary.org/obo/uberon/basic.obo
Resolving purl.obolibrary.org (purl.obolibrary.org)... 132.174.1.35
Connecting to purl.obolibrary.org (purl.obolibrary.org)|132.174.1.35|:80... connected.
HTTP request sent, awaiting response.

## Read and process the ontology

In [3]:
# Read obo into graph
with open('download/basic.obo') as read_file:
    basic = obo.read_obo(read_file)
dict(collections.Counter(key for u, v, key in basic.edges(keys=True)))

{'develops_from': 1413,
 'immediate_transformation_of': 71,
 'is_a': 20592,
 'part_of': 10715,
 'transformation_of': 43}

In [4]:
# Extract information from the graph
term_rows = []
xref_rows = []
subset_rows = []

for node, data in basic.nodes(data=True):
    term_rows.append((node, data['name']))
    
    for xref in data.get('xref', []):
        xref_rows.append((node, xref))

    for subset in data.get('subset', []):
        subset_rows.append((node, subset))

term_df = pandas.DataFrame(term_rows, columns=['uberon_id', 'uberon_name'])
xref_df = pandas.DataFrame(xref_rows, columns=['uberon_id', 'xref'])
subset_df = pandas.DataFrame(subset_rows, columns=['uberon_id', 'subset'])

In [5]:
# Create a dataframe of Uberon terms
term_df.to_csv('data/terms.tsv', sep='\t', index=False)
term_df.head()

Unnamed: 0,uberon_id,uberon_name
0,UBERON:0035368,anterior surface of kidney
1,UBERON:4000003,permanent cartilage
2,UBERON:0004617,lumbar vertebra 1
3,UBERON:0014933,periventricular gray matter
4,UBERON:0011149,Marshall's gland


In [6]:
# Update MESH IDs that are tree numbers
url = 'https://raw.githubusercontent.com/dhimmel/mesh/b6893d6502deeaa0f702128d9c8bbddff6b4c755/data/tree-numbers.tsv'
tree_number_df = pandas.read_table(url)
tn_to_id = dict(zip(tree_number_df.mesh_tree_number, tree_number_df.mesh_id))

def update_xref(x):
    vocab, identifier = x.split(':', 1)
    if vocab == 'MESH':
        if re.search('D[0-9]{6}', identifier):
            return x
        return tn_to_id.get(identifier)
    return x

xref_df.xref = xref_df.xref.map(update_xref)

# Create a dataframe of cross-references
xref_df.to_csv('data/xref.tsv', sep='\t', index=False)
xref_df.head()

Unnamed: 0,uberon_id,xref
0,UBERON:0035368,FMA:15589
1,UBERON:0035368,http://linkedlifedata.com/resource/umls/id/C02...
2,UBERON:0035368,http://ncicb.nci.nih.gov/xml/owl/EVS/Anterior_...
3,UBERON:0035368,http://www.snomedbrowser.com/Codes/Details/279...
4,UBERON:0004617,EMAPA:19472


In [7]:
# Create a dataframe of term subsets
xref_df.to_csv('data/subset.tsv', sep='\t', index=False)
subset_dict = {subset: set(df.uberon_id) for subset, df in subset_df.groupby('subset')}
subset_df.head()

Unnamed: 0,uberon_id,subset
0,UBERON:0004617,defined_by_ordinal_series
1,UBERON:0005561,vertebrate_core
2,UBERON:0002895,uberon_slim
3,UBERON:0001716,pheno_slim
4,UBERON:0001716,uberon_slim


## Create `hetio-slim`

`hetio-slim` is a subset of terms created for our [specific project](https://dx.doi.org/10.15363/thinklab.4):

+ potentially human-relevant (definitively non-human terms are removed)
+ in `uberon_slim`
+ not in `non_informative`, `upper_level`, `grouping_class`
+ have a MeSH cross-reference

In [8]:
human_df = pandas.read_table('data/human-constraint.tsv')
human_ids = set(human_df.query('no_negative_evidence == 1').uberon_id)

In [9]:
merged_df = term_df[term_df.uberon_id.isin(human_ids)].merge(xref_df)
merged_df['mesh_id'] = merged_df.xref.map(lambda x: x.split(':', 1)[1] if x and x.startswith('MESH:') else '')
merged_df = merged_df[merged_df.mesh_id != ''].drop('xref', 1)
exclude = subset_dict['non_informative'] | subset_dict['upper_level'] | subset_dict['grouping_class']
merged_df = merged_df[-merged_df.uberon_id.isin(exclude)]
merged_df = merged_df[merged_df.uberon_id.isin(subset_dict['uberon_slim'])]
merged_df.head()

Unnamed: 0,uberon_id,uberon_name,mesh_id
43,UBERON:0001716,secondary palate,D010159
120,UBERON:0001908,optic tract,D014795
230,UBERON:0002286,third ventricle,D020542
312,UBERON:0002349,myocardium,D009206
449,UBERON:0000978,leg,D035002


In [10]:
len(merged_df)

402

In [11]:
# Add mesh_name column
url = 'https://raw.githubusercontent.com/dhimmel/mesh/b6893d6502deeaa0f702128d9c8bbddff6b4c755/data/terms.tsv'
mesh_df = pandas.read_table(url)
merged_df = merged_df.merge(mesh_df)
assert not any(merged_df.uberon_id.duplicated())

In [12]:
# Save hetio-slim as a tsv
merged_df.to_csv('data/hetio-slim.tsv', index=False, sep='\t')