In [5]:
import pandas

import obo

In [7]:
# Read the ontology into a networkx MultiDiGraph
with open('download/basic.obo') as obo_file:
    ontology = obo.read_obo(obo_file)

In [8]:
# Extract xref resources that indication human terms
human_xrefs = set()
for x in ontology.graph['treat-xrefs-as-reverse-genus-differentia']:
    resource, relationship, genus = x.split(' ')
    if genus == 'NCBITaxon:9606':
        human_xrefs.add(resource)
human_xrefs

{'DHBA', 'EHDAA2', 'FMA', 'HBA', 'HsapDv'}

In [9]:
# Parse xrefs and add as node attribute
for node, data in ontology.nodes_iter(data=True):
    data['xrefs'] = [xref.split(':', 1) for xref in data.get('xref', [])]

In [11]:
def get_children(graph, node, in_keys = {'is_a', 'part_of'}, out_keys = set()):
    children = set()
    for u, v, key in graph.in_edges(node, keys=True):
        if key in in_keys:
            children.add(u)
            children |= get_children(graph, u)
    for u, v, key in graph.out_edges(node, keys=True):
        if key in out_keys:
            children.add(v)
            children |= get_children(graph, v)
    return children

In [12]:
# Extract MeSH cross-references
rows = []
for node, data in ontology.nodes_iter(data=True):
    if not node.startswith('UBERON:'):
        continue
    xrefs = data['xrefs']
    
    # Find whether term is in humans by seeing if node or its children
    # xref specific human terminologies
    children = get_children(ontology, node, in_keys = {'is_a', 'part_of', 'develops_from'})
    child_xrefs = set()
    for child in children | {node}:
        child_xrefs |= {xref[0] for xref in ontology.node[child]['xrefs']}
    human = int(bool(child_xrefs & human_xrefs))
    
    rows.append([node, data['name'], human])

pos_df = pandas.DataFrame(rows, columns=['uberon_id', 'uberon_name', 'positive_evidence'])

In [26]:
# Read negative evidence df
neg_df = pandas.read_table('download/ext_human_constraints.tsv')
neg_df = neg_df.rename(columns={'Uberon ID': 'uberon_id', '9606': 'no_negative_evidence'})
neg_df = neg_df[['uberon_id', 'no_negative_evidence']]
neg_df.no_negative_evidence = (neg_df.no_negative_evidence  == 'T').astype(int)

In [30]:
human_df = pos_df.merge(neg_df)
human_df.to_csv('data/human-constraint.tsv', sep='\t', index=False)
human_df.head()

Unnamed: 0,uberon_id,uberon_name,positive_evidence,no_negative_evidence
0,UBERON:0011624,superior horn of thyroid cartilage,1,1
1,UBERON:0001642,superior sagittal sinus,1,1
2,UBERON:0013695,colon endothelium,0,1
3,UBERON:0001111,intercostal muscle,1,1
4,UBERON:2001813,preopercular sensory canal,0,0
