# Introduction

Trying to get a list of ontology terms and ids.

In [1]:
import bz2
from lxml import etree
import os

In [2]:
with bz2.BZ2File("uberon.owl.bz2", "r") as instream:
    tree = etree.parse(instream)

In [3]:
root = tree.getroot()

In [4]:
namespaces = root.nsmap.copy()

uberon = namespaces[None]
del namespaces[None]
namespaces["uberon"] = uberon
namespaces

{'cl': 'http://purl.obolibrary.org/obo/cl#',
 'dc': 'http://purl.org/dc/elements/1.1/',
 'go': 'http://purl.obolibrary.org/obo/go#',
 'pr': 'http://purl.obolibrary.org/obo/pr#',
 'cl1': 'http://purl.obolibrary.org/obo/cl#3',
 'cl2': 'http://purl.obolibrary.org/obo/cl#2',
 'cl3': 'http://purl.obolibrary.org/obo/cl#1',
 'obo': 'http://purl.obolibrary.org/obo/',
 'owl': 'http://www.w3.org/2002/07/owl#',
 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 'xsd': 'http://www.w3.org/2001/XMLSchema#',
 'bspo': 'http://purl.obolibrary.org/obo/bspo#',
 'cito': 'http://purl.org/spar/cito/',
 'core': 'http://purl.obolibrary.org/obo/uberon/core#',
 'doap': 'http://usefulinc.com/ns/doap#',
 'foaf': 'http://xmlns.com/foaf/0.1/',
 'pato': 'http://purl.obolibrary.org/obo/pato#',
 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
 'skos': 'http://www.w3.org/2004/02/skos/core#',
 'swrl': 'http://www.w3.org/2003/11/swrl#',
 'chebi': 'http://purl.obolibrary.org/obo/chebi/',
 'swrla': 'http://swrl.stanf

In [5]:
terms = root.xpath("//owl:Class", namespaces=namespaces)

In [6]:
len(terms)

28235

In [7]:
terms[20000].getchildren()

[<Element {http://www.w3.org/2002/07/owl#}equivalentClass at 0x7fe9497031c0>,
 <Element {http://www.w3.org/2000/01/rdf-schema#}subClassOf at 0x7fe949703240>,
 <Element {http://www.w3.org/2000/01/rdf-schema#}subClassOf at 0x7fe949703280>,
 <Element {http://www.w3.org/2000/01/rdf-schema#}subClassOf at 0x7fe9497032c0>,
 <Element {http://www.w3.org/2000/01/rdf-schema#}subClassOf at 0x7fe949703300>,
 <Element {http://purl.obolibrary.org/obo/}IAO_0000115 at 0x7fe949703380>,
 <Element {http://www.geneontology.org/formats/oboInOwl#}hasOBONamespace at 0x7fe9497033c0>,
 <Element {http://www.geneontology.org/formats/oboInOwl#}id at 0x7fe949703400>,
 <Element {http://www.w3.org/2000/01/rdf-schema#}label at 0x7fe949703440>]

In [8]:
terms[20000].xpath("rdfs:label", namespaces=namespaces)

[<Element {http://www.w3.org/2000/01/rdf-schema#}label at 0x7fe949703440>]

In [9]:
terms[20000].xpath("obo:IAO_0000115", namespaces=namespaces)[0].text

'A distal tarsal bone 3 endochondral element that is composed primarily of a pre-cartilage condensation.'

In [10]:
terms = []
for owl_class in root.xpath("//owl:Class", namespaces=namespaces):
    class_ids = owl_class.xpath("oboInOwl:id", namespaces=namespaces)
    labels = owl_class.xpath("rdfs:label", namespaces=namespaces)
    description = owl_class.xpath("obo:IAO_0000115", namespaces=namespaces)
    if len(class_ids) > 0 and len(labels) > 0:
        if len(description) > 0:
            description = description[0].text
        else:
            description = ""
        terms.append((class_ids[0].text, labels[0].text, description))

In [12]:
with bz2.BZ2File("obo.tsv.bz2", "w") as outstream:
    outstream.write("term_id\tterm_name\tdescription".encode("utf8"))
    outstream.write(os.linesep.encode("utf8"))
    for term in terms:
        row = "\t".join(term)
        outstream.write(row.encode("utf8"))
        outstream.write(os.linesep.encode("utf8"))