In [5]:
from owlready2 import *
import pandas as pd
import numpy as np
import csv
import xml.sax
from fuzzywuzzy import process



In [6]:
# load the BTO ontology
onto_path.append("/data/ontology")
onto = get_ontology("http://purl.obolibrary.org/obo/bto.owl").load()
# print(onto.base_iri)

In [7]:
# load class information and properties from the ontology
class_names = [c.name for c in onto.classes()]
# account for missing class labels in the ontology
class_labels = [c.label[0] if len(c.label) > 0 else '' for c in onto.classes()]

# properties_names contains all properties, including object and annotation properties
# names list contains ontology
object_properties_names = [o.name for o in onto.object_properties()]
object_properties_labels = [o.label[0] if len(o.label) > 0 else o.name for o in onto.object_properties()]

annotation_properties_names = [a.name for a in onto.annotation_properties()]
annotation_properties_labels = [a.label[0] if len(a.label) > 0 else a.name for a in onto.annotation_properties()]

properties_names = [p.name for p in onto.properties()]
properties_labels = [p.label[0] if len(p.label) > 0 else p.name for p in onto.properties()]


In [8]:
# load the list of annotated body_site terms
body_site_terms = pd.read_csv("data/BioAnnotate_rh-body_site.csv", header=None)
body_site_terms = body_site_terms[0].tolist()
# print(body_site_terms)

In [17]:
# load a single biosamples XML file. File 699 used due to smaller size
# SAX parser used to avoid memory issues - CAN IN THEORY LOAD ALL BIOSAMPLES IN ONE FILE
# biosamples_path = "data/biosamples/biosample_set.699.xml"
biosamples_path = "data/biosamples/biosample_small.xml"

class BioSamplesHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.bioSampleId = ""
        self.tissue_value = ""

    def startElement(self, tag, attributes):
        if tag == "BioSample":
            self.bioSampleId = attributes["id"]
            # print("startElement: ", attributes["id"])
        elif tag == "Attribute":
            pass


    def endElement(self, tag):
        if tag =="BioSample":
            self.bioSampleId = ""
            self.tissue_value = ""


class BioSamplesExploreHandler(xml.sax.ContentHandler):
    def __init__(self):
        # self.tags = set()
        self.is_attribute = False
        # self.attribute_names = set()
        self.num_samples = 0

    def startElement(self, name, attrs):
        # self.tags.add(name)
        if name == "BioSample":
            self.num_samples += 1
        if name == "Attribute":
            self.is_attribute = True
            print(attrs.items())
            # if process.extractOne(attrs)
            pass

            # print the value of the attribute
            # print(attrs.values()[0])
            # print(attrs[attrs.keys()[0]])
            # print(process.extractOne(attrs[attrs.keys()[0]], body_site_terms))
            # if process.extractOne(attrs[attrs.keys()[0]], body_site_terms)[1] > 90:
            #     print(name)

    def endElement(self, name):

        pass

    def characters(self, content):
        if self.is_attribute:
            # print(content)
            self.is_attribute = False
        pass

    def endDocument(self):
        # print(self.tags)
        print(self.num_samples)
        pass


parser = xml.sax.make_parser()
# handler = BioSamplesHandler()
handler = BioSamplesExploreHandler()
parser.setContentHandler(handler)
parser.parse(biosamples_path)

[('attribute_name', 'finishing strategy (depth of coverage)')]
[('attribute_name', 'collection date'), ('harmonized_name', 'collection_date'), ('display_name', 'collection date')]
[('attribute_name', 'estimated_size'), ('harmonized_name', 'estimated_size'), ('display_name', 'estimated size')]
[('attribute_name', 'sop')]
[('attribute_name', 'project_type')]
[('attribute_name', 'host'), ('harmonized_name', 'host'), ('display_name', 'host')]
[('attribute_name', 'lat_lon'), ('harmonized_name', 'lat_lon'), ('display_name', 'latitude and longitude')]
[('attribute_name', 'biome'), ('harmonized_name', 'env_broad_scale'), ('display_name', 'broad-scale environmental context')]
[('attribute_name', 'misc_param: HMP body site')]
[('attribute_name', 'nucleic acid extraction')]
[('attribute_name', 'feature'), ('harmonized_name', 'env_local_scale'), ('display_name', 'local-scale environmental context')]
[('attribute_name', 'investigation_type'), ('harmonized_name', 'investigation_type'), ('display_nam