In [12]:
from owlready2 import *
import pandas as pd
import numpy as np
import csv
import xml.sax
from rapidfuzz import process

In [13]:
# load the BTO ontology
onto_path.append("/data/ontology")
onto = get_ontology("http://purl.obolibrary.org/obo/bto.owl").load()
# print(onto.base_iri)

In [33]:
# load class information and properties from the ontology
class_names = [c.name for c in onto.classes()]
# account for missing class labels in the ontology
class_labels = [c.label[0] if len(c.label) > 0 else '' for c in onto.classes()]

# properties_names contains all properties, including object and annotation properties
# names list contains ontology
object_properties_names = [o.name for o in onto.object_properties()]
object_properties_labels = [o.label[0] if len(o.label) > 0 else o.name for o in onto.object_properties()]

annotation_properties_names = [a.name for a in onto.annotation_properties()]
annotation_properties_labels = [a.label[0] if len(a.label) > 0 else a.name for a in onto.annotation_properties()]

properties_names = [p.name for p in onto.properties()]
properties_labels = [p.label[0] if len(p.label) > 0 else p.name for p in onto.properties()]


In [37]:
# load the list of annotated body_site terms
body_site_terms = pd.read_csv("data/BioAnnotate_rh-body_site.csv", header=None)
body_site_terms = body_site_terms[0].tolist()
print(body_site_terms)

['biosample_tag', '%_b_cells', '%_blood', '%_brain', '%_granulocytes', '%_monocytes', '%_nk_cells', '%_t_cells', '%_tumour_cell_content', '>_5%_tumor_cellularity', '3rd_leaf_length', '3rd_leaf_length_cm', '3rd_leaf_width', '3rd_leaf_width_cm', '3rd_weigh_date_lbs', 'absolute_lymphocytes', 'absolute_macrophages', 'absolute_neutrophils', 'advanced_adenoma', 'afected_area', 'affected_area', 'age_at_blood_collection', 'age_at_skin_biopsy', 'age_feces', 'anal_cytology', 'anatomic_location', 'anatomic_segment', 'anatomic_site', 'anatomic_site_', 'anatomical_localisation', 'anatomical_location', 'anatomical_material', 'anatomical_part', 'anatomical_position', 'anatomical_site', 'anatomical_sites', 'anatomy', 'animal_source', 'animal_source_specific', 'atcc_cell_line_source', 'atcc_culture_collection_identifier', 'atcc_id', 'atrial_tissue_region', 'b_cell_stage', 'b_cell_subset', 'b_cell_type', 'b_cells', 'background_cell_line', 'bacteria_tissue', 'bal_lymphocyte_percent', 'bal_neutrophil_perc

In [36]:
# load a single biosamples XML file. File 699 used due to smaller size
# SAX parser used to avoid memory issues - CAN IN THEORY LOAD ALL BIOSAMPLES IN ONE FILE
biosamples_path = "data/biosamples/biosample_set.699.xml"
# biosamples_path = "data/biosamples/biosample_small.xml"


class BioSamplesHandler(xml.sax.ContentHandler):
    def __init__(self):
        super().__init__()
        self.bioSampleId = ""
        self.tissue_value = ""

    def startElement(self, tag, attributes):
        if tag == "BioSample":
            self.bioSampleId = attributes["id"]
            # print("startElement: ", attributes["id"])
        elif tag == "Attribute":
            pass

    def endElement(self, tag):
        if tag == "BioSample":
            self.bioSampleId = ""
            self.tissue_value = ""


class BioSamplesExploreHandler(xml.sax.ContentHandler):
    def __init__(self):
        # self.tags = set()
        # self.is_attribute = False
        super().__init__()
        self.is_tissue_attribute = False
        self.attribute_name = ""
        self.num_samples = 0
        self.tissue_sample_count = 0

    def startElement(self, tag_name, tag_attrs):
        # self.tags.add(name)
        if tag_name == "BioSample":
            self.num_samples += 1
        if tag_name == "Attribute":
            # ASSUMES THAT ALL ATTRIBUTE TAGS HAVE AN ATTRIBUTE_NAME ATTRIBUTE
            attribute_name = tag_attrs["attribute_name"]
            search_result = process.extractOne(attribute_name, body_site_terms)
            if search_result[1] > 85:
                self.tissue_sample_count += 1
                self.is_tissue_attribute = True
                self.attribute_name = attribute_name
                print("similar term: ", search_result)
                # print(tag_attrs.values()[0])
            pass

    def endElement(self, name):

        pass

    def characters(self, content):
        if self.is_tissue_attribute:
            print(self.attribute_name + ": " + content)
            self.is_tissue_attribute = False
            print("--------------------")


        pass

    def endDocument(self):
        # print(self.tags)
        print(self.num_samples)
        print(self.tissue_sample_count)
        pass


parser = xml.sax.make_parser()
# handler = BioSamplesHandler()
handler = BioSamplesExploreHandler()
parser.setContentHandler(handler)
parser.parse(biosamples_path)

similar term:  ('basophil', 90.0, 51)
sop: http://hmpdacc.org/doc/CommonGeneAnnotation_SOP.pdf
--------------------
similar term:  ('host_blood', 90.0, 377)
host: Homo sapiens
--------------------
similar term:  ('anatomical_material', 90.0, 31)
material: biological product [ENVO:02000043]
--------------------
similar term:  ('derived_strain', 90.0, 252)
strain: DSM 17216
--------------------
similar term:  ('basophil', 90.0, 51)
sop: http://hmpdacc.org/doc/CommonGeneAnnotation_SOP.pdf
--------------------
similar term:  ('host_blood', 90.0, 377)
host: Homo sapiens
--------------------
similar term:  ('anatomical_material', 90.0, 31)
material: biological product [ENVO:02000043]
--------------------
similar term:  ('derived_strain', 90.0, 252)
strain: DSM 17241
--------------------
similar term:  ('collection_site', 86.66666666666667, 207)
collection_date: 2020-09-16
--------------------
similar term:  ('sample_type_body_site', 90.0, 460)
sample_type: microbe
--------------------
simila