In [3]:
from owlready2 import *
import pandas as pd
import numpy as np
import csv
import xml.sax
from rapidfuzz import process
# from rapidfuzz import fuzz
import rapidfuzz

In [4]:
# load the BTO ontology
onto_path.append("/data/ontology")
onto = get_ontology("http://purl.obolibrary.org/obo/bto.owl").load()
# print(onto.base_iri)

In [5]:
# load class information and properties from the ontology
class_names = [c.name for c in onto.classes()]
# account for missing class labels in the ontology
class_labels = [c.label[0] if len(c.label) > 0 else '' for c in onto.classes()]

# properties_names contains all properties, including object and annotation properties
# names list contains ontology
object_properties_names = [o.name for o in onto.object_properties()]
object_properties_labels = [o.label[0] if len(o.label) > 0 else o.name for o in onto.object_properties()]

annotation_properties_names = [a.name for a in onto.annotation_properties()]
annotation_properties_labels = [a.label[0] if len(a.label) > 0 else a.name for a in onto.annotation_properties()]

properties_names = [p.name for p in onto.properties()]
properties_labels = [p.label[0] if len(p.label) > 0 else p.name for p in onto.properties()]

print("Number of classes: ", len(class_names))


Number of classes:  6569


In [7]:
# load the list of annotated body_site terms
body_site_terms = pd.read_csv("data/BioAnnotate_rh-body_site.csv", header=None)
body_site_terms = body_site_terms[0].tolist()
# print(len(body_site_terms))
body_site_terms = set(body_site_terms)

In [10]:
# load a single biosamples XML file. File 699 used due to smaller size
# SAX parser used to avoid memory issues - CAN IN THEORY LOAD ALL BIOSAMPLES IN ONE FILE
biosamples_path = "data/biosamples/biosample_set.699.xml"
# biosamples_path = "data/biosamples/biosample_small.xml"


class BioSamplesHandler(xml.sax.ContentHandler):
    def __init__(self):
        super().__init__()
        self.bioSampleId = ""
        self.tissue_value = ""

    def startElement(self, tag, attributes):
        if tag == "BioSample":
            self.bioSampleId = attributes["id"]
            # print("startElement: ", attributes["id"])
        elif tag == "Attribute":
            pass

    def endElement(self, tag):
        if tag == "BioSample":
            self.bioSampleId = ""
            self.tissue_value = ""


class BioSamplesExploreHandler(xml.sax.ContentHandler):
    def __init__(self):
        # self.tags = set()
        # self.is_attribute = False
        super().__init__()
        self.is_tissue_attribute = False
        self.attribute_name = ""
        self.num_samples = 0
        self.tissue_sample_count = 0

    def startElement(self, tag_name, tag_attrs):
        # self.tags.add(name)
        if tag_name == "BioSample":
            self.num_samples += 1
        if tag_name == "Attribute":
            # ASSUMES THAT ALL ATTRIBUTE TAGS HAVE AN ATTRIBUTE_NAME ATTRIBUTE 
            # WORKS FOR BIOSAMPLES XML
            attribute_name = tag_attrs["attribute_name"]
            # if attribute_name in body_site_terms:
            #     self.tissue_sample_count += 1
            #     self.is_tissue_attribute = True
            #     self.attribute_name = attribute_name
            search_result = process.extractOne(attribute_name, body_site_terms)
            if search_result[1] > 85:
                self.tissue_sample_count += 1
                self.is_tissue_attribute = True
                self.attribute_name = attribute_name
            #     print("similar term: ", search_result)
            #     # print(tag_attrs.values()[0])
            pass

    def endElement(self, name):

        pass

    def characters(self, content):
        # if self.is_tissue_attribute:
        #     print(self.attribute_name + ": " + content)
        #     self.is_tissue_attribute = False
        #     print("--------------------")
        pass

    def endDocument(self):
        # print(self.tags)
        print(self.num_samples)
        print(self.tissue_sample_count)
        pass


parser = xml.sax.make_parser()
# handler = BioSamplesHandler()
handler = BioSamplesExploreHandler()
parser.setContentHandler(handler)
parser.parse(biosamples_path)


# using a direct lookup from body_site_terms, 45m 22.9s
# num_samples: 32526809
# num_tissue_samples: 6492612


17718
40839


In [9]:
a = "collection site aid kajsh dkajsh dkajsh dkjahs kdh ajs das"
b = "collection date"

print(rapidfuzz.fuzz.ratio(a, b))
print(rapidfuzz.fuzz.partial_ratio(a, b))
print(rapidfuzz.fuzz.token_sort_ratio(a, b))
print(rapidfuzz.fuzz.token_set_ratio(a, b))
print(rapidfuzz.fuzz.WRatio(a, b))
print(rapidfuzz.fuzz.QRatio(a, b))
print(rapidfuzz.distance.DamerauLevenshtein.distance(a, b))



35.61643835616438
86.66666666666667
41.0958904109589
80.0
85.5
35.61643835616438
45
