In [None]:
from owlready2 import *
import pandas as pd
import numpy as np
import csv
import xml.sax
from rapidfuzz import process
# from rapidfuzz import fuzz
import rapidfuzz

In [None]:
# load the BTO ontology
onto_path.append("/data/ontology")
onto = get_ontology("http://purl.obolibrary.org/obo/bto.owl").load()
# print(onto.base_iri)

In [None]:
# load class information and properties from the ontology
class_names = [c.name for c in onto.classes()]
# account for missing class labels in the ontology
class_labels = [c.label[0] if len(c.label) > 0 else '' for c in onto.classes()]

# properties_names contains all properties, including object and annotation properties
# names list contains ontology
object_properties_names = [o.name for o in onto.object_properties()]
object_properties_labels = [o.label[0] if len(o.label) > 0 else o.name for o in onto.object_properties()]

annotation_properties_names = [a.name for a in onto.annotation_properties()]
annotation_properties_labels = [a.label[0] if len(a.label) > 0 else a.name for a in onto.annotation_properties()]

properties_names = [p.name for p in onto.properties()]
properties_labels = [p.label[0] if len(p.label) > 0 else p.name for p in onto.properties()]

print("Number of classes: ", len(class_names))

In [121]:
# load the list of annotated body_site terms
body_site_terms = pd.read_csv("data/BioAnnotate_rh-body_site.csv", header=None)
body_site_terms = body_site_terms[0].tolist()
body_site_terms = set(body_site_terms)
body_site_string = " ".join(body_site_terms)
# print("sample_tissue_type" in body_site_terms)
print(body_site_string)

helper_t_cells prostate_cancer cell_tissue_type gut_chamber rhizocompartment gut_section histology_of_cancer tissue__type cell_of_origin_subtype presence_and_type_of_narrowings_right_colon glial_cluster host_blood tissue_samples cell_type_of_origin biopsy_site graft_type extent_of_ulcerated_surface_right_colon neutrophils collection_information cell cell_type_isolation derived_tissue tissue_rego cst_dendrogram_introitus_cervix tissue_subtype_brodmann_area breast_cancer_subtypes collection_site_and_cell_type bowel source_of_tissue tissue_cell_type cellular_fraction brain_structure lesion.size organ_of_origin cell_types glands tisssue_type histologic_type deep_ulcer %_blood cell_tissue_subtype gingivitis presence_and_type_of_narrowings_ileum bone_marrow_blasts tissus basophil tisue cell_type_or_stage characteristics:cancer_type_abbrevation batch_of_transduced_liver_cells animal_source fungi_head cell_typing brain_sub_region_tissue fruit_status organ_site vaginal_wall_ph blood_monocyte_pe

In [166]:
# load a single biosamples XML file. File 699 used due to smaller size
# SAX parser used to avoid memory issues - CAN IN THEORY LOAD ALL BIOSAMPLES IN ONE FILE
biosamples_path = "data/biosamples/biosample_set.699.xml"
# biosamples_path = "data/biosamples/biosample_small.xml"


# class BioSamplesHandler(xml.sax.ContentHandler):
#     def __init__(self):
#         super().__init__()
#         self.bioSampleId = ""
#         self.tissue_value = ""

#     def startElement(self, tag, attributes):
#         if tag == "BioSample":
#             self.bioSampleId = attributes["id"]
#             # print("startElement: ", attributes["id"])
#         elif tag == "Attribute":
#             pass

#     def endElement(self, tag):
#         if tag == "BioSample":
#             self.bioSampleId = ""
#             self.tissue_value = ""


# class BioSamplesExploreHandler(xml.sax.ContentHandler):
#     def __init__(self):
#         # self.tags = set()
#         # self.is_attribute = False
#         super().__init__()
#         self.is_tissue_attribute = False
#         self.attribute_name = ""
#         self.num_samples = 0
#         self.tissue_sample_count = 0

#     def startElement(self, tag_name, tag_attrs):
#         # self.tags.add(name)
#         if tag_name == "BioSample":
#             self.num_samples += 1
#         if tag_name == "Attribute":
#             # ASSUMES THAT ALL ATTRIBUTE TAGS HAVE AN ATTRIBUTE_NAME ATTRIBUTE 
#             # WORKS FOR BIOSAMPLES XML
#             attribute_name = tag_attrs["attribute_name"]
#             # search_result = process.extractOne(attribute_name, body_site_terms, scorer=rapidfuzz.fuzz.QRatio)
#             # if search_result[1] > 88:
#             #     self.tissue_sample_count += 1
#             #     self.is_tissue_attribute = True
#             #     self.attribute_name = attribute_name
#             #     print("similar term: ", search_result)
#             #     # print(tag_attrs.values()[0])

#             score = rapidfuzz.fuzz.token_set_ratio(attribute_name, body_site_string)
#             if score > 90:
#                 self.tissue_sample_count += 1
#                 self.is_tissue_attribute = True
#                 self.attribute_name = attribute_name
#                 # print("similar term: ", search_result)
#                 # print(tag_attrs.values()[0])
#             pass

#     def endElement(self, name):

#         pass

#     def characters(self, content):
#         if self.is_tissue_attribute:
#             print(self.attribute_name + ": " + content)
#             self.is_tissue_attribute = False
#             print("--------------------")
#         pass

#     def endDocument(self):
#         # print(self.tags)
#         print(self.num_samples)
#         print(self.tissue_sample_count)
#         pass


class BioSamplesDictHandler(xml.sax.ContentHandler):
    def __init__(self):
        super().__init__()
        # self.is_tissue_attribute = False
        self.attribute_name = ""
        self.num_samples = 0
        # self.num_attributes = 0
        # self.tissue_sample_count = 0
        self.attribute_dict = {}
        self.sample_dict = {}
        self.bioSampleId = ""

    def startElement(self, tag_name, tag_attrs):
        if tag_name == "BioSample":
            self.num_samples += 1
            self.bioSampleId = tag_attrs["id"]

        elif tag_name == "Attribute":
            self.attribute_name = tag_attrs["attribute_name"]
            # self.num_attributes += 1
  
            
    def characters(self, content):
        if self.attribute_name != "":
            self.attribute_dict[self.attribute_name] = content
            self.attribute_name = ""


    def endElement(self, name):
        if name == "Attributes":
            if self.attribute_dict != {}:
                for key in self.attribute_dict.keys():
                    result = process.extractOne(key, body_site_terms, scorer=rapidfuzz.fuzz.ratio)
                    if result[1] > 87:
                        self.sample_dict[self.bioSampleId] = (key, self.attribute_dict[key])
                        break
                    else:
                        self.sample_dict[self.bioSampleId] = None
            self.attribute_dict = {}

    def endDocument(self):
        print("num_samples: ", self.num_samples)
        print(self.sample_dict)
        # print(len(self.sample_dict))
        print(len([x for x in self.sample_dict.values() if x is not None]))
        # print the key value pairs in the sample dict that don't have None as the value in a list
        print([x for x in self.sample_dict.items() if x[1] is not None])

parser = xml.sax.make_parser()
handler = BioSamplesDictHandler()
parser.setContentHandler(handler)
parser.parse(biosamples_path)


# using a direct lookup from body_site_terms, 45m 22.9s
# num_samples: 32526809
# num_tissue_samples: 6492612

# 17718
# 40839

# token sort ratio
# 3121

num_samples:  17718
{'32338113': None, '32338114': None, '32338115': None, '32338116': None, '32338117': None, '32338118': None, '32338119': None, '32338120': None, '32338121': None, '32338122': None, '32338123': None, '32338229': None, '32338230': None, '32338231': None, '32338232': None, '32338233': None, '32338234': None, '32338235': None, '32338236': None, '32338237': None, '32338238': None, '32338239': None, '32338240': None, '32338241': None, '32338242': None, '32338243': None, '32338244': None, '32338245': None, '32338246': None, '32338247': None, '32338248': None, '32338249': None, '32338250': None, '32338251': None, '32338252': None, '32338253': None, '32338254': None, '32338255': None, '32338256': None, '32338257': None, '32338258': None, '32338259': None, '32338260': None, '32338261': None, '32338262': None, '32338263': None, '32338264': None, '32338265': None, '32338266': None, '32338267': None, '32338268': None, '32338269': None, '32338270': None, '32338271': None, '323382

In [122]:
import time

# a = "biomaterial provider"
a = body_site_string
b = 'sample_type'
c = 'biomaterial_provider'

print(rapidfuzz.fuzz.ratio(a, b))
# print(rapidfuzz.fuzz.partial_ratio(a, b))
# print(rapidfuzz.fuzz.token_sort_ratio(a, b))
# print(rapidfuzz.fuzz.token_set_ratio(a, b))
# print(rapidfuzz.fuzz.WRatio(a, b))
# print(rapidfuzz.fuzz.QRatio(a, b))
# print(rapidfuzz.distance.DamerauLevenshtein.distance(a, b))
# print("-----")
print(rapidfuzz.fuzz.ratio(a, c))
# print(rapidfuzz.fuzz.partial_ratio(a, c))
# print(rapidfuzz.fuzz.token_sort_ratio(a, c))
# print(rapidfuzz.fuzz.token_set_ratio(a, c))
# print(rapidfuzz.fuzz.WRatio(a, c))
# print(rapidfuzz.fuzz.QRatio(a, c))
# print(rapidfuzz.distance.DamerauLevenshtein.distance(a, c))

s = time.time()
print(rapidfuzz.process.extractOne(b, body_site_terms))
print(rapidfuzz.process.extractOne(c, body_site_terms))
e = time.time()
print("time: %s" % (e-s))

s = time.time()
print(rapidfuzz.fuzz.ratio(b, body_site_string))
print(rapidfuzz.fuzz.ratio(c, body_site_string))
e = time.time()
print("time: %s" % (e-s))



0.2330508474576276
0.42332521959995706
('sample_type_skin_type', 90.0, 68)
('biomaterial', 90.0, 469)
time: 0.0020172595977783203
0.2330508474576276
0.42332521959995706
time: 0.00014901161193847656
