In [3]:
from owlready2 import *
import pandas as pd
import xml.sax
from rapidfuzz import process
# from rapidfuzz import fuzz
import rapidfuzz

----
# Loading data...

- loads the BRENDA ontology OWL file
    - loads the properties, classes and synonyms from BRENDA
    - classes information split into names and labels
    - properties split into annotation/object and also names and labels
- loads the body_site_terms from a csv of manually collated terms

In [7]:
# load the BTO ontology
onto_path.append("/data/ontology")
onto = get_ontology("http://purl.obolibrary.org/obo/bto.owl").load()
# print(onto.base_iri)

In [8]:
# load class information and properties from the ontology
class_names = [c.name for c in onto.classes()]
# account for missing class labels in the ontology
class_labels = [c.label[0] if len(c.label) > 0 else '' for c in onto.classes()]
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

num_synonyms = 0

for class_label in onto.classes():
    num_synonyms += len(class_label.hasRelatedSynonym) + len(class_label.hasExactSynonym)

print("Num synonyms: ", num_synonyms)
# properties_names contains all properties, including object and annotation properties
# names list contains ontology
object_properties_names = [o.name for o in onto.object_properties()]
object_properties_labels = [o.label[0] if len(o.label) > 0 else o.name for o in onto.object_properties()]

annotation_properties_names = [a.name for a in onto.annotation_properties()]
annotation_properties_labels = [a.label[0] if len(a.label) > 0 else a.name for a in onto.annotation_properties()]

properties_names = [p.name for p in onto.properties()]
properties_labels = [p.label[0] if len(p.label) > 0 else p.name for p in onto.properties()]

print("Number of classes: ", len(class_names))
print(class_synonyms)

Num synonyms:  6210
Number of classes:  6569
{'BTO_0000000': [], 'BTO_0000001': [], 'BTO_0000002': [], 'BTO_0000003': [], 'BTO_0000004': [], 'BTO_0000005': [], 'BTO_0000006': [], 'BTO_0000007': ['293 cell', 'A-293 cell', 'A293 cell', 'HEK cell', 'HEK293 cell', 'human embryonal kidney cell', 'human embryonic kidney cell'], 'BTO_0000008': [], 'BTO_0000009': [], 'BTO_0000010': [], 'BTO_0000011': ['NIH3T3-L1 cell'], 'BTO_0000012': [], 'BTO_0000013': [], 'BTO_0000014': [], 'BTO_0000015': [], 'BTO_0000016': ['A172 cell'], 'BTO_0000017': ['A431 cell'], 'BTO_0000018': ['A549 cell', 'NCI-A549 cell'], 'BTO_0000019': [], 'BTO_0000020': ['abdominal cavity'], 'BTO_0000021': [], 'BTO_0000022': ['visceral ganglion'], 'BTO_0000023': ['breast muscle', 'pectoralis muscle'], 'BTO_0000024': ['fourth stomach'], 'BTO_0000025': [], 'BTO_0000026': [], 'BTO_0000027': [], 'BTO_0000028': ['exocrine pancreas cell', 'pancreatic acinus'], 'BTO_0000029': ['long adductor muscle', 'musculus adductor longus'], 'BTO_000

In [9]:
# load the list of annotated body_site terms
body_site_terms = pd.read_csv("data/BioAnnotate_rh-body_site.csv", header=None)
body_site_terms = body_site_terms[0].tolist()
body_site_terms = set(body_site_terms)
body_site_string = " ".join(body_site_terms)
# print("sample_tissue_type" in body_site_terms)
# print(body_site_string)

---- 
# Parsing the Biosamples files
- BioSamplesDictHandler looks at all the attributes for each biosample
    - uses a fuzzy search to find the attribute that has the highest similarity to the attribute names in the body_site_terms
- BioSamplesPackageHandler looks for 'useful' packages and finds the appropriate information

In [7]:
# load a single biosamples XML file. File 699 used due to smaller size
# SAX parser used to avoid memory issues - CAN IN THEORY LOAD ALL BIOSAMPLES IN ONE FILE
biosamples_path = "data/biosamples/biosample_set.699.xml"
# biosamples_path = "data/biosamples/biosample_small.xml"

class BioSamplesDictHandler(xml.sax.ContentHandler):
    def __init__(self):
        super().__init__()
        # self.is_tissue_attribute = False
        self.attribute_name = ""
        self.num_samples = 0
        # self.num_attributes = 0
        # self.tissue_sample_count = 0
        self.attribute_dict = {}
        self.sample_dict = {}
        self.biosample_id = ""

    def startElement(self, tag_name, tag_attrs):
        if tag_name == "BioSample":
            self.num_samples += 1
            self.biosample_id = tag_attrs["id"]

        elif tag_name == "Attribute":
            self.attribute_name = tag_attrs["attribute_name"]
            # self.num_attributes += 1
  
            
    def characters(self, content):
        if self.attribute_name != "":
            self.attribute_dict[self.attribute_name] = content
            self.attribute_name = ""


    def endElement(self, name):
        if name == "Attributes":
            if self.attribute_dict != {}:
                for key in self.attribute_dict.keys():
                    result = process.extractOne(key, body_site_terms, scorer=rapidfuzz.fuzz.ratio)
                    if result[1] > 87:
                        self.sample_dict[self.biosample_id] = (key, self.attribute_dict[key])
                        break
                    else:
                        self.sample_dict[self.biosample_id] = None
            self.attribute_dict = {}

    def endDocument(self):
        print("num_samples: ", self.num_samples)
        print(self.sample_dict)
        # print(len(self.sample_dict))
        print(len([x for x in self.sample_dict.values() if x is not None]))
        # print the key value pairs in the sample dict that don't have None as the value in a list
        print([x for x in self.sample_dict.items() if x[1] is not None])

parser = xml.sax.make_parser()
handler = BioSamplesDictHandler()
parser.setContentHandler(handler)
parser.parse(biosamples_path)


# using a direct lookup from body_site_terms, 45m 22.9s
# num_samples: 32526809
# num_tissue_samples: 6492612

# 17718
# 40839

# token sort ratio
# 3121

num_samples:  17718
{'32338113': None, '32338114': None, '32338115': None, '32338116': None, '32338117': None, '32338118': None, '32338119': None, '32338120': None, '32338121': None, '32338122': None, '32338123': None, '32338229': None, '32338230': None, '32338231': None, '32338232': None, '32338233': None, '32338234': None, '32338235': None, '32338236': None, '32338237': None, '32338238': None, '32338239': None, '32338240': None, '32338241': None, '32338242': None, '32338243': None, '32338244': None, '32338245': None, '32338246': None, '32338247': None, '32338248': None, '32338249': None, '32338250': None, '32338251': None, '32338252': None, '32338253': None, '32338254': None, '32338255': None, '32338256': None, '32338257': None, '32338258': None, '32338259': None, '32338260': None, '32338261': None, '32338262': None, '32338263': None, '32338264': None, '32338265': None, '32338266': None, '32338267': None, '32338268': None, '32338269': None, '32338270': None, '32338271': None, '323382

In [33]:
templates_path = "data/biosamples/templates_xml/"
biosamples_packages_xml_path = "data/biosamples/biosample_packages.xml"
packages_list = []

class TemplateNamesHander(xml.sax.ContentHandler):
    def __init__(self, templates_list) -> None:
        super().__init__()
        # self.templates = []
        self.is_name = False

    def startElement(self, name, attrs):
        if name == "Name":
            self.is_name = True

    def characters(self, content):
        if self.is_name:
            packages_list.append(content)
            self.is_name = False    
    
parser = xml.sax.make_parser()
parser.setContentHandler(TemplateNamesHander(templates_list=packages_list))
parser.parse(biosamples_packages_xml_path)


'''
- Reads the xml file for each package and extracts the mandatory attributes
- Mandatory attributes for a single package will be stored in a set
    - attributes where at least one in a group is required will be stored in a list

- All data will be stored in a dictionary where the key is the package name and the value is a list of mandatory attributes
'''
class MandatoryAttributeHandler(xml.sax.ContentHandler):
    def __init__(self, attributes_list) -> None:
        super().__init__()
        self.attributes_list = attributes_list
        self.is_mandatory_attribute = False
        self.group_name = None
        self.is_harmonized_name = False
        self.mandatory_groups = {}

    def startElement(self, name, attrs):
        if name == "Attribute":
            use = attrs["use"]
            if use == "optional":
                pass
            elif use == "mandatory":
                self.is_mandatory_attribute = True
            elif use == "either_one_mandatory":
                self.group_name = attrs["group_name"]
                self.is_mandatory_attribute = True
        elif name == "HarmonizedName":
            self.is_harmonized_name = True

    def characters(self, content):
        if self.is_mandatory_attribute and self.is_harmonized_name:
            if self.group_name is None:
                self.attributes_list.append(content)
            else:
                # group name is not None therefore the attribute is part of a group 
                # attribute name added to dictionary with group name as key
                if self.group_name in self.mandatory_groups:
                    self.mandatory_groups[self.group_name].append(content)
                else:
                    self.mandatory_groups[self.group_name] = [content]
                self.group_name = None
            
            self.is_mandatory_attribute = False
            self.is_harmonized_name = False

    def endDocument(self):
        for group in self.mandatory_groups.items():
            self.attributes_list.append(group)
        attribute_list = self.attributes_list


attributes_dict = {}
for package in packages_list:
    with open("data/biosamples/templates_xml/" + package + ".xml") as f:
        attribute_list = []
        parser = xml.sax.make_parser()
        parser.setContentHandler(MandatoryAttributeHandler(attribute_list))
        parser.parse(f)
        attributes_dict[package] = attribute_list

In [51]:
biosamples_path = "data/biosamples/biosample_random_samples.xml"
possible_tissue = {"cell_line", "cell_subtype", "cell_type", "host_tissue_sampled", "sample_type", "tissue", "host_anatomical_part", "isolation_source", "source_name", "source_type", "strain", "subclone", "subgroup"}
# possible_tissue = {"tissue"}
import pprint

class BioSamplesPackageHandler(xml.sax.ContentHandler):
    def __init__(self, useful_packages, attributes_dict):
        super().__init__()
        self.useful_packages = useful_packages
        self.attributes_dict = attributes_dict
        self.is_package = False
        self.package_name = ""
        self.target_attributes = set()
        self.num_samples = 0
        self.biosample_id = ""
        self.attribute_name = ""
        self.sample_dict = {}

    def startElement(self, name, attrs):
        if name == "BioSample":
            self.biosample_id = attrs["id"]
            self.num_samples += 1
        elif name == "Package":
            self.is_package = True
        elif name == "Attribute" and "harmonized_name" in attrs:
                self.attribute_name = attrs["harmonized_name"]

    def characters(self, content):
        if self.is_package:
            self.package_name = content
            if self.package_name in self.useful_packages:
                self.target_attributes = possible_tissue.intersection(set(a for a in self.attributes_dict[self.package_name] if type(a) == str))
                if len(self.target_attributes) > 1:
                    print("more than one matching package ", self.biosample_id)
            self.is_package = False

        if self.attribute_name != "":
            if self.attribute_name in self.target_attributes:
                self.sample_dict[self.biosample_id] = (self.attribute_name, content)
            self.attribute_name = ""

    def endDocument(self):
        print("num_samples: ", self.num_samples)
        # print(self.sample_dict)
        pprint.pprint(self.sample_dict)
        print(len(self.sample_dict))
        


In [52]:
useful_packages = set()
with open("data/biosamples/useful_packages.txt", "r") as f:
    for package in f.readlines():
        useful_packages.add(package.strip())


parser = xml.sax.make_parser()
handler = BioSamplesPackageHandler(useful_packages, attributes_dict)
parser.setContentHandler(handler)
parser.parse(biosamples_path)


more than one matching package  30414169
num_samples:  10000
{'10037706': ('tissue', 'Whole blood'),
 '10039586': ('isolation_source', 'missing'),
 '10058440': ('isolation_source', 'food'),
 '10070699': ('tissue', 'cormel'),
 '10072908': ('tissue', 'Proximal jejunum'),
 '10080731': ('isolation_source', 'Hospital patients'),
 '10088371': ('tissue', 'Sputum'),
 '10092280': ('isolation_source', 'missing'),
 '10097534': ('isolation_source', 'human'),
 '10104985': ('isolation_source', 'Water column'),
 '10119081': ('tissue', 'tumor'),
 '10128932': ('tissue', 'blood'),
 '10142456': ('tissue', 'lung cancer'),
 '10148089': ('tissue', 'SKIN FIBROBLASTS'),
 '10148996': ('tissue', 'leaf'),
 '10162994': ('tissue', 'cortex'),
 '10163223': ('isolation_source', 'Drainage from GT site'),
 '10178733': ('isolation_source', 'upper respiratory tract'),
 '10179895': ('isolation_source', 'upper respiratory tract'),
 '10182641': ('isolation_source', 'missing'),
 '10218328': ('isolation_source', 'not applicab

## Gettting search results from NCBI

- https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=biosamples&term=blood