In [52]:
from owlready2 import *
import xml.sax
import csv
import tissue_eval 

In [53]:
# Load the BTO ontology and classes
onto_path.append('../../data/ontologies/bto.owl')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# class_names stores the names in the form BTO_0000001
# class_labels stores the label (english name) of the class or '' if no label is present
# class_synonyms stores a dictionary of the form {class_name: [synonym1, synonym2, ...]}
class_names = [c.name for c in onto.classes()]
class_labels = [c.label[0] if len(c.label) > 0 else '' for c in onto.classes()]
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

assert len(class_names) == len(class_labels) == len(class_synonyms)
print('Number of classes: ', len(class_names))

Number of classes:  6569


In [54]:
class BioSamplesAttributesHandler(xml.sax.ContentHandler):
    '''
    SAX handler to read attributes from a BioSamples XML file and extract possible tissue locations
    - extracts attributes with harmonized names from the target_attributes set
    '''
    def __init__(self, target_attributes, sample_dict):
        self.biosample_id = ''
        self.attribute_name = ''
        self.target_attributes = target_attributes 
        self.sample_dict = sample_dict
        self.attribute_list = []

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute' and 'harmonized_name' in attrs:
            attribute = attrs['harmonized_name']    
            if attribute in self.target_attributes:
                self.attribute_name = attrs['harmonized_name']

    def characters(self, content):
        if self.attribute_name != '':
            self.attribute_list.append((self.attribute_name, content))
            self.attribute_name = ''

    def endElement(self, name):
        if name == 'BioSample':
            self.sample_dict[self.biosample_id] = self.attribute_list
            self.biosample_id = ''
            self.attribute_list = []

    def endDocument(self):
        print('Finished parsing BioSamples XML file')


In [55]:
# sample_dict stores the biosample accession id and its possible tissue attributes in a list of tuples
sample_dict = {}
# biosamples_path = '/mnt/disk1/biosample_set.xml'
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'
target_attributes = {"cell_line", "cell_subtype", "cell_type", "host_tissue_sampled", "sample_type", "tissue", "host_anatomical_part", "isolation_source", "source_name", "source_type", "strain", "subclone", "subgroup"}

parser = xml.sax.make_parser()
handler = BioSamplesAttributesHandler(target_attributes, sample_dict)
parser.setContentHandler(handler)
parser.parse(biosamples_path)

print('Number of samples: ', len(sample_dict))

Finished parsing BioSamples XML file
Number of samples:  10000


In [56]:
# result_dict stores the biosample accession id and the found tissue location from the BTO ontology
# if no tissue location is found or no possible attributes were identified, the value is None

result_dict = {}
for biosample_id, attributes in sample_dict.items():
    for attribute in attributes:
        # finds the first matching attribute in the BTO ontology
        res = onto.search(label=attribute[1].lower())
        if len(res) > 0:
            result_dict[biosample_id] = res[0].label[0]
            break
    
    if biosample_id not in result_dict:
        result_dict[biosample_id] = None

In [57]:
# positive_results is a set of biosample accession ids that have a matching tissue location in the BTO ontology
# negative_results is a set of biosample accession ids that have no matching tissue location in the BTO ontology
positive_resuults = set()
negative_results = set()

for biosample_id, tissue in result_dict.items():
    if tissue is not None:
        positive_resuults.add(biosample_id)
    else:
        negative_results.add(biosample_id)

print('Number of positive results: ', len(positive_resuults))
print('Number of negative results: ', len(negative_results))

Number of positive results:  2301
Number of negative results:  7699


In [58]:
# write the results to a csv file
with open('../../data/biosamples/results/biosample_tissue_locations.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_id', '1.0'])
    for biosample_id, tissue in result_dict.items():
        if tissue is not None:
            writer.writerow([biosample_id, tissue])
        else:
            writer.writerow([biosample_id, 'None'])

print('Finished writing results to csv file')  

Finished writing results to csv file


In [None]:
tissue_eval.attribute_graph(result_dict)