# Version 1.1
- Creates a lookup dict using the BRENDA class labels and their synonyms
    - Lookup dict maps possible terms to the BRENDA class name
- Lookup dict also adds terms that remove the words "cell", "cells" if they are located at the end of the label
- Attribute values fetched from the biosamples are searched for in the dictionary

In [159]:
from owlready2 import *
import xml.sax
import csv
import importlib
import tissue_eval

In [160]:
# Load the BTO ontology and classes
onto_path.append('../../data/ontologies/bto.owl')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# classes stores a dictionary of the form {class_name: class_label}
# class_synonyms stores a dictionary of the form {class_name: [synonym1, synonym2, ...]}
classes = {c.name: c.label[0] if len(c.label) > 0 else '' for c in onto.classes()}
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

assert len(classes) == len(class_synonyms)
print('Number of classes: ', len(classes))

Number of classes:  6569


In [161]:
lookup_dict = {}
redundant_end_terms = {'cell', 'cells', 'tissue', 'tissues'}
redundant_start_terms = {'human'}

for class_name, synonyms in class_synonyms.items():
    label = classes[class_name].lower()
    lookup_dict[label] = class_name
    # removes 'cell' or 'cells' from the end of the label if applicable and adds to lookup dictionary NOT QUITE RIGHT
    tokens = label.split()
    if len(tokens) > 0:
        if tokens[-1] in redundant_end_terms:
            term = ' '.join(tokens[:-1])
            if term not in lookup_dict:
                lookup_dict[term] = class_name
    # removes 'human' from the start of the label if applicable and adds to lookup dictionary
        if tokens[0] in redundant_start_terms:
            term = ' '.join(tokens[1:])
            if term not in lookup_dict:
                lookup_dict[term] = class_name

    if len(synonyms) > 0:
        for synonym in synonyms:
            synonym = synonym.lower()
            # adds synonym to lookup dictionary
            lookup_dict[synonym] = class_name
            # removes 'cell' or 'cells' from the end of the synonym if applicable and adds to lookup dictionary
            tokens = synonym.split()
            if tokens[-1] in redundant_end_terms:
                term = ' '.join(tokens[:-1])
                if term not in lookup_dict:
                    lookup_dict[term] = class_name
            # removes 'human' from the start of the synonym if applicable and adds to lookup dictionary
            if tokens[0] in redundant_start_terms:
                term = ' '.join(tokens[1:])
                if term not in lookup_dict:
                    lookup_dict[term] = class_name
                
        
print('Number of entries in lookup dictionary: ', len(lookup_dict))

Number of entries in lookup dictionary:  19534


In [162]:
class BioSamplesAttributesHandler(xml.sax.ContentHandler):
    '''
    SAX handler to read attributes from a BioSamples XML file and extract possible tissue locations
    - extracts attributes with harmonized names from the target_attributes set
    '''
    def __init__(self, target_attributes, sample_dict):
        self.biosample_id = ''
        self.attribute_name = ''
        self.target_attributes = target_attributes 
        self.sample_dict = sample_dict
        self.attribute_list = []

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute' and 'harmonized_name' in attrs:
            attribute = attrs['harmonized_name']    
            if attribute in self.target_attributes:
                self.attribute_name = attrs['harmonized_name']

    def characters(self, content):
        if self.attribute_name != '':
            self.attribute_list.append((self.attribute_name, content))
            self.attribute_name = ''

    def endElement(self, name):
        if name == 'BioSample':
            self.sample_dict[self.biosample_id] = self.attribute_list
            self.biosample_id = ''
            self.attribute_list = []

    def endDocument(self):
        print('Finished parsing BioSamples XML file')

In [163]:
# sample_dict stores the biosample accession id and its possible tissue attributes in a list of tuples
sample_dict = {}
# biosamples_path = '/mnt/disk1/biosample_set.xml'
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'
target_attributes = {"cell_line", "cell_subtype", "cell_type", "host_tissue_sampled", "sample_type", "tissue", "host_anatomical_part", "isolation_source", "source_name", "source_type", "strain", "subclone", "subgroup"}

parser = xml.sax.make_parser()
handler = BioSamplesAttributesHandler(target_attributes, sample_dict)
parser.setContentHandler(handler)
parser.parse(biosamples_path)

print('Number of samples: ', len(sample_dict))

Finished parsing BioSamples XML file
Number of samples:  10000


In [164]:
# result_dict stores the biosample accession id and the found tissue location from the BTO ontology
# if no tissue location is found or no possible attributes were identified, the value is None

result_dict = {}
matched_attribute = {}
for biosample_id, attributes in sample_dict.items():
    for attribute in attributes:
        lookup_term = attribute[1].lower()
        tokens = lookup_term.split()
        if len(tokens) > 1 and tokens[-1] in redundant_end_terms:
            tokens = tokens[:-1]
        if len(tokens) > 1 and tokens[0] in redundant_start_terms:
            tokens = tokens[1:]
        lookup_term = ' '.join(tokens)

        if lookup_term in lookup_dict:
            result_dict[biosample_id] = lookup_dict[lookup_term]
            matched_attribute[biosample_id] = attribute[0]
            break
    
    if biosample_id not in result_dict:
        result_dict[biosample_id] = None

In [165]:
print('cell' in lookup_dict)


False


In [166]:
# positive_results is a set of biosample accession ids that have a matching tissue location in the BTO ontology
# negative_results is a set of biosample accession ids that have no matching tissue location in the BTO ontology
positive_results = set()
negative_results = set()

for biosample_id, tissue in result_dict.items():
    if tissue is not None:
        positive_results.add(biosample_id)
    else:
        negative_results.add(biosample_id)

print('Number of positive results: ', len(positive_results))
print('Number of negative results: ', len(negative_results))

Number of positive results:  2937
Number of negative results:  7063


In [167]:
# write the results to a csv file
with open('../../data/biosamples/results/biosample_tissue_locations_1.1.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_accession_id', 'biosample_url', 'attrs', 'matched_attr', 'bto_label', 'bto_name', 'bto_url'])

    for biosample_id in sample_dict:
        accession_id = biosample_id
        url = 'https://www.ncbi.nlm.nih.gov/biosample/' + biosample_id
        attrs = sample_dict[biosample_id] if len(sample_dict[biosample_id]) > 0 else ''
        matched_attr = matched_attribute[biosample_id] if biosample_id in matched_attribute else ''
        bto_name = result_dict[biosample_id] 
        bto_label = classes[bto_name] if bto_name is not None else ''
        bto_url = 'http://purl.obolibrary.org/obo/' + bto_name if bto_name is not None else ''

        writer.writerow([accession_id, url, attrs, matched_attr, bto_label, bto_name, bto_url])

print('Finished writing results to csv file')  

Finished writing results to csv file


In [168]:
importlib.reload(tissue_eval)
# attribute graph looks at found attributes before matching them to the ontology
tissue_eval.attribute_graph(sample_dict)

tissue_eval.matches_graph(result_dict)