# Version 1.2
- Adds more preprocessing to the data
    - removes non alphanumeric characters 
- Updates the list of attribtue names with more possible tissue types
    

In [34]:
from owlready2 import *
import xml.sax
import csv
import importlib
import tissue_eval
import pandas as pd
import numpy as np

In [35]:
# Load the BTO ontology and classes
onto_path.append('../../data/ontologies/bto.owl')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# classes stores a dictionary of the form {class_name: class_label}
# class_synonyms stores a dictionary of the form {class_name: [synonym1, synonym2, ...]}
classes = {c.name: c.label[0] if len(c.label) > 0 else '' for c in onto.classes()}
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

assert len(classes) == len(class_synonyms)
print('Number of classes: ', len(classes))

Number of classes:  6569


In [36]:
def preprocess(search_term) -> str:
    '''
    Helper function to preprocess a string to search for in the lookup dictionary
    - Converts the string to lower case and removes all punctuation
    - redundant terms defined in the set are removed from the front and back respectively
    '''
    redundant_end_terms = {'cell', 'cells', 'tissue', 'tissues'}
    redundant_start_terms = {'human'}

    search_term = search_term.lower()
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    search_term = search_term.translate(str.maketrans('', '', punctuation))
    if len(search_term.split()) > 1 and search_term.split()[0] in redundant_start_terms:
        search_term = ' '.join(search_term.split()[1:])
    if len(search_term.split()) > 1 and search_term.split()[-1] in redundant_end_terms:
        search_term = ' '.join(search_term.split()[:-1])
    return search_term


In [37]:
lookup_dict = {}

for class_name, synonyms in class_synonyms.items():
    label = classes[class_name].lower()
    lookup_dict[label] = class_name
    search_term = preprocess(label)
    if search_term not in lookup_dict:
        lookup_dict[search_term] = class_name

    if len(synonyms) > 0:
        for synonym in synonyms:
            synonym = preprocess(synonym)
            # adds synonym to lookup dictionary
            lookup_dict[synonym] = class_name
                
        
print('Number of entries in lookup dictionary: ', len(lookup_dict))

Number of entries in lookup dictionary:  14541


In [38]:
class BioSamplesAttributesHandler(xml.sax.ContentHandler):
    '''
    SAX handler to read attributes from a BioSamples XML file and extract possible tissue locations
    - extracts attributes with harmonized names from the target_attributes set
    '''
    def __init__(self, target_attributes, sample_dict):
        self.biosample_id = ''
        self.attribute_name = ''
        self.target_attributes = target_attributes 
        self.sample_dict = sample_dict
        self.attribute_list = []

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute' and 'harmonized_name' in attrs:
            attribute = attrs['harmonized_name']    
            if attribute in self.target_attributes:
                self.attribute_name = attrs['harmonized_name']

    def characters(self, content):
        if self.attribute_name != '':
            self.attribute_list.append((self.attribute_name, content))
            self.attribute_name = ''

    def endElement(self, name):
        if name == 'BioSample':
            self.sample_dict[self.biosample_id] = self.attribute_list
            self.biosample_id = ''
            self.attribute_list = []

    def endDocument(self):
        print('Finished parsing BioSamples XML file')

In [39]:
# sample_dict stores the biosample accession id and its possible tissue attributes in a list of tuples
sample_dict = {}
# biosamples_path = '/mnt/disk1/biosample_set.xml'
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'
target_attributes = {"cell_line", "cell_subtype", "cell_type", "host_tissue_sampled", "sample_type", "tissue", "host_anatomical_part", "isolation_source", "source_name", "source_type", "strain", "subclone", "subgroup", "body_habitat", "body_product", "cell_subtype", "host_body_product", "host_body_habitat", "plant_body_site", "plant structure"}

parser = xml.sax.make_parser()
handler = BioSamplesAttributesHandler(target_attributes, sample_dict)
parser.setContentHandler(handler)
parser.parse(biosamples_path)

print('Number of samples: ', len(sample_dict))

Finished parsing BioSamples XML file
Number of samples:  10000


In [40]:
# result_dict stores the biosample accession id and the found tissue location from the BTO ontology
# if no tissue location is found or no possible attributes were identified, the value is None

result_dict = {}
matched_attribute = {}
for biosample_id, attributes in sample_dict.items():
    for attribute in attributes:
        lookup_term = preprocess(attribute[1])

        if lookup_term in lookup_dict:
            result_dict[biosample_id] = lookup_dict[lookup_term]
            matched_attribute[biosample_id] = attribute[0]
            break
    
    if biosample_id not in result_dict:
        result_dict[biosample_id] = None

In [41]:
# positive_results is a set of biosample accession ids that have a matching tissue location in the BTO ontology
# negative_results is a set of biosample accession ids that have no matching tissue location in the BTO ontology
positive_results = set()
negative_results = set()

for biosample_id, tissue in result_dict.items():
    if tissue is not None:
        positive_results.add(biosample_id)
    else:
        negative_results.add(biosample_id)

print('Number of positive results: ', len(positive_results))
print('Number of negative results: ', len(negative_results))

Number of positive results:  2972
Number of negative results:  7028


In [42]:
# write the results to a csv file
with open('../../data/biosamples/results/biosample_tissue_locations_1.2.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_accession_id', 'biosample_url', 'attrs', 'matched_attr', 'bto_label', 'bto_name', 'bto_url'])

    for biosample_id in sample_dict:
        accession_id = biosample_id
        url = 'https://www.ncbi.nlm.nih.gov/biosample/' + biosample_id
        attrs = sample_dict[biosample_id] if len(sample_dict[biosample_id]) > 0 else ''
        matched_attr = matched_attribute[biosample_id] if biosample_id in matched_attribute else ''
        bto_name = result_dict[biosample_id] 
        bto_label = classes[bto_name] if bto_name is not None else ''
        bto_url = 'http://purl.obolibrary.org/obo/' + bto_name if bto_name is not None else ''

        writer.writerow([accession_id, url, attrs, matched_attr, bto_label, bto_name, bto_url])

print('Finished writing results to csv file')  

Finished writing results to csv file


In [43]:
importlib.reload(tissue_eval)
# attribute graph looks at found attributes before matching them to the ontology
tissue_eval.attribute_graph(sample_dict)

tissue_eval.matches_graph(result_dict)

In [44]:
tissue_map_filepath = '../../data/biosamples/results/'

# read the files
v1_1 = pd.read_csv(tissue_map_filepath + 'biosample_tissue_locations_1.1.csv')
v1_2 = pd.read_csv(tissue_map_filepath + 'biosample_tissue_locations_1.2.csv')

# creates a new column delta_bto that stores unchanged, gained, lost or changed
v1_2['delta_bto'] = None
v1_2['delta_attrs'] = None

for index, row in v1_2.iterrows():
    if pd.isnull(row['bto_label']) and pd.isnull(v1_1.loc[index]['bto_label']):
        delta_bto = 'unchanged'
    elif row['bto_label'] == v1_1.loc[index]['bto_label']:
        delta_bto = 'unchanged'
    elif pd.isnull(row['bto_label']):
        delta_bto = 'lost'
    elif pd.isnull(v1_1.loc[index]['bto_label']):
        delta_bto = 'gained'
    else:
        delta_bto = 'changed'

    if pd.isnull(row['attrs']) and pd.isnull(v1_1.loc[index]['attrs']):
        delta_attrs = 'unchanged'
    elif row['attrs'] == v1_1.loc[index]['attrs']:
        delta_attrs = 'unchanged'
    elif pd.isnull(row['attrs']):
        delta_attrs = 'lost'
    elif pd.isnull(v1_1.loc[index]['attrs']):
        delta_attrs = 'gained'
    else:
        delta_bto = 'changed'

    v1_2.loc[index, 'delta_bto'] = delta_bto
    v1_2.loc[index, 'delta_attrs'] = delta_attrs

# add the new columns to the original file
v1_2.to_csv(tissue_map_filepath + 'biosample_tissue_locations_1.2.csv', index=False)