# Version 2.1
- Takes the depth of terms in BRENDA to account
    - Depth of a term is the shortest path to the root when more than one path exists
- Looks at the title of the study to find any relevant information

## Issues
- More information may be available in the BioProject title and description


In [2]:
from owlready2 import *
import pandas as pd
import numpy as np
import spacy
import xml.sax
import csv
import importlib
import tissue_eval





In [101]:
# Load the BTO ontology
onto_path.append('../../data/ontologies/')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# classes dictionary: {class_name: class_label} 
#   - class_label is None if no label is found
# class_synonyms dictionary: {class_name: [synonym1, synonym2, ...]}
#   - synonym list is empty if no synonym is found
classes = {c.name: c.label.first() for c in onto.classes()}
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

# create a reverse mapping of classes and synonyms to BTO IDs
classes_reverse = {c.label.first().lower(): c.name for c in onto.classes() if c.label != []}
class_synonyms_reverse = {s.lower(): c for c, syn in class_synonyms.items() for s in syn}
labels_reverse = {**classes_reverse, **class_synonyms_reverse}

assert len(classes) == len(class_synonyms)
print('Number of classes:', len(classes))

# flatten the synonyms and class labels into a single set
class_labels = {c for c in classes.values() if c is not None}
class_synonyms_flattend = {s for syn in class_synonyms.values() for s in syn}
bto_values = class_labels.union(class_synonyms_flattend)

# depth_dict: {class_name: depth}
depth_dict = {}
for c in onto.classes():
    depth_dict[c.name] = len(c.ancestors())

Number of classes: 6569


In [8]:
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
tokenizer = Tokenizer(nlp.vocab)

patterns = []

for bto_value in bto_values:
    pattern = [{'LOWER': token.lower_} for token in tokenizer(bto_value)]
    patterns.append(pattern)

matcher.add('bto', patterns, greedy='LONGEST')

In [107]:
class BioSamplesMatcherHandler(xml.sax.ContentHandler):
    def __init__(self, sample_dict, attribute_dict) -> None:
        super().__init__()
        self.sample_dict = sample_dict
        self.attribute_dict = attribute_dict
        self.biosample_id = ''

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']

    def endElement(self, name):
        self.sample_dict[self.biosample_id] = None

In [108]:
sample_dict = {}
attribute_dict = {}
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'

parser = xml.sax.make_parser()
handler = BioSamplesMatcherHandler(sample_dict, attribute_dict)
parser.setContentHandler(handler)

parser.parse(biosamples_path)
print('Number of samples:', len(sample_dict))

Number of samples: 10000


In [None]:
result_dict = {}

for biosample_id, bto_term in sample_dict.items():
    if bto_term == None:
        result_dict[biosample_id] = None
    else:
        result_dict[biosample_id] = bto_term

print('Number of samples:', len(result_dict))

In [None]:
# take the whole attribute stuff to check results


with open('../../data/biosamples/results/biosample_tissue_locations_2.1.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_accession_id', 'biosample_url', 'matched_attr', 'bto_label', 'bto_name', 'bto_url'])
    for biosample_id in sample_dict:
        accession_id = biosample_id
        url = 'https://www.ncbi.nlm.nih.gov/biosample/' + biosample_id
        matched_attr = attribute_dict[biosample_id] if attribute_dict[biosample_id] is not None else ''
        bto_label = result_dict[biosample_id]
        bto_name = labels_reverse[bto_label] if bto_label is not None else None
        bto_url = 'http://purl.obolibrary.org/obo/' + bto_name if bto_name is not None else ''

        writer.writerow([accession_id, url, matched_attr, bto_label, bto_name, bto_url])

print('Finished writing results to CSV file')

In [None]:
importlib.reload(tissue_eval)

tissue_eval.matches_graph(result_dict)
tissue_eval.attribute_name_graph(attribute_dict)

In [None]:
tissue_map_filepath = '../../data/biosamples/results'

v2_0 = pd.read_csv(tissue_map_filepath + '/biosample_tissue_locations_2.0.csv')
v2_1 = pd.read_csv(tissue_map_filepath + '/biosample_tissue_locations_2.1.csv')

v2_1['delta_bto'] = None
v2_1['delta_attr'] = None

for index, row in v2_1.iterrows():
    if pd.isnull(row['bto_label']) and pd.isnull(v2_0.loc[index]['bto_label']):
        delta_bto = 'unchanged'
    elif row['bto_label'] == v2_0.loc[index]['bto_label']:
        delta_bto = 'unchanged'
    elif pd.isnull(row['bto_label']):
        delta_bto = 'lost'
    elif pd.isnull(v2_0.loc[index]['bto_label']):
        delta_bto = 'gained'
    else:
        delta_bto = 'changed'

    if pd.isnull(row['matched_attr']) and pd.isnull(v2_0.loc[index]['matched_attr']):
        delta_attr = 'unchanged'
    elif row['matched_attr'] == v2_0.loc[index]['matched_attr']:
        delta_attr = 'unchanged'
    elif pd.isnull(row['matched_attr']):
        delta_attr = 'lost'
    elif pd.isnull(v2_0.loc[index]['matched_attr']):
        delta_attr = 'gained'
    else:
        delta_bto = 'changed'

    v2_1.loc[index, 'delta_bto'] = delta_bto
    v2_1.loc[index, 'delta_attr'] = delta_attr

# add new columns to the original file
v2_1.to_csv(tissue_map_filepath + '/biosample_tissue_locations_2.1.csv', index=False)