# Version 2.1
- Looks at the title and abstract (paragraph) for studies to find more relevant information
- Takes the depth of terms in BRENDA to account
    - Depth of a term is the shortest path to the root when more than one path exists



## Issues
- More information may be available in the BioProject title and description
- Can more patterns to the matcher that remove 'redundnant' start and end terms
    - Either a direct search for the first and last terms 
    - Can add a custom extension attribute in spacy


In [70]:
from owlready2 import *
import pandas as pd
import re
import spacy
import xml.sax
import csv
import importlib
import tissue_eval
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer

In [141]:
# Load the BTO ontology
onto_path.append('../../data/ontologies/')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# classes dictionary: {class_name: class_label} 
#   - class_label is None if no label is found
# class_synonyms dictionary: {class_name: [synonym1, synonym2, ...]}
#   - synonym list is empty if no synonym is found
classes = {c.name: c.label.first() for c in onto.classes()}
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

class_synonyms['BTO_0000440'] = class_synonyms['BTO_0000440'] + ['stool']

# create a reverse mapping of classes and synonyms to BTO IDs
classes_reverse = {c.label.first().lower(): c.name for c in onto.classes() if c.label != []}
class_synonyms_reverse = {s.lower(): c for c, syn in class_synonyms.items() for s in syn}
labels_reverse = {**classes_reverse, **class_synonyms_reverse}

assert len(classes) == len(class_synonyms)
print('Number of classes:', len(classes))

# flatten the synonyms and class labels into a single set
class_labels = {c for c in classes.values() if c is not None}
class_synonyms_flattend = {s for syn in class_synonyms.values() for s in syn}
bto_values = class_labels.union(class_synonyms_flattend)

Number of classes: 6569


In [None]:
# calculate the depth for each class
# depth_dict: {class_name: depth}
depth_dict = {}

for ancestor in onto.search_one(label='tissues, cell types and enzyme sources').is_a:
    print(ancestor.label)



In [72]:
# creates a spacy matcher to match patterns of BRENDA terms and synonyms
#  - patterns are created by tokenizing the BRENDA terms and synonyms
#  - matcher looks at only direct matches

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
tokenizer = Tokenizer(nlp.vocab)

patterns = []

for bto_value in bto_values:
    pattern = [{'LOWER': token.lower_} for token in tokenizer(bto_value)]
    patterns.append(pattern)
    

matcher.add('bto', patterns, greedy='LONGEST')

In [73]:
class BioSamplesMatcherHandler(xml.sax.ContentHandler):
    def __init__(self, sample_dict) -> None:
        super().__init__()
        self.sample_dict = sample_dict
        self.attribute_dict = {}
        self.biosample_id = ''
        self.content_dict = {}
        self.is_title = False
        self.is_paragraph = False
        self.attribute_name = ''

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Title':
            self.is_title = True
        elif name == 'Paragraph':
            self.is_paragraph = True
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']

    def characters(self, content):
        if self.is_title:
            self.content_dict['title'] = content
            self.is_title = False
        elif self.is_paragraph:
            self.content_dict['paragraph'] = content
            self.is_paragraph = False
        elif self.attribute_name != '':
            self.attribute_dict[self.attribute_name] = content
            self.attribute_name = ''
        

    def endElement(self, name):
        if name == 'BioSample':
            self.content_dict['attributes'] = self.attribute_dict
            self.sample_dict[self.biosample_id] = self.content_dict
            self.attribute_dict = {}
            self.content_dict = {}
    
    def endDocument(self):
        print('Finished parsing BioSamples XML file')

In [74]:
sample_dict = {}
attribute_dict = {}
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'

parser = xml.sax.make_parser()
handler = BioSamplesMatcherHandler(sample_dict)
parser.setContentHandler(handler)

parser.parse(biosamples_path)
print('Number of samples:', len(sample_dict))

Finished parsing BioSamples XML file
Number of samples: 10000


In [101]:
# adds a regex pattern to the default tokenizer to split on underscores
nlp = spacy.load('en_core_web_lg')
tokenizer = nlp.tokenizer

infixes = nlp.Defaults.infixes + [r'[_~]']
infix_re = spacy.util.compile_infix_regex(infixes)
tokenizer.infix_finditer = infix_re.finditer

In [119]:
matches_dict = {}

# finds all returned matches from the matcher
# - matcher looks at the title, paragraph, and attributes
# - returns a dictionary of matches for each sample

for biosample_id, content_dict in sample_dict.items():
    cur_matches = {}
    title = content_dict['title']
    attributes = content_dict['attributes'] # dictionary of attributes

    title_tokens = tokenizer(title)
    attributes_tokens = {key: tokenizer(value) for key, value in attributes.items()}

    title_matches = matcher(title_tokens, as_spans=True)
    attribute_matches = {}
    for key, value in attributes_tokens.items():
        attribute_match = matcher(value, as_spans=True)
        if len(attribute_match) > 0:
            attribute_matches[key] = matcher(value, as_spans=True)


    if len(title_matches) > 0:
        cur_matches['title'] = title_matches
    if len(attribute_matches) > 0:
        cur_matches['attributes'] = attribute_matches

    if 'paragraph' in content_dict:
        paragraph = content_dict['paragraph']
        paragraph_tokens = tokenizer(paragraph)
        paragraph_matches = matcher(paragraph_tokens, as_spans=True)
        
        if len(paragraph_matches) > 0:
            cur_matches['paragraph'] = paragraph_matches

    
    matches_dict[biosample_id] = cur_matches    

In [120]:
import pprint
pprint.pprint(matches_dict)

{'SAMD00000683': {},
 'SAMD00006380': {},
 'SAMD00014374': {'attributes': {'sample comment': [Root]}},
 'SAMD00021033': {'attributes': {'project_name': [medium]}},
 'SAMD00021226': {'title': [Smooth muscle]},
 'SAMD00036614': {},
 'SAMD00049287': {},
 'SAMD00050272': {'attributes': {'env_medium': [soil], 'project_name': [soil]},
                  'title': [soil]},
 'SAMD00056207': {},
 'SAMD00060149': {'attributes': {'env_local_scale': [oral cavity]}},
 'SAMD00060847': {},
 'SAMD00061605': {},
 'SAMD00078983': {},
 'SAMD00079210': {},
 'SAMD00091260': {},
 'SAMD00098583': {'attributes': {'env_broad_scale': [plant]}},
 'SAMD00100214': {'attributes': {'env_medium': [plant],
                                 'project_name': [root, fungi]}},
 'SAMD00107974': {'attributes': {'project_name': [gut]}},
 'SAMD00115040': {'attributes': {'env_medium': [feces], 'project_name': [gut]},
                  'title': [feces]},
 'SAMD00118585': {'attributes': {'propagation': [vegetative]}},
 'SAMD00119977

In [None]:
# take the whole attribute stuff to check results


with open('../../data/biosamples/results/biosample_tissue_locations_2.1.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_accession_id', 'biosample_url', 'matched_attr', 'bto_label', 'bto_name', 'bto_url'])
    for biosample_id in sample_dict:
        accession_id = biosample_id
        url = 'https://www.ncbi.nlm.nih.gov/biosample/' + biosample_id
        matched_attr = attribute_dict[biosample_id] if attribute_dict[biosample_id] is not None else ''
        bto_label = result_dict[biosample_id]
        bto_name = labels_reverse[bto_label] if bto_label is not None else None
        bto_url = 'http://purl.obolibrary.org/obo/' + bto_name if bto_name is not None else ''

        writer.writerow([accession_id, url, matched_attr, bto_label, bto_name, bto_url])

print('Finished writing results to CSV file')

In [None]:
importlib.reload(tissue_eval)

tissue_eval.matches_graph(result_dict)
tissue_eval.attribute_name_graph(attribute_dict)

In [None]:
tissue_map_filepath = '../../data/biosamples/results'

v2_0 = pd.read_csv(tissue_map_filepath + '/biosample_tissue_locations_2.0.csv')
v2_1 = pd.read_csv(tissue_map_filepath + '/biosample_tissue_locations_2.1.csv')

v2_1['delta_bto'] = None
v2_1['delta_attr'] = None

for index, row in v2_1.iterrows():
    if pd.isnull(row['bto_label']) and pd.isnull(v2_0.loc[index]['bto_label']):
        delta_bto = 'unchanged'
    elif row['bto_label'] == v2_0.loc[index]['bto_label']:
        delta_bto = 'unchanged'
    elif pd.isnull(row['bto_label']):
        delta_bto = 'lost'
    elif pd.isnull(v2_0.loc[index]['bto_label']):
        delta_bto = 'gained'
    else:
        delta_bto = 'changed'

    if pd.isnull(row['matched_attr']) and pd.isnull(v2_0.loc[index]['matched_attr']):
        delta_attr = 'unchanged'
    elif row['matched_attr'] == v2_0.loc[index]['matched_attr']:
        delta_attr = 'unchanged'
    elif pd.isnull(row['matched_attr']):
        delta_attr = 'lost'
    elif pd.isnull(v2_0.loc[index]['matched_attr']):
        delta_attr = 'gained'
    else:
        delta_bto = 'changed'

    v2_1.loc[index, 'delta_bto'] = delta_bto
    v2_1.loc[index, 'delta_attr'] = delta_attr

# add new columns to the original file
v2_1.to_csv(tissue_map_filepath + '/biosample_tissue_locations_2.1.csv', index=False)