# Version 2.1
- Looks at the title and abstract (paragraph) for studies to find more relevant information
- Added the additional synonym 'stool' for 'feces' (BTO_0000440)

## Issues
- More information may be available in the BioProject title and description
- Can more patterns to the matcher that remove 'redundnant' start and end terms
    - Either a direct search for the first and last terms 
    - Can add a custom extension attribute in spacy
- Does not calculate the depth of specific terms in the BTO
- Matcher will ignore preceding terms like not-___
- Only looks at the first match in any values


In [29]:
from owlready2 import *
import pandas as pd
import re
import spacy
import xml.sax
import csv
import importlib
import tissue_eval
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer

In [30]:
# Load the BTO ontology
onto_path.append('../../data/ontologies/')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# classes dictionary: {class_name: class_label} 
#   - class_label is None if no label is found
# class_synonyms dictionary: {class_name: [synonym1, synonym2, ...]}
#   - synonym list is empty if no synonym is found
classes = {c.name: c.label.first() for c in onto.classes()}
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

class_synonyms['BTO_0000440'] = class_synonyms['BTO_0000440'] + ['stool']

# create a reverse mapping of classes and synonyms to BTO IDs
classes_reverse = {c.label.first().lower(): c.name for c in onto.classes() if c.label != []}
class_synonyms_reverse = {s.lower(): c for c, syn in class_synonyms.items() for s in syn}
labels_reverse = {**classes_reverse, **class_synonyms_reverse}

assert len(classes) == len(class_synonyms)
print('Number of classes:', len(classes))

# flatten the synonyms and class labels into a single set
class_labels = {c for c in classes.values() if c is not None}
class_synonyms_flattend = {s for syn in class_synonyms.values() for s in syn}
bto_values = class_labels.union(class_synonyms_flattend)

Number of classes: 6569


In [31]:
# get the depth of a term in an is_a hierarchy 
# - if there is more than one path to the root, the longest path is used
bto_objects = {c.name: c for c in onto.classes()}

def get_depth(bto_id):
    depths = []
    if bto_id == 'BTO_0000000':
        return 0
    
    for bto_class in bto_objects[bto_id].is_a:
        if bto_class == owl.Thing:
            return 0
        bto_class_type = type(bto_class)
        if bto_class_type == ThingClass:
            depths += [1 + get_depth(bto_class.name)]
        elif bto_class_type == Restriction:
            depths += [1 + get_depth(bto_class.value.name)]


    return max(depths)
    
    

In [33]:
# calculate the depth for each class
# depth_dict: {class_name: depth}
depth_dict = {}

get_depth('BTO_0000671')        

2

In [34]:
# creates a spacy matcher to match patterns of BRENDA terms and synonyms
#  - patterns are created by tokenizing the BRENDA terms and synonyms
#  - matcher looks at only direct matches

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
tokenizer = Tokenizer(nlp.vocab)

patterns = []

for bto_value in bto_values:
    pattern = [{'LOWER': token.lower_} for token in tokenizer(bto_value)]
    patterns.append(pattern)
    

matcher.add('bto', patterns, greedy='LONGEST')

In [36]:
class BioSamplesMatcherHandler(xml.sax.ContentHandler):
    def __init__(self, sample_dict) -> None:
        super().__init__()
        self.sample_dict = sample_dict
        self.attribute_dict = {}
        self.biosample_id = ''
        self.content_dict = {}
        self.is_title = False
        self.is_paragraph = False
        self.attribute_name = ''

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Title':
            self.is_title = True
        elif name == 'Paragraph':
            self.is_paragraph = True
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']

    def characters(self, content):
        if self.is_title:
            self.content_dict['title'] = content
            self.is_title = False
        elif self.is_paragraph:
            self.content_dict['paragraph'] = content
            self.is_paragraph = False
        elif self.attribute_name != '':
            self.attribute_dict[self.attribute_name] = content
            self.attribute_name = ''
        

    def endElement(self, name):
        if name == 'BioSample':
            self.content_dict['attributes'] = self.attribute_dict
            self.sample_dict[self.biosample_id] = self.content_dict
            self.attribute_dict = {}
            self.content_dict = {}
    
    def endDocument(self):
        print('Finished parsing BioSamples XML file')

In [38]:
sample_dict = {}
attribute_dict = {}
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'

parser = xml.sax.make_parser()
handler = BioSamplesMatcherHandler(sample_dict)
parser.setContentHandler(handler)

parser.parse(biosamples_path)
print('Number of samples:', len(sample_dict))

Finished parsing BioSamples XML file
Number of samples: 10000


In [40]:
# adds a regex pattern to the default tokenizer to split on underscores
nlp = spacy.load('en_core_web_lg')
tokenizer = nlp.tokenizer

infixes = nlp.Defaults.infixes + [r'[_~]']
infix_re = spacy.util.compile_infix_regex(infixes)
tokenizer.infix_finditer = infix_re.finditer

In [46]:
matches_dict = {}

# finds all returned matches from the matcher
# - matcher looks at the title, paragraph, and attributes
# - returns a dictionary of matches for each sample

for biosample_id, content_dict in sample_dict.items():
    cur_matches = {}
    title = content_dict['title']
    attributes = content_dict['attributes'] # dictionary of attributes

    title_tokens = tokenizer(title)
    attributes_tokens = {key: tokenizer(value) for key, value in attributes.items()}

    title_matches = matcher(title_tokens, as_spans=True)
    attribute_matches = {}
    for key, value in attributes_tokens.items():
        attribute_match = matcher(value, as_spans=True)
        if len(attribute_match) > 0:
            attribute_matches[key] = matcher(value, as_spans=True)[0]


    if len(title_matches) > 0:
        cur_matches['title'] = title_matches[0]
    if len(attribute_matches) > 0:
        cur_matches['attributes'] = attribute_matches

    if 'paragraph' in content_dict:
        paragraph = content_dict['paragraph']
        paragraph_tokens = tokenizer(paragraph)
        paragraph_matches = matcher(paragraph_tokens, as_spans=True)
        
        if len(paragraph_matches) > 0:
            cur_matches['paragraph'] = paragraph_matches[0]

    
    matches_dict[biosample_id] = cur_matches    

In [63]:
with open('../../data/biosamples/results/biosample_tissue_locations_2.1.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_accession_id', 'biosample_url', 'title_match', 'paragraph_match', 'attribute_matches'])
    for biosample_id, matches in matches_dict.items():
        title_match = ''
        paragraph_match = ''
        attribute_matches = ''

        for match_type, match in matches.items():
            if match_type == 'title':
                title_match = match.text
            elif match_type == 'paragraph':
                paragraph_match = match.text
            elif match_type == 'attributes':
                for attribute, match in match.items():
                    attribute_matches += f'{attribute}:{match.text},'
        
        biosample_url = f'https://www.ncbi.nlm.nih.gov/biosample/{biosample_id}'
        writer.writerow([biosample_id, biosample_url, title_match, paragraph_match, attribute_matches])


print('Finished writing results to CSV file')

Finished writing results to CSV file
