# Version 2.3
- Removed the optional terms from the matcher
- Set all information taken from the XML file to lowercase
- Added option to check for attribute name if there are no matches
    - Will get fewer false positives from optional terms
    - Set of attribute names reduced to attributes with confidence
    - New check for attribute names uses a weaker matcher that allow optional terms


## Issues
- Can exclude proper nouns when looking for tissue terms to give fewer false positives
    - BTO matches should only be nouns or adjectives
- 'INSDC center name' gives some false positives
- Paragraph can have 'too much' information
- Cam look at attribute names vs title vs paragraph for ranking relevance of terms



In [35]:
from owlready2 import *
import pandas as pd
import re
import spacy
import xml.sax
import csv
import importlib
import tissue_eval
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer

In [36]:
# Load the BTO ontology
onto_path.append('../../data/ontologies/')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# classes dictionary: {class_name: class_label} 
#   - class_label is None if no label is found
# class_synonyms dictionary: {class_name: [synonym1, synonym2, ...]}
#   - synonym list is empty if no synonym is found
classes = {c.name: c.label.first() for c in onto.classes()}
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

# added possible missing synonyms
class_synonyms['BTO_0000440'] = class_synonyms['BTO_0000440'] + ['stool']

# create a reverse mapping of classes and synonyms to BTO IDs
classes_reverse = {c.label.first().lower(): c.name for c in onto.classes() if c.label != []}
class_synonyms_reverse = {s.lower(): c for c, syn in class_synonyms.items() for s in syn}
labels_reverse = {**classes_reverse, **class_synonyms_reverse}

assert len(classes) == len(class_synonyms)
print('Number of classes:', len(classes))

# flatten the synonyms and class labels into a single set
class_labels = {c for c in classes.values() if c is not None}
class_synonyms_flattend = {s for syn in class_synonyms.values() for s in syn}
bto_values = class_labels.union(class_synonyms_flattend)

Number of classes: 6569


In [37]:
# creates a spacy matcher to match patterns of BRENDA terms and synonyms
#  - patterns are created by tokenizing the BRENDA terms and synonyms
#  - matcher looks at only direct matches


nlp = spacy.load('en_core_web_lg')
matcher = Matcher(nlp.vocab)
tokenizer = Tokenizer(nlp.vocab)

patterns = []

for bto_value in bto_values:
    weak_pattern = [{'LOWER': token.lower_} for token in tokenizer(bto_value)]
    patterns.append(weak_pattern)

# takes the longest match if there are clashes
matcher.add('bto', patterns, greedy='LONGEST')

In [38]:
# attribute_matcher is a weaker matcher that allows optional terms at the end of the match
redundant_end_terms = ['tissue', 'tissues', 'cell', 'cells']
attribute_matcher = Matcher(nlp.vocab)


weak_patterns = []
for bto_value in bto_values:
    weak_pattern = [{'LOWER': token.lower_} for token in tokenizer(bto_value)]
    if weak_pattern[-1]['LOWER'] in redundant_end_terms:
        weak_pattern[-1]['OP'] = '?'
        weak_patterns.append(weak_pattern)


attribute_matcher.add('bto', weak_patterns, greedy='LONGEST')

In [39]:
class BioSamplesMatcherHandler(xml.sax.ContentHandler):
    '''
    SAX handler class to read in information from a BioSamples XML file
        - Reads the title, paragraph, and attributes of each BioSample
        - Information stored in the provided sample_dict with the biosample_id as the key
    '''
    def __init__(self, sample_dict) -> None:
        super().__init__()
        self.sample_dict = sample_dict
        self.attribute_dict = {}
        self.biosample_id = ''
        self.content_dict = {}
        self.is_title = False
        self.is_paragraph = False
        self.attribute_name = ''

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Title':
            self.is_title = True
        elif name == 'Paragraph':
            self.is_paragraph = True
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']

    def characters(self, content):
        if self.is_title:
            self.content_dict['title'] = content.lower()
            self.is_title = False
        elif self.is_paragraph:
            self.content_dict['paragraph'] = content.lower()
            self.is_paragraph = False
        elif self.attribute_name != '':
            self.attribute_dict[self.attribute_name] = content.lower()
            self.attribute_name = ''
        

    def endElement(self, name):
        if name == 'BioSample':
            self.content_dict['attributes'] = self.attribute_dict
            self.sample_dict[self.biosample_id] = self.content_dict
            self.attribute_dict = {}
            self.content_dict = {}
    
    def endDocument(self):
        print('Finished parsing BioSamples XML file')

In [40]:
sample_dict = {}
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'

parser = xml.sax.make_parser()
handler = BioSamplesMatcherHandler(sample_dict)
parser.setContentHandler(handler)

parser.parse(biosamples_path)
print('Number of samples:', len(sample_dict))

Finished parsing BioSamples XML file
Number of samples: 10000


In [102]:
# creates jsonl files for training in prodigy
# - patterns.jsonl contains the patterns for the matcher
# - biosamples_random_content.jsonl contains text from the title, paragraph, and attributes of each BioSample
import json

numbers_pattern = r'^[\d\. ]*$'
punct_pattern = r"\ *[_&<>:-]+\ *"

def not_tissue_term(content_str):
    not_tissue_terms = {'not provided', 'not applicable', 'not collected', 'not available', 'none', 'undetected', 'unknown', 'none detected', 'na', 'missing', 'no', 'yes', 'n/a', 'true', 'false', 'dna', 'male', 'female', 'animal', 'public', 'human'}
    if re.match(numbers_pattern, content_str):
        return True
    if content_str in not_tissue_terms:
        return True
    if len(content_str) < 3:
        return True
    return False


redundant_end_terms = {'tissue', 'tissues', 'cell', 'cells'}
with open('patterns.jsonl', 'w') as f:
    for bto_term in bto_values:
        pattern = [{'LOWER': token.lower_} for token in tokenizer(bto_term)]
        if pattern[-1]['LOWER'] in redundant_end_terms:
            pattern[-1]['OP'] = '?'
        pattern_dict = {'label': 'BTO', 'pattern': pattern}
        f.write(json.dumps(pattern_dict) + '\n')



with open('biosamples_random_content.jsonl', 'w') as f:
    for biosample_id, content in sample_dict.items():
        if 'title' in content:
            term = re.sub(punct_pattern, " ", content['title'])
            if not_tissue_term(term):
                continue
            content_dict = {'text': term, 'meta': {'source': 'title', 'biosample_id': biosample_id}}
            f.write(json.dumps(content_dict) + '\n')
        if 'paragraph' in content:
            term = re.sub(punct_pattern, " ", content['paragraph'])
            if not_tissue_term(term):
                continue
            content_dict = {'text': term, 'meta': {'source': 'paragraph', 'biosample_id': biosample_id}}
            f.write(json.dumps(content_dict) + '\n')
        if 'attributes' in content:
            for attribute in content['attributes']:
                term = re.sub(punct_pattern, " ", content['attributes'][attribute])
                if not_tissue_term(term):
                    continue
                content_dict = {'text': term, 'meta': {'source': 'attribute:' + attribute, 'biosample_id': biosample_id}}
                f.write(json.dumps(content_dict) + '\n')


In [42]:
# adds a regex pattern to the default tokenizer to split on underscores
nlp = spacy.load('en_core_web_lg')
tokenizer = nlp.tokenizer

infixes = nlp.Defaults.infixes + [r'[_~]']
infix_re = spacy.util.compile_infix_regex(infixes)
tokenizer.infix_finditer = infix_re.finditer

In [43]:
matches_dict = {}
confident_attributes = {'tissue', 'cell_type', 'cell_line', 'cell_subtype', 'source_name'}

# finds all returned matches from the matcher
# - matcher looks at the title, paragraph, and attributes
# - returns a dictionary of matches for each sample

for biosample_id, content_dict in sample_dict.items():
    cur_matches = {}
    title = content_dict['title']
    attributes = content_dict['attributes'] # dictionary of attributes

    title_tokens = tokenizer(title)
    attributes_tokens = {key: tokenizer(value) for key, value in attributes.items()}

    title_matches = matcher(title_tokens, as_spans=True)
    attribute_matches = {}
    for key, value in attributes_tokens.items():
        attribute_match = matcher(value, as_spans=True)
        if len(attribute_match) > 0:
            attribute_matches[key] = matcher(value, as_spans=True)


    if len(title_matches) > 0:
        cur_matches['title'] = title_matches
    if len(attribute_matches) > 0:
        cur_matches['attributes'] = attribute_matches

    if 'paragraph' in content_dict:
        paragraph = content_dict['paragraph']
        paragraph_tokens = tokenizer(paragraph)
        paragraph_matches = matcher(paragraph_tokens, as_spans=True)
        
        if len(paragraph_matches) > 0:
            cur_matches['paragraph'] = paragraph_matches

    if cur_matches == {}:
        for key, value in attributes_tokens.items():
            if key not in confident_attributes:
                continue
            attribute_match = attribute_matcher(value, as_spans=True)
            if len(attribute_match) > 0:
                cur_matches[key] = attribute_match

        if len(attribute_matches) > 0:
            cur_matches['attributes'] = attribute_matches
            
    matches_dict[biosample_id] = cur_matches

In [44]:
positive_samples = {key: value for key, value in matches_dict.items() if len(value) > 0}
negative_samples = {key: value for key, value in matches_dict.items() if len(value) == 0}
print('Number of positive results:', len(positive_samples))
print('Number of negative results:', len(negative_samples))

Number of positive results: 5309
Number of negative results: 4691


In [45]:
# take the whole attribute stuff to check results


with open('../../data/biosamples/results/biosample_tissue_locations_2.3.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['biosample_accession_id', 'biosample_url', 'title_match', 'paragraph_match', 'attribute_matches'])
    for biosample_id, matches in matches_dict.items():
        title_match = ''
        paragraph_match = ''
        attribute_matches = ''

        for match_type, match in matches.items():
            if match_type == 'title':
                for token in match:
                    title_match += token.text + ' '
            elif match_type == 'paragraph':
                for token in match:
                    paragraph_match += token.text + ' '
            elif match_type == 'attributes':
                for attribute, match in match.items():
                    attribute_matches += f'{attribute},'
                    for token in match:
                        attribute_matches += token.text + ' '
        
        biosample_url = f'https://www.ncbi.nlm.nih.gov/biosample/{biosample_id}'
        writer.writerow([biosample_id, biosample_url, title_match, paragraph_match, attribute_matches])


print('Finished writing results to CSV file')

Finished writing results to CSV file
