# Version 2.0
- Changed method to use the Spacy library in to identify terms from BRENDA
    - Uses rule based matching to find matches in BRENDA

    - lemmatization with BTO?
    - passing the attribute tags as context?
- Look at information in the rest of the biosamples rather than just the attributes

In [3]:
from owlready2 import *
import pandas as pd
import numpy as np
import spacy
import xml.sax
import xml.etree.ElementTree as ET
import csv
import importlib
import tissue_eval

In [17]:
# Load the BTO ontology
onto_path.append('../../data/ontologies/')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# classes dictionary: {class_name: class_label} 
#   - class_label is None if no label is found
# class_synonyms dictionary: {class_name: [synonym1, synonym2, ...]}
#   - synonym list is empty if no synonym is found
classes = {c.name: c.label.first() for c in onto.classes()}
class_synonyms = {c.name: c.hasExactSynonym + c.hasRelatedSynonym for c in onto.classes()}

assert len(classes) == len(class_synonyms)
print('Number of classes:', len(classes))

Number of classes: 6569


In [24]:
class BiosamplesLanguageHandler(xml.sax.ContentHandler):
    def __init__(self) -> None:
        super().__init__()
        self.biosample_id = ''


    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
            print(self.biosample_id)

    def endDocument(self):
        print('Finished parsing BioSamples XML file')
        

In [14]:
class BiosamplesLanguageTissueHandler(xml.sax.ContentHandler):
    def __init__(self) -> None:
        super().__init__()
        self.biosample_id = ''
        self.tissue = ''
        self.is_tissue = False
        self.tissue_dict = {}

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute' and 'harmonized_name' in attrs and attrs['harmonized_name'] == 'tissue':
            self.is_tissue = True

    def characters(self, content):
        if self.is_tissue:
            self.tissue = content
            self.is_tissue = False
            self.tissue_dict[self.biosample_id] = self.tissue

    # def endElement(self, name):
    #     if name == 'BioSample' and self.tissue != '':
    #         self.tissue_dict[self.biosample_id] = self.tissue

    def endDocument(self):
        with open('tissue_results.csv', 'w') as f:
            writer = csv.writer(f)
            for key, value in self.tissue_dict.items():
                writer.writerow([key, value])
                
        print('Finished parsing BioSamples XML file')
    

biosamples_path = '../../data/biosamples/biosample_random_samples.xml'

parser = xml.sax.make_parser()
handler = BiosamplesLanguageTissueHandler()
parser.setContentHandler(handler)
parser.parse(biosamples_path)

Finished parsing BioSamples XML file


In [40]:
for data in parse_biosample(biosamples_path):
    print(data)

SAMN00000478
SAMN00000982
SAMN00003190
SAMN00006038
SAMN00014824
SAMD00000683
SAMN00017717
SAMN00023185
SAMN00024345
SAMEA741979
SAMN00031572
SAMN00033172
SAMN00033422
SAMN00033710
SAMN00043887
SAMN00044397
SAMN00048526
SAMN00060356
SAMN00063659
SAMN00073199
SAMN00080674
SAMN00081058
SAMN00084312
SAMN00086177
SAMN00087679
SAMN00092642
SAMN00097718
SAMN00101044
SAMN00101996
SAMN00105115
SAMN00109398
SAMEA893050
SAMN00123946
SAMN00125136
SAMN00127364
SAMN00127639
SAMN00138427
SAMN00145857
SAMN00147445
SAMN00149946
SAMN00153632
SAMN00153759
SAMN00154697
SAMN00159839
SAMN00162559
SAMN00170098
SAMN00174384
SAMN00181193
SAMN00187896
SAMN00192860
SAMN00204112
SAMN00204890
SAMN00208112
SAMN00216021
SAMN00223984
SAMN00224660
SAMN00225593
SAMN00225936
SAMN00233701
SAMN00234475
SAMN00234655
SAMN00236044
SAMN00236131
SAMN00237012
SAMN00237394
SAMN00239916
SAMN00242239
SAMN00242263
SAMN00242861
SAMN00244953
SAMN00248087
SAMN00249325
SAMN00250096
SAMN00254506
SAMN00255812
SAMN00255931
SAMN00256179
S