In [1]:
import os
import csv
from owlready2 import *
import xml.sax





# NCBI biosamples templates
Templates for each package are taken from the biosamples ncbi webpage (https://submit.ncbi.nlm.nih.gov/biosample/template/)

'*' indicates mandatory field

'**' indicates at least one attribute in the group required

all other attributes are optional

Current code has all the NCBI templates as well as the MIGS Cultured Bacterial/Archaeal packages


In [2]:
templates_path = "data/biosamples/templates/"

attributes_dict = {}
required_attributes = set()

for filename in os.listdir(templates_path):
    with open(templates_path + filename, 'r') as f:
        reader = csv.reader(f, delimiter='\t', quotechar='"')
        for row in reader:
            if not row or row[0].startswith("#"):
                continue
            # print(row)
            attributes_dict[filename.strip(".tsv")] = [attribute for attribute in row if attribute[0] == "*" and attribute[1] != "*"]

# print(attributes_dict)
# prints the attributes that are common to all biosamples templates (from NCBI packages)
print("Attributes common to all packages: ", set.intersection(*[set(attributes_dict[template]) for template in attributes_dict]))

Attributes common to all packages:  {'*sample_name', '*organism'}


## Confirming biosamples XML format
- every biosample identified by a \<BioSample> tag
- each biosample also has:
    - a \<Package> tag that has identifies the package for that sample
        - the package of a sample specifies what attributes will be recorded for the data
    - an \<Organism> tag (data stored in the attributes of the tag) that identifies the taxonomy related to the sample
    - an \<Ids> tag that contains information of the sample
        - each \<Id> tag has information about where the specific sample came from

In [5]:
import re

# search for the string "<Organism " in the file
organism_count = 0
biosample_count = 0
package_count = 0
id_count = 0

f = open("data/biosamples/biosample_set.xml", "r")
for line in f:
    if re.search("</BioSample>", line):
        biosample_count += 1
    elif re.search("<Organism ", line):
        organism_count += 1
    elif re.search("<Package ", line):
        package_count += 1
    elif re.search("</Ids>", line):
        id_count += 1

print("Number of biosample tags: ", biosample_count)
print("Number of organisms tags: ", organism_count)
print("Number of package tags: ", package_count)
print("Number of ids tags: ", id_count)


Number of organisms tags:  32526809
Number of biosample tags:  32526809
Number of package tags:  32526809
Number of ids tags:  32526809


In [4]:
bio_path = "data/biosamples/biosample_set.xml"

class BioSampleIdsHandler(xml.sax.ContentHandler):
    def __init__(self) -> None:
        self.sample_name_count = 0
        self.id_attribute_set = set()
        self.database_names = set()
        super().__init__()

    def startElement(self, name, attrs):
        if name == "Id":
            for l in attrs.getNames():
                self.id_attribute_set.add(l)
            if "db_label" in attrs.getNames() and attrs.getValue("db_label") == "Sample name":
                self.sample_name_count += 1
            if "db" in attrs.getNames():
                self.database_names.add(attrs.getValue("db"))

    def endDocument(self):
        print("Number of sample name attributes: ", self.sample_name_count)
        print("Unique id attributes: ", self.id_attribute_set)
        print("Unique database names: ", self.database_names)
        return super().endDocument()
    
parser = xml.sax.make_parser()
parser.setContentHandler(BioSampleIdsHandler())
parser.parse(bio_path)

Number of sample name attributes:  16141032
Unique id attributes:  {'db', 'is_primary', 'is_hidden', 'db_label'}
Unique database names:  {'UIUC', 'University of California San Diego', 'Boston College', 'Chinese Academy of Chinese Academy of Inspection a', 'Istituto Nazionale di Oceanografia e di Geofisica Sperimentale', 'Boku University Vienna, Austria', 'INRES Molecular Biology of the Rhizosphere', 'M. Achtman', 'USC', 'Università degli studi di Udine', 'Serratus Project', 'Biodiversity Research Center, Academia Sinica', 'Agricultural Research Organization (Volcani Center', 'Centers for Disease Control and Prevention', 'Institute of Plant Protection, Chinese Academy of', 'Korean Institute of ', 'GenomeTRAKR', 'EDLB-CDC', 'Ghent University - Pharmaceutical Sciences', 'Istituto Italiano di Tecnologia', 'LSHTM', 'IAS-CSIC', 'wageningen university & researcher', 'Center for New Medicine Research, Changchun Univer', 'Universite Blaise-Pascal, Clermont-Ferrand', 'California Institute of Tec

# Synonyms from the BTO

In [6]:
onto_path.append("data/ontologies/")
onto = get_ontology("http://purl.obolibrary.org/obo/bto.owl").load()

class_labels = [c.label[0] if len(c.label) > 0 else '' for c in onto.classes()]
# classes = list(c.label[0] for c in onto.classes())
# print(class_labels)
print("Num classes: ", len(class_labels))

num_synonyms = 0

for class_label in onto.classes():
    # print(class_label.hasRelatedSynonym)
    num_synonyms += len(class_label.hasRelatedSynonym) + len(class_label.hasExactSynonym)

print("Num synonyms: ", num_synonyms)

# direct search for terms in the onotology
# print(onto.search(label="*lea*"))

Num classes:  6569
Num synonyms:  6210


Things to search for in biosamples XML:

    </BioSample>
    <Organism 
    <Package 
    <Title 
