In [1]:
import os
import csv
from owlready2 import *





# NCBI biosamples templates
Templates for each package are taken from the biosamples ncbi webpage (https://submit.ncbi.nlm.nih.gov/biosample/template/)

'*' indicates mandatory field

'**' indicates at least one attribute in the group required

all other attributes are optional

Current code has all the NCBI templates as well as the MIGS Cultured Bacterial/Archaeal packages


In [2]:
templates_path = "data/biosamples/templates/"

attributes_dict = {}
required_attributes = set()

for filename in os.listdir(templates_path):
    with open(templates_path + filename, 'r') as f:
        reader = csv.reader(f, delimiter='\t', quotechar='"')
        for row in reader:
            if not row or row[0].startswith("#"):
                continue
            # print(row)
            attributes_dict[filename.strip(".tsv")] = [attribute for attribute in row if attribute[0] == "*" and attribute[1] != "*"]

# print(attributes_dict)
# prints the attributes that are common to all biosamples templates (from NCBI packages)
print("Attributes common to all packages: ", set.intersection(*[set(attributes_dict[template]) for template in attributes_dict]))

Attributes common to all packages:  {'*sample_name', '*organism'}


## Confirming biosamples XML format
- every biosample should have a single package?
- every biosample should have a sample name
    - sample name within the \<Ids\> tags?
- every biosample should have an organism tag

In [5]:
import re

# search for the string "<Organism " in the file
organism_count = 0
biosample_count = 0
package_count = 0
id_count = 0

f = open("data/biosamples/biosample_set.xml", "r")
for line in f:
    if re.search("</BioSample>", line):
        biosample_count += 1
    elif re.search("<Organism ", line):
        organism_count += 1
    elif re.search("<Package ", line):
        package_count += 1
    elif re.search("</Ids>", line):
        id_count += 1

print("Number of biosample tags: ", biosample_count)
print("Number of organisms tags: ", organism_count)
print("Number of package tags: ", package_count)
print("Number of ids tags: ", id_count)


Number of organisms tags:  32526809
Number of biosample tags:  32526809
Number of package tags:  32526809
Number of ids tags:  32526809


# Synonyms from the BTO

In [6]:
onto_path.append("data/ontologies/")
onto = get_ontology("http://purl.obolibrary.org/obo/bto.owl").load()

class_labels = [c.label[0] if len(c.label) > 0 else '' for c in onto.classes()]
# classes = list(c.label[0] for c in onto.classes())
# print(class_labels)
print("Num classes: ", len(class_labels))

num_synonyms = 0

for class_label in onto.classes():
    # print(class_label.hasRelatedSynonym)
    num_synonyms += len(class_label.hasRelatedSynonym) + len(class_label.hasExactSynonym)

print("Num synonyms: ", num_synonyms)

# direct search for terms in the onotology
# print(onto.search(label="*lea*"))

Num classes:  6569
Num synonyms:  6210


Things to search for in biosamples XML:

    </BioSample>
    <Organism 
    <Package 
    <Title 
