In [4]:
import requests
import rdflib
import warnings
warnings.filterwarnings("ignore")
from rdflib import Graph, Namespace, RDFS, RDF, OWL
from owlready2 import get_ontology
from owlready2 import *
from rdflib import Graph, Namespace, RDFS
from tqdm import tqdm
import json
import pickle
import ParsingModule
from ParsingModule import store_as_gzipped_json

In [5]:
def get_children(ontology, term_id, term_label, d, nodes_dict=None, data=None):
    if nodes_dict is None:  # load the json file only once
        with open(ontology) as f:
            data = json.load(f)
        nodes_dict = {node["id"]: node["lbl"] for node in data["graphs"][0]["nodes"] if all(key in node for key in ['id', 'lbl'])}

    if term_id not in nodes_dict:
        return f"{term_id} node not in ontology" # node not found in ontology, return early

    if term_label not in d: 
        d[term_label] = {}  # add the parent to the dictionary

    for term in data["graphs"][0]["edges"]: # iterate through the edges
        if (term["obj"] == term_id) and (term["pred"] in ["http://purl.obolibrary.org/obo/BFO_0000050", 'is_a']):
            parent = term["sub"]
            if parent == "http://purl.obolibrary.org/obo/MONDO_0011876":
                continue    # skip MONDO_0011876
            parent_label = nodes_dict.get(parent)
            if parent_label is not None:
                if parent_label in d:
                    d[term_label][parent_label] = d[parent_label]
                    del d[parent_label]
                else:
                    d[term_label][parent_label] = {}
                get_children(ontology, parent, parent_label, d[term_label], nodes_dict, data)

    return d

In [6]:
def flatten(d):
    items = []
    for k, v in d.items(): # iterate through the dictionary
        items.append(k) # add the key to the list
        if isinstance(v, dict):# if the value is a dictionary, call the function recursively
            items.extend(flatten(v))
        else:
            items.append(v)
    return items

In [7]:
def transform_nested_dict_to_tree(d, parent_label=None, parent_value=None):
    result = []
    for key, value in d.items():
        label = key
        if parent_label:
            label = f"{parent_label} , {key}"
        children = []
        if value:
            children = transform_nested_dict_to_tree(value, label, key)
        if children:
            result.append({"label": key, "value": label, "children": children})
        else:
            result.append({"label": key, "value": label})
    return result

# JSON file EFO

### organism part

In [9]:
d=dict()
orgpart_dict = get_children('/home/compomics/git/Publication/lesSDRF/ontology/efo.json', "http://www.ebi.ac.uk/efo/EFO_0000635",'organism part', d)

In [10]:
orgpart_dict = orgpart_dict['organism part']

In [11]:
orgpart_dict['Not available'] = {}
orgpart_dict['Not applicable'] = {}

In [18]:
orgpart_dict.keys()

dict_keys(['anatomical entity', 'anatomy basic component', 'Not available', 'Not applicable'])

In [14]:
orgpart_dict['anatomical entity'].keys()

dict_keys(['umbilical cord', 'seminal fluid', 'primordium', 'abdominal cavity', 'peritoneal cavity', 'sensory system', 'bodily fluid', 'nasal cavity', 'retroperitoneal space', 'mushroom body', 'saliva', 'decidua basalis', 'meningeal cluster', 'vasculature', 'upper urinary tract', 'embryonic structure', 'head capsule', 'endocrine system', 'renal pelvis/ureter', 'tegmentum', 'venom', 'excreta', 'anatomical structure', 'immune system', 'mediastinum', 'early telencephalic vesicle'])

In [15]:
# save the dictionary as a pickle
with open('orgpart_dict.pickle', 'wb') as handle:
    pickle.dump(orgpart_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
#load pickle into dict
with open('/home/compomics/git/Publication/lesSDRF/data/organismpart_dict.pickle', 'rb') as handle:
    orgpart_dict = pickle.load(handle)
# make streamlit tree
organismpart_nodes = transform_nested_dict_to_tree(orgpart_dict['organism part'])
with open('/home/compomics/git/Publication/lesSDRF/data/organismpart_nodes.pickle', 'wb') as handle:
    pickle.dump(organismpart_nodes, handle, protocol=pickle.HIGHEST_PROTOCOL)
# flatten into a list of elements
all_organismpart_elements = flatten(orgpart_dict['organism part'])
with open('/home/compomics/git/Publication/lesSDRF/data/all_organismpart_elements.pickle', 'wb') as handle:
    pickle.dump(all_organismpart_elements, handle, protocol=pickle.HIGHEST_PROTOCOL)

FileNotFoundError: [Errno 2] No such file or directory: '/home/compomics/git/Publication/lesSDRF/data/organismpart_dict.pickle'

In [64]:
with open('C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\onto_dicts\\all_organismpart_elements.pickle', 'rb') as handle:
    x = pickle.load(handle)

In [17]:
# make streamlit tree
orgpart_nodes = transform_nested_dict_to_tree(orgpart_dict)
all_orgpart_elements = flatten(orgpart_dict)
store_as_gzipped_json(orgpart_dict, "organism_part_dict")
store_as_gzipped_json(orgpart_nodes, "organism_part_dict_nodes")
store_as_gzipped_json(all_orgpart_elements, "all_organism_part_dict_elements")

'Stored all_organism_part_dict_elements as gzipped json'

In [50]:
len(all_organismpart_elements)

26865

### for cell types

In [19]:
d=dict()
celltype_dict = get_children('/home/compomics/git/Publication/lesSDRF/ontology/efo.json', "http://www.ebi.ac.uk/efo/EFO_0000324",'cell type', d)

In [20]:
celltype_dict = celltype_dict['cell type']

In [21]:
celltype_dict['Not available'] = {}
celltype_dict['Not applicable'] = {}

In [22]:
celltype_dict.keys()

dict_keys(['epithelial cell', 'embryonic cell (metazoa)', 'photosynthetic cell', 'mouse erythroleukemia cell', 'clear cell', 'somatic cell', 'fungal cell', 'prokaryotic cell', 'immortal cell line cell', 'neoplastic cell', 'integumental cell', 'diploid cell', 'lung cancer cell', 'follicular dendritic cell', 'mantle cell', 'glial brain cell', 'merkel cell', 'mouse neural progenitor cell', 'electrically active cell', 'ligament cell', 'bone marrow cell', 'disease cell type', 'inferred cell type', 'plant cell', 'secretory cell', 'stem cell', 'pancreatic cell', 'nervous system cell', 'infected cell', 'non-terminally differentiated cell', 'experimental cell', 'musculo-skeletal system cell', 'reproductive system cell', 'Not available', 'Not applicable'])

In [23]:
celltype_nodes = transform_nested_dict_to_tree(celltype_dict)
all_celltype_elements = flatten(celltype_dict)
store_as_gzipped_json(celltype_dict, "cell_type_dict")
store_as_gzipped_json(celltype_nodes, "cell_type_nodes")
store_as_gzipped_json(all_celltype_elements, "all_cell_type_elements")

'Stored all_cell_type_elements as gzipped json'

### for disease  'characteristics[disease]',==> ontology, EFO:0000408

In [25]:
d=dict()
disease_dict = get_children('/home/compomics/git/Publication/lesSDRF/ontology/efo.json', "http://www.ebi.ac.uk/efo/EFO_0000408",'disease', d)

In [26]:
len(disease_dict['disease'].keys())

43

In [None]:
import gzip

In [27]:
disease_dict['normal'] = {}
disease_dict['Not available'] = {}
disease_dict['Not applicable'] = {}

In [28]:
disease_dict.keys()

dict_keys(['disease', 'normal', 'Not available', 'Not applicable'])

In [29]:
# make streamlit tree
disease_nodes = transform_nested_dict_to_tree(disease_dict)
all_disease_elements = flatten(disease_dict)

In [30]:
store_as_gzipped_json(disease_dict, "disease_dict")
store_as_gzipped_json(disease_nodes, "disease_nodes")
store_as_gzipped_json(all_disease_elements, "all_disease_elements")

'Stored all_disease_elements as gzipped json'

### for developmental stage

In [31]:
d=dict()
develop_dict = get_children('/home/compomics/git/Publication/lesSDRF/ontology/efo.json', "http://www.ebi.ac.uk/efo/EFO_0000399",'developmental stage', d)

In [32]:
develop_dict = develop_dict['developmental stage']

In [38]:
develop_dict

{'pharyngula stage': {'pharyngula prim-15': {},
  'pharyngula high-pec': {},
  'pharyngula prim-5': {},
  'pharyngula prim-25': {}},
 'drosophila developmental stage': {'third instar larva stage': {}},
 'fertilized egg stage': {},
 'segmentation stage': {'segmentation 20-25 somites': {},
  'segmentation 26+ somites': {},
  'segmentation 10-13 somites': {},
  'segmentation 5-9 somites': {},
  'segmentation 1-4 somites': {},
  'segmentation 14-19 somites': {}},
 'infant': {'neonate': {}, 'Fibrous Hamartoma of Infancy': {}},
 'floral transition': {},
 'mouse prenatal': {'Theiler stage 3': {},
  'Theiler stage 2': {},
  'Theiler stage 1': {},
  'Theiler stage 7': {},
  'Theiler stage 5': {},
  'Theiler stage 6': {},
  'Theiler stage 4': {},
  'Theiler stage 11': {},
  'Theiler stage 17': {},
  'Theiler stage 24': {},
  'Theiler stage 26': {},
  'Theiler stage 21': {},
  'Theiler stage 22': {},
  'Theiler stage 20': {},
  'Theiler stage 25': {},
  'Theiler stage 23': {},
  'Theiler stage 14

In [33]:
develop_dict['Not available'] = {}
develop_dict['Not applicable'] = {}

In [34]:
developmental_stage_nodes = transform_nested_dict_to_tree(develop_dict)
all_developmental_stage_elements = flatten(develop_dict)
store_as_gzipped_json(develop_dict, "developmental_stage_dict")
store_as_gzipped_json(developmental_stage_nodes, "developmental_stage_nodes")
store_as_gzipped_json(all_developmental_stage_elements, "all_developmental_stage_elements")

'Stored all_developmental_stage_elements as gzipped json'

## for organism

In [27]:
d=dict()
org_dict = get_children('/home/compomics/git/Publication/lesSDRF/ontology/efo.json', "http://purl.obolibrary.org/obo/OBI_0100026",'organism', d)
org_dict = org_dict['organism']
org_dict['Not available'] = {}
org_dict['Not applicable'] = {}


In [28]:
# make streamlit tree
org_nodes = transform_nested_dict_to_tree(org_dict)
all_org_elements = flatten(org_dict)
store_as_gzipped_json(org_dict, "organism_dict")
store_as_gzipped_json(org_nodes, "organism_nodes")
store_as_gzipped_json(all_org_elements, "all_organism_elements")

'Stored all_organism_elements as gzipped json'

In [18]:
# save the dictionary as a pickle

with open('org_dict.pickle', 'wb') as handle:
    pickle.dump(org_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## for ancestry

In [35]:
d=dict()
ancestry_dict = get_children('/home/compomics/git/Publication/lesSDRF/ontology/efo.json', "http://purl.obolibrary.org/obo/HANCESTRO_0004",'ancestry category', d)
ancestry_dict['ancestry category'].keys()


dict_keys(['European', 'Asian', 'genetically isolated population', 'African', 'Oceanian', 'Greater Middle Eastern  (Middle Eastern, North African or Persian)', 'African American or Afro-Caribbean', 'Native American', 'Hispanic or Latin American', 'undefined ancestry population'])

In [36]:
ancestry_dict['Not available'] = {}
ancestry_dict['Not applicable'] = {}

In [37]:
ancestry_dict['ancestry category'].keys()

dict_keys(['European', 'Asian', 'genetically isolated population', 'African', 'Oceanian', 'Greater Middle Eastern  (Middle Eastern, North African or Persian)', 'African American or Afro-Caribbean', 'Native American', 'Hispanic or Latin American', 'undefined ancestry population'])

In [9]:
# save the dictionary as a pickle

with open('ancestry_dict.pickle', 'wb') as handle:
    pickle.dump(ancestry_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
ancestry_nodes = transform_nested_dict_to_tree(ancestry_dict['ancestry category'])

In [22]:
with open('ancestry_nodes.pickle', 'wb') as handle:
    pickle.dump(ancestry_nodes, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
all_ancestry_elements = flatten(ancestry_dict['ancestry category'])
with open('all_ancestry_elements.pickle', 'wb') as handle:
    pickle.dump(all_ancestry_elements, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
#load pickle into dict
with open('C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\onto_dicts\\ancestry_dict.pickle', 'rb') as handle:
    ancestry_dict = pickle.load(handle)
# make streamlit tree
develop_nodes = transform_nested_dict_to_tree(ancestry_dict)
with open('C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\onto_dicts\\ancestry_nodes.pickle', 'wb') as handle:
    pickle.dump(develop_nodes, handle, protocol=pickle.HIGHEST_PROTOCOL)
# flatten into a list of elements
all_develop_elements = flatten(ancestry_dict)
with open('C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\onto_dicts\\all_ancestry_elements.pickle', 'wb') as handle:
    pickle.dump(all_celltype_elements, handle, protocol=pickle.HIGHEST_PROTOCOL)

## for cell line

In [29]:
d=dict()
cell_dict = get_children('/home/compomics/git/Publication/lesSDRF/ontology/efo.json', "http://purl.obolibrary.org/obo/CL_0000000",'cell', d)

In [30]:
cell_dict = cell_dict['cell']
cell_dict['Not available'] = {}
cell_dict['Not applicable'] = {}

In [31]:
cell_nodes = transform_nested_dict_to_tree(cell_dict)
all_cell_elements = flatten(cell_dict)

In [32]:
# make streamlit tree
cell_nodes = transform_nested_dict_to_tree(cell_dict)
all_cell_elements = flatten(cell_dict)
store_as_gzipped_json(cell_dict, "cell_line_dict")
store_as_gzipped_json(cell_nodes, "cell_line_nodes")
store_as_gzipped_json(all_cell_elements, "all_cell_line_elements")

'Stored all_cell_line_elements as gzipped json'

## for enrichment

In [33]:
d=dict()
enrichment_dict = get_children('/home/compomics/git/Publication/lesSDRF/ontology/efo.json', "http://www.ebi.ac.uk/efo/EFO_0009090",'enrichment process', d)
enrichment_dict['Not available'] = {}
enrichment_dict['Not applicable'] = {}

In [34]:
# make streamlit tree
enrichment_nodes = transform_nested_dict_to_tree(enrichment_dict)
all_enrichment_elements = flatten(enrichment_dict)
store_as_gzipped_json(enrichment_dict, "enrichment_dict")
store_as_gzipped_json(enrichment_nodes, "enrichment_nodes")
store_as_gzipped_json(all_enrichment_elements, "all_enrichment_elements")

'Stored all_enrichment_elements as gzipped json'

In [8]:
enrichment_dict['enrichment process']['sample enrichment'].keys()

dict_keys(['isolation of cell population', 'cell size selection', 'density gradient centrifugation', 'magnetic affinity cell sorting'])

In [9]:
enrichment_nodes = transform_nested_dict_to_tree(enrichment_dict)
all_enrichment_elements = flatten(enrichment_dict)

In [10]:
#save dict as pickle
with open('C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\onto_dicts\\enrichment_dict.pickle', 'wb') as handle:
    pickle.dump(enrichment_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
# make streamlit tree
with open('C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\onto_dicts\\enrichment_nodes.pickle', 'wb') as handle:
    pickle.dump(enrichment_nodes, handle, protocol=pickle.HIGHEST_PROTOCOL)
# flatten into a list of elements
with open('C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\onto_dicts\\all_enrichment_elements.pickle', 'wb') as handle:
    pickle.dump(all_enrichment_elements, handle, protocol=pickle.HIGHEST_PROTOCOL)