In [1]:
cd ../

/Users/atarashansky/Desktop/czi/single-cell-data-portal


In [2]:
import cellxgene_census
from backend.wmg.data.rollup import rollup_across_cell_type_descendants
import owlready2
import json
import tiledb
from backend.wmg.data.ontology_labels import ontology_term_label, ontology_term_id_labels
import pandas as pd
import numpy as np

def traverse_with_counting(node):
    global traverse_node_counter
    global all_unique_nodes
    node_count = traverse_node_counter.get(node.name, 0)
    traverse_node_counter[node.name] = node_count + 1
    all_unique_nodes.add(node.name +"__"+str(node_count))
    
    subclasses = list(node.subclasses())
    node_id = node.name.replace('_',':')
    if len(subclasses) == 0:
        return {"id": node.name+"__"+str(node_count),
                "name": id_to_name[node_id] if node_id in id_to_name else node_id,
                "n_cells_rollup": int(cell_counts_df_rollup[node_id] if node_id in cell_counts_df_rollup else 0),
                "n_cells": int(cell_counts_df[node_id] if node_id in cell_counts_df else 0),
               }
        
    children = []
    for child in subclasses:
        children.append(traverse_with_counting(child))

    return {"id": node.name+"__"+str(node_count),
                "name": id_to_name[node_id] if node_id in id_to_name else node_id,
                "n_cells_rollup": int(cell_counts_df_rollup[node_id] if node_id in cell_counts_df_rollup else 0),
                "n_cells": int(cell_counts_df[node_id] if node_id in cell_counts_df else 0),
                "children": children,
               }

def _descendants(cell_type):
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")
    descendants = [i.name.replace("_", ":") for i in entity.descendants()] if entity else [cell_type]
    return descendants

def _ancestors(cell_type):
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")
    ancestors = [i.name.replace("_", ":") for i in entity.ancestors() if i.name!= "Thing"] if entity else [cell_type]
    return ancestors

def _children(cell_type):
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")
    children = [i.name.replace("_", ":") for i in entity.subclasses()] if entity else [cell_type]
    return children

def _parents(cell_type):
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")    
    parent_names = [parent.name.replace("_",":") for parent in entity.is_a if isinstance(parent, owlready2.ThingClass) and parent.name!= "Thing"]
    return parent_names

def dfs(parents, end, start, node=None, path = None, all_paths = []):
    if path is None and node is None:
        path = [end]
        node = end

    if node == start:
        return path
    
    for parent in parents.get(node,[]):
        full_path = dfs(parents, end, start, node=parent, path = path+[parent], all_paths=all_paths)
        if full_path:
            all_paths.append(full_path)
            
def truncate_graph(graph,valid_nodes):   
    if graph['id'] not in valid_nodes:
        return False

    children= graph.get("children",[])
    valid_children = []
    append_dummy = False
    
    invalid_children_ids = []
    for child in children:
        is_valid = truncate_graph(child, valid_nodes)
        if is_valid:
            valid_children.append(child)
        elif child['id']!='':
            invalid_children_ids.append(child['id'])
            append_dummy = True

    if append_dummy and len(valid_children) > 0:
        valid_children.append(
            {"id": "",
            "name": "",
            "n_cells_rollup": 0,
            "n_cells": 0,
             "invalid_children_ids": invalid_children_ids,
            "parent": graph['id']
            }        
        )
    if len(valid_children) > 0:
        graph['children'] = valid_children
    else:
        if 'children' in graph:
            del graph['children']

    return True


def truncate_graph2(graph, visited_nodes_in_paths):
    # i want every node to only show children once
    # this means deleting "children" if seen more than once
    # EXCEPT if one of your children is in a path leading to acinar cell.
    # Then, you collapse the remaining children
    global nodesWithChildrenFound
    if graph['id'].split("__")[0] in nodesWithChildrenFound:
        if 'children' in graph:
            children = graph['children']            
            new_children = []
            invalid_children_ids = []
            for child in children:
                if child['id'] in visited_nodes_in_paths:
                    new_children.append(child)
                elif child['id'] != '':
                    invalid_children_ids.append(child['id'])
                    
            if len(children) > len(new_children) and len(new_children) > 0:
                # append dummy
                new_children.append(
                    {"id": "",
                    "name": "",
                    "n_cells_rollup": 0,
                    "n_cells": 0,
                     "invalid_children_ids": invalid_children_ids,
                     "parent": graph['id']
                    }        
                )
            if len(new_children) > 0:
                graph['children'] = new_children
            else:
                del graph['children']
    elif 'children' in graph:
        nodesWithChildrenFound.add(graph['id'].split("__")[0])
    
    
    children = graph.get("children",[])
    for child in children:
        if child['id'] != "":
            truncate_graph2(child, visited_nodes_in_paths)


def prune_node_distinguishers(graph):
    graph['id'] = graph['id'].split('__')[0]
    for child in graph.get('children',[]):
        prune_node_distinguishers(child)

def delete_unknown_terms(graph):
    new_children = []
    for child in graph.get('children',[]):
        unknown = child['name'].startswith('CL:')
        if not unknown:
            new_children.append(child)
    if len(new_children) > 0:
        graph['children'] = new_children
    elif 'children' in graph:
        del graph['children']
    
    for child in graph.get('children',[]):
        delete_unknown_terms(child)
        
def truncate_graph_one_target(graph, target):
    global targetFound
    if targetFound and graph['id'].split("__")[0] == target.split("__")[0]:
        del graph['children']
    elif graph['id'] == target:
        targetFound = True
    
    children = graph.get("children",[])
    for child in children:
        truncate_graph_one_target(child, target)

def build_children(graph):
    global all_children
    children = graph.get('children',[])
    if len(children) == 0:
        ids = []
    else:
        ids = [child['id'] for child in children]
        
    all_children[graph['id']] = ids
    
    for child in children:
        build_children(child)

def build_parents(graph):
    global all_parents
    children = graph.get('children',[])
    
    for child in children:
        all_parents[child['id']]=[graph['id']]
        build_parents(child)
        
def getExpandedData(graph):
    global isExpandedNodes
    if 'children' in graph:
        isExpandedNodes.append(graph['id'])
        for child in graph['children']:
            getExpandedData(child)
                
        
def getShownData(graph):
    global notShownWhenExpandedNodes
    
    if 'children' in graph:
        for child in graph['children']:
            if child['id'] == "":
                if len(child["invalid_children_ids"]) > 0:
                    notShownWhenExpandedNodes.append({child['parent']: list(set(child["invalid_children_ids"]))})
            else:
                getShownData(child)
        
def _to_dict(a, b):
    """
    convert a flat key array (a) and a value array (b) into a dictionary with values grouped by keys
    """
    a = np.array(a)
    b = np.array(b)
    idx = np.argsort(a)
    a = a[idx]
    b = b[idx]
    bounds = np.where(a[:-1] != a[1:])[0] + 1
    bounds = np.append(np.append(0, bounds), a.size)
    bounds_left = bounds[:-1]
    bounds_right = bounds[1:]
    slists = [b[bounds_left[i] : bounds_right[i]] for i in range(bounds_left.size)]
    d = dict(zip(np.unique(a), [list(set(x)) for x in slists]))
    return d



# Build ontology tree JSON

In [3]:
census = cellxgene_census.open_soma()
c = census['census_info']['summary_cell_counts'].read().concat().to_pandas()
cell_counts_df = c[[i.startswith('CL:') for i in c['ontology_term_id']]].groupby('ontology_term_id').sum(numeric_only=True)[['unique_cell_count']]
cell_counts_df['n_cells'] = cell_counts_df['unique_cell_count']
del cell_counts_df['unique_cell_count']
cell_counts_df['cell_type_ontology_term_id'] = cell_counts_df.index.values
cell_counts_df=cell_counts_df.reset_index(drop=True)

The "stable" release is currently 2023-05-15. Specify 'census_version="2023-05-15"' in future calls to open_soma() to ensure data consistency.


In [4]:
obs = census['census_data']['homo_sapiens']['obs'].read().concat().to_pandas()

a2,b2=obs[['cell_type_ontology_term_id','tissue_ontology_term_id']].values.T

uberon_by_celltype = _to_dict(b2,a2)

In [5]:
ontology = owlready2.get_ontology("https://github.com/obophenotype/cell-ontology/releases/latest/download/cl-basic.owl")
ontology.load()

all_cell_types = []
classes = ontology.classes()
all_cell_type_owl_descriptions = {}
id_to_name = {}
for c in classes :
    if not c.name.startswith("CL_"):
        continue
    if c.deprecated :
        continue
    all_cell_types.append(
    {
        "label": c.label.first(),
        "id": c.name.replace("_",":")
    }
    )
    id_to_name[c.name.replace("_",":")] = c.label.first()
    
    if str(c.IAO_0000115.first()) == 'None':
        all_cell_type_owl_descriptions[c.name.replace("_",":")] = ''
    else:
        all_cell_type_owl_descriptions[c.name.replace("_",":")] = str(c.IAO_0000115.first())
        

In [6]:
uberon_by_celltype = _to_dict(b2,a2)

In [7]:
all_cell_types_ids = [i["id"] for i in all_cell_types]
to_attach = pd.DataFrame()
to_attach['cell_type_ontology_term_id']=[i for i in all_cell_types_ids if i not in cell_counts_df['cell_type_ontology_term_id'].values]
to_attach['n_cells']=0

cell_counts_df = pd.concat([cell_counts_df,to_attach],axis=0)
cell_counts_df_rollup = rollup_across_cell_type_descendants(cell_counts_df).set_index('cell_type_ontology_term_id')['n_cells']
cell_counts_df = cell_counts_df.set_index('cell_type_ontology_term_id')['n_cells']

root_node = ontology.world["http://purl.obolibrary.org/obo/CL_0000548"]

traverse_node_counter = {}
all_unique_nodes = set()
a = traverse_with_counting(root_node) 
all_unique_nodes = list(all_unique_nodes)

In [8]:
all_children={}
all_parents={}    
build_children(a)
build_parents(a) 

In [26]:
start_node = 'CL_0000548__0'

all_states_per_cell_type = {}
for i,end_node in enumerate(all_cell_types_ids):
    if i%100==0:
        print(i)
    end_node = end_node.replace(":","_")
    if end_node in traverse_node_counter:
        all_paths=[]
        for i in range(traverse_node_counter[end_node]):
            paths = []    
            dfs(all_parents,end_node+"__"+str(i),start_node,all_paths=paths)
            paths = [i[::-1] for i in paths] 
            if len(paths) == 0:
                all_paths.append([end_node+"__"+str(i)])
            else:
                all_paths.append(paths[0])

        ### RULES ###
        # 1. We only want to show terms that are CHILDREN, GRANDCHILDREN, SIBLINGS OF TARGET, or IN A PATH TO TARGET
        visited_nodes_in_paths = list(set(sum(all_paths,[])))

        children1 = all_children.get(end_node+"__0",[]) #children
        children2 = sum([all_children.get(child,[]) for child in children1],[]) #grandchildren
        siblings=[]
        for i in range(traverse_node_counter[end_node]):
            sibs = sum([all_children.get(parent,[]) for parent in all_parents.get(end_node+"__"+str(i),[])],[]) #siblings
            siblings.append(sibs)
        siblings = list(set(sum(siblings,[])))


        valid_nodes = list(set(visited_nodes_in_paths + children1 + children2 + siblings))

        a_copy = json.loads(json.dumps(a))
        truncate_graph(a_copy,valid_nodes) 

        nodesWithChildrenFound=set()
        truncate_graph2(a_copy, visited_nodes_in_paths)
        delete_unknown_terms(a_copy)
        
        # now, given this graph, populate what you need - specifically, we need "notShownWhenExpanded" and "isExpanded"
        notShownWhenExpandedNodes=[]
        isExpandedNodes=[]
        
        getExpandedData(a_copy)
        getShownData(a_copy)

        assert(len(list(set([list(i.keys())[0] for i in notShownWhenExpandedNodes])))==len(notShownWhenExpandedNodes))        
        
        notShownWhenExpanded = {}
        for i in notShownWhenExpandedNodes:
            notShownWhenExpanded.update(i)
            
        all_states_per_cell_type[end_node] = {'isExpandedNodes': list(set(isExpandedNodes)), 'notShownWhenExpandedNodes': notShownWhenExpanded}  

0
100
200


KeyboardInterrupt: 

In [23]:
start_node = 'CL_0000548__0'

all_states_per_tissue = {}
for tissue in uberon_by_celltype:
    end_nodes = uberon_by_celltype[tissue]
    
    end_nodes = [e.replace(":","_") for e in end_nodes]
    all_paths=[]
    for end_node in end_nodes:
        if end_node in traverse_node_counter:
            for i in range(traverse_node_counter[end_node]):
                paths = []    
                dfs(all_parents,end_node+"__"+str(i),start_node,all_paths=paths)
                paths = [i[::-1] for i in paths] 
                if len(paths) == 0:
                    all_paths.append([end_node+"__"+str(i)])
                else:
                    all_paths.append(paths[0])

    ### RULES ###
    # 1. We only want to show terms that are CHILDREN, GRANDCHILDREN, SIBLINGS OF TARGET, or IN A PATH TO TARGET
    visited_nodes_in_paths = list(set(sum(all_paths,[])))

    children1 = list(set(sum([all_children.get(e+"__0",[]) for e in end_nodes],[])))

    valid_nodes = list(set(visited_nodes_in_paths + children1))

    a_copy = json.loads(json.dumps(a))
    truncate_graph(a_copy,valid_nodes) 

    nodesWithChildrenFound=set()
    truncate_graph2(a_copy, visited_nodes_in_paths)
    delete_unknown_terms(a_copy)

    # now, given this graph, populate what you need - specifically, we need "notShownWhenExpanded" and "isExpanded"
    notShownWhenExpandedNodes=[]
    isExpandedNodes=[]

    getExpandedData(a_copy)
    getShownData(a_copy)

    assert(len(list(set([list(i.keys())[0] for i in notShownWhenExpandedNodes])))==len(notShownWhenExpandedNodes))        

    notShownWhenExpanded = {}
    for i in notShownWhenExpandedNodes:
        notShownWhenExpanded.update(i)

    all_states_per_tissue[tissue] = {'isExpandedNodes': list(set(isExpandedNodes)), 'notShownWhenExpandedNodes': notShownWhenExpanded}  

In [25]:
json.dump(all_states_per_tissue,open('ontologyTreeStatePerTissue.json','w'))

In [8]:
delete_unknown_terms(a)
json.dump(a,open('ontologyTree.json','w'))

In [9]:
json.dump(all_states_per_cell_type,open('ontologyTreeStatePerCellType.json','w'))

In [10]:
!mv ontologyTree.json frontend/src/views/CellCards/common/fixtures/.
!mv ontologyTreeStatePerCellType.json frontend/src/views/CellCards/common/fixtures/.
!mv ontologyTreeStatePerTissue.json frontend/src/views/CellCards/common/fixtures/.