In [2]:
cd ../

/Users/atarashansky/Desktop/czi/single-cell-data-portal


In [3]:
import cellxgene_census
from backend.wmg.data.rollup import rollup_across_cell_type_descendants
import owlready2
import json
import tiledb
from backend.wmg.data.ontology_labels import ontology_term_label, ontology_term_id_labels
import pandas as pd
import numpy as np

def traverse_with_counting(node):
    global traverse_node_counter
    global all_unique_nodes
    node_count = traverse_node_counter.get(node.name, 0)
    traverse_node_counter[node.name] = node_count + 1
    all_unique_nodes.add(node.name +"__"+str(node_count))
    
    subclasses = list(node.subclasses())
    node_id = node.name.replace('_',':')
    if len(subclasses) == 0:
        return {"id": node.name+"__"+str(node_count),
                "name": id_to_name[node_id] if node_id in id_to_name else node_id,
                "n_cells_rollup": int(cell_counts_df_rollup[node_id] if node_id in cell_counts_df_rollup else 0),
                "n_cells_rollup_normalized": float(cell_counts_df_rollup_norm[node_id] if node_id in cell_counts_df_rollup_norm else 0),                
                "n_cells": int(cell_counts_df[node_id] if node_id in cell_counts_df else 0),
                "n_cells_normalized": float(cell_counts_df_norm[node_id] if node_id in cell_counts_df_norm else 0),                                
                "hasChildren": False
               }
        
    children = []
    for child in subclasses:
        children.append(traverse_with_counting(child))

    return {"id": node.name+"__"+str(node_count),
                "name": id_to_name[node_id] if node_id in id_to_name else node_id,
                "n_cells_rollup": int(cell_counts_df_rollup[node_id] if node_id in cell_counts_df_rollup else 0),
                "n_cells_rollup_normalized": float(cell_counts_df_rollup_norm[node_id] if node_id in cell_counts_df_rollup_norm else 0),
                "n_cells": int(cell_counts_df[node_id] if node_id in cell_counts_df else 0),
                "n_cells_normalized": float(cell_counts_df_norm[node_id] if node_id in cell_counts_df_norm else 0),
                "children": children,
                "hasChildren": True
               }

def _descendants(cell_type):
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")
    descendants = [i.name.replace("_", ":") for i in entity.descendants()] if entity else [cell_type]
    return descendants

def _ancestors(cell_type):
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")
    ancestors = [i.name.replace("_", ":") for i in entity.ancestors() if i.name!= "Thing"] if entity else [cell_type]
    return ancestors

def _children(cell_type):
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")
    children = [i.name.replace("_", ":") for i in entity.subclasses()] if entity else [cell_type]
    return children

def _parents(cell_type):
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")    
    parent_names = [parent.name.replace("_",":") for parent in entity.is_a if isinstance(parent, owlready2.ThingClass) and parent.name!= "Thing"]
    return parent_names

def dfs(parents, end, start, node=None, path = None, all_paths = []):
    if path is None and node is None:
        path = [end]
        node = end

    if node == start:
        return path
    
    for parent in parents.get(node,[]):
        full_path = dfs(parents, end, start, node=parent, path = path+[parent], all_paths=all_paths)
        if full_path:
            all_paths.append(full_path)
            
def truncate_graph(graph,valid_nodes):   
    if graph['id'] not in valid_nodes:
        return False

    children= graph.get("children",[])
    valid_children = []
    append_dummy = False
    
    for child in children:
        is_valid = truncate_graph(child, valid_nodes)
        if is_valid:
            valid_children.append(child)
        else:
            append_dummy = True

    if append_dummy and len(valid_children) > 0:
        valid_children.append(
            {"id": "",
            "name": "",
            "n_cells_rollup": 0,
            "n_cells_rollup_normalized": 0,
            "n_cells": 0,
            "n_cells_normalized": 0,
            "hasChildren": False
            }        
        )
    if len(valid_children) > 0:
        graph['children'] = valid_children
    else:
        if 'children' in graph:
            del graph['children']

    return True


def truncate_graph2(graph, visited_nodes_in_paths):
    # i want every node to only show children once
    # this means deleting "children" if seen more than once
    # EXCEPT if one of your children is in a path leading to acinar cell.
    # Then, you collapse the remaining children
    global nodesWithChildrenFound
    if graph['id'].split("__")[0] in nodesWithChildrenFound:
        if 'children' in graph:
            children = graph['children']            
            new_children = []
            for child in children:
                if child['id'] in visited_nodes_in_paths:
                    new_children.append(child)
            if len(children) > len(new_children) and len(new_children) > 0:
                # append dummy
                new_children.append(
                    {"id": "",
                    "name": "",
                    "n_cells_rollup": 0,
                    "n_cells_rollup_normalized": 0,
                    "n_cells": 0,
                    "n_cells_normalized": 0,
                    "hasChildren": False
                    }        
                )
            if len(new_children) > 0:
                graph['children'] = new_children
            else:
                del graph['children']
    elif 'children' in graph:
        nodesWithChildrenFound.add(graph['id'].split("__")[0])
    
    
    children = graph.get("children",[])
    for child in children:
        if child['id'] != "":
            truncate_graph2(child, visited_nodes_in_paths)


def prune_node_distinguishers(graph):
    graph['id'] = graph['id'].split('__')[0]
    for child in graph.get('children',[]):
        prune_node_distinguishers(child)

def delete_unknown_terms(graph):
    new_children = []
    for child in graph.get('children',[]):
        unknown = child['name'].startswith('CL:')
        if not unknown:
            new_children.append(child)
    if len(new_children) > 0:
        graph['children'] = new_children
    elif 'children' in graph:
        del graph['children']
    
    for child in graph.get('children',[]):
        delete_unknown_terms(child)
        
def truncate_graph_one_target(graph, target):
    global targetFound
    if targetFound and graph['id'].split("__")[0] == target.split("__")[0]:
        del graph['children']
    elif graph['id'] == target:
        targetFound = True
    
    children = graph.get("children",[])
    for child in children:
        truncate_graph_one_target(child, target)

def build_children(graph):
    global all_children
    children = graph.get('children',[])
    if len(children) == 0:
        ids = []
    else:
        ids = [child['id'] for child in children]
        
    all_children[graph['id']] = ids
    
    for child in children:
        build_children(child)

def build_parents(graph):
    global all_parents
    children = graph.get('children',[])
    
    for child in children:
        all_parents[child['id']]=[graph['id']]
        build_parents(child)
        





# Build ontology tree JSON

In [4]:
census = cellxgene_census.open_soma()
c = census['census_info']['summary_cell_counts'].read().concat().to_pandas()
cell_counts_df = c[[i.startswith('CL:') for i in c['ontology_term_id']]].groupby('ontology_term_id').sum(numeric_only=True)[['unique_cell_count']]
cell_counts_df['n_cells'] = cell_counts_df['unique_cell_count']
del cell_counts_df['unique_cell_count']
cell_counts_df['cell_type_ontology_term_id'] = cell_counts_df.index.values
cell_counts_df=cell_counts_df.reset_index(drop=True)

The "stable" Census version is not yet available. Using "latest" Census version instead.
The "latest" release is currently 2023-05-08. Specify 'census_version="2023-05-08"' in future calls to open_soma() to ensure data consistency.


In [5]:
all_cell_types = [{k: ontology_term_label(k)} for k in ontology_term_id_labels if k.startswith('CL:')]
all_cell_types_ids = [list(i.keys())[0] for i in all_cell_types]
to_attach = pd.DataFrame()
to_attach['cell_type_ontology_term_id']=[i for i in all_cell_types_ids if i not in cell_counts_df['cell_type_ontology_term_id'].values]
to_attach['n_cells']=0

cell_counts_df = pd.concat([cell_counts_df,to_attach],axis=0)
cell_counts_df_rollup = rollup_across_cell_type_descendants(cell_counts_df).set_index('cell_type_ontology_term_id')['n_cells']
cell_counts_df = cell_counts_df.set_index('cell_type_ontology_term_id')['n_cells']

cell_counts_df_rollup_norm = cell_counts_df_rollup/cell_counts_df_rollup.max()
cell_counts_df_norm = cell_counts_df/cell_counts_df.max()
id_to_name = pd.Series(index=cell_counts_df.index,data=[ontology_term_label(i) for i in cell_counts_df.index])


ontology = owlready2.get_ontology("https://github.com/obophenotype/cell-ontology/releases/latest/download/cl-basic.owl")
ontology.load()

root_node = ontology.world["http://purl.obolibrary.org/obo/CL_0000000"]

traverse_node_counter = {}
all_unique_nodes = set()
a = traverse_with_counting(root_node) 
all_unique_nodes = list(all_unique_nodes)

In [6]:
all_children={}
all_parents={}    
build_children(a)
build_parents(a) 

In [7]:
start_node = 'CL_0000000__0'

all_graphs_per_cell_type = {}
for i,end_node in enumerate(all_cell_types_ids):
    if i%100==0:
        print(i)
    end_node = end_node.replace(":","_")
    if end_node in traverse_node_counter:
        all_paths=[]
        for i in range(traverse_node_counter[end_node]):
            paths = []    
            dfs(all_parents,end_node+"__"+str(i),start_node,all_paths=paths)
            paths = [i[::-1] for i in paths] 
            if len(paths) == 0:
                all_paths.append([end_node+"__"+str(i)])
            else:
                all_paths.append(paths[0])

        ### RULES ###
        # 1. We only want to show terms that are CHILDREN, GRANDCHILDREN, SIBLINGS OF TARGET, or IN A PATH TO TARGET
        visited_nodes_in_paths = list(set(sum(all_paths,[])))

        children1 = all_children.get(end_node+"__0",[]) #children
        children2 = sum([all_children.get(child,[]) for child in children1],[]) #grandchildren
        siblings=[]
        for i in range(traverse_node_counter[end_node]):
            sibs = sum([all_children.get(parent,[]) for parent in all_parents.get(end_node+"__"+str(i),[])],[]) #siblings
            siblings.append(sibs)
        siblings = list(set(sum(siblings,[])))


        valid_nodes = list(set(visited_nodes_in_paths + children1 + children2 + siblings))

        a_copy = json.loads(json.dumps(a))
        truncate_graph(a_copy,valid_nodes) 

        nodesWithChildrenFound=set()
        truncate_graph2(a_copy, visited_nodes_in_paths)

        prune_node_distinguishers(a_copy)

        delete_unknown_terms(a_copy)

        all_graphs_per_cell_type[end_node] = a_copy 

0 CL:0000000
1 CL:0000540
2 CL:0000101
3 CL:0000001
4 CL:0000010
5 CL:0000002
6 CL:0000003
7 CL:0000004
8 CL:0000005
9 CL:0000057
10 CL:0000333
11 CL:0000008
12 CL:0000006
13 CL:0000197
14 CL:0000007
15 CL:0002321
16 CL:0000009
17 CL:0000578
18 CL:0000011
19 CL:0000012
20 CL:0000013
21 CL:0000014
22 CL:0000039
23 CL:0000034
24 CL:0000015
25 CL:0000586
26 CL:0000016
27 CL:0000017
28 CL:0000020
29 CL:0000018
30 CL:0000413
31 CL:0000657
32 CL:0000019
33 CL:0000408
34 CL:0000021
35 CL:0000022
36 CL:0000023
37 CL:0000548
38 CL:0000024
39 CL:0000670
40 CL:0000025
41 CL:0000675
42 CL:0000026
43 CL:0000412
44 CL:4029002
45 CL:0000722
46 CL:0000027
47 CL:0000192
48 CL:0000028
49 CL:0000338
50 CL:0000029
51 CL:0002676
52 CL:0000030
53 CL:0000055
54 CL:0000133
55 CL:0000031
56 CL:0000032
57 CL:0000114
58 CL:0000033
59 CL:0000151
60 CL:0011115
61 CL:0000035
62 CL:0000723
63 CL:0000036
64 CL:0000037
65 CL:0000988
66 CL:0008001
67 CL:0011026
68 CL:0000566
69 CL:0000038
70 CL:0000764
71 CL:0000839
72

556 CL:0005002
557 CL:0000431
558 CL:0005001
559 CL:0000432
560 CL:0000433
561 CL:0000434
562 CL:0000435
563 CL:0000436
564 CL:0000437
565 CL:0000438
566 CL:0000639
567 CL:0000439
568 CL:0000440
569 CL:0000441
570 CL:0000442
571 CL:0000443
572 CL:0000444
573 CL:0000737
574 CL:0000445
575 CL:0000446
576 CL:0002260
577 CL:0000448
578 CL:0002334
579 CL:0000449
580 CL:0002335
581 CL:0000450
582 CL:0000451
583 CL:0000842
584 CL:0000452
585 CL:0000453
586 CL:0000990
587 CL:0000454
588 CL:0000455
589 CL:0000456
590 CL:0000459
591 CL:0000460
592 CL:0000461
593 CL:0000648
594 CL:0000462
595 CL:0000464
596 CL:0010021
597 CL:0000466
598 CL:0000467
599 CL:0000468
600 CL:0000469
601 CL:0011020
602 CL:0000470
603 CL:0000471
604 CL:0000472
605 CL:0000474
606 CL:0002520
607 CL:0000475
608 CL:0000476
609 CL:0000477
610 CL:0000500
611 CL:2000064
612 CL:0000478
613 CL:0000479
614 CL:0000480
615 CL:0000481
616 CL:0000482
617 CL:0000483
618 CL:0000484
619 CL:0000485
620 CL:0000486
621 CL:0000487
622 CL:000

1098 CL:0000944
1099 CL:0000947
1100 CL:0000974
1101 CL:0000948
1102 CL:0000972
1103 CL:0000949
1104 CL:0000950
1105 CL:0000951
1106 CL:0000975
1107 CL:0000952
1108 CL:0000957
1109 CL:0000953
1110 CL:0000955
1111 CL:0000956
1112 CL:0000958
1113 CL:0000959
1114 CL:0000960
1115 CL:0000961
1116 CL:0002056
1117 CL:0000962
1118 CL:0000963
1119 CL:0000965
1120 CL:0000964
1121 CL:0000966
1122 CL:0000967
1123 CL:0001053
1124 CL:0000969
1125 CL:0000970
1126 CL:0000971
1127 CL:0000973
1128 CL:0000976
1129 CL:0000977
1130 CL:0000978
1131 CL:0000979
1132 CL:0000981
1133 CL:0000982
1134 CL:0000983
1135 CL:0000984
1136 CL:0000985
1137 CL:0000986
1138 CL:0000987
1139 CL:0000989
1140 CL:0000991
1141 CL:0000992
1142 CL:0000993
1143 CL:0000994
1144 CL:0000995
1145 CL:0001021
1146 CL:0001026
1147 CL:0001060
1148 CL:0000996
1149 CL:0000997
1150 CL:0000999
1151 CL:0000998
1152 CL:0002010
1153 CL:0002465
1154 CL:0001000
1155 CL:0001001
1156 CL:0001002
1157 CL:0001003
1158 CL:0001004
1159 CL:0001005
1160 CL:

1617 CL:0002403
1618 CL:0002405
1619 CL:0002406
1620 CL:0002407
1621 CL:0002408
1622 CL:0002409
1623 CL:0002410
1624 CL:0002411
1625 CL:0002412
1626 CL:0002413
1627 CL:0002414
1628 CL:0002415
1629 CL:0002416
1630 CL:0002423
1631 CL:0002424
1632 CL:0002426
1633 CL:0002438
1634 CL:0002427
1635 CL:0002428
1636 CL:0002429
1637 CL:0002430
1638 CL:0002431
1639 CL:0002432
1640 CL:0002433
1641 CL:0002434
1642 CL:0002435
1643 CL:0002439
1644 CL:0002440
1645 CL:0002441
1646 CL:0002442
1647 CL:0002443
1648 CL:0002444
1649 CL:0002445
1650 CL:0002446
1651 CL:0002447
1652 CL:0002448
1653 CL:0002449
1654 CL:0002450
1655 CL:0002451
1656 CL:0002452
1657 CL:0002454
1658 CL:0002455
1659 CL:0002456
1660 CL:0002457
1661 CL:0002458
1662 CL:0002461
1663 CL:0002459
1664 CL:0002460
1665 CL:0002462
1666 CL:0002463
1667 CL:0002464
1668 CL:0002466
1669 CL:0002467
1670 CL:0002468
1671 CL:0002469
1672 CL:0002470
1673 CL:0002471
1674 CL:0002472
1675 CL:0002473
1676 CL:0002474
1677 CL:0002475
1678 CL:0002476
1679 CL:

2132 CL:0011110
2133 CL:0011111
2134 CL:0011112
2135 CL:0011113
2136 CL:0011114
2137 CL:0013000
2138 CL:0017000
2139 CL:0017001
2140 CL:0017002
2141 CL:0017003
2142 CL:1000296
2143 CL:0017004
2144 CL:0017501
2145 CL:0019001
2146 CL:0019002
2147 CL:0019003
2148 CL:0019015
2149 CL:0019017
2150 CL:0019020
2151 CL:0019021
2152 CL:1000398
2153 CL:0019022
2154 CL:0019026
2155 CL:0019028
2156 CL:0019029
2157 CL:0019031
2158 CL:0700009
2159 CL:4030005
2160 CL:1000001
2161 CL:1000022
2162 CL:1000507
2163 CL:1000036
2164 CL:1000042
2165 CL:1000050
2166 CL:1000073
2167 CL:1000083
2168 CL:1000085
2169 CL:1000090
2170 CL:1000123
2171 CL:1000143
2172 CL:1000147
2173 CL:1000155
2174 CL:1000182
2175 CL:1000217
2176 CL:1000222
2177 CL:1000223
2178 CL:1000236
2179 CL:1000239
2180 CL:1000245
2181 CL:1000271
2182 CL:1000274
2183 CL:1000275
2184 CL:1000276
2185 CL:1000277
2186 CL:1000278
2187 CL:1000280
2188 CL:1000281
2189 CL:1000282
2190 CL:1000283
2191 CL:1000284
2192 CL:1000285
2193 CL:1000286
2194 CL:

2649 CL:4023048
2650 CL:4023049
2651 CL:4023050
2652 CL:4023051
2653 CL:4023058
2654 CL:4023052
2655 CL:4023053
2656 CL:4023054
2657 CL:4023055
2658 CL:4023056
2659 CL:4023065
2660 CL:4023066
2661 CL:4023067
2662 CL:4023069
2663 CL:4023070
2664 CL:4023075
2665 CL:4023077
2666 CL:4023078
2667 CL:4023080
2668 CL:4023081
2669 CL:4023088
2670 CL:4023089
2671 CL:4023090
2672 CL:4023092
2673 CL:4023093
2674 CL:4023094
2675 CL:4023095
2676 CL:4023097
2677 CL:4023098
2678 CL:4023099
2679 CL:4023100
2680 CL:4023101
2681 CL:4023102
2682 CL:4023103
2683 CL:4023104
2684 CL:4023105
2685 CL:4023106
2686 CL:4023107
2687 CL:4023108
2688 CL:4023109
2689 CL:4023110
2690 CL:4023112
2691 CL:4023113
2692 CL:4023114
2693 CL:4023115
2694 CL:4023116
2695 CL:4023118
2696 CL:4023119
2697 CL:4023122
2698 CL:4023123
2699 CL:4023130
2700 CL:4023124
2701 CL:4023125
2702 CL:4023127
2703 CL:4023128
2704 CL:4023129
2705 CL:4023158
2706 CL:4023159
2707 CL:4023160
2708 CL:4023161
2709 CL:4023162
2710 CL:4023163
2711 CL:

In [8]:
json.dump(all_graphs_per_cell_type,open('allOntologyViews.json','w'))