In [10]:
import tiledb
import pandas as pd
from pandas import DataFrame

from backend.wmg.data.ontology_labels import ontology_term_label, gene_term_label
from pronto import Ontology
import pygraphviz as pgv
from collections import defaultdict
from typing import Dict, List, Any, Iterable, Set
import json
import requests
import os
import yaml
from collections import OrderedDict

from backend.wmg.data.snapshot import (
    CELL_TYPE_ORDERINGS_FILENAME,
    EXPRESSION_SUMMARY_CUBE_NAME,
    PRIMARY_FILTER_DIMENSIONS_FILENAME,
)

from math import isnan



config = tiledb.Config()
config["vfs.s3.scheme"] = "https" 
config["vfs.s3.region"] = "us-west-2"
# config["vfs.s3.endpoint_override"] = ""
# config["vfs.s3.use_virtual_addressing"] = True
tdb_ctx = tiledb.Ctx(config=config)

Emanuele's function to flatten cell types

In [2]:
onto = Ontology.from_obo_library("uberon/basic.obo")

  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_cl

In [3]:
onto["UBERON:0002048"].subclasses(with_self=False, distance=1)
[c for c in onto["UBERON:0002048"].subclasses(with_self=False, distance=1)]
list(onto["UBERON:0002048"].subclasses(with_self=False, distance=1))
list(onto["UBERON:0002107"].relationships)

list(onto['UBERON:0002107'].objects(onto.get_relationship("part_of")))


  list(onto['UBERON:0002107'].objects(onto.get_relationship("part_of")))


KeyError: 'NCBITaxon:9606'

In [4]:
def generate_cell_ordering(snapshot_path: str, cell_type_by_tissue: Dict) -> None:
    """
    Use graphviz to map all the cells associated with a tissue to the ontology tree and return their correct order
    """
    # Note: those dependencies are only needed by the WMG pipeline, so we should keep them local
    # so that this file can be imported by tests without breaking.
    from pronto import Ontology
    import pygraphviz as pgv

    onto = Ontology.from_obo_library("cl-basic.obo")

    def compute_ordering(cells, root):
        ancestors = [list(onto[t].superclasses()) for t in cells if t in onto]
        ancestors = [i for s in ancestors for i in s]
        ancestors = set(ancestors)

        G = pgv.AGraph()
        for a in ancestors:
            for s in a.subclasses(with_self=False, distance=1):
                if s in ancestors:
                    G.add_edge(a.id, s.id)

        G.layout(prog="dot")

        positions = {}
        for n in G.iternodes():
            pos = n.attr["pos"].split(",")
            positions[n] = (float(pos[0]), float(pos[1]))

        ancestor_ids = [a.id for a in ancestors]
        def recurse(node, current_tree, depth = 3):
            tree_to_pass = current_tree
            if node in cells:
                current_tree[node] = OrderedDict()
                if depth <= 0 or node == "CL:0000003":
                    tree_to_pass = current_tree
                else:
                    depth -= 1
                    tree_to_pass = current_tree[node]
                
            children = [
                (c, positions[c.id]) for c in onto[node].subclasses(with_self=False, distance=1) if c.id in ancestor_ids
            ]
            sorted_children = sorted(children, key=lambda x: x[1][0])
            for child in sorted_children:
                recurse(child[0].id, tree_to_pass, depth=depth)
        
        tree = OrderedDict()
        recurse(root, tree)
        return tree

    xtrees = {}
    for tissue, cell_df in cell_type_by_tissue.items():
        cells = list(cell_df)
        final_tree = compute_ordering(cells, "CL:0000003")
        trees[tissue] = final_tree
    
    with open(f"{snapshot_path}/{CELL_TYPE_ORDERINGS_FILENAME}", "w") as tree_json_file:
        json.dump(trees, tree_json_file)


In [5]:

def create_child(id_name, name = ""):
    return {"id": id_name, "name": name, "children":[]}

def generate_cell_ordering(snapshot_path: str, cell_type_by_tissue: Dict) -> None:
    """
    Use graphviz to map all the cells associated with a tissue to the ontology tree and return their correct order
    """
    # Note: those dependencies are only needed by the WMG pipeline, so we should keep them local
    # so that this file can be imported by tests without breaking.
    from pronto import Ontology
    import pygraphviz as pgv

    onto = Ontology.from_obo_library("cl-basic.obo")

    def compute_ordering(cells, root, tree_name):
        ancestors = [list(onto[t].superclasses()) for t in cells if t in onto]
        ancestors = [i for s in ancestors for i in s]
        ancestors = set(ancestors)

        G = pgv.AGraph()
        for a in ancestors:
            for s in a.subclasses(with_self=False, distance=1):
                if s in ancestors:
                    G.add_edge(a.id, s.id)

        G.layout(prog="dot")

        positions = {}
        for n in G.iternodes():
            pos = n.attr["pos"].split(",")
            positions[n] = (float(pos[0]), float(pos[1]))

        ancestor_ids = [a.id for a in ancestors]
        
        def recurse(node: Set[str], current_tree: Dict[str, Union[str, Dict]], depth = 4):
            
            tree_to_pass = current_tree
            
            if node in cells:
                
                cells.remove(node)
                current_tree["children"].append(create_child(node, onto[node].name))
                
                # Skip indenting if we pass the threshold or if we find "naive cells"
                # as these cell are unkwown but in the ontology they're the root
                # to most cell types.
                if depth <= 1 or node == "CL:0000003":
                    tree_to_pass = current_tree
                else:
                    depth -= 1
                    tree_to_pass = current_tree["children"][-1]
                
            children = [
                (c, positions[c.id]) for c in onto[node].subclasses(with_self=False, distance=1) if c.id in ancestor_ids
            ]
            sorted_children = sorted(children, key=lambda x: x[1][0])
            for child in sorted_children:
                recurse(child[0].id, tree_to_pass, depth=depth)
        
        tree = create_child(tree_name)
        recurse(root, tree)
        return tree

    trees = {}
    for tissue, cell_df in cell_type_by_tissue.items():
        cells = set(list(cell_df))
        trees[tissue] = compute_ordering(cells, "CL:0000003", tissue)
    
    with open(f"{snapshot_path}/{CELL_TYPE_ORDERINGS_FILENAME}", "w") as tree_json_file:
        json.dump(trees, tree_json_file, indent=2)


In [11]:
def generate_cell_ordering(snapshot_path: str, cell_type_by_tissue: Dict) -> None:
    """
    Use graphviz to map all the cells associated with a tissue to the ontology tree and return their correct order
    """
    # Note: those dependencies are only needed by the WMG pipeline, so we should keep them local
    # so that this file can be imported by tests without breaking.
    from pronto import Ontology
    import pygraphviz as pgv

    onto = Ontology.from_obo_library("cl-basic.obo")

    def compute_ordering(cells, root):
        ancestors = [list(onto[t].superclasses()) for t in cells if t in onto]
        ancestors = [i for s in ancestors for i in s]
        ancestors = set(ancestors)

        G = pgv.AGraph()
        for a in ancestors:
            for s in a.subclasses(with_self=False, distance=1):
                if s in ancestors:
                    G.add_edge(a.id, s.id)

        G.layout(prog="dot")

        positions = {}
        for n in G.iternodes():
            pos = n.attr["pos"].split(",")
            positions[n] = (float(pos[0]), float(pos[1]))

        ancestor_ids = [a.id for a in ancestors]

        def recurse(node: Set[str], depth=0):

            if node in cells:

                cells.remove(node)
                yield {"id": node, "depth": depth}

                if node != "CL:0000003":
                    depth += 1

            children = [
                (c, positions[c.id]) for c in onto[node].subclasses(with_self=False, distance=1) if c.id in ancestor_ids
            ]
            sorted_children = sorted(children, key=lambda x: x[1][0])
            for child in sorted_children:
                yield from recurse(child[0].id, depth=depth)

        ordered_list = recurse(root)
        return list(ordered_list)

    mapping = {}
    for tissue, cell_df in cell_type_by_tissue.items():
        cells = list(cell_df)
        ordered_cells = compute_ordering(cells, "CL:0000003")
        mapping[tissue] = ordered_cells

    data = []
    for tissue, cells in mapping.items():
        for i, cell in enumerate(cells):
            data.append((tissue, cell["id"], cell["depth"], i))

    df = pd.DataFrame(data, columns=["tissue_ontology_term_id", "cell_type_ontology_term_id", "depth", "order"])
    df.to_json(f"{snapshot_path}/{CELL_TYPE_ORDERINGS_FILENAME}")

    return df

Download and load ontology

Test with a list of cell types from lung

In [12]:
cell_types = {"UBERON:0002048": 
              ["CL:0000003", "CL:0000115", "CL:0002139", "CL:0002138",
                "CL:0000071", "CL:0002144", "CL:0002543", "CL:1000413",
                "CL:2000016", "CL:0000077", "CL:0002503", "CL:0000669",
                "CL:0000186", "CL:0000192", "CL:0000359", "CL:0002598",
                "CL:0000646", "CL:000263", "CL:0002632", "CL:0000158",
                "CL:0000499", "CL:0000138", "CL:0000057", "CL:0002241",
                "CL:0000165", "CL:1000223", "CL:0000556", "CL:0000763",
                "CL:0000595", "CL:0000766", "CL:0000767", "CL:0000775",
                "CL:0000576", "CL:0002393", "CL:0002397", "CL:0002057",
                "CL:0000875", "CL:0000860", "CL:0000097", "CL:0000235",
                "CL:0000583", "CL:0002399", "CL:0001057", "CL:0000451",
                "CL:0001058", "CL:0000784", "CL:0000542", "CL:0000084",
                "CL:0000800", "CL:0000895", "CL:0000900", "CL:0001044",
                "CL:0001050", "CL:0000905", "CL:0000913", "CL:0000791",
                "CL:0000624", "CL:0000625", "CL:0000814", "CL:0000236",
                "CL:0000786", "CL:0001065", "CL:0000623", "CL:0005025",
                "CL:0000160", "CL:0002370", "CL:1000143", "CL:0000064",
                "CL:0000067", "CL:1000271", "CL:0000066", "CL:0000076",
                "CL:0002062", "CL:0002063", "CL:0000151", "CL:0000319",
                "CL:0019001", "CL:1000331"],
             "UBERON:0000178": 
              ["CL:0000003", "CL:0000115", "CL:0002139", "CL:0002138",
                "CL:0000071", "CL:0002144", "CL:0002543", "CL:1000413",
                "CL:2000016", "CL:0000077", "CL:0002503", "CL:0000669",
                "CL:0000186", "CL:0000192", "CL:0000359", "CL:0002598",
                "CL:0000646", "CL:000263", "CL:0002632", "CL:0000158",
                "CL:0000499", "CL:0000138", "CL:0000057", "CL:0002241",
                "CL:0000165", "CL:1000223", "CL:0000556", "CL:0000763",
                "CL:0000595", "CL:0000766", "CL:0000767", "CL:0000775",
                "CL:0000576", "CL:0002393", "CL:0002397", "CL:0002057",
                "CL:0000875", "CL:0000860", "CL:0000097", "CL:0000235"]}

In [13]:
all_trees = generate_cell_ordering(".", cell_types)

In [14]:
all_trees

Unnamed: 0,tissue_ontology_term_id,cell_type_ontology_term_id,depth,order
0,UBERON:0002048,CL:0000003,0,0
1,UBERON:0002048,CL:0000542,0,1
2,UBERON:0002048,CL:0000084,1,2
3,UBERON:0002048,CL:0000800,2,3
4,UBERON:0002048,CL:0000905,2,4
...,...,...,...,...
111,UBERON:0000178,CL:1000413,3,34
112,UBERON:0000178,CL:0002138,2,35
113,UBERON:0000178,CL:0000192,0,36
114,UBERON:0000178,CL:0000359,1,37


In [42]:
onto_test = Ontology.from_obo_library("cl-basic.obo")
onto_test["CL:0000003"].name

'native cell'

Let's reacreate what happens in the api

In [43]:
organism = "NCBITaxon:9606"
tissue = "UBERON:0002048"

In [80]:
cell_counts_db = tiledb.open("s3://cellxgene-wmg-prod/1651599970/cell_counts/", ctx=tdb_ctx)
cell_counts = cell_counts_db.query(attrs=["cell_type_ontology_term_id", "n_cells"]).df[tissue, organism]
distinct_tissues_cell_types = cell_counts.groupby(
    ["tissue_ontology_term_id", "cell_type_ontology_term_id"], as_index=False
).first()[["tissue_ontology_term_id", "cell_type_ontology_term_id", "n_cells"]]

distinct_tissues_cell_types

Unnamed: 0,tissue_ontology_term_id,cell_type_ontology_term_id,n_cells
0,UBERON:0002048,CL:0000003,4
1,UBERON:0002048,CL:0000057,473
2,UBERON:0002048,CL:0000064,88
3,UBERON:0002048,CL:0000066,669
4,UBERON:0002048,CL:0000067,36
...,...,...,...
70,UBERON:0002048,CL:1000271,747
71,UBERON:0002048,CL:1000331,6
72,UBERON:0002048,CL:1000413,11
73,UBERON:0002048,CL:1000491,29


In [62]:
cell_type_orderings = all_trees.copy()
cell_type_orderings_orig = all_trees.copy()

#cell_type_orderings["keep_row"] = cell_type_orderings["cell_type_ontology_term_id"] in distinct_tissues_cell_types["cell_type_ontology_term_id"]
cell_type_orderings["to_keep"] = [cell_type in distinct_tissues_cell_types["cell_type_ontology_term_id"].tolist() 
                                   for cell_type in cell_type_orderings["cell_type_ontology_term_id"]] 
cell_type_orderings 

Unnamed: 0,tissue_ontology_term_id,cell_type_ontology_term_id,depth,order,to_keep
0,UBERON:0002048,CL:0000003,0,0,True
1,UBERON:0002048,CL:0000151,0,1,False
2,UBERON:0002048,CL:0000158,1,2,True
3,UBERON:0002048,CL:0002063,1,3,True
4,UBERON:0002048,CL:0019001,1,4,True
...,...,...,...,...,...
111,UBERON:0002,CL:0002057,3,34,False
112,UBERON:0002,CL:0000875,3,35,True
113,UBERON:0002,CL:0002393,3,36,True
114,UBERON:0002,CL:0000860,3,37,True


In [68]:
cell_type_orderings = all_trees.copy()
cell_type_orderings_orig = all_trees.copy()
joined = cell_type_orderings.merge(
    distinct_tissues_cell_types, on=["tissue_ontology_term_id", "cell_type_ontology_term_id"], how = "left"
)
joined

Unnamed: 0,tissue_ontology_term_id,cell_type_ontology_term_id,depth,order,n_cells
0,UBERON:0002048,CL:0000003,0,0,4.0
1,UBERON:0002048,CL:0000151,0,1,
2,UBERON:0002048,CL:0000158,1,2,641.0
3,UBERON:0002048,CL:0002063,1,3,3854.0
4,UBERON:0002048,CL:0019001,1,4,24.0
...,...,...,...,...,...
111,UBERON:0002,CL:0002057,3,34,
112,UBERON:0002,CL:0000875,3,35,
113,UBERON:0002,CL:0002393,3,36,
114,UBERON:0002,CL:0000860,3,37,


In [71]:
depth_col = joined.columns.get_loc("depth")
n_cells = joined.columns.get_loc("n_cells")
cell_col = joined.columns.get_loc("cell_type_ontology_term_id")

joined['depth'] = joined['depth'].astype('int')


for i in range(len(joined)):
    if isnan(joined.iloc[i, keep_col]):
        original_depth = joined.iloc[i, depth_col]
        for j in range(i + 1, len(joined)):
            if original_depth < joined.iloc[j, depth_col]:
                joined.iloc[j,depth_col] -= 1
            else:
                break

joined.to_csv("cell_type_orderings_fixed.csv")
cell_type_orderings_orig.to_csv("cell_type_orderings_orig.csv")

In [76]:
#joined.iloc[[i for i in range(len(joined)) if not isnan(joined.iloc[i, n_cells])], ]

joined[joined['n_cells'].notnull()]

Unnamed: 0,tissue_ontology_term_id,cell_type_ontology_term_id,depth,order,n_cells
0,UBERON:0002048,CL:0000003,0,0,4.0
2,UBERON:0002048,CL:0000158,0,2,641.0
3,UBERON:0002048,CL:0002063,0,3,3854.0
4,UBERON:0002048,CL:0019001,0,4,24.0
5,UBERON:0002048,CL:1000331,1,5,6.0
...,...,...,...,...,...
72,UBERON:0002048,CL:0002062,1,72,142.0
73,UBERON:0002048,CL:0000076,1,73,13.0
74,UBERON:0002048,CL:0000067,1,74,36.0
75,UBERON:0002048,CL:0000064,0,75,88.0


In [97]:
def fixed_depths(x):
    depth_col = x.columns.get_loc("depth")
    n_cells = x.columns.get_loc("n_cells")
    cell_col = x.columns.get_loc("cell_type_ontology_term_id")

    x['depth'] = x['depth'].astype('int')

    for i in range(len(x)):
        if isnan(x.iloc[i, keep_col]):
            original_depth = x.iloc[i, depth_col]
            for j in range(i + 1, len(x)):
                if original_depth < x.iloc[j, depth_col]:
                    x.iloc[j,depth_col] -= 1
                else:
                    break
                    
    return x

def build_ordered_cell_types_by_tissue(
    cell_counts, cell_type_orderings):
    
    # Find unique cell types and tissues with counts in snapshot
    distinct_tissues_cell_types = cell_counts.groupby(
        ["tissue_ontology_term_id", "cell_type_ontology_term_id"], as_index=False
    ).first()[["tissue_ontology_term_id", "cell_type_ontology_term_id", "n_cells"]]
    
    joined = cell_type_orderings.merge(
        distinct_tissues_cell_types, 
        on=["tissue_ontology_term_id", "cell_type_ontology_term_id"],
        how="left"
    )
    
    #return joined 
    # Fix depths based on the rows that need to be removed
    joined = fixed_depths(joined)
    joined = joined[joined['n_cells'].notnull()]

    #for row in sorted.itertuples(index=False):
    #    structured_result[row.tissue_ontology_term_id].append(
    #        {"cell_type_ontology_term_id": row.cell_type_ontology_term_id,
    #         "cell_type": ontology_term_label(row.cell_type_ontology_term_id),
    #         "depth": row.depth
    #         }
    #    )
    #return structured_result

In [99]:
aaa = build_ordered_cell_types_by_tissue(cell_counts, cell_type_orderings)

aaa.to_csv("deleteme.csv")

In [120]:
def build_ordered_cell_types_by_tissue(
    cell_counts: DataFrame, cell_type_orderings: DataFrame
) -> Dict[str, List[Dict[str, str]]]:
    distinct_tissues_cell_types: DataFrame = cell_counts.groupby(
        ["tissue_ontology_term_id", "cell_type_ontology_term_id"], as_index=False
    ).first()[["tissue_ontology_term_id", "cell_type_ontology_term_id", "n_cells"]]

    joined = cell_type_orderings.merge(
        distinct_tissues_cell_types,
        on=["tissue_ontology_term_id", "cell_type_ontology_term_id"],
        how="left"
    )

    # Fix depths based on the rows that need to be removed
    joined = build_ordered_cell_types_by_tissue_fix_depths(joined)
    # Remove cell types without counts
    joined = joined[joined['n_cells'].notnull()]

    structured_result: Dict[str, List[Dict[str, str]]] = defaultdict(list)
    for row in joined.itertuples(index=False):
        structured_result[row.tissue_ontology_term_id].append(
            {"cell_type_ontology_term_id": row.cell_type_ontology_term_id,
             "cell_type": ontology_term_label(row.cell_type_ontology_term_id),
             "depth": row.depth
             }
        )

    return structured_result


def build_ordered_cell_types_by_tissue_fix_depths(x):
    """
    Fixes the depths of the cell ontology tree based on cell types that have to be removed
    because they have 0 counts
    """

    depth_col = x.columns.get_loc("depth")
    n_cells = x.columns.get_loc("n_cells")

    x['depth'] = x['depth'].astype('int')

    for i in range(len(x)):
        if isnan(x.iloc[i, n_cells]):
            original_depth = x.iloc[i, depth_col]
            for j in range(i + 1, len(x)):
                if original_depth < x.iloc[j, depth_col]:
                    x.iloc[j, depth_col] -= 1
                else:
                    break

    return x

In [121]:
aaa = build_ordered_cell_types_by_tissue(cell_counts, cell_type_orderings)
aaa
#aaa.to_csv("deleteme.csv")


defaultdict(list,
            {'UBERON:0002048': [{'cell_type_ontology_term_id': 'CL:0000003',
               'cell_type': None,
               'depth': 0},
              {'cell_type_ontology_term_id': 'CL:0000158',
               'cell_type': None,
               'depth': 0},
              {'cell_type_ontology_term_id': 'CL:0002063',
               'cell_type': None,
               'depth': 0},
              {'cell_type_ontology_term_id': 'CL:0019001',
               'cell_type': None,
               'depth': 0},
              {'cell_type_ontology_term_id': 'CL:1000331',
               'cell_type': None,
               'depth': 1},
              {'cell_type_ontology_term_id': 'CL:0000319',
               'cell_type': None,
               'depth': 0},
              {'cell_type_ontology_term_id': 'CL:0000160',
               'cell_type': None,
               'depth': 1},
              {'cell_type_ontology_term_id': 'CL:0002370',
               'cell_type': None,
               'depth': 

In [117]:
ontology_term_label("UBERON:0002048")