In [8]:
import json
import numpy as np
from scipy import sparse

import cellxgene_census
import tiledbsoma as soma

census_version = "2023-10-30"
experiment_name = "homo_sapiens"
embedding_uri = "../../../../tmp/emb-cxg-contrib-0"

In [9]:
# Demo 1: select cells from the Census based on obs metadata. Given the resulting cells,
# use their soma_joinids to slice from embedding.

# First, query for the cells of interest based upon metadata.
with cellxgene_census.open_soma(census_version=census_version) as census:
    exp = census["census_data"][experiment_name]
    obs_df = (
        exp.obs.read(
            value_filter="""tissue_general == 'heart'""",
            column_names=[
                "soma_joinid",
                "cell_type",
                "tissue",
                "tissue_general",
                "nnz",
            ],
        )
        .concat()
        .to_pandas()
    )

display(obs_df)

Unnamed: 0,soma_joinid,cell_type,tissue,tissue_general,nnz
0,2613074,endothelial cell,apex of heart,heart,2467
1,2613075,mural cell,apex of heart,heart,2054
2,2613076,endothelial cell,apex of heart,heart,2426
3,2613077,mural cell,apex of heart,heart,1819
4,2613078,endothelial cell,apex of heart,heart,2330
...,...,...,...,...,...
3629947,62982880,endothelial cell,heart,heart,402
3629948,62982881,endothelial cell,heart,heart,515
3629949,62982882,endothelial cell,heart,heart,414
3629950,62982883,endothelial cell,heart,heart,300


In [10]:
# 1A: extract embedddings for the selected cells, into a SciPy csr_matrix

with soma.open(embedding_uri) as E:
    # read embedding metadata
    embedding_info = json.loads(E.metadata["CxG_contrib_metadata"])
    # read embedding as SciPy (positionally indexed)
    embedding_csr = sparse.vstack(
        sp
        for sp, _ in E.read(coords=(obs_df.index.to_numpy(),))
        .blockwise(axis=0, size=2**20)
        .scipy()
    )


# confirm this embedding is associated with this census version and experiment
assert embedding_info["census_version"] == census_version
assert embedding_info["experiment_name"] == experiment_name

display("Embedding CSR")
display(embedding_csr)

# Each cell embedding is positionally indexed a la' AnnData obsm
display("First cell metadata and embedding")
display(obs_df.iloc[0])
display(embedding_csr[0,:].toarray())



'Embedding CSR'

<3629952x8 sparse matrix of type '<class 'numpy.float32'>'
	with 29039616 stored elements in Compressed Sparse Row format>

'First cell metadata and embedding'

soma_joinid                2613074
cell_type         endothelial cell
tissue               apex of heart
tissue_general               heart
nnz                           2467
Name: 0, dtype: object

array([[0.43466, 0.63609, 0.14684, 0.68371, 0.3792 , 0.81642, 0.91252,
        0.89164]], dtype=float32)

In [11]:
# 1B: dense embedding - you can also densify the results using SciPy sparse.csr_matrix methods

embedding_dense = embedding_csr.toarray()
display(embedding_dense)

array([[0.43466, 0.63609, 0.14684, ..., 0.81642, 0.91252, 0.89164],
       [0.19788, 0.89287, 0.22958, ..., 0.54048, 0.22691, 0.01647],
       [0.96428, 0.19529, 0.58122, ..., 0.23629, 0.4647 , 0.16747],
       ...,
       [0.1045 , 0.45968, 0.57934, ..., 0.16972, 0.03613, 0.5094 ],
       [0.32792, 0.32506, 0.8825 , ..., 0.08321, 0.84946, 0.82772],
       [0.12771, 0.55175, 0.83483, ..., 0.90774, 0.29128, 0.91662]],
      dtype=float32)

todo:
* query to anndata, add embedding as obsm
* 