# Ingest CXG spatial h5ad files from a directory and create Census object

## Create a manifest file containing the location of spatial datasets to load

In [1]:
import os
from pathlib import Path

from cellxgene_census_builder.build_soma.manifest import load_manifest

In [2]:
def create_manifest_csv_file(spatial_datasets_dir, manifest_file_path):
    file_ids = [os.path.splitext(filename)[0] for filename in os.listdir(spatial_datasets_dir)]
    file_paths = [os.path.join(spatial_datasets_dir, filename) for filename in os.listdir(spatial_datasets_dir)]
    manifest_content = "\n".join([", ".join(pair) for pair in zip(file_ids, file_paths, strict=False)])

    with open(manifest_file_path, "w") as f:
        f.write(manifest_content.strip())

In [3]:
dataset_urls = [
    # https://cellxgene.cziscience.com/collections/e2c257e7-6f79-487c-b81c-39451cd4ab3c
    "https://datasets.cellxgene.cziscience.com/6811c454-def2-4d9e-b360-aa8a69f843ce.h5ad",
    "https://datasets.cellxgene.cziscience.com/1fee1684-94b3-4371-8d44-6a8b937ba23f.h5ad",
    # https://cellxgene.cziscience.com/collections/8191c283-0816-424b-9b61-c3e1d6258a77
    "https://datasets.cellxgene.cziscience.com/17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec.h5ad",
    "https://datasets.cellxgene.cziscience.com/5487249a-b034-446a-8d9b-0b810ecb7e91.h5ad",
    # https://cellxgene.cziscience.com/collections/0c8a364b-97b5-4cc8-a593-23c38c6f0ac5
    "https://datasets.cellxgene.cziscience.com/9e66d65a-385e-4097-9e61-2ad68a1a59fb.h5ad",
    "https://datasets.cellxgene.cziscience.com/f6fbb333-bf7a-4402-a3c0-ef2c894d4371.h5ad",
]

In [4]:
# # From https://docs.google.com/document/d/1ZI_L83nVTEqRs5BoqmaZoXkt4CSd_p8IejIl1V3o5RQ/edit
# dataset_urls = [
#     # 'https://datasets.cellxgene.cziscience.com/c63d5cb4-1046-4948-a188-e6af50ef90f4.h5ad',  # old
#     "https://datasets.cellxgene.cziscience.com/3396c353-d720-4588-8724-75546e2f18cc.h5ad",
#     # 'https://datasets.cellxgene.cziscience.com/6ab91271-5f48-4e98-92ef-d02ee21e63e1.h5ad',  # old
#     "https://datasets.cellxgene.cziscience.com/c83e78ff-e13b-4531-ac9a-cd7d8d1d44ef.h5ad",
#     "https://datasets.cellxgene.cziscience.com/53e343af-979c-4525-a705-1b9d1a1fee14.h5ad",
#     "https://datasets.cellxgene.cziscience.com/9624a105-319c-4abf-b10b-d96ce1650100.h5ad",
#     "https://datasets.cellxgene.cziscience.com/c6f6e674-b59d-46cf-8525-73f64f9eef8c.h5ad",
#     "https://datasets.cellxgene.cziscience.com/1bb92cf8-ab3f-4bb0-a722-b241b5d377ed.h5ad",
#     "https://datasets.cellxgene.cziscience.com/fa3893cb-d420-42ac-8263-09719a26102e.h5ad",
#     "https://datasets.cellxgene.cziscience.com/07998bf8-d070-41bb-a584-f8bdd1193aef.h5ad",
# ]

In [5]:
# NOTE: Change these variables to point to appropriate file paths on your machine
# spatial_datasets_dir = "/Users/psridharan/code/cellxgene-census/ps_stuff/spatial_test_datasets"

REPO_ROOT_DIR = Path("../../..")
BUILDER_DIR = REPO_ROOT_DIR / "tools/cellxgene_census_builder"
WORKING_DIR = REPO_ROOT_DIR / "issues/census_1127_spatial-builder"
ANNDATA_DIR = WORKING_DIR / "source_h5ad"

MANIFEST_FILE_PATH = BUILDER_DIR / "spatial_dev_tools/manifest.csv"
BLOCKLIST_FILE_PATH = BUILDER_DIR / "spatial_dev_tools/blocklist.txt"

In [6]:
ANNDATA_DIR.mkdir(exist_ok=True)

In [7]:
for url in dataset_urls:
    file_name = url.split("/")[-1]
    file_pth = ANNDATA_DIR / file_name
    file_pth_str = str(file_pth)
    if not file_pth.is_file():
        !wget $url -O $file_pth_str

In [8]:
create_manifest_csv_file(ANNDATA_DIR, MANIFEST_FILE_PATH)

In [9]:
datasets = load_manifest(str(MANIFEST_FILE_PATH), str(BLOCKLIST_FILE_PATH))

In [10]:
assert len(datasets) == len(dataset_urls)
datasets

[Dataset(dataset_id='17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec', dataset_asset_h5ad_uri='../../../issues/census_1127_spatial-builder/source_h5ad/17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec.h5ad', dataset_version_id='', dataset_h5ad_path='', dataset_title='', citation='', collection_id='', collection_name='', collection_doi='', collection_doi_label='', asset_h5ad_filesize=-1, cell_count=-1, mean_genes_per_cell=-1.0, schema_version='', dataset_total_cell_count=0, soma_joinid=-1),
 Dataset(dataset_id='9e66d65a-385e-4097-9e61-2ad68a1a59fb', dataset_asset_h5ad_uri='../../../issues/census_1127_spatial-builder/source_h5ad/9e66d65a-385e-4097-9e61-2ad68a1a59fb.h5ad', dataset_version_id='', dataset_h5ad_path='', dataset_title='', citation='', collection_id='', collection_name='', collection_doi='', collection_doi_label='', asset_h5ad_filesize=-1, cell_count=-1, mean_genes_per_cell=-1.0, schema_version='', dataset_total_cell_count=0, soma_joinid=-1),
 Dataset(dataset_id='f6fbb333-bf7a-4402-a3c0-ef2c894d4371'

## Run the census builder to ingest the spatial datasets

In [11]:
# NOTE: Change these variables to point to appropriate file paths on your machine
census_builder_working_dir = (WORKING_DIR / "census-builds").absolute()
census_build_tag = "test-spatial-build"

In [12]:
# Ensure the working directory does not already contain a build tag with the same name
! rm -rf {str(census_builder_working_dir)}/logs
! rm -rf {str(census_builder_working_dir)}/{census_build_tag}

In [13]:
!echo {str(census_builder_working_dir)}

/home/ubuntu/github/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/../../../issues/census_1127_spatial-builder/census-builds


In [14]:
!ls

3396c353-d720-4588-8724-75546e2f18cc  manifest.csv
blocklist.txt			      soma-spatial
census_spatial_dataset_ingest.ipynb   tiledbsoma_spatial_dataset_ingest.ipynb


In [15]:
print(census_build_tag)
print(census_builder_working_dir)
print(MANIFEST_FILE_PATH)

test-spatial-build
/home/ubuntu/github/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/../../../issues/census_1127_spatial-builder/census-builds
../../../tools/cellxgene_census_builder/spatial_dev_tools/manifest.csv


In [16]:
MANIFEST_FILE_PATH.resolve()

PosixPath('/home/ubuntu/github/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/manifest.csv')

In [17]:
!ls ../../../issues/census_1127_spatial-builder/census-builds/

In [18]:
!python -m cellxgene_census_builder.build_soma -v --build-tag $census_build_tag $census_builder_working_dir build --manifest $MANIFEST_FILE_PATH

2024-07-17 21:44:30 561179  DEBUG    Setting NUMEXPR_MAX_THREADS environment variable to "8"
2024-07-17 21:44:30 561179  DEBUG    Setting OMP_NUM_THREADS environment variable to "1"
2024-07-17 21:44:30 561179  DEBUG    Setting OPENBLAS_NUM_THREADS environment variable to "1"
2024-07-17 21:44:30 561179  DEBUG    Setting MKL_NUM_THREADS environment variable to "1"
2024-07-17 21:44:30 561179  DEBUG    Setting VECLIB_MAXIMUM_THREADS environment variable to "1"
2024-07-17 21:44:30 561179  INFO     CensusBuildArgs(working_dir=PosixPath('/home/ubuntu/github/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/../../../issues/census_1127_spatial-builder/census-builds'), config=CensusBuildConfig(verbose=1, dashboard=True, log_dir='logs', log_file='build.log', reports_dir='reports', consolidate=True, dryrun=False, cellxgene_census_S3_path='s3://cellxgene-data-public/cell-census', cellxgene_census_default_mirror_S3_path='s3://cellxgene-census-public-us-west-2/cell-census', cellxgene_

## Inspect the census object
**Work-In-Progress** 

_Note that there is a `census_data` and `census_spatial` collection side by side and the `census_spatial` collection also contains a `spatial` collection along with `obs` and `ms`_

In [19]:
import tiledbsoma

In [20]:
soma_root_collection_uri = f"{census_builder_working_dir}/{census_build_tag}/soma"
soma_root_collection_uri

'/home/ubuntu/github/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/../../../issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma'

In [21]:
soma_root_collection = tiledbsoma.open(soma_root_collection_uri)
soma_root_collection

<Collection 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma' (open for 'r') (3 items)
    'census_data': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_data' (unopened)
    'census_info': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_info' (unopened)
    'census_spatial': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial' (unopened)>

In [22]:
soma_root_collection["census_spatial"]

<Collection 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial' (open for 'r') (2 items)
    'homo_sapiens': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (unopened)
    'mus_musculus': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial/mus_musculus' (unopened)>

In [23]:
soma_root_collection["census_spatial"]["homo_sapiens"]["spatial"]

<Collection 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial' (open for 'r') (empty)>

# Query the census object
**Work-In-Progress**

In [30]:
import cellxgene_census

census = cellxgene_census.open_soma(uri=soma_root_collection_uri)

In [31]:
census["census_spatial"]["homo_sapiens"]

<Experiment 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (open for 'r') (3 items)
    'ms': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/ms' (unopened)
    'obs': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/obs' (unopened)
    'spatial': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial' (unopened)>

In [32]:
obs_columns = [
    "soma_joinid",
    "dataset_id",
    "assay_ontology_term_id",
    "cell_type",
    "tissue",
    "raw_sum",
    "nnz",
    "raw_mean_nnz",
    "raw_variance_nnz",
    "n_measured_vars",
]
obs_df = census["census_spatial"]["homo_sapiens"].obs.read(column_names=obs_columns).concat().to_pandas()
obs_df

Unnamed: 0,soma_joinid,dataset_id,assay_ontology_term_id,cell_type,tissue,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,0,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,unknown,heart left ventricle,1065.0,505,2.108911,36.045655,22492
1,1,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,cardiac muscle myoblast,heart left ventricle,14193.0,2813,5.045503,2083.542025,22492
2,2,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,cardiac muscle myoblast,heart left ventricle,8883.0,2384,3.726091,760.181342,22492
3,3,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,unknown,heart left ventricle,2380.0,968,2.458678,94.614630,22492
4,4,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,unknown,heart left ventricle,1089.0,503,2.165010,42.679889,22492
...,...,...,...,...,...,...,...,...,...,...
29947,29947,5487249a-b034-446a-8d9b-0b810ecb7e91,EFO:0010961,unknown,heart left ventricle,28548.0,6325,4.513518,169.593310,22532
29948,29948,5487249a-b034-446a-8d9b-0b810ecb7e91,EFO:0010961,immature innate lymphoid cell,heart left ventricle,15198.0,4365,3.481787,168.873923,22532
29949,29949,5487249a-b034-446a-8d9b-0b810ecb7e91,EFO:0010961,immature innate lymphoid cell,heart left ventricle,20013.0,5057,3.957485,201.442615,22532
29950,29950,5487249a-b034-446a-8d9b-0b810ecb7e91,EFO:0010961,unknown,heart left ventricle,2206.0,1180,1.869492,10.093216,22532


In [27]:
census.keys()

KeysView(<Collection 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma' (open for 'r') (3 items)
    'census_data': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_data' (unopened)
    'census_info': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_info' (unopened)
    'census_spatial': Collection 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial' (open for 'r') (2 items)
        'homo_sapiens': Experiment 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (open for 'r') (3 items)
            'ms': 'file:///home/ubuntu/github/cellxgene-census/issues/census_1127_spatial-builder/census-builds/te

In [28]:
var_df = census["census_spatial"]["homo_sapiens"].ms["RNA"].var.read().concat().to_pandas()

var_df

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
0,0,ENSG00000243485,MIR1302-2HG,1021,0,0
1,1,ENSG00000237613,FAM138A,1219,0,0
2,2,ENSG00000186092,OR4F5,2618,0,0
3,3,ENSG00000238009,ENSG00000238009.6,3726,35,29952
4,4,ENSG00000239945,ENSG00000239945.1,1319,0,0
...,...,...,...,...,...,...
36967,36967,ENSG00000280081,LINC01667,4169,1,4992
36968,36968,ENSG00000235609,ENSG00000235609.7,5929,269,9984
36969,36969,ENSG00000265590,CFAP298-TCP10L,19326,0,0
36970,36970,ENSG00000249624,IFNAR2-IL10RB,3943,64,9984


In [29]:
census.close()