# Ingest CXG spatial h5ad files from a directory and create Census object

## Create a manifest file containing the location of spatial datasets to load

In [1]:
import os

from cellxgene_census_builder.build_soma.manifest import load_manifest

In [2]:
def create_manifest_csv_file(spatial_datasets_dir, manifest_file_path):
    file_ids = [os.path.splitext(filename)[0] for filename in os.listdir(spatial_datasets_dir)]
    file_paths = [os.path.join(spatial_datasets_dir, filename) for filename in os.listdir(spatial_datasets_dir)]
    manifest_content = "\n".join([", ".join(pair) for pair in zip(file_ids, file_paths, strict=False)])

    with open(manifest_file_path, "w") as f:
        f.write(manifest_content.strip())

In [3]:
# NOTE: Change these variables to point to appropriate file paths on your machine
spatial_datasets_dir = "/Users/psridharan/code/cellxgene-census/ps_stuff/spatial_test_datasets"
manifest_file_path = (
    "/Users/psridharan/code/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/manifest.csv"
)
blocklist_file_path = (
    "/Users/psridharan/code/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/blocklist.txt"
)

In [4]:
create_manifest_csv_file(spatial_datasets_dir, manifest_file_path)

In [5]:
datasets = load_manifest(manifest_file_path, blocklist_file_path)

In [6]:
datasets

[Dataset(dataset_id='c6f6e674-b59d-46cf-8525-73f64f9eef8c', dataset_asset_h5ad_uri='/Users/psridharan/code/cellxgene-census/ps_stuff/spatial_test_datasets/c6f6e674-b59d-46cf-8525-73f64f9eef8c.h5ad', dataset_version_id='', dataset_h5ad_path='', dataset_title='', citation='', collection_id='', collection_name='', collection_doi='', asset_h5ad_filesize=-1, cell_count=-1, mean_genes_per_cell=-1.0, schema_version='', dataset_total_cell_count=0, soma_joinid=-1),
 Dataset(dataset_id='fa3893cb-d420-42ac-8263-09719a26102e', dataset_asset_h5ad_uri='/Users/psridharan/code/cellxgene-census/ps_stuff/spatial_test_datasets/fa3893cb-d420-42ac-8263-09719a26102e.h5ad', dataset_version_id='', dataset_h5ad_path='', dataset_title='', citation='', collection_id='', collection_name='', collection_doi='', asset_h5ad_filesize=-1, cell_count=-1, mean_genes_per_cell=-1.0, schema_version='', dataset_total_cell_count=0, soma_joinid=-1),
 Dataset(dataset_id='07998bf8-d070-41bb-a584-f8bdd1193aef', dataset_asset_h5ad

## Run the census builder to ingest the spatial datasets

In [7]:
# NOTE: Change these variables to point to appropriate file paths on your machine
census_builder_working_dir = "/Users/psridharan/code/cellxgene-census/ps_stuff/census-builds"
census_build_tag = "census_spatial"

In [8]:
# Ensure the working directory does not already contain a build tag with the same name
! rm -rf {census_builder_working_dir}/logs
! rm -rf {census_builder_working_dir}/{census_build_tag}

In [9]:
! python -m cellxgene_census_builder.build_soma -v --build-tag {census_build_tag} {census_builder_working_dir} build --manifest {manifest_file_path}

2024-06-05 16:47:41 5875    DEBUG    Setting NUMEXPR_MAX_THREADS environment variable to "5"
2024-06-05 16:47:41 5875    DEBUG    Setting OMP_NUM_THREADS environment variable to "1"
2024-06-05 16:47:41 5875    DEBUG    Setting OPENBLAS_NUM_THREADS environment variable to "1"
2024-06-05 16:47:41 5875    DEBUG    Setting MKL_NUM_THREADS environment variable to "1"
2024-06-05 16:47:41 5875    DEBUG    Setting VECLIB_MAXIMUM_THREADS environment variable to "1"
2024-06-05 16:47:41 5875    INFO     CensusBuildArgs(working_dir=PosixPath('/Users/psridharan/code/cellxgene-census/ps_stuff/census-builds'), config=CensusBuildConfig(verbose=1, dashboard=True, log_dir='logs', log_file='build.log', reports_dir='reports', consolidate=True, dryrun=False, cellxgene_census_S3_path='s3://cellxgene-data-public/cell-census', cellxgene_census_default_mirror_S3_path='s3://cellxgene-census-public-us-west-2/cell-census', cellxgene_census_S3_replica_path=None, logs_S3_path='s3://cellxgene-data-public-logs/builde

## Query the generated census object
NOTE: Currently the demo shows summary cell counts but the demo will show case spatial queries once that is availabe

In [10]:
import cellxgene_census

In [11]:
with cellxgene_census.open_soma(uri=f"{census_builder_working_dir}/{census_build_tag}/soma") as census:
    census_summary_cell_counts = census["census_info"]["summary_cell_counts"].read().concat().to_pandas()
    census_summary_cell_counts = census_summary_cell_counts.drop(columns=["soma_joinid"])
    print(census_summary_cell_counts)

        organism        category                           label  \
0   Homo sapiens             all                              na   
1   Homo sapiens           assay  Visium Spatial Gene Expression   
2   Homo sapiens       cell_type          neuronal receptor cell   
3   Homo sapiens       cell_type                 epithelial cell   
4   Homo sapiens       cell_type                      blood cell   
..           ...             ...                             ...   
62  Homo sapiens          tissue           primary visual cortex   
63  Homo sapiens  tissue_general                           heart   
64  Homo sapiens  tissue_general                           brain   
65  Homo sapiens  tissue_general                        placenta   
66  Homo sapiens  tissue_general                           liver   

   ontology_term_id  total_cell_count  unique_cell_count  
0                na             39936              39936  
1       EFO:0010961             39936              39936  
2     