# Ingest CXG spatial h5ad files from a directory and create Census object

## Create a manifest file containing the location of spatial datasets to load

In [1]:
import os

from cellxgene_census_builder.build_soma.manifest import load_manifest

In [2]:
def create_manifest_csv_file(spatial_datasets_dir, manifest_file_path):
    file_ids = [os.path.splitext(filename)[0] for filename in os.listdir(spatial_datasets_dir)]
    file_paths = [os.path.join(spatial_datasets_dir, filename) for filename in os.listdir(spatial_datasets_dir)]
    manifest_content = "\n".join([", ".join(pair) for pair in zip(file_ids, file_paths, strict=False)])

    with open(manifest_file_path, "w") as f:
        f.write(manifest_content.strip())

In [3]:
# NOTE: Change these variables to point to appropriate file paths on your machine
spatial_datasets_dir = "/Users/psridharan/code/cellxgene-census/ps_stuff/spatial_test_datasets"
manifest_file_path = (
    "/Users/psridharan/code/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/manifest.csv"
)
blocklist_file_path = (
    "/Users/psridharan/code/cellxgene-census/tools/cellxgene_census_builder/spatial_dev_tools/blocklist.txt"
)

In [4]:
create_manifest_csv_file(spatial_datasets_dir, manifest_file_path)

In [5]:
datasets = load_manifest(manifest_file_path, blocklist_file_path)

In [6]:
datasets

[Dataset(dataset_id='c6f6e674-b59d-46cf-8525-73f64f9eef8c', dataset_asset_h5ad_uri='/Users/psridharan/code/cellxgene-census/ps_stuff/spatial_test_datasets/c6f6e674-b59d-46cf-8525-73f64f9eef8c.h5ad', dataset_version_id='', dataset_h5ad_path='', dataset_title='', citation='', collection_id='', collection_name='', collection_doi='', asset_h5ad_filesize=-1, cell_count=-1, mean_genes_per_cell=-1.0, schema_version='', dataset_total_cell_count=0, soma_joinid=-1),
 Dataset(dataset_id='fa3893cb-d420-42ac-8263-09719a26102e', dataset_asset_h5ad_uri='/Users/psridharan/code/cellxgene-census/ps_stuff/spatial_test_datasets/fa3893cb-d420-42ac-8263-09719a26102e.h5ad', dataset_version_id='', dataset_h5ad_path='', dataset_title='', citation='', collection_id='', collection_name='', collection_doi='', asset_h5ad_filesize=-1, cell_count=-1, mean_genes_per_cell=-1.0, schema_version='', dataset_total_cell_count=0, soma_joinid=-1),
 Dataset(dataset_id='07998bf8-d070-41bb-a584-f8bdd1193aef', dataset_asset_h5ad

## Run the census builder to ingest the spatial datasets

In [7]:
# NOTE: Change these variables to point to appropriate file paths on your machine
census_builder_working_dir = "/Users/psridharan/code/cellxgene-census/ps_stuff/census-builds"
census_build_tag = "test-spatial-build"

In [8]:
# Ensure the working directory does not already contain a build tag with the same name
! rm -rf {census_builder_working_dir}/logs
! rm -rf {census_builder_working_dir}/{census_build_tag}

In [9]:
! python -m cellxgene_census_builder.build_soma -v --build-tag {census_build_tag} {census_builder_working_dir} build --manifest {manifest_file_path}

2024-06-07 15:32:39 39783   DEBUG    Setting NUMEXPR_MAX_THREADS environment variable to "5"
2024-06-07 15:32:39 39783   DEBUG    Setting OMP_NUM_THREADS environment variable to "1"
2024-06-07 15:32:39 39783   DEBUG    Setting OPENBLAS_NUM_THREADS environment variable to "1"
2024-06-07 15:32:39 39783   DEBUG    Setting MKL_NUM_THREADS environment variable to "1"
2024-06-07 15:32:39 39783   DEBUG    Setting VECLIB_MAXIMUM_THREADS environment variable to "1"
2024-06-07 15:32:39 39783   INFO     CensusBuildArgs(working_dir=PosixPath('/Users/psridharan/code/cellxgene-census/ps_stuff/census-builds'), config=CensusBuildConfig(verbose=1, dashboard=True, log_dir='logs', log_file='build.log', reports_dir='reports', consolidate=True, dryrun=False, cellxgene_census_S3_path='s3://cellxgene-data-public/cell-census', cellxgene_census_default_mirror_S3_path='s3://cellxgene-census-public-us-west-2/cell-census', cellxgene_census_S3_replica_path=None, logs_S3_path='s3://cellxgene-data-public-logs/builde

## Inspect the census object
**Work-In-Progress** 

_Note that there is a `census_data` and `census_spatial` collection side by side and the `census_spatial` collection also contains a `spatial` collection along with `obs` and `ms`_

In [10]:
import tiledbsoma

In [11]:
soma_root_collection_uri = f"{census_builder_working_dir}/{census_build_tag}/soma"
soma_root_collection_uri

'/Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma'

In [12]:
soma_root_collection = tiledbsoma.open(soma_root_collection_uri)
soma_root_collection

<Collection '/Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma' (open for 'r') (3 items)
    'census_info': 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_info' (unopened)
    'census_data': 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_data' (unopened)
    'census_spatial': 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_spatial' (unopened)>

In [13]:
soma_root_collection["census_spatial"]

<Collection 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_spatial' (open for 'r') (2 items)
    'homo_sapiens': 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (unopened)
    'mus_musculus': 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_spatial/mus_musculus' (unopened)>

In [14]:
soma_root_collection["census_spatial"]["homo_sapiens"]

<Experiment 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (open for 'r') (3 items)
    'obs': 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/obs' (unopened)
    'ms': 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/ms' (unopened)
    'spatial': 'file:///Users/psridharan/code/cellxgene-census/ps_stuff/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial' (unopened)>

# Query the census object
**Work-In-Progress**

In [15]:
import cellxgene_census

census = cellxgene_census.open_soma(uri=soma_root_collection_uri)

In [16]:
obs_columns = [
    "soma_joinid",
    "dataset_id",
    "assay_ontology_term_id",
    "cell_type",
    "tissue",
    "raw_sum",
    "nnz",
    "raw_mean_nnz",
    "raw_variance_nnz",
    "n_measured_vars",
]
obs_df = census["census_spatial"]["homo_sapiens"].obs.read(column_names=obs_columns).concat().to_pandas()
obs_df

Unnamed: 0,soma_joinid,dataset_id,assay_ontology_term_id,cell_type,tissue,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,0,c6f6e674-b59d-46cf-8525-73f64f9eef8c,EFO:0010961,unknown,caudate lobe of liver,4113.0,1376,2.989099,232.399154,21082
1,1,c6f6e674-b59d-46cf-8525-73f64f9eef8c,EFO:0010961,periportal region hepatocyte,caudate lobe of liver,9551.0,2551,3.744022,669.163077,21082
2,2,c6f6e674-b59d-46cf-8525-73f64f9eef8c,EFO:0010961,unknown,caudate lobe of liver,5874.0,1795,3.272423,349.790293,21082
3,3,c6f6e674-b59d-46cf-8525-73f64f9eef8c,EFO:0010961,periportal region hepatocyte,caudate lobe of liver,9659.0,2368,4.078970,826.866174,21082
4,4,c6f6e674-b59d-46cf-8525-73f64f9eef8c,EFO:0010961,hepatocyte,caudate lobe of liver,6293.0,1846,3.408992,342.384938,21082
...,...,...,...,...,...,...,...,...,...,...
39931,39931,1bb92cf8-ab3f-4bb0-a722-b241b5d377ed,EFO:0010961,periportal region hepatocyte,caudate lobe of liver,14401.0,3291,4.375874,712.894846,20713
39932,39932,1bb92cf8-ab3f-4bb0-a722-b241b5d377ed,EFO:0010961,unknown,caudate lobe of liver,3369.0,1140,2.955263,98.843475,20713
39933,39933,1bb92cf8-ab3f-4bb0-a722-b241b5d377ed,EFO:0010961,periportal region hepatocyte,caudate lobe of liver,9040.0,2583,3.499806,334.456139,20713
39934,39934,1bb92cf8-ab3f-4bb0-a722-b241b5d377ed,EFO:0010961,unknown,caudate lobe of liver,2679.0,973,2.753340,94.605763,20713


In [17]:
var_df = census["census_spatial"]["homo_sapiens"].ms["RNA"].var.read().concat().to_pandas()

var_df

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
0,0,ENSG00000243485,MIR1302-2HG,1021,1,4992
1,1,ENSG00000237613,FAM138A,1219,0,0
2,2,ENSG00000186092,OR4F5,2618,0,0
3,3,ENSG00000238009,ENSG00000238009.6,3726,51,39936
4,4,ENSG00000239945,ENSG00000239945.1,1319,0,0
...,...,...,...,...,...,...
36967,36967,ENSG00000280081,LINC01667,4169,1,4992
36968,36968,ENSG00000235609,ENSG00000235609.7,5929,147,9984
36969,36969,ENSG00000265590,CFAP298-TCP10L,19326,0,0
36970,36970,ENSG00000249624,IFNAR2-IL10RB,3943,65,9984


In [18]:
census.close()