# Ingest CXG spatial h5ad files from a directory and create Census object

## Create a manifest file containing the location of spatial datasets to load

In [1]:
from pathlib import Path

from cellxgene_census_builder.build_soma.manifest import load_manifest

In [4]:
def create_manifest_csv_file(spatial_datasets_dir, manifest_file_path, *, spatial_datasets: list[Path] | None = None):
    """Params.
    ------
    spatial_datasets_dir
        Directory containing the h5ads
    manifest_file_path
        Output path
    spatial_datasets
        Optional list of datasets to select from the spatial_datasets directory. To make it easier to have smaller builds.
    """
    if spatial_datasets is None:
        spatial_datasets = sorted(Path(spatial_datasets_dir).glob("*.h5ad"))
    file_ids = [filename.stem for filename in spatial_datasets]
    # file_paths = [os.path.join(spatial_datasets_dir, filename) for filename in spatial_datasets]
    manifest_content = "\n".join([", ".join(map(str, pair)) for pair in zip(file_ids, spatial_datasets, strict=False)])
    with open(manifest_file_path, "w") as f:
        f.write(manifest_content.strip())

In [5]:
dataset_urls = [
    # https://cellxgene.cziscience.com/collections/e2c257e7-6f79-487c-b81c-39451cd4ab3c
    "https://datasets.cellxgene.cziscience.com/6811c454-def2-4d9e-b360-aa8a69f843ce.h5ad",
    "https://datasets.cellxgene.cziscience.com/1fee1684-94b3-4371-8d44-6a8b937ba23f.h5ad",
    # https://cellxgene.cziscience.com/collections/8191c283-0816-424b-9b61-c3e1d6258a77
    "https://datasets.cellxgene.cziscience.com/17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec.h5ad",
    "https://datasets.cellxgene.cziscience.com/5487249a-b034-446a-8d9b-0b810ecb7e91.h5ad",
    # https://cellxgene.cziscience.com/collections/0c8a364b-97b5-4cc8-a593-23c38c6f0ac5
    "https://datasets.cellxgene.cziscience.com/9e66d65a-385e-4097-9e61-2ad68a1a59fb.h5ad",
    "https://datasets.cellxgene.cziscience.com/f6fbb333-bf7a-4402-a3c0-ef2c894d4371.h5ad",
]

In [7]:
# NOTE: Change these variables to point to appropriate file paths on your machine
# spatial_datasets_dir = "/Users/psridharan/code/cellxgene-census/ps_stuff/spatial_test_datasets"

REPO_ROOT_DIR = Path("../../..")
BUILDER_DIR = REPO_ROOT_DIR / "tools/cellxgene_census_builder"
WORKING_DIR = REPO_ROOT_DIR / "issues/census_1127_spatial-builder"
ANNDATA_DIR = WORKING_DIR / "source_h5ad"
SOMA_DIR = Path("/mnt/scratch/spatial_soma")

MANIFEST_FILE_PATH = BUILDER_DIR / "spatial_dev_tools/manifest.csv"
BLOCKLIST_FILE_PATH = BUILDER_DIR / "spatial_dev_tools/blocklist.txt"

In [8]:
ANNDATA_DIR.mkdir(exist_ok=True)

In [9]:
for url in dataset_urls:
    file_name = url.split("/")[-1]
    file_pth = ANNDATA_DIR / file_name
    file_pth_str = str(file_pth)
    if not file_pth.is_file():
        !wget $url -O $file_pth_str

In [63]:
create_manifest_csv_file(ANNDATA_DIR, MANIFEST_FILE_PATH, spatial_datasets=list(ANNDATA_DIR.glob("*.h5ad"))[-3:-1])

In [64]:
datasets = load_manifest(str(MANIFEST_FILE_PATH), str(BLOCKLIST_FILE_PATH))

In [65]:
datasets

[Dataset(dataset_id='6811c454-def2-4d9e-b360-aa8a69f843ce', dataset_asset_h5ad_uri='../../../issues/census_1127_spatial-builder/source_h5ad/6811c454-def2-4d9e-b360-aa8a69f843ce.h5ad', dataset_version_id='', dataset_h5ad_path='', dataset_title='', citation='', collection_id='', collection_name='', collection_doi='', collection_doi_label='', asset_h5ad_filesize=-1, cell_count=-1, mean_genes_per_cell=-1.0, schema_version='', dataset_total_cell_count=0, soma_joinid=-1),
 Dataset(dataset_id='1fee1684-94b3-4371-8d44-6a8b937ba23f', dataset_asset_h5ad_uri='../../../issues/census_1127_spatial-builder/source_h5ad/1fee1684-94b3-4371-8d44-6a8b937ba23f.h5ad', dataset_version_id='', dataset_h5ad_path='', dataset_title='', citation='', collection_id='', collection_name='', collection_doi='', collection_doi_label='', asset_h5ad_filesize=-1, cell_count=-1, mean_genes_per_cell=-1.0, schema_version='', dataset_total_cell_count=0, soma_joinid=-1)]

## Run the census builder to ingest the spatial datasets

In [14]:
# NOTE: Change these variables to point to appropriate file paths on your machine
census_builder_working_dir = (SOMA_DIR / "census-builds").absolute()
census_build_tag = "test-spatial-build"

In [16]:
!echo {str(census_builder_working_dir)}

/mnt/scratch/spatial_soma/census-builds


In [17]:
!ls

17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec  manifest.csv
blocklist.txt			      manifest.csv.bak
census_spatial_dataset_ingest.ipynb   tiledbsoma_spatial_dataset_ingest.ipynb


In [18]:
print(census_build_tag)
print(census_builder_working_dir)
print(MANIFEST_FILE_PATH)

test-spatial-build
/mnt/scratch/spatial_soma/census-builds
../../../tools/cellxgene_census_builder/spatial_dev_tools/manifest.csv


In [79]:
# Ensure the working directory does not already contain a build tag with the same name
! rm -rf {str(census_builder_working_dir)}/logs
! rm -rf {str(census_builder_working_dir)}/{census_build_tag}

In [80]:
!python -m cellxgene_census_builder.build_soma -v --build-tag $census_build_tag $census_builder_working_dir build --manifest $MANIFEST_FILE_PATH

2024-07-24 04:15:04 230350  DEBUG    Setting NUMEXPR_MAX_THREADS environment variable to "8"
2024-07-24 04:15:04 230350  DEBUG    Setting OMP_NUM_THREADS environment variable to "1"
2024-07-24 04:15:04 230350  DEBUG    Setting OPENBLAS_NUM_THREADS environment variable to "1"
2024-07-24 04:15:04 230350  DEBUG    Setting MKL_NUM_THREADS environment variable to "1"
2024-07-24 04:15:04 230350  DEBUG    Setting VECLIB_MAXIMUM_THREADS environment variable to "1"
2024-07-24 04:15:04 230350  INFO     CensusBuildArgs(working_dir=PosixPath('/mnt/scratch/spatial_soma/census-builds'), config=CensusBuildConfig(verbose=1, dashboard=True, log_dir='logs', log_file='build.log', reports_dir='reports', consolidate=True, dryrun=False, cellxgene_census_S3_path='s3://cellxgene-data-public/cell-census', cellxgene_census_default_mirror_S3_path='s3://cellxgene-census-public-us-west-2/cell-census', cellxgene_census_S3_replica_path=None, logs_S3_path='s3://cellxgene-data-public-logs/builder', build_tag='test-spa

## Inspect the census object
**Work-In-Progress** 

_Note that there is a `census_data` and `census_spatial` collection side by side and the `census_spatial` collection also contains a `spatial` collection along with `obs` and `ms`_

In [81]:
import tiledbsoma

In [82]:
soma_root_collection_uri = f"{census_builder_working_dir}/{census_build_tag}/soma"
soma_root_collection_uri

'/mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma'

In [83]:
soma_root_collection = tiledbsoma.open(soma_root_collection_uri)
soma_root_collection

<Collection 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma' (open for 'r') (3 items)
    'census_data': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_data' (unopened)
    'census_info': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_info' (unopened)
    'census_spatial': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial' (unopened)>

In [84]:
soma_root_collection["census_spatial"]

<Collection 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial' (open for 'r') (2 items)
    'homo_sapiens': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (unopened)
    'mus_musculus': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/mus_musculus' (unopened)>

In [85]:
soma_root_collection["census_spatial"]["homo_sapiens"]["spatial"]

<Collection 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial' (open for 'r') (2 items)
    '1fee1684-94b3-4371-8d44-6a8b937ba23f': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial/1fee1684_94b3_4371_8d44_6a8b937ba23f' (unopened)
    '6811c454-def2-4d9e-b360-aa8a69f843ce': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial/6811c454_def2_4d9e_b360_aa8a69f843ce' (unopened)>

In [91]:
soma_root_collection["census_spatial"]["homo_sapiens"]

<Experiment 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (open for 'r') (4 items)
    'ms': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/ms' (unopened)
    'obs': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/obs' (unopened)
    'obs_scene': DataFrame 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/obs_scene' (open for 'r')
    'spatial': Collection 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial' (open for 'r') (2 items)
        '1fee1684-94b3-4371-8d44-6a8b937ba23f': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial/1fee1684_94b3_4371_8d44_6a8b937ba23f' (unopened)
        '6811c454-def2-4d9e-b360-aa8a69f843ce': 'file:///mnt/scratch/spatial_soma/census-bui

In [92]:
soma_root_collection["census_spatial"]["homo_sapiens"]["obs_scene"].read().concat().to_pandas()

Unnamed: 0,soma_joinid,scene_id
0,0,6811c454-def2-4d9e-b360-aa8a69f843ce
1,1,6811c454-def2-4d9e-b360-aa8a69f843ce
2,2,6811c454-def2-4d9e-b360-aa8a69f843ce
3,3,6811c454-def2-4d9e-b360-aa8a69f843ce
4,4,6811c454-def2-4d9e-b360-aa8a69f843ce
...,...,...
9979,9979,1fee1684-94b3-4371-8d44-6a8b937ba23f
9980,9980,1fee1684-94b3-4371-8d44-6a8b937ba23f
9981,9981,1fee1684-94b3-4371-8d44-6a8b937ba23f
9982,9982,1fee1684-94b3-4371-8d44-6a8b937ba23f


# Query the census object
**Work-In-Progress**

In [47]:
import cellxgene_census

census = cellxgene_census.open_soma(uri=soma_root_collection_uri)

In [48]:
census["census_spatial"]["homo_sapiens"]

<Experiment 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (open for 'r') (3 items)
    'ms': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/ms' (unopened)
    'obs': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/obs' (unopened)
    'spatial': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/spatial' (unopened)>

In [49]:
obs_columns = [
    "soma_joinid",
    "dataset_id",
    "assay_ontology_term_id",
    "cell_type",
    "tissue",
    "raw_sum",
    "nnz",
    "raw_mean_nnz",
    "raw_variance_nnz",
    "n_measured_vars",
]
obs_df = census["census_spatial"]["homo_sapiens"].obs.read(column_names=obs_columns).concat().to_pandas()
obs_df

Unnamed: 0,soma_joinid,dataset_id,assay_ontology_term_id,cell_type,tissue,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,0,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,unknown,heart left ventricle,1065.0,505,2.108911,36.045655,22492
1,1,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,cardiac muscle myoblast,heart left ventricle,14193.0,2813,5.045503,2083.542025,22492
2,2,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,cardiac muscle myoblast,heart left ventricle,8883.0,2384,3.726091,760.181342,22492
3,3,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,unknown,heart left ventricle,2380.0,968,2.458678,94.614630,22492
4,4,17d9e43f-1251-4f1e-8a5b-a96f2c89e5ec,EFO:0010961,unknown,heart left ventricle,1089.0,503,2.165010,42.679889,22492
...,...,...,...,...,...,...,...,...,...,...
9979,9979,9e66d65a-385e-4097-9e61-2ad68a1a59fb,EFO:0010961,hepatocyte,caudate lobe of liver,582.0,347,1.677233,12.872399,21082
9980,9980,9e66d65a-385e-4097-9e61-2ad68a1a59fb,EFO:0010961,centrilobular region hepatocyte,caudate lobe of liver,149.0,120,1.241667,0.689006,21082
9981,9981,9e66d65a-385e-4097-9e61-2ad68a1a59fb,EFO:0010961,centrilobular region hepatocyte,caudate lobe of liver,357.0,231,1.545455,4.805534,21082
9982,9982,9e66d65a-385e-4097-9e61-2ad68a1a59fb,EFO:0010961,unknown,caudate lobe of liver,580.0,397,1.460957,3.435972,21082


In [50]:
census.keys()

KeysView(<Collection 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma' (open for 'r') (3 items)
    'census_data': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_data' (unopened)
    'census_info': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_info' (unopened)
    'census_spatial': Collection 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial' (open for 'r') (2 items)
        'homo_sapiens': Experiment 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens' (open for 'r') (3 items)
            'ms': 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/ms' (unopened)
            'obs': DataFrame 'file:///mnt/scratch/spatial_soma/census-builds/test-spatial-build/soma/census_spatial/homo_sapiens/obs' (open for 'r')
            'spatial': 'file:///mnt/scratch/spatial_soma/censu

In [51]:
var_df = census["census_spatial"]["homo_sapiens"].ms["RNA"].var.read().concat().to_pandas()

var_df

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
0,0,ENSG00000243485,MIR1302-2HG,1021,0,0
1,1,ENSG00000237613,FAM138A,1219,0,0
2,2,ENSG00000186092,OR4F5,2618,0,0
3,3,ENSG00000238009,ENSG00000238009.6,3726,16,9984
4,4,ENSG00000239945,ENSG00000239945.1,1319,0,0
...,...,...,...,...,...,...
36401,36401,ENSG00000277836,ENSG00000277836.1,288,0,0
36402,36402,ENSG00000278633,ENSG00000278633.1,2404,0,0
36403,36403,ENSG00000276017,ENSG00000276017.1,2404,1,4992
36404,36404,ENSG00000278817,ENSG00000278817.1,1213,1546,9984


## Seeing how we can programattically access all the visium h5ads.

In [None]:
import requests

In [None]:
resp = requests.get(
    "https://api.cellxgene.staging.single-cell.czi.technology/curation/v1/datasets",
    {"schema_version": "5.1"},
)

In [None]:
records = resp.json()

In [None]:
records[0]

{'assay': [{'label': "10x 3' v2", 'ontology_term_id': 'EFO:0009899'},
  {'label': "10x 3' v3", 'ontology_term_id': 'EFO:0009922'}],
 'assets': [{'filesize': 513480,
   'filetype': 'H5AD',
   'url': 'https://datasets.cellxgene.staging.single-cell.czi.technology/d3415df4-ece9-4830-a930-c563bddeb077.h5ad'},
  {'filesize': 172238,
   'filetype': 'RDS',
   'url': 'https://datasets.cellxgene.staging.single-cell.czi.technology/d3415df4-ece9-4830-a930-c563bddeb077.rds'}],
 'batch_condition': ['batchA', 'batchB'],
 'cell_count': 320,
 'cell_type': [{'label': 'endothelial cell', 'ontology_term_id': 'CL:0000115'},
  {'label': 'fibroblast', 'ontology_term_id': 'CL:0000057'}],
 'citation': 'Dataset Version: https://datasets.cellxgene.staging.single-cell.czi.technology/d3415df4-ece9-4830-a930-c563bddeb077.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.staging.single-cell.czi.technology/collections/80295d46-a238-4e5d-816c-82c74316699e',
 'collection_doi': None,

In [None]:
[rec for rec in records if rec["assay"]]

In [None]:
visium_datasets = []

for _i, rec in enumerate(records):
    if "Visium Spatial Gene Expression" not in [a["label"] for a in rec["assay"]]:
        continue
    visium_datasets.append(rec)
len(visium_datasets)
# for a in rec["assay"]:
#     a["label"]
# if i > 100: break
# print(rec["assay"])

186

In [None]:
visium_datasets

[{'assay': [{'label': 'Visium Spatial Gene Expression',
    'ontology_term_id': 'EFO:0010961'}],
  'assets': [{'filesize': 198366604,
    'filetype': 'H5AD',
    'url': 'https://datasets.cellxgene.staging.single-cell.czi.technology/7a09bbe2-499d-4785-ac21-fa585934b4e4.h5ad'}],
  'cell_count': 4992,
  'cell_type': [{'label': 'Schwann cell', 'ontology_term_id': 'CL:0002573'},
   {'label': 'basal cell', 'ontology_term_id': 'CL:0000646'},
   {'label': 'chondroblast', 'ontology_term_id': 'CL:0000058'},
   {'label': 'chondrocyte', 'ontology_term_id': 'CL:0000138'},
   {'label': 'fibroblast of dermis', 'ontology_term_id': 'CL:0002551'},
   {'label': 'mesenchymal cell', 'ontology_term_id': 'CL:0008019'},
   {'label': 'muscle cell', 'ontology_term_id': 'CL:0000187'},
   {'label': 'primitive red blood cell', 'ontology_term_id': 'CL:0002355'},
   {'label': 'skeletal muscle myoblast', 'ontology_term_id': 'CL:0000515'},
   {'label': 'tendon cell', 'ontology_term_id': 'CL:0000388'},
   {'label': 'un

In [None]:
?requests.get

[0;31mSignature:[0m [0mrequests[0m[0;34m.[0m[0mget[0m[0;34m([0m[0murl[0m[0;34m,[0m [0mparams[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Sends a GET request.

:param url: URL for the new :class:`Request` object.
:param params: (optional) Dictionary, list of tuples or bytes to send
    in the query string for the :class:`Request`.
:param \*\*kwargs: Optional arguments that ``request`` takes.
:return: :class:`Response <Response>` object
:rtype: requests.Response
[0;31mFile:[0m      ~/miniforge3/envs/census-spatial-builder-dev/lib/python3.11/site-packages/requests/api.py
[0;31mType:[0m      function