To run this notebook, you need to set-up an environment containing the libraries imported below

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import warnings
import os
import spatialdata as sd
import anndata as ad

%load_ext autoreload
%autoreload 2

warnings.filterwarnings("ignore")

### Setup Paths and Load Data

In [None]:
path = "/data/cgobet/2026_06_01_spatial_exploratory/data/"

sdata_spatch = sd.read_zarr(os.path.join(path, "spatch", "xenium_ovary_spatch.zarr"))
sdata_10x = sd.read_zarr(os.path.join(path, "10x", "xenium_ovary_10x.zarr"))

### Process AnnData: Cell Filtering, QC Metrics, and Morphology

In [None]:
def process_adata(
    sdata, dataset_name, min_counts=100, max_quantile=0.98, min_cells=100
):
    # Extract table
    adata = sdata["table"]

    # Store raw counts
    adata.layers["counts"] = adata.X.copy()

    # Calculate QC metrics
    sc.pp.calculate_qc_metrics(adata, inplace=True)

    # Add cell and nucleus areas
    cell_areas = sdata.shapes["cell_boundaries"].geometry.area
    nuc_areas = sdata.shapes["nucleus_boundaries"].geometry.area
    adata.obs["cell_area"] = adata.obs["cell_id"].map(cell_areas)
    adata.obs["nucleus_area"] = adata.obs["cell_id"].map(nuc_areas)

    # Print pre-filtering stats
    print(f"\n{dataset_name} - Before filtering:")
    print(f"  Shape: {adata.shape}")
    print(f"  Median counts: {np.median(adata.obs['total_counts']):.0f}")

    # Filter cells and genes
    max_counts = np.quantile(adata.obs["total_counts"], max_quantile)
    sc.pp.filter_cells(adata, min_counts=min_counts)
    sc.pp.filter_cells(adata, max_counts=max_counts)
    sc.pp.filter_genes(adata, min_cells=min_cells)

    # Print post-filtering stats
    print(f"{dataset_name} - After filtering:")
    print(f"  Shape: {adata.shape}")
    print(f"  Median counts: {np.median(adata.obs['total_counts']):.0f}")

    # Add dataset labels
    adata.obs_names = adata.obs_names + f"_{dataset_name}"
    adata.obs["dataset"] = dataset_name

    return adata

In [None]:
adata_spatch = process_adata(
    sdata_spatch,
    dataset_name="spatch",
    min_counts=100,
    max_quantile=0.98,
    min_cells=100,
)

adata_10x = process_adata(
    sdata_10x, dataset_name="10x", min_counts=200, max_quantile=0.98, min_cells=100
)

### Merge Datasets

In [None]:
adata = ad.concat([adata_spatch, adata_10x])
adata.layers["counts"] = adata.X.copy()

### Save Processed Data

In [None]:
adata.write_h5ad(os.path.join(path, "xenium_ovary_both.h5ad"))