To run this notebook, you need to set-up an environment containing the libraries imported below

In [11]:
import scanpy as sc
import pandas as pd
import warnings
import os
import spatialdata as sd
import spatialdata_io as sio

from spatialdata.models import TableModel, PointsModel, ShapesModel
from spatialdata.transformations import Identity
from shapely.geometry import Polygon

import geopandas as gpd

%load_ext autoreload
%autoreload 2

warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


###  Data Acquisition: SPATCH Cancer Atlas (FFPE Human Ovarian Adenocarcinoma with Xenium 5K Prime)

Download the spatial transcriptomics data for the **Ovarian Cancer** study from the [SPATCH portal](https://spatch.pku-genomics.org/#/download).

* **Study:** *Systematic benchmarking of high-throughput subcellular spatial transcriptomics platforms across human tumors.*
* **Journal:** [Ren et al., Nature Communications (2025)](https://www.nature.com/articles/s41467-025-64292-3)

#### Workspace Organization
Organize the files into the following subdirectories:

* **`transcriptome/`**: `.h5ad` count matrices.
* **`transcripts/`**: `.parquet` raw spot data.
* **`morphology/`**: `.csv` cell and nucleus boundaries.

In [None]:
path = "/data/cgobet/2026_06_01_spatial_exploratory/data/"

In [None]:
# create spatch folder if it doesn't exist
spatch_path = os.path.join(path, "spatch")
if not os.path.exists(spatch_path):
    os.makedirs(spatch_path)

transcript_path = os.path.join(spatch_path, "transcripts", "transcripts.parquet")
adata_path = os.path.join(spatch_path, "transcriptome", "adata.h5ad")
seg_path = os.path.join(spatch_path, "segmentation_mask")

### Load and Sanitize AnnData

In [None]:
adata = sc.read_h5ad(adata_path)

uns_keys = list(adata.uns.keys())
for key in uns_keys:
    new_key = key.replace(" ", "_").replace("&", "_").replace("-", "_")
    if new_key != key:
        adata.uns[new_key] = adata.uns.pop(key)

if "spatialdata_attrs" in adata.uns:
    del adata.uns["spatialdata_attrs"]

adata.obs["cell_id"] = adata.obs.index.astype(str)
adata.obs["region"] = "cell_boundaries"

### Load and Process Segmentation (CSV to Polygons)

In [None]:
def csv_to_gdf(csv_filename):
    csv_file = os.path.join(seg_path, csv_filename)
    df = pd.read_csv(csv_file, names=["cell_id", "x", "y", "index"])

    polygons = (
        df.groupby("cell_id", sort=False)
        .apply(lambda g: Polygon(zip(g["x"], g["y"])) if len(g) >= 3 else None)
        .dropna()
    )

    gdf = gpd.GeoDataFrame({"geometry": polygons}, index=polygons.index)
    gdf.index = gdf.index.astype(str)
    return ShapesModel.parse(gdf, transformations={"global": Identity()})


cell_boundaries = csv_to_gdf("cell_boundaries.csv")
nucleus_boundaries = csv_to_gdf("nucleus_boundaries.csv")

### Process Transcripts (Parquet to Points)

In [None]:
transcripts_df = pd.read_parquet(transcript_path)
points = PointsModel.parse(
    transcripts_df,
    coordinates={"x": "x_location", "y": "y_location"},
    feature_key="feature_name",
    transformations={"global": Identity()},
)

### Final Table Parsing 


In [None]:
table = TableModel.parse(
    adata, region="cell_boundaries", region_key="region", instance_key="cell_id"
)

### Assemble SpatialData and Write to Zarr

In [None]:
sdata = sd.SpatialData(
    shapes={
        "cell_boundaries": cell_boundaries,
        "nucleus_boundaries": nucleus_boundaries,
    },
    points={"transcripts": points},
    tables={"table": table},
)

output_zarr = os.path.join(path, "spatch/xenium_ovary_spatch.zarr")
sdata.write(output_zarr)

### Data Acquisition: 10x Genomics

Download the spatial transcriptomics data for the **Fresh Frozen Human Ovarian Adenocarcinoma** (Xenium 5K Prime) from the [10x Genomics portal](https://www.10xgenomics.com/datasets/xenium-prime-fresh-frozen-human-ovary).


The dataset is provided as a single compressed bundle. Once downloaded, extract the contents into a dedicated directory:

* **`10x/`**: Extracted output files (cell_feature_matrix, transcripts, etc.)

In [None]:
# Create directory
!mkdir -p {os.path.join(path, '10x')}

# Download the bundle
!wget -O {os.path.join(path, '10x/outs.zip')} https://s3-us-west-2.amazonaws.com/10x.files/samples/xenium/3.0.0/Xenium_Prime_Human_Ovary_FF/Xenium_Prime_Human_Ovary_FF_outs.zip

# Unzip and clean up
!unzip {os.path.join(path, '10x/outs.zip')} -d {os.path.join(path, '10x/')}
!rm {os.path.join(path, '10x/outs.zip')}

### Write to Zarr

In [None]:
sdata = sio.xenium(os.path.join(path, "10x/Xenium_Prime_Human_Ovary_FF_outs/"))
del sdata["morphology_focus"]
sdata.write(os.path.join(path, "10x/xenium_ovary_10x.zarr"))