# Open-ST Metastatic Lymph Node 3D

This notebook follows the same general processing pattern as `openST-mousehead.ipynb`, adapted for the metastatic lymph node 3D dataset at:

- `/Users/chrislangseth/Downloads/GSE251926_metastatic_lymph_node_3d.h5ad`

It loads the `.h5ad`, prepares a counts layer, runs a standard Scanpy workflow, then fits SCVI and CellCharter.

In [3]:
import anndata as ad
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scvi
import squidpy as sq
import cellcharter as cc
from anndata.io import read_elem
from lightning.pytorch import seed_everything

seed_everything(12345)
scvi.settings.seed = 12345

Seed set to 12345
Seed set to 12345


In [4]:
H5AD_PATH = "/Users/chrislangseth/Downloads/GSE251926_metastatic_lymph_node_3d.h5ad"

def read_h5ad_robust(path):
    try:
        return ad.read_h5ad(path)
    except Exception:
        with h5py.File(path, "r") as f:
            kwargs = {}
            for key in ["X", "obs", "var", "uns", "obsm", "varm", "obsp", "varp"]:
                if key in f:
                    kwargs[key] = read_elem(f[key])

            if "layers" in f:
                layers = {}
                for name in f["layers"].keys():
                    obj = f["layers"][name]
                    if (
                        name == "raw"
                        and isinstance(obj, h5py.Group)
                        and obj.attrs.get("encoding-type") == "anndata"
                    ):
                        continue
                    layers[name] = read_elem(obj)
                kwargs["layers"] = layers

            adata = ad.AnnData(**kwargs)

            if "layers" in f and "raw" in f["layers"]:
                raw_group = f["layers"]["raw"]
                if isinstance(raw_group, h5py.Group) and raw_group.attrs.get("encoding-type") == "anndata":
                    raw_varm = read_elem(raw_group["varm"]) if "varm" in raw_group else None
                    raw_adata = ad.AnnData(
                        X=read_elem(raw_group["X"]),
                        obs=adata.obs.copy(),
                        var=read_elem(raw_group["var"]),
                        varm=raw_varm,
                    )
                    adata.raw = raw_adata
                    adata.layers["raw"] = raw_adata.X.copy()

            return adata

adata = read_h5ad_robust(H5AD_PATH)
adata.var_names_make_unique()
adata.obs_names_make_unique()
adata.obs["sample_id"] = "metastatic_lymph_node_3d"
adata.obs["sample_id"] = adata.obs["sample_id"].astype("category")

if "raw" in adata.layers:
    adata.layers["counts"] = adata.layers["raw"].copy()
elif adata.raw is not None:
    adata.layers["counts"] = adata.raw.X.copy()
else:
    adata.layers["counts"] = adata.X.copy()

NEIGHBOR_SPATIAL_KEY = "spatial_3d_aligned" if "spatial_3d_aligned" in adata.obsm else "spatial"
PLOT_SPATIAL_KEY = "spatial" if "spatial" in adata.obsm else NEIGHBOR_SPATIAL_KEY
adata.obsm["spatial_plot"] = np.asarray(adata.obsm[PLOT_SPATIAL_KEY])[:, :2].copy()

print(f"Neighbor spatial key: {NEIGHBOR_SPATIAL_KEY}")
print(f"Plot spatial key: {PLOT_SPATIAL_KEY}")
adata

Neighbor spatial key: spatial_3d_aligned
Plot spatial key: spatial


AnnData object with n_obs × n_vars = 1097769 × 28943
    obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section', 'sample_id'
    var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts'
    uns: 'log1p'
    obsm: 'spatial', 'spatial_3d_aligned', 'spatial_plot'
    layers: 'raw', 'counts'

In [7]:
adata.X.max()

np.float32(6.2955923)

In [8]:
adata.obs.head()

Unnamed: 0,cell_ID_mask,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,n_reads,reads_per_counts,n_joined,exact_entropy,theoretical_entropy,exact_compression,theoretical_compression,n_counts,annotation,annotation_key,n_section,sample_id
4_2434,2434,191,274.0,37.0,13.50365,748,2.729927,343,1.896968,1.909971,38,38,274.0,unknown,17.0,4,metastatic_lymph_node_3d
4_2727,2727,278,352.0,46.0,13.068181,1054,2.994318,247,1.905911,1.911943,38,38,352.0,Tumor,15.0,4,metastatic_lymph_node_3d
4_2771,2771,198,283.0,37.0,13.074204,756,2.671378,433,1.898245,1.909238,38,38,283.0,Tumor,15.0,4,metastatic_lymph_node_3d
4_2865,2865,326,433.0,54.0,12.471132,1328,3.066975,304,1.895395,1.907862,38,38,433.0,Plasma_IgG,13.0,4,metastatic_lymph_node_3d
4_2915,2915,256,346.0,48.0,13.872832,1026,2.965318,344,1.903663,1.909331,38,37,346.0,Tumor,15.0,4,metastatic_lymph_node_3d


In [9]:
adata

AnnData object with n_obs × n_vars = 1097769 × 28943
    obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section', 'sample_id'
    var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts'
    uns: 'log1p'
    obsm: 'spatial', 'spatial_3d_aligned', 'spatial_plot'
    layers: 'raw', 'counts'

In [10]:
n_top_genes = min(2000, adata.n_vars)
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, flavor="seurat", subset=True)

adata

AnnData object with n_obs × n_vars = 1097769 × 2000
    obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section', 'sample_id'
    var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
    obsm: 'spatial', 'spatial_3d_aligned', 'spatial_plot'
    layers: 'raw', 'counts'

In [None]:
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=min(30, adata.obsm["X_pca"].shape[1]))
sc.tl.umap(adata, min_dist=0.3)

In [None]:
with plt.rc_context({"figure.figsize": (7, 5)}):
    sc.pl.umap(adata, color=[c for c in ["annotation", "n_section"] if c in adata.obs.columns], wspace=0.4)

with plt.rc_context({"figure.figsize": (8, 8)}):
    sc.pl.embedding(adata, basis="spatial_plot", color=[c for c in ["annotation", "n_section"] if c in adata.obs.columns], s=2, frameon=False)

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    batch_key="sample_id",
)

model = scvi.model.SCVI(adata)

In [None]:
model.train(early_stopping=True, enable_progress_bar=True, max_epochs=30)

In [None]:
adata.obsm["X_scVI"] = model.get_latent_representation(adata).astype(np.float32)
adata

In [None]:
sq.gr.spatial_neighbors(
    adata,
    library_key="sample_id",
    coord_type="generic",
    spatial_key=NEIGHBOR_SPATIAL_KEY,
    n_neighs=6,
    delaunay=False,
)

In [None]:
cc.gr.aggregate_neighbors(adata, n_layers=3, use_rep="X_scVI", out_key="X_cellcharter", sample_key="sample_id")

In [None]:
# GMM components to try
gmm_components = [5, 10, 15, 20]

In [None]:
from sklearn.mixture import GaussianMixture

X = adata.obsm["X_cellcharter"]
gmm_results = []

for k in gmm_components:
    print(k)
    gmm = GaussianMixture(
        n_components=k,
        covariance_type="full",
        random_state=0,
        n_init=3,
    )
    labels = gmm.fit_predict(X)
    key = f"CellCharter_{k}"
    adata.obs[key] = labels.astype(str)
    gmm_results.append({
        "k": k,
        "bic": gmm.bic(X),
        "aic": gmm.aic(X),
    })
    print(f"k={k}: {adata.obs[key].nunique()} clusters")

pd.DataFrame(gmm_results)

In [None]:
with plt.rc_context({"figure.figsize": (8, 8)}):
    sc.pl.embedding(adata, basis="spatial_plot", color="CellCharter_15", s=2, frameon=False)

In [None]:
OUTPUT_H5AD = "/Users/chrislangseth/Downloads/GSE251926_metastatic_lymph_node_3d.processed.h5ad"
adata.write_h5ad(OUTPUT_H5AD)
OUTPUT_H5AD