# Benchmark: Granule enrichment vs. extrasomatic baseline

Benchmarks whether detected RNA granules are enriched beyond the ambient extrasomatic baseline. Uses **composition-based logFC** for both metrics so they are comparable. Granule markers above the diagonal (or regression line) indicate bona fide structure rather than ambient RNA.

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
from numpy.linalg import lstsq
from scipy.spatial import cKDTree
from scipy.stats import wilcoxon

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

In [None]:
dataset = "MERSCOPE_WT_1"
data_path = f"../../data/{dataset}/"
output_path = f"../../output/{dataset}/"
benchmark_path = "../../output/benchmark/benchmark_diffusion/"

transcripts_path = data_path + "processed_data/transcripts.parquet"
granules_path = output_path + "all_granules.parquet"
spots_path = data_path + "processed_data/spots.h5ad"

## 1. Load data and filter granules

In [None]:
# overlaps_nucleus = 1 means in soma
transcripts = pd.read_parquet(transcripts_path)
print(f"Transcripts: {len(transcripts)}")
print(f"Transcript columns: {list(transcripts.columns)}")

spots = sc.read_h5ad(spots_path)
print(f"Spots: {spots.n_obs}")
print(f"Spot obs columns: {list(spots.obs.columns)}")

granules = pd.read_parquet(granules_path)
print(f"All granules (raw): {len(granules)}")

In [None]:
# Filter: sphere_r <= 4, in_soma_ratio <= 0.1, nc_ratio <= 0.1
granules = granules[granules["sphere_r"] <= 4].copy().reset_index(drop=True)
nc_genes_df = pd.read_csv(data_path + "processed_data/negative_controls.csv")
nc_genes = list(nc_genes_df["Gene"])

def make_tree_3d(d1, d2, d3):
    points = np.c_[np.ravel(d1), np.ravel(d2), np.ravel(d3)]
    return cKDTree(points)

def compute_nc_ratio(granules_df, transcripts_df, nc_genes_list):
    nc_transcripts = transcripts_df[transcripts_df["target"].isin(nc_genes_list)]
    if nc_transcripts.shape[0] == 0:
        return np.zeros(len(granules_df))
    z_col = "sphere_z" if "sphere_z" in granules_df.columns else "layer_z"
    tree = make_tree_3d(
        np.array(nc_transcripts["global_x"]),
        np.array(nc_transcripts["global_y"]),
        np.array(nc_transcripts["global_z"]),
    )
    centers = granules_df[["sphere_x", "sphere_y", z_col]].to_numpy()
    radii = granules_df["sphere_r"].to_numpy()
    sizes = granules_df["size"].to_numpy().astype(float)
    counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
    nc_ratio = np.where(sizes > 0, counts / sizes, 0.0)
    return nc_ratio

nc_ratio = compute_nc_ratio(granules, transcripts, nc_genes)
granules = granules.copy()
granules["nc_ratio"] = nc_ratio

granules = granules[granules["in_soma_ratio"] <= 0.1].copy()
granules = granules[(granules["nc_ratio"] == 0) | (granules["nc_ratio"] < 0.1)].copy().reset_index(drop=True)

print(f"True RNA granules (after all filters): {len(granules)}")

## 2. Map transcripts to spots and build soma/extrasomatic pseudo counts

In [None]:
# Map transcripts to 50x50 spot squares [sx-25, sx+25) x [sy-25, sy+25)
GRID_LEN = 50
HALF_LEN = GRID_LEN / 2

spot_x = spots.obs["global_x"].values
spot_y = spots.obs["global_y"].values
tx_x = transcripts["global_x"].values
tx_y = transcripts["global_y"].values

spot_idx = np.full(len(transcripts), -1, dtype=np.int64)
for i in range(len(spot_x)):
    in_spot = (
        (tx_x >= spot_x[i] - HALF_LEN) & (tx_x < spot_x[i] + HALF_LEN) &
        (tx_y >= spot_y[i] - HALF_LEN) & (tx_y < spot_y[i] + HALF_LEN)
    )
    spot_idx[in_spot] = i

transcripts["spot_idx"] = spot_idx
n_assigned = (spot_idx >= 0).sum()
print(f"Transcripts in spot squares: {n_assigned} / {len(transcripts)}")

In [None]:
genes_df = pd.read_csv(data_path + "processed_data/genes.csv")
gene_col = genes_df.columns[0]  # typically "genes"
genes_all = list(genes_df[gene_col].dropna().unique())
print(f"Genes from genes.csv: {len(genes_all)}")

in_soma_col = "overlaps_nucleus"  # 1 = in soma, 0 = extrasomatic
if in_soma_col not in transcripts.columns:
    raise KeyError(f"Transcripts must have '{in_soma_col}' column")

trans_in_spots = transcripts[transcripts["spot_idx"] >= 0].copy()
trans_in_spots["in_soma"] = (trans_in_spots[in_soma_col] == 1).astype(int)
trans_in_spots["extra"] = 1 - trans_in_spots["in_soma"]

counts_soma = trans_in_spots.groupby(["spot_idx", "target"])["in_soma"].sum().unstack(fill_value=0)
counts_extra = trans_in_spots.groupby(["spot_idx", "target"])["extra"].sum().unstack(fill_value=0)

# Use genes from genes.csv; fill 0 for genes with no counts
counts_soma = counts_soma.reindex(columns=genes_all, fill_value=0)
counts_extra = counts_extra.reindex(columns=genes_all, fill_value=0)

print(f"Genes: {len(genes_all)}, Spots with counts: {counts_soma.shape[0]}")

## 3. Baseline logFC (extrasomatic vs somatic)

Composition-based: log2(frac_extra/frac_soma). P-value from spot-level paired Wilcoxon.

In [None]:
mask_soma = transcripts[in_soma_col] == 1
mask_extra = transcripts[in_soma_col] != 1
total_soma = mask_soma.sum()
total_extra = mask_extra.sum()

eps = 0.5
baseline_logFC = []
baseline_pval_paired = []  # keep spot-level p-value for significance
for g in genes_all:
    mask_g = transcripts["target"] == g
    count_soma_g = (mask_g & mask_soma).sum()
    count_extra_g = (mask_g & mask_extra).sum()
    frac_soma = (count_soma_g + eps) / (total_soma + eps)
    frac_extra = (count_extra_g + eps) / (total_extra + eps)
    logfc = np.log2(frac_extra) - np.log2(frac_soma)
    baseline_logFC.append(logfc)
    s = counts_soma[g].values + 1.0
    e = counts_extra[g].values + 1.0
    try:
        stat, p = wilcoxon(e, s, alternative="two-sided")
        baseline_pval_paired.append(p)
    except Exception:
        baseline_pval_paired.append(1.0)

baseline_df = pd.DataFrame({
    "gene": genes_all,
    "baseline_logFC": baseline_logFC,
    "baseline_pval": baseline_pval_paired,
})
print(baseline_df.head(10))

## 4. Granule enrichment statistic

In [None]:
z_col = "layer_z"
tx_xyz = transcripts[["global_x", "global_y", "global_z"]].to_numpy(dtype=np.float64)
tree_tx = cKDTree(tx_xyz)

in_granule = np.zeros(len(transcripts), dtype=np.int8)
gnl_centers = granules[["sphere_x", "sphere_y", z_col]].to_numpy(dtype=np.float64)
gnl_radii = granules["sphere_r"].to_numpy(dtype=np.float64)

for i in range(len(granules)):
    idx = tree_tx.query_ball_point(gnl_centers[i], gnl_radii[i] + 0.1)
    if len(idx) > 0:
        in_granule[idx] = 1

transcripts["in_granule"] = in_granule
print(f"Transcripts in granules: {in_granule.sum()} / {len(transcripts)}")

In [None]:
mask_extra = transcripts[in_soma_col] != 1
mask_in_gnl_extra = (transcripts["in_granule"] == 1) & mask_extra
mask_non_gnl_extra = (transcripts["in_granule"] == 0) & mask_extra
total_tx_in_gnl_extra = mask_in_gnl_extra.sum()
total_tx_non_gnl_extra = mask_non_gnl_extra.sum()

granule_enrichment = []
eps = 0.5
for idx, g in enumerate(genes_all):
    mask_g = transcripts["target"] == g
    in_gnl_extra_g = (mask_in_gnl_extra & mask_g).sum()
    non_gnl_extra_g = (mask_non_gnl_extra & mask_g).sum()
    frac_gnl = (in_gnl_extra_g + eps) / (total_tx_in_gnl_extra + eps)
    frac_non = (non_gnl_extra_g + eps) / (total_tx_non_gnl_extra + eps)
    enr = np.log2(frac_gnl) - np.log2(frac_non)
    granule_enrichment.append(enr)

enrichment_df = pd.DataFrame({"gene": genes_all, "granule_enrichment": granule_enrichment})

In [None]:
granule_markers = ['Syn1', 'Cyfip2', 'Vamp2', 'Bsn', 'Stx1a', 'Map2', 'Nfasc', 'Slc17a7', 'Gria1', 'Map1a', 'Ddn', 'Gap43', 'Mapt', 'Gphn', 'Homer2', 'Slc17a6', 'Dlg3', 'Nlgn2', 'Gria2', 'Nlgn1', 'Nav1', 'Slc32a1', 'Tubb3', 'Dlg4', 'Syt1', 'Camk2a', 'Nrxn1', 'Syp', 'Nlgn3', 'Cplx2', 'Ank3', 'Shank1', 'Homer1', 'Shank3']

plot_df = baseline_df.merge(enrichment_df, on="gene", how="inner")
plot_df_markers = plot_df[plot_df["gene"].isin(granule_markers)].copy()

print(f"Genes in plot: {len(plot_df)}, Markers: {len(plot_df_markers)}")

### Alternative: granule enrichment vs soma

Set `USE_ALT_GRANULE_VS_SOMA = True` to use log2(frac_in_granules/frac_soma). Other options: minimum count filter, larger pseudocount.

In [None]:
# Optional: alternative granule enrichment = "vs soma" (same reference as baseline)
# This often puts more markers above the diagonal: granules vs soma >= extrasomatic vs soma for true markers.
USE_ALT_GRANULE_VS_SOMA = False  # Set True to use this definition and replot
if USE_ALT_GRANULE_VS_SOMA:
    total_soma = (transcripts[in_soma_col] == 1).sum()
    total_in_gnl = (transcripts["in_granule"] == 1).sum()
    granule_enrichment_alt = []
    eps = 0.5
    for g in genes_all:
        mask_g = transcripts["target"] == g
        in_gnl_g = ((transcripts["in_granule"] == 1) & mask_g).sum()
        soma_g = ((transcripts[in_soma_col] == 1) & mask_g).sum()
        frac_soma = (soma_g + eps) / (total_soma + eps)
        frac_gnl = (in_gnl_g + eps) / (total_in_gnl + eps)
        granule_enrichment_alt.append(np.log2(frac_gnl) - np.log2(frac_soma))
    plot_df["granule_enrichment_alt"] = granule_enrichment_alt
    plot_df["granule_enrichment"] = plot_df["granule_enrichment_alt"]
    print("Using alternative granule enrichment (vs soma). Re-run the next cell (Excess Δ and regression) and below.")

## 5. Excess statistic (Δ) and regression reference

Δ = granule_enrichment − baseline_logFC. Regression on non-markers yields a reference line; markers above it are enriched beyond what baseline predicts.

In [None]:
# Excess statistic
plot_df["delta"] = plot_df["granule_enrichment"] - plot_df["baseline_logFC"]

# Regression on non-markers: granule_enrichment ~ baseline_logFC
non_marker = ~plot_df["gene"].isin(granule_markers)
X = plot_df.loc[non_marker, "baseline_logFC"].values.reshape(-1, 1)
y = plot_df.loc[non_marker, "granule_enrichment"].values
# Linear regression: y = slope * x + intercept
ones = np.ones((len(X), 1))
X_aug = np.hstack([X, ones])
coef, _, _, _ = lstsq(X_aug, y, rcond=None)
reg_slope, reg_intercept = coef[0], coef[1]
plot_df["expected_ge"] = reg_slope * plot_df["baseline_logFC"] + reg_intercept
plot_df["above_regression"] = plot_df["granule_enrichment"] > plot_df["expected_ge"]

# Re-derive plot_df_markers so it includes the new columns (delta, above_regression, expected_ge)
plot_df_markers = plot_df[plot_df["gene"].isin(granule_markers)].copy()

# Count markers above diagonal vs above regression line
above_diag = (plot_df_markers["granule_enrichment"] > plot_df_markers["baseline_logFC"]).sum()
above_reg = plot_df_markers["above_regression"].sum()
print(f"Regression (non-markers): granule_enrichment = {reg_slope:.4f} * baseline_logFC + {reg_intercept:.4f}")
print(f"Granule markers above diagonal (y=x): {above_diag} / {len(plot_df_markers)}")
print(f"Granule markers above regression line: {above_reg} / {len(plot_df_markers)}")
plot_df.to_csv(benchmark_path + "benchmark_diffusion_df.csv", index=False)