# Benchmark sphere radius for profile()

Benchmark how the choice of sphere radius (used to gather transcripts and build granule expression profiles) affects per-granule metrics. Uses fine detection results exported as `granules.parquet`.

**Four radius settings:**
1. **Default**: granule-specific radius (as in mcDETECT `profile()`, no buffer)
2. **Fixed**: same radius for all granules = median of granule radii
3. **Expand**: current radius × 1.2 per granule
4. **Shrink**: current radius × 0.8 per granule

**Per-granule metrics (for each setting):**
- Number of transcripts per granule
- Number of unique genes per granule
- Negative-control ratio (NC transcripts / granule-marker transcripts in sphere)
- In-soma ratio (granule-marker transcripts in sphere that overlap soma / all granule-marker transcripts in sphere)

In [None]:
import numpy as np
import os
import pandas as pd
from scipy.spatial import cKDTree

import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = "MERSCOPE_WT_1"
data_path = f"../../data/{dataset}/"
output_path = f"../../output/{dataset}/"
benchmark_path = "../../output/benchmark/benchmark_sphere/"
representative_dir = os.path.join(benchmark_path, f"{dataset}_representative_data")
os.makedirs(benchmark_path, exist_ok=True)
os.makedirs(representative_dir, exist_ok=True)

In [None]:
# Fine detection results (with filtering applied)
granules = pd.read_parquet(output_path + "granules.parquet")
print(f"Granules: {len(granules)}")

transcripts = pd.read_parquet(data_path + "processed_data/transcripts.parquet")
if "target" not in transcripts.columns and "gene" in transcripts.columns:
    transcripts["target"] = transcripts["gene"]

genes = pd.read_csv(data_path + "processed_data/genes.csv")
genes = list(genes.iloc[:, 0])

nc_genes = list(pd.read_csv(data_path + "processed_data/negative_controls.csv")["Gene"])
gnl_genes = ["Camk2a", "Cplx2", "Slc17a7", "Ddn", "Syp", "Map1a", "Shank1", "Syn1", "Gria1", "Gria2", "Cyfip2", "Vamp2", "Bsn", "Slc32a1", "Nfasc", "Syt1", "Tubb3", "Nav1", "Shank3", "Mapt"]

if "overlaps_nucleus" not in transcripts.columns and "overlaps_nucleus_5_dilation" in transcripts.columns:
    transcripts["overlaps_nucleus"] = transcripts["overlaps_nucleus_5_dilation"]
if "layer_z" not in granules.columns and "sphere_z" in granules.columns:
    granules["layer_z"] = granules["sphere_z"]

## Helper functions

Center for each sphere uses `(sphere_x, sphere_y, layer_z)` to match mcDETECT's `profile()`.

In [None]:
def make_tree_3d(d1, d2, d3):
    """Build 3D cKDTree from coordinate arrays."""
    points = np.c_[np.ravel(d1), np.ravel(d2), np.ravel(d3)]
    return cKDTree(points)


def get_radii(granules, setting):
    """Return array of radii for each granule. setting in {'default', 'fixed', 'expand', 'shrink'}."""
    r = granules["sphere_r"].to_numpy()
    if setting == "default":
        return r.copy()
    if setting == "fixed":
        med = np.median(r)
        return np.full(len(granules), med)
    if setting == "expand":
        return r * 1.2
    if setting == "shrink":
        return r * 0.8
    raise ValueError(f"Unknown setting: {setting}")


def compute_per_granule_metrics(granules, transcripts, radii, gnl_genes, nc_genes):
    """
    For each granule with center (sphere_x, sphere_y, layer_z) and radius from `radii`,
    compute: n_transcripts, n_unique_genes, nc_ratio, in_soma_ratio.
    nc_ratio = (NC transcript count in sphere) / (granule-marker transcript count in sphere).
    in_soma_ratio = (granule-marker transcripts in sphere that overlap soma) / (granule-marker transcripts in sphere).
    query_ball_point returns iloc indices; we use numpy arrays for correct indexing.
    """
    tree = make_tree_3d(
        transcripts["global_x"].to_numpy(),
        transcripts["global_y"].to_numpy(),
        transcripts["global_z"].to_numpy(),
    )
    centers = granules[["sphere_x", "sphere_y", "layer_z"]].to_numpy()
    target_arr = transcripts["target"].to_numpy()
    overlaps_arr = transcripts["overlaps_nucleus"].to_numpy() if "overlaps_nucleus" in transcripts.columns else np.zeros(len(transcripts))

    n_transcripts = []
    n_unique_genes = []
    nc_ratios = []
    in_soma_ratios = []

    for i in range(len(granules)):
        c, r = centers[i], radii[i]
        idx = tree.query_ball_point(c, r)
        if not idx:
            n_transcripts.append(0)
            n_unique_genes.append(0)
            nc_ratios.append(0.0)
            in_soma_ratios.append(np.nan)
            continue
        idx = np.asarray(idx)
        n_transcripts.append(len(idx))
        n_unique_genes.append(len(np.unique(target_arr[idx])))

        gnl_mask = np.isin(target_arr[idx], gnl_genes)
        nc_mask = np.isin(target_arr[idx], nc_genes)
        gnl_count = gnl_mask.sum()
        nc_count = nc_mask.sum()
        nc_ratios.append(nc_count / gnl_count if gnl_count > 0 else 0.0)

        if gnl_count == 0:
            in_soma_ratios.append(np.nan)
        else:
            gnl_iloc = idx[gnl_mask]
            in_soma = overlaps_arr[gnl_iloc].sum()
            in_soma_ratios.append(in_soma / gnl_count)

    return pd.DataFrame({
        "n_transcripts": n_transcripts,
        "n_unique_genes": n_unique_genes,
        "nc_ratio": nc_ratios,
        "in_soma_ratio": in_soma_ratios,
    })

In [None]:
# Run all four radius settings and compute per-granule metrics
settings = ["default", "fixed", "expand", "shrink"]
results = {}

for s in settings:
    radii = get_radii(granules, s)
    df = compute_per_granule_metrics(granules, transcripts, radii, gnl_genes, nc_genes)
    results[s] = df
    print(f"Done: {s}")

## Summary: per-setting aggregates

For each radius setting, report (across granules): mean and median of n_transcripts, n_unique_genes, nc_ratio, and in_soma_ratio.

In [None]:
def summary_row(name, df):
    return {
        "setting": name,
        "mean_n_transcripts": df["n_transcripts"].mean(),
        "median_n_transcripts": df["n_transcripts"].median(),
        "mean_n_unique_genes": df["n_unique_genes"].mean(),
        "median_n_unique_genes": df["n_unique_genes"].median(),
        "mean_nc_ratio": df["nc_ratio"].mean(),
        "median_nc_ratio": df["nc_ratio"].median(),
        "mean_in_soma_ratio": df["in_soma_ratio"].mean(),
        "median_in_soma_ratio": df["in_soma_ratio"].median(),
    }

summary_rows = [summary_row(s, results[s]) for s in settings]
summary_df = pd.DataFrame(summary_rows)
summary_df

In [None]:
# Save summary and per-granule metrics for each setting
summary_df.to_csv(benchmark_path + "benchmark_sphere_summary.csv", index=False)
for s in settings:
    out = results[s].copy()
    out["setting"] = s
    out.to_csv(benchmark_path + f"benchmark_sphere_metrics_{s}.csv", index=False)
print("Saved benchmark_sphere_summary.csv and benchmark_sphere_metrics_<setting>.csv")

## Gene expression similarity within spheres across settings

For each gene, each setting yields an expression vector across the same spheres (granules). We compute the Pearson correlation between the default setting and each of the other three (fixed, expand, shrink) per gene, then plot a correlation heatmap: rows = genes, columns = default vs fixed / default vs expand / default vs shrink.

In [None]:
from collections import Counter

def build_expression_matrix(granules, transcripts, radii, genes):
    """Return (n_granules x n_genes) dense array of transcript counts per sphere, matching profile() logic."""
    trans = transcripts[transcripts["target"].isin(genes)].copy()
    if trans.shape[0] == 0:
        return np.zeros((len(granules), len(genes)))
    tree = make_tree_3d(
        trans["global_x"].to_numpy(),
        trans["global_y"].to_numpy(),
        trans["global_z"].to_numpy(),
    )
    gene_to_idx = {g: i for i, g in enumerate(genes)}
    target_arr = trans["target"].to_numpy()
    centers = granules[["sphere_x", "sphere_y", "layer_z"]].to_numpy()
    n_gnl, n_gene = len(granules), len(genes)
    X = np.zeros((n_gnl, n_gene), dtype=np.float32)
    for i in range(n_gnl):
        idx = tree.query_ball_point(centers[i], radii[i])
        if not idx:
            continue
        idx = np.asarray(idx)
        counts = Counter(target_arr[idx])
        for g, c in counts.items():
            X[i, gene_to_idx[g]] = c
    return X

In [None]:
# Build expression matrix (n_granules x n_genes) for each setting
print("Building expression matrices...")
X_default = build_expression_matrix(granules, transcripts, get_radii(granules, "default"), genes)
X_fixed   = build_expression_matrix(granules, transcripts, get_radii(granules, "fixed"), genes)
X_expand  = build_expression_matrix(granules, transcripts, get_radii(granules, "expand"), genes)
X_shrink  = build_expression_matrix(granules, transcripts, get_radii(granules, "shrink"), genes)
print("Done.")

In [None]:
# Write granule expression profiles (with metadata) as h5ad for each radius setting
import anndata

def expression_to_adata(X, granules, genes, setting_name):
    """Build AnnData: X = expression (n_granules x n_genes), obs = granule metadata including coordinates."""
    obs = granules.copy()
    obs["granule_id"] = [f"gnl_{i}" for i in range(len(granules))]
    obs = obs.astype({"granule_id": str})
    obs.rename(columns={"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace=True)
    adata = anndata.AnnData(X=X.astype(np.float32), obs=obs)
    adata.var["genes"] = genes
    adata.var_names = genes
    adata.var_names_make_unique()
    adata.obs["radius_setting"] = setting_name
    return adata

for name, X in [
    ("default", X_default),
    ("fixed", X_fixed),
    ("expand", X_expand),
    ("shrink", X_shrink),
]:
    adata = expression_to_adata(X, granules, genes, name)
    out_path = os.path.join(representative_dir, f"granules_expression_{name}.h5ad")
    adata.write_h5ad(out_path)
    print(f"Wrote {out_path} ({adata.n_obs} granules x {adata.n_vars} genes)")

In [None]:
# Per-gene correlation: default vs each other setting (expression vector across spheres)
n_genes = len(genes)
corr_vs_fixed  = np.array([np.corrcoef(X_default[:, g], X_fixed[:, g])[0, 1]   for g in range(n_genes)])
corr_vs_expand = np.array([np.corrcoef(X_default[:, g], X_expand[:, g])[0, 1] for g in range(n_genes)])
corr_vs_shrink = np.array([np.corrcoef(X_default[:, g], X_shrink[:, g])[0, 1] for g in range(n_genes)])
# Constant vectors yield nan; replace with 1.0 (perfect correlation with self)
corr_vs_fixed  = np.nan_to_num(corr_vs_fixed,  nan=1.0, posinf=1.0, neginf=-1.0)
corr_vs_expand = np.nan_to_num(corr_vs_expand, nan=1.0, posinf=1.0, neginf=-1.0)
corr_vs_shrink = np.nan_to_num(corr_vs_shrink, nan=1.0, posinf=1.0, neginf=-1.0)
corr_matrix = np.column_stack([corr_vs_fixed, corr_vs_expand, corr_vs_shrink])  # (n_genes, 3)
corr_df = pd.DataFrame(corr_matrix, index=genes, columns=["default vs fixed", "default vs expand", "default vs shrink"])
corr_df.to_csv(benchmark_path + "benchmark_sphere_expression_correlation.csv")
corr_df.head(10)

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # fig, ax = plt.subplots(figsize=(4, max(6, len(genes) * 0.04)))
# # sns.heatmap(corr_df, ax=ax, cmap="RdYlBu_r", vmin=0, vmax=1, cbar_kws={"label": "Pearson r"})
# # ax.set_xlabel("Default vs other setting")
# # ax.set_ylabel("Gene")
# # plt.tight_layout()
# # plt.savefig(benchmark_path + "benchmark_sphere_expression_correlation_heatmap.jpeg", dpi=500, bbox_inches="tight")
# # plt.close()

# fig, ax = plt.subplots(figsize=(15, 5))
# hm = sns.heatmap(corr_df.T, ax=ax, cmap="RdYlBu_r", vmin=0, vmax=1, cbar_kws={ "orientation": "horizontal", "label": "Pearson r", "pad": 0.25, "shrink": 0.5, "aspect": 30})
# ax.set_xlabel(" ")
# ax.set_ylabel(" ")
# # Ensure colorbar label & ticks are below
# cbar = hm.collections[0].colorbar
# cbar.ax.xaxis.set_label_position("bottom")
# cbar.ax.xaxis.tick_bottom()
# plt.tight_layout()
# plt.savefig(benchmark_path + "benchmark_sphere_expression_correlation_heatmap.jpeg", dpi=500, bbox_inches="tight")
# plt.close()

## Gene-by-gene correlation heatmaps (default vs others)

Three heatmaps: rows = genes (expression from **default** setting across spheres), columns = genes (expression from **other** setting). Entry (i, j) = Pearson r between default gene *i* and other-setting gene *j*. One heatmap each for default vs fixed, default vs expand, default vs shrink.

In [None]:
# # Gene-by-gene correlation: rows = default (gene i), columns = other setting (gene j); entry = corr(default_i, other_j)
# # Vectorized: stack [X_default, X_other], then corrcoef(.T) gives block matrix; top-right block = default vs other
# def gene_gene_corr_default_vs_other(X_default, X_other, genes):
#     n = len(genes)
#     M = np.hstack([X_default, X_other])  # (n_granules, 2*n_genes)
#     R = np.corrcoef(M.T)                  # (2*n_genes, 2*n_genes)
#     C = R[:n, n:]                         # (n_genes, n_genes): default rows, other cols
#     C = np.nan_to_num(C, nan=0.0, posinf=1.0, neginf=-1.0)
#     return pd.DataFrame(C, index=genes, columns=genes)

# gene_corr_default_fixed  = gene_gene_corr_default_vs_other(X_default, X_fixed, genes)
# gene_corr_default_expand = gene_gene_corr_default_vs_other(X_default, X_expand, genes)
# gene_corr_default_shrink = gene_gene_corr_default_vs_other(X_default, X_shrink, genes)

# for corr_df, title, suffix in [
#     (gene_corr_default_fixed, "default vs fixed", "default_vs_fixed"),
#     (gene_corr_default_expand, "default vs expand", "default_vs_expand"),
#     (gene_corr_default_shrink, "default vs shrink", "default_vs_shrink"),
# ]:
#     fig, ax = plt.subplots(figsize=(12, 12))
#     sns.heatmap(corr_df, ax=ax, cmap="RdYlBu_r", vmin=-1, vmax=1, center=0,
#                 square=True, cbar_kws={"label": "Pearson r"})
#     ax.set_title(title)
#     ax.set_xlabel("Gene (other setting)")
#     ax.set_ylabel("Gene (default)")
#     plt.tight_layout()
#     plt.savefig(benchmark_path + f"benchmark_sphere_gene_gene_correlation_heatmap_{suffix}.jpeg", dpi=300, bbox_inches="tight")
#     plt.close()