# Benchmark sphere radius for profile()

Benchmark how the choice of sphere radius (used to gather transcripts and build granule expression profiles) affects per-granule metrics. Uses fine detection results exported as `granules.parquet`.

**Four radius settings:**
1. **Default**: granule-specific radius (as in mcDETECT `profile()`, no buffer)
2. **Fixed**: same radius for all granules = median of granule radii
3. **Expand**: current radius × 1.2 per granule
4. **Shrink**: current radius × 0.8 per granule

**Per-granule metrics (for each setting):**
- Number of transcripts per granule
- Number of unique genes per granule
- Negative-control ratio (NC transcripts / granule-marker transcripts in sphere)
- In-soma ratio (granule-marker transcripts in sphere that overlap soma / all granule-marker transcripts in sphere)

In [None]:
import numpy as np
import os
import pandas as pd
from scipy.spatial import cKDTree

import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = "MERSCOPE_WT_1"
data_path = f"../../data/{dataset}/"
output_path = f"../../output/{dataset}/"
benchmark_path = "../../output/benchmark/"
os.makedirs(benchmark_path, exist_ok=True)

In [None]:
# Fine detection results (with filtering applied)
granules = pd.read_parquet(output_path + "granules.parquet")
print(f"Granules: {len(granules)}")

transcripts = pd.read_parquet(data_path + "processed_data/transcripts.parquet")
if "target" not in transcripts.columns and "gene" in transcripts.columns:
    transcripts["target"] = transcripts["gene"]

nc_genes = list(pd.read_csv(data_path + "processed_data/negative_controls.csv")["Gene"])
gnl_genes = ["Camk2a", "Cplx2", "Slc17a7", "Ddn", "Syp", "Map1a", "Shank1", "Syn1", "Gria1", "Gria2", "Cyfip2", "Vamp2", "Bsn", "Slc32a1", "Nfasc", "Syt1", "Tubb3", "Nav1", "Shank3", "Mapt"]

if "overlaps_nucleus" not in transcripts.columns and "overlaps_nucleus_5_dilation" in transcripts.columns:
    transcripts["overlaps_nucleus"] = transcripts["overlaps_nucleus_5_dilation"]
if "layer_z" not in granules.columns and "sphere_z" in granules.columns:
    granules["layer_z"] = granules["sphere_z"]

## Helper functions

Center for each sphere uses `(sphere_x, sphere_y, layer_z)` to match mcDETECT's `profile()`.

In [None]:
def make_tree_3d(d1, d2, d3):
    """Build 3D cKDTree from coordinate arrays."""
    points = np.c_[np.ravel(d1), np.ravel(d2), np.ravel(d3)]
    return cKDTree(points)


def get_radii(granules, setting):
    """Return array of radii for each granule. setting in {'default', 'fixed', 'expand', 'shrink'}."""
    r = granules["sphere_r"].to_numpy()
    if setting == "default":
        return r.copy()
    if setting == "fixed":
        med = np.median(r)
        return np.full(len(granules), med)
    if setting == "expand":
        return r * 1.2
    if setting == "shrink":
        return r * 0.8
    raise ValueError(f"Unknown setting: {setting}")


def compute_per_granule_metrics(granules, transcripts, radii, gnl_genes, nc_genes):
    """
    For each granule with center (sphere_x, sphere_y, layer_z) and radius from `radii`,
    compute: n_transcripts, n_unique_genes, nc_ratio, in_soma_ratio.
    nc_ratio = (NC transcript count in sphere) / (granule-marker transcript count in sphere).
    in_soma_ratio = (granule-marker transcripts in sphere that overlap soma) / (granule-marker transcripts in sphere).
    query_ball_point returns iloc indices; we use numpy arrays for correct indexing.
    """
    tree = make_tree_3d(
        transcripts["global_x"].to_numpy(),
        transcripts["global_y"].to_numpy(),
        transcripts["global_z"].to_numpy(),
    )
    centers = granules[["sphere_x", "sphere_y", "layer_z"]].to_numpy()
    target_arr = transcripts["target"].to_numpy()
    overlaps_arr = transcripts["overlaps_nucleus"].to_numpy() if "overlaps_nucleus" in transcripts.columns else np.zeros(len(transcripts))

    n_transcripts = []
    n_unique_genes = []
    nc_ratios = []
    in_soma_ratios = []

    for i in range(len(granules)):
        c, r = centers[i], radii[i]
        idx = tree.query_ball_point(c, r)
        if not idx:
            n_transcripts.append(0)
            n_unique_genes.append(0)
            nc_ratios.append(0.0)
            in_soma_ratios.append(np.nan)
            continue
        idx = np.asarray(idx)
        n_transcripts.append(len(idx))
        n_unique_genes.append(len(np.unique(target_arr[idx])))

        gnl_mask = np.isin(target_arr[idx], gnl_genes)
        nc_mask = np.isin(target_arr[idx], nc_genes)
        gnl_count = gnl_mask.sum()
        nc_count = nc_mask.sum()
        nc_ratios.append(nc_count / gnl_count if gnl_count > 0 else 0.0)

        if gnl_count == 0:
            in_soma_ratios.append(np.nan)
        else:
            gnl_iloc = idx[gnl_mask]
            in_soma = overlaps_arr[gnl_iloc].sum()
            in_soma_ratios.append(in_soma / gnl_count)

    return pd.DataFrame({
        "n_transcripts": n_transcripts,
        "n_unique_genes": n_unique_genes,
        "nc_ratio": nc_ratios,
        "in_soma_ratio": in_soma_ratios,
    })

In [None]:
# Run all four radius settings and compute per-granule metrics
settings = ["default", "fixed", "expand", "shrink"]
results = {}

for s in settings:
    radii = get_radii(granules, s)
    df = compute_per_granule_metrics(granules, transcripts, radii, gnl_genes, nc_genes)
    results[s] = df
    print(f"Done: {s}")

## Summary: per-setting aggregates

For each radius setting, report (across granules): mean and median of n_transcripts, n_unique_genes, nc_ratio, and in_soma_ratio.

In [None]:
def summary_row(name, df):
    return {
        "setting": name,
        "mean_n_transcripts": df["n_transcripts"].mean(),
        "median_n_transcripts": df["n_transcripts"].median(),
        "mean_n_unique_genes": df["n_unique_genes"].mean(),
        "median_n_unique_genes": df["n_unique_genes"].median(),
        "mean_nc_ratio": df["nc_ratio"].mean(),
        "median_nc_ratio": df["nc_ratio"].median(),
        "mean_in_soma_ratio": df["in_soma_ratio"].mean(),
        "median_in_soma_ratio": df["in_soma_ratio"].median(),
    }

summary_rows = [summary_row(s, results[s]) for s in settings]
summary_df = pd.DataFrame(summary_rows)
summary_df

In [None]:
# Save summary and per-granule metrics for each setting
summary_df.to_csv(benchmark_path + "benchmark_sphere_summary.csv", index=False)
for s in settings:
    out = results[s].copy()
    out["setting"] = s
    out.to_csv(benchmark_path + f"benchmark_sphere_metrics_{s}.csv", index=False)
print("Saved benchmark_sphere_summary.csv and benchmark_sphere_metrics_<setting>.csv")

## Optional: compare distributions by setting

Per-granule metrics by radius setting (boxplots).

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.flatten()
metrics = ["n_transcripts", "n_unique_genes", "nc_ratio", "in_soma_ratio"]
for ax, m in zip(axes, metrics):
    data = [results[s][m] for s in settings]
    ax.boxplot(data, labels=settings)
    ax.set_ylabel(m)
    if m == "in_soma_ratio":
        ax.set_ylim(-0.05, 1.05)
plt.suptitle("Per-granule metrics by radius setting")
plt.tight_layout()
plt.savefig(benchmark_path + "benchmark_sphere_boxplots.jpeg", dpi=300, bbox_inches="tight")
plt.show()