# Benchmark filtering thresholds

Benchmark **in_soma_thr** (first entry only) and **nc_thr** on detection outcomes using pre-computed `all_granules.parquet`. Filtering is applied on the granule dataframe only (no detection rerun). Preprocessing: size filter `sphere_r < 4`; NC filtering uses all `nc_genes` and transcript kdtree (no top-15 selection). Threshold = 1 turns off that filter.

In [None]:
import anndata
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scanpy as sc
from mcDETECT.model import mcDETECT
from scipy.spatial import cKDTree

import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = "MERSCOPE_WT_1"
data_path = f"../../data/{dataset}/"
output_path = f"../../output/{dataset}/"
benchmark_path = "../../output/benchmark/benchmark_filtering/"

In [4]:
granules = pd.read_parquet(output_path + "all_granules.parquet")
granules = granules[granules["sphere_r"] < 4].copy().reset_index(drop=True)
print(f"Granules after size filter (sphere_r < 4): {len(granules)}")

Granules after size filter (sphere_r < 4): 1149541


In [5]:
transcripts = pd.read_parquet(data_path + "processed_data/transcripts.parquet")
nc_genes = list(pd.read_csv(data_path + "processed_data/negative_controls.csv")["Gene"])

In [6]:
def make_tree_3d(d1, d2, d3):
    """Build 3D cKDTree from coordinate arrays."""
    points = np.c_[np.ravel(d1), np.ravel(d2), np.ravel(d3)]
    return cKDTree(points)


def compute_nc_ratio(granules, transcripts, nc_genes):
    """Per granule: nc_ratio = (NC transcript count in sphere) / size. Center (sphere_x,y,z), radius sphere_r."""
    nc_trans = transcripts[transcripts["target"].isin(nc_genes)]
    if nc_trans.shape[0] == 0:
        return np.zeros(len(granules))
    tree = make_tree_3d(
        nc_trans["global_x"].to_numpy(),
        nc_trans["global_y"].to_numpy(),
        nc_trans["global_z"].to_numpy(),
    )
    centers = granules[["sphere_x", "sphere_y", "layer_z"]].to_numpy()
    radii = granules["sphere_r"].to_numpy()
    sizes = granules["size"].to_numpy().astype(float)
    counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
    return np.where(sizes > 0, counts / sizes, 0.0)


def apply_filters(granules, nc_ratio, in_soma_thr, nc_thr):
    """Threshold 1 = no filter. in_soma: keep in_soma_ratio < thr. nc: keep nc_ratio == 0 or < thr."""
    out = granules.copy()
    out["nc_ratio"] = nc_ratio
    if in_soma_thr < 1:
        out = out[out["in_soma_ratio"] < in_soma_thr].copy()
    if nc_thr < 1:
        out = out[(out["nc_ratio"] == 0) | (out["nc_ratio"] < nc_thr)].copy()
    return out.reset_index(drop=True)

In [7]:
nc_ratio_all = compute_nc_ratio(granules, transcripts, nc_genes)

In [8]:
# 1. In-soma only (nc_thr=1); in_soma_thr in [0, 1e-4, 0.1, ..., 1]
in_soma_vals = np.concatenate([[0], [1e-4], np.arange(0.1, 1.05, 0.1)])


def _row(filtered, scenario, in_soma_thr, nc_thr):
    n = len(filtered)
    return {
        "scenario": scenario,
        "in_soma_thr": in_soma_thr,
        "nc_thr": nc_thr,
        "n_detections": n,
        "mean_sphere_r": filtered["sphere_r"].mean() if n else np.nan,
        "sd_sphere_r": filtered["sphere_r"].std() if n else np.nan,
        "q25_sphere_r": filtered["sphere_r"].quantile(0.25) if n else np.nan,
        "q50_sphere_r": filtered["sphere_r"].quantile(0.5) if n else np.nan,
        "q75_sphere_r": filtered["sphere_r"].quantile(0.75) if n else np.nan,
        "mean_in_soma_ratio": filtered["in_soma_ratio"].mean() if n else np.nan,
    }


rows_1 = [
    _row(apply_filters(granules, nc_ratio_all, in_soma_thr=t, nc_thr=1.0), "in_soma_only", t, 1.0)
    for t in in_soma_vals
]
df_1 = pd.DataFrame(rows_1)
df_1

Unnamed: 0,scenario,in_soma_thr,nc_thr,n_detections,mean_sphere_r,sd_sphere_r,q25_sphere_r,q50_sphere_r,q75_sphere_r,mean_in_soma_ratio
0,in_soma_only,0.0,1.0,0,,,,,,
1,in_soma_only,0.0001,1.0,730585,1.009886,0.394465,0.749825,0.939781,1.188042,0.0
2,in_soma_only,0.1,1.0,737063,1.017467,0.404131,0.751496,0.943322,1.195696,0.000592
3,in_soma_only,0.2,1.0,759367,1.027075,0.411021,0.756395,0.950959,1.206527,0.004825
4,in_soma_only,0.3,1.0,797435,1.032245,0.415076,0.759126,0.954493,1.211372,0.016109
5,in_soma_only,0.4,1.0,827977,1.033971,0.419994,0.758001,0.954285,1.213418,0.028055
6,in_soma_only,0.5,1.0,849178,1.03983,0.424674,0.76104,0.958382,1.219514,0.037856
7,in_soma_only,0.6,1.0,884211,1.044719,0.42832,0.76364,0.961769,1.224734,0.056933
8,in_soma_only,0.7,1.0,929888,1.045412,0.431287,0.762623,0.961395,1.225956,0.085936
9,in_soma_only,0.8,1.0,959840,1.04876,0.434384,0.7638,0.963369,1.22967,0.106536


In [9]:
# 2. NC only (in_soma_thr=1); nc_thr in [0, 0.1, ..., 1]
nc_vals = np.arange(0, 1.05, 0.1)
rows_2 = [
    _row(apply_filters(granules, nc_ratio_all, in_soma_thr=1.0, nc_thr=t), "nc_only", 1.0, t)
    for t in nc_vals
]
df_2 = pd.DataFrame(rows_2)
df_2

Unnamed: 0,scenario,in_soma_thr,nc_thr,n_detections,mean_sphere_r,sd_sphere_r,q25_sphere_r,q50_sphere_r,q75_sphere_r,mean_in_soma_ratio
0,nc_only,1.0,0.0,902092,0.970843,0.356431,0.732728,0.915824,1.14825,0.183843
1,nc_only,1.0,0.1,942824,1.009343,0.408891,0.741604,0.932594,1.187778,0.188656
2,nc_only,1.0,0.2,1024415,1.036317,0.427372,0.756108,0.954763,1.218521,0.205501
3,nc_only,1.0,0.3,1083904,1.039821,0.428711,0.759085,0.957455,1.221068,0.220843
4,nc_only,1.0,0.4,1105328,1.03939,0.42937,0.758465,0.956717,1.220483,0.227527
5,nc_only,1.0,0.5,1114497,1.040483,0.429743,0.759414,0.957645,1.221448,0.231244
6,nc_only,1.0,0.6,1128755,1.04071,0.429289,0.760038,0.958044,1.221334,0.237786
7,nc_only,1.0,0.7,1138463,1.039253,0.429445,0.75861,0.956619,1.22,0.242688
8,nc_only,1.0,0.8,1142610,1.039284,0.429381,0.758716,0.956641,1.219981,0.24475
9,nc_only,1.0,0.9,1144707,1.039675,0.42942,0.758997,0.957037,1.22056,0.245801


In [10]:
# 3. Both filters; nc_thr=0.1 fixed, in_soma_thr in [0, 1e-4, 0.1, ..., 1]
rows_3 = [
    _row(apply_filters(granules, nc_ratio_all, in_soma_thr=t, nc_thr=0.1), "in_soma_and_nc", t, 0.1)
    for t in in_soma_vals
]
df_3 = pd.DataFrame(rows_3)
df_3

Unnamed: 0,scenario,in_soma_thr,nc_thr,n_detections,mean_sphere_r,sd_sphere_r,q25_sphere_r,q50_sphere_r,q75_sphere_r,mean_in_soma_ratio
0,in_soma_and_nc,0.0,0.1,0,,,,,,
1,in_soma_and_nc,0.0001,0.1,673103,1.002402,0.394634,0.742747,0.931343,1.17939,0.0
2,in_soma_and_nc,0.1,0.1,677954,1.008531,0.40249,0.744208,0.934124,1.185826,0.000488
3,in_soma_and_nc,0.2,0.1,693249,1.015703,0.407604,0.74797,0.939819,1.193923,0.003706
4,in_soma_and_nc,0.3,0.1,720636,1.018644,0.40962,0.749722,0.941839,1.196593,0.012704
5,in_soma_and_nc,0.4,0.1,743477,1.018339,0.412087,0.747958,0.940401,1.196501,0.022724
6,in_soma_and_nc,0.5,0.1,757316,1.021682,0.414156,0.749995,0.943047,1.200022,0.029963
7,in_soma_and_nc,0.6,0.1,780312,1.023618,0.414956,0.751325,0.944502,1.201914,0.044314
8,in_soma_and_nc,0.7,0.1,810957,1.021506,0.415036,0.749234,0.942364,1.200241,0.067132
9,in_soma_and_nc,0.8,0.1,829576,1.022056,0.415168,0.749608,0.942741,1.200729,0.08237


In [11]:
benchmark_results = pd.concat([df_1, df_2, df_3], ignore_index=True)
benchmark_results.to_csv(benchmark_path + "benchmark_filtering_results.csv", index=False)

In [12]:
sc.settings.verbosity = 0
genes = list(pd.read_csv(data_path + "processed_data/genes.csv").iloc[:, 0])
adata = sc.read_h5ad(data_path + "processed_data/adata.h5ad")
adata_neuron = adata[adata.obs["cell_type"].isin(["Glutamatergic", "GABAergic"])].copy()
gnl_genes = ["Camk2a", "Cplx2", "Slc17a7", "Ddn", "Syp", "Map1a", "Shank1", "Syn1", "Gria1", "Gria2", "Cyfip2", "Vamp2", "Bsn", "Slc32a1", "Nfasc", "Syt1", "Tubb3", "Nav1", "Shank3", "Mapt"]
mc = mcDETECT(type="discrete", transcripts=transcripts, gnl_genes=gnl_genes, nc_genes=None)

## Representative settings

Five settings: (1) no filter; (2) in-soma 0.1 only; (3) NC 0.1 only; (4) both 0.1; (5) in-soma 0.05, NC 0.1. Summary table; then per-setting profile (mcDETECT), merge with neurons, heatmap.

In [13]:
rep_settings = [(1.0, 1.0), (0.1, 1.0), (1.0, 0.1), (0.1, 0.1), (0.05, 0.1)]
rep_rows = []
for a, b in rep_settings:
    f = apply_filters(granules, nc_ratio_all, in_soma_thr=a, nc_thr=b)
    n = len(f)
    rep_rows.append({
        "setting": f"in_soma={a}, nc={b}",
        "in_soma_thr": a,
        "nc_thr": b,
        "n_detections": n,
        "mean_sphere_r": f["sphere_r"].mean() if n else np.nan,
        "mean_in_soma_ratio": f["in_soma_ratio"].mean() if n else np.nan,
    })
rep_summary = pd.DataFrame(rep_rows)
rep_summary

Unnamed: 0,setting,in_soma_thr,nc_thr,n_detections,mean_sphere_r,mean_in_soma_ratio
0,"in_soma=1.0, nc=1.0",1.0,1.0,1149541,1.039695,0.248234
1,"in_soma=0.1, nc=1.0",0.1,1.0,737063,1.017467,0.000592
2,"in_soma=1.0, nc=0.1",1.0,0.1,942824,1.009343,0.188656
3,"in_soma=0.1, nc=0.1",0.1,0.1,677954,1.008531,0.000488
4,"in_soma=0.05, nc=0.1",0.05,0.1,674084,1.00428,5.2e-05


In [14]:
gene_groups = {
    "Granule Markers": ["Camk2a", "Cplx2", "Slc17a7", "Syp", "Ddn", "Map1a", "Syn1", "Shank1", "Cyfip2", "Vamp2"],
    "Neuron Markers": ["Trpc4", "Gjc3", "Plekhb1", "Ntrk2", "Ntsr2", "Fn1", "Gnai2", "Shc3", "Reep3", "Chd9", "Acsbg1"],
}
target_genes = [g for group in gene_groups.values() for g in group]

In [22]:
heatmap_dir = benchmark_path + "benchmark_filtering_heatmaps/"
os.makedirs(heatmap_dir, exist_ok=True)
np.random.seed(42)

for i, (a, b) in enumerate(rep_settings):
    f = apply_filters(granules, nc_ratio_all, in_soma_thr=a, nc_thr=b)
    if len(f) == 0:
        print(f"Setting {i+1}: no granules, skip")
        continue
    ga = mc.profile(f, genes=genes)
    sc.pp.normalize_total(ga, target_sum=1e4)
    sc.pp.log1p(ga)
    ga = ga[np.random.permutation(ga.n_obs)].copy()
    adata_all = anndata.concat([ga, adata_neuron], axis=0, merge="same")
    adata_all.var["genes"] = adata_all.var.index
    adata_all.obs["type"] = pd.Categorical(["Granules"] * ga.n_obs + ["Neurons"] * adata_neuron.n_obs, categories=["Granules", "Neurons"], ordered=True)
    sc.set_figure_params(scanpy=True, fontsize=10)
    sc.pl.heatmap(adata_all, target_genes, groupby="type", log=True, cmap="Reds", standard_scale="var", swap_axes=True, figsize=(8, 4), show=False)
    plt.savefig(heatmap_dir + f"heatmap_in{a}_nc{b}.jpeg", dpi=300, bbox_inches="tight")
    plt.close()
    print(f"Setting {i+1} (in_soma={a}, nc={b}): n={len(f)}, saved.")

Setting 1 (in_soma=1.0, nc=1.0): n=1149541, saved.
Setting 2 (in_soma=0.1, nc=1.0): n=737063, saved.
Setting 3 (in_soma=1.0, nc=0.1): n=942824, saved.
Setting 4 (in_soma=0.1, nc=0.1): n=677954, saved.
Setting 5 (in_soma=0.05, nc=0.1): n=674084, saved.


In [None]:
# Export representative granule datasets
representative_dir = os.path.join(benchmark_path, f"{dataset}_representative_data")
os.makedirs(representative_dir, exist_ok=True)

def _thr_tag(x: float) -> str:
    x = float(x)
    if x.is_integer():
        return str(int(x))
    return str(x).replace(".", "p")

for a, b in rep_settings:
    f = apply_filters(granules, nc_ratio_all, in_soma_thr=a, nc_thr=b)
    out_name = f"granules_inSomaThr{_thr_tag(a)}_ncThr{_thr_tag(b)}.parquet"
    f.to_parquet(os.path.join(representative_dir, out_name), index=False)

print(f"Wrote {len(rep_settings)} parquet files to {representative_dir}")