# Benchmark filtering thresholds (in_soma_thr, nc_thr)

Benchmark the effect of **in_soma_thr** (first entry only) and **nc_thr** on detection outcomes using pre-computed `all_granules.parquet`. No mcDETECT detection is run; filtering is applied directly on the granule dataframe. Size filtering at `sphere_r < 4` is applied first. Negative control filtering uses all `nc_genes` and transcript-based kdtree counts (no top-15 marker selection).

**Note:** A threshold of 1 suppresses that filter: `in_soma_thr >= 1` means no in-soma filtering; `nc_thr >= 1` means no negative-control filtering.

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree

import warnings
warnings.filterwarnings("ignore")

In [4]:
# File paths (same dataset as detection.ipynb)
dataset = "MERSCOPE_WT_1"
data_path = f"../data/{dataset}/"
output_path = f"../output/{dataset}/"

In [5]:
# Load granules and apply size filtering (sphere_r < 4)
granules = pd.read_parquet(output_path + "all_granules.parquet")
granules = granules[granules["sphere_r"] < 4].copy().reset_index(drop=True)
print(f"After size filter (sphere_r < 4): {len(granules)} granules")

After size filter (sphere_r < 4): 1153314 granules


In [6]:
# Load transcripts and nc_genes for negative control filtering
transcripts = pd.read_parquet(data_path + "processed_data/transcripts.parquet")
nc_genes_df = pd.read_csv(data_path + "processed_data/negative_controls.csv")
nc_genes = list(nc_genes_df["Gene"])

In [7]:
def make_tree_3d(d1, d2, d3):
    """Build 3D cKDTree from coordinate arrays."""
    points = np.c_[np.ravel(d1), np.ravel(d2), np.ravel(d3)]
    return cKDTree(points)


def compute_nc_ratio(granules, transcripts, nc_genes):
    """
    For each granule, count NC transcripts within sphere (sphere_r) and compute nc_ratio = nc_count / size.
    Uses (sphere_x, sphere_y, layer_z) as center and sphere_r as radius. Returns array of nc_ratio, same length as granules.
    """
    nc_transcripts = transcripts[transcripts["target"].isin(nc_genes)]
    if nc_transcripts.shape[0] == 0:
        return np.zeros(len(granules))
    tree = make_tree_3d(
        np.array(nc_transcripts["global_x"]),
        np.array(nc_transcripts["global_y"]),
        np.array(nc_transcripts["global_z"]),
    )
    centers = granules[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
    radii = granules["sphere_r"].to_numpy()
    sizes = granules["size"].to_numpy().astype(float)
    counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
    nc_ratio = np.where(sizes > 0, counts / sizes, 0.0)
    return nc_ratio


def apply_filters(granules, nc_ratio, in_soma_thr, nc_thr):
    """
    When a threshold is 1, that filter is suppressed (no filtering).
    in_soma_thr < 1: keep granules with in_soma_ratio < in_soma_thr.
    nc_thr < 1: keep granules with (nc_ratio == 0) or (nc_ratio < nc_thr).
    """
    out = granules.copy()
    out["nc_ratio"] = nc_ratio
    if in_soma_thr < 1:
        out = out[out["in_soma_ratio"] < in_soma_thr].copy()
    if nc_thr < 1:
        out = out[(out["nc_ratio"] == 0) | (out["nc_ratio"] < nc_thr)].copy()
    return out.reset_index(drop=True)

In [8]:
# Precompute nc_ratio for all granules (used in scenarios 2 and 3)
nc_ratio_all = compute_nc_ratio(granules, transcripts, nc_genes)

In [9]:
# (1) In-soma filtering only (nc_thr = 1): in_soma_thr = 0, 0.1, ..., 1
in_soma_vals = np.concatenate([[0], [1e-4], np.arange(0.1, 1.05, 0.1)])
rows_1 = []
for in_soma_thr in in_soma_vals:
    filtered = apply_filters(granules, nc_ratio_all, in_soma_thr=in_soma_thr, nc_thr=1.0)
    rows_1.append({
        "scenario": "in_soma_only",
        "in_soma_thr": in_soma_thr,
        "nc_thr": 1.0,
        "n_detections": len(filtered),
        "mean_sphere_r": filtered["sphere_r"].mean() if len(filtered) > 0 else np.nan,
        "mean_in_soma_ratio": filtered["in_soma_ratio"].mean() if len(filtered) > 0 else np.nan,
    })
df_1 = pd.DataFrame(rows_1)
df_1

Unnamed: 0,scenario,in_soma_thr,nc_thr,n_detections,mean_sphere_r,mean_in_soma_ratio
0,in_soma_only,0.0,1.0,0,,
1,in_soma_only,0.0001,1.0,732636,1.01051,0.0
2,in_soma_only,0.1,1.0,739202,1.018153,0.000598
3,in_soma_only,0.2,1.0,761682,1.027799,0.004851
4,in_soma_only,0.3,1.0,799934,1.032996,0.016154
5,in_soma_only,0.4,1.0,830604,1.034736,0.028112
6,in_soma_only,0.5,1.0,851919,1.040602,0.037934
7,in_soma_only,0.6,1.0,887100,1.045497,0.057029
8,in_soma_only,0.7,1.0,932928,1.046202,0.086027
9,in_soma_only,0.8,1.0,963003,1.049562,0.106642


In [10]:
# (2) Negative control filtering only (in_soma_thr = 1 suppresses in-soma filter)
nc_vals = np.arange(0, 1.05, 0.1)
rows_2 = []
for nc_thr in nc_vals:
    filtered = apply_filters(granules, nc_ratio_all, in_soma_thr=1.0, nc_thr=nc_thr)
    rows_2.append({
        "scenario": "nc_only",
        "in_soma_thr": 1.0,
        "nc_thr": nc_thr,
        "n_detections": len(filtered),
        "mean_sphere_r": filtered["sphere_r"].mean() if len(filtered) > 0 else np.nan,
        "mean_in_soma_ratio": filtered["in_soma_ratio"].mean() if len(filtered) > 0 else np.nan,
    })
df_2 = pd.DataFrame(rows_2)
df_2

Unnamed: 0,scenario,in_soma_thr,nc_thr,n_detections,mean_sphere_r,mean_in_soma_ratio
0,nc_only,1.0,0.0,904399,0.97133,0.183905
1,nc_only,1.0,0.1,945657,1.010061,0.188757
2,nc_only,1.0,0.2,1027889,1.037099,0.205633
3,nc_only,1.0,0.3,1087553,1.040613,0.22096
4,nc_only,1.0,0.4,1109001,1.040212,0.227626
5,nc_only,1.0,0.5,1118213,1.041296,0.231348
6,nc_only,1.0,0.6,1132504,1.041531,0.237882
7,nc_only,1.0,0.7,1142206,1.040075,0.242761
8,nc_only,1.0,0.8,1146360,1.040109,0.244819
9,nc_only,1.0,0.9,1148460,1.040499,0.245867


In [11]:
# (3) Fix nc_thr at 0.1, vary in_soma_thr = 0, 0.1, ..., 1
rows_3 = []
for in_soma_thr in in_soma_vals:
    filtered = apply_filters(granules, nc_ratio_all, in_soma_thr=in_soma_thr, nc_thr=0.1)
    rows_3.append({
        "scenario": "in_soma_and_nc",
        "in_soma_thr": in_soma_thr,
        "nc_thr": 0.1,
        "n_detections": len(filtered),
        "mean_sphere_r": filtered["sphere_r"].mean() if len(filtered) > 0 else np.nan,
        "mean_in_soma_ratio": filtered["in_soma_ratio"].mean() if len(filtered) > 0 else np.nan,
    })
df_3 = pd.DataFrame(rows_3)
df_3

Unnamed: 0,scenario,in_soma_thr,nc_thr,n_detections,mean_sphere_r,mean_in_soma_ratio
0,in_soma_and_nc,0.0,0.1,0,,
1,in_soma_and_nc,0.0001,0.1,674834,1.002937,0.0
2,in_soma_and_nc,0.1,0.1,679751,1.009114,0.000493
3,in_soma_and_nc,0.2,0.1,695163,1.016313,0.003727
4,in_soma_and_nc,0.3,0.1,722679,1.019274,0.012743
5,in_soma_and_nc,0.4,0.1,745609,1.018983,0.022772
6,in_soma_and_nc,0.5,0.1,759528,1.022343,0.030032
7,in_soma_and_nc,0.6,0.1,782620,1.024284,0.044401
8,in_soma_and_nc,0.7,0.1,813355,1.022197,0.067214
9,in_soma_and_nc,0.8,0.1,832037,1.022752,0.082457


In [12]:
# Single results dataframe
benchmark_results = pd.concat([df_1, df_2, df_3], ignore_index=True)
benchmark_results

Unnamed: 0,scenario,in_soma_thr,nc_thr,n_detections,mean_sphere_r,mean_in_soma_ratio
0,in_soma_only,0.0,1.0,0,,
1,in_soma_only,0.0001,1.0,732636,1.01051,0.0
2,in_soma_only,0.1,1.0,739202,1.018153,0.000598
3,in_soma_only,0.2,1.0,761682,1.027799,0.004851
4,in_soma_only,0.3,1.0,799934,1.032996,0.016154
5,in_soma_only,0.4,1.0,830604,1.034736,0.028112
6,in_soma_only,0.5,1.0,851919,1.040602,0.037934
7,in_soma_only,0.6,1.0,887100,1.045497,0.057029
8,in_soma_only,0.7,1.0,932928,1.046202,0.086027
9,in_soma_only,0.8,1.0,963003,1.049562,0.106642


In [13]:
# Save results
benchmark_results.to_csv("../output/benchmark/benchmark_filtering_results.csv", index=False)