In [1]:
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import precision_recall_curve, auc, f1_score
from model import *
from simulate import *

output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Function for saving simulated raw data
def write_simulated_data(points_all, out_csv, is_3d):
    df = points_all.copy().reset_index(drop = True)
    df["transcript_id"] = df.index.astype(int)
    out = pd.DataFrame({"transcript_id": df["transcript_id"],
                        "x": df["global_x"].astype(float),
                        "y": df["global_y"].astype(float),
                        "z": df["global_z"].astype(float) if is_3d else 0.0,
                        "gene": df["target"].astype(str)})
    os.makedirs(os.path.dirname(out_csv), exist_ok = True)
    out.to_csv(out_csv, index = False)

In [3]:
# Function for calculating precision, recall, accuracy, F1 score
def calculate_metric(ground_truth_indices, matched_index):
    
    flattened_matches = []
    for match in matched_index:
        if isinstance(match, tuple):
            flattened_matches.extend(match)
        elif match != -1:
            flattened_matches.append(match)

    # 1. True Positives (TP): Unique ground truth points correctly detected
    unique_matched_points = set(flattened_matches)
    true_positives = len(unique_matched_points & ground_truth_indices)

    # 2. False Positives (FP): Detections that didn"t match any ground truth
    false_positives = len([x for x in matched_index if x == -1])

    # 3. False Negatives (FN): Ground truth points that were never matched
    false_negatives = len(ground_truth_indices - unique_matched_points)

    # 4. Total ground truth points (used for recall)
    total_ground_truth_points = len(ground_truth_indices)

    # 5. Total detections (used for accuracy)
    total_detections = len(matched_index)

    # 6. Precision
    if true_positives + false_positives > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0.0

    # 7. Recall
    if true_positives + false_negatives > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0.0

    # 8. Revised Accuracy
    true_matches = len([x for x in matched_index if x != -1])  # Count of detections correctly matched
    if total_detections > 0:
        accuracy = true_matches / total_detections
    else:
        accuracy = 0.0
    
    # 9. F1 Score
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0
    
    return precision, recall, accuracy, f1

# Main metric calculation function
def metric_main(tree, ground_truth_indices, sphere):
    matched_index = []
    for k in range(sphere.shape[0]):
        idx = tree.query_ball_point([sphere["sphere_x"].iloc[k], sphere["sphere_y"].iloc[k], sphere["sphere_z"].iloc[k]], sphere["sphere_r"].iloc[k])
        if len(idx) == 0:
            matched_index.append(-1)
        elif len(idx) == 1:
            matched_index += idx
        elif len(idx) > 1:
            matched_index.append(tuple(idx))
    return calculate_metric(ground_truth_indices, matched_index)

## Visualize sphere radius distribution

In [None]:
point_type = ["CSR", "Extranuclear", "Intranuclear"]
ratio = [0.5, 0.25, 0.25]
mean_dist_extra = 1
mean_dist_intra = 4
beta_extra = (1, 19)
beta_intra = (19, 1)

simulate_z = True
name = "A"
density_overall = 0.16
num_clusters_extra = 10000
num_clusters_intra = 4000
seed = 1

for i in range(len(point_type)):
    simulate = simulation(name = name, density = density_overall * ratio[i], shape = (2000, 2000), layer_num = 8, layer_gap = 1.5, simulate_z = simulate_z, write_path = output_dir + "/", seed = seed)
    if i == 0:
        points_CSR = simulate.simulate_CSR()
        points_CSR["type"] = [point_type[i]] * points_CSR.shape[0]
    elif i == 1:
        parents_cluster_extra, points_cluster_extra = simulate.simulate_cluster(num_clusters = num_clusters_extra, beta = beta_extra, mean_dist = mean_dist_extra)
        points_cluster_extra["type"] = [point_type[i]] * points_cluster_extra.shape[0]
    elif i == 2:
        parents_cluster_intra, points_cluster_intra = simulate.simulate_cluster(num_clusters = num_clusters_intra, beta = beta_intra, mean_dist = mean_dist_intra)
        points_cluster_intra["type"] = [point_type[i]] * points_cluster_intra.shape[0]
points_all = pd.concat([points_CSR, points_cluster_extra, points_cluster_intra], axis = 0, ignore_index = True)
parents_all = parents_cluster_extra

def rg(df):
    pts = df[["global_x", "global_y", "global_z"]].to_numpy()
    c = pts.mean(axis=0)
    return np.sqrt(((pts - c)**2).sum(axis=1).mean())

radii = points_cluster_intra.groupby("id").apply(rg)
med = np.nanmean(radii)

area = np.pi * (radii) ** 2
pd.DataFrame({"id": radii.index, "radius": radii, "area": area}).to_csv("output/intranuclear_area.csv", index = 0)

# plt.figure(figsize = (6, 4))
# sns.histplot(radii, binwidth=0.2, kde=False, edgecolor="gray")
# if np.isfinite(med):
#     plt.axvline(med, color="red", linestyle="--", linewidth=1)
#     ymax = plt.ylim()[1]
#     plt.text(med + 0.5, ymax * 0.95, f"{med:.2f}",
#             color="red", ha="left", va="top", fontsize=10)
# plt.xlabel("Aggregate radius", fontsize=12)
# plt.ylabel("Frequency", fontsize=12)
# plt.xticks(np.arange(0, 15, 2))
# plt.savefig("output/cell_radius_hist.png", dpi=500, bbox_inches="tight")
# plt.close()

# for points, label in zip([points_cluster_intra, points_cluster_extra], ["somatic", "distal"]):
#     in_soma_ratio = points.groupby("id")["in_nucleus"].mean()
#     plt.figure(figsize=(6,4))
#     sns.histplot(in_soma_ratio, bins=50, stat="density", edgecolor="gray")
#     sns.kdeplot(in_soma_ratio, color="red", linewidth=1)
#     plt.xlabel(f"Mean in-soma ratio per aggregate", fontsize=12)
#     plt.ylabel("Frequency", fontsize=12)
#     plt.yticks(np.arange(0, 50, 5))
#     plt.savefig(f"output/mean_in_soma_ratio_hist_{label}.png", dpi=500, bbox_inches="tight")
#     plt.close()

## Single-marker CSR and aggregation

In [None]:
# Settings
point_type = ["CSR", "Extranuclear", "Intranuclear"]
ratio = [0.5, 0.25, 0.25]
mean_dist_extra = 1
mean_dist_intra = 4
beta_extra = (1, 19)
beta_intra = (19, 1)

In [None]:
# Main simulation loop
dimension_settings = {"3D": True, "2D": False}

marker_settings = {"A": {"density": 0.08, "num_clusters_extra": 5000, "num_clusters_intra": 2000},
            "B": {"density": 0.04, "num_clusters_extra": 3000, "num_clusters_intra": 1200},
            "C": {"density": 0.02, "num_clusters_extra": 2000, "num_clusters_intra": 800}}

# marker_settings = {"D": {"density": 0.01, "num_clusters_extra": 1250, "num_clusters_intra": 500},
#                    "E": {"density": 0.005, "num_clusters_extra": 800, "num_clusters_intra": 300},
#                    "F": {"density": 0.0025, "num_clusters_extra": 500, "num_clusters_intra": 200}}

seed_lst = np.arange(1, 201)

for dimension, simulate_z in dimension_settings.items():
    
    print(f"Running simulations for dimension: {dimension}")

    for name, params in marker_settings.items():
        
        print(f"Running simulations for marker: {name}")
        
        density_overall = params["density"]
        num_clusters_extra = params["num_clusters_extra"]
        num_clusters_intra = params["num_clusters_intra"]
        
        precision_lst = []
        recall_lst = []
        accuracy_lst = []
        f1_lst = []

        for seed in seed_lst:

            # simulate data
            for i in range(len(point_type)):
                simulate = simulation(name = name, density = density_overall * ratio[i], shape = (2000, 2000), layer_num = 8, layer_gap = 1.5, simulate_z = simulate_z, write_path = output_dir + "/", seed = seed)
                if i == 0:
                    points_CSR = simulate.simulate_CSR()
                    points_CSR["type"] = [point_type[i]] * points_CSR.shape[0]
                elif i == 1:
                    parents_cluster_extra, points_cluster_extra = simulate.simulate_cluster(num_clusters = num_clusters_extra, beta = beta_extra, mean_dist = mean_dist_extra)
                    points_cluster_extra["type"] = [point_type[i]] * points_cluster_extra.shape[0]
                elif i == 2:
                    parents_cluster_intra, points_cluster_intra = simulate.simulate_cluster(num_clusters = num_clusters_intra, beta = beta_intra, mean_dist = mean_dist_intra)
                    points_cluster_intra["type"] = [point_type[i]] * points_cluster_intra.shape[0]
            points_all = pd.concat([points_CSR, points_cluster_extra, points_cluster_intra], axis = 0, ignore_index = True)
            parents_all = parents_cluster_extra
            
            # save simulated data
            write_simulated_data(points_all, f"simulated_data/single_marker/{dimension}/{name}/seed_{seed}.csv", simulate_z)
            
            # run mcDETECT
            detect = model(shape = (2000, 2000), transcripts = points_all, target_all = ["A", "B", "C"], eps = 1.5, in_thr = 0.25, size_thr = 4)
            # detect = model(shape = (2000, 2000), transcripts = points_all, target_all = ["D", "E", "F"], eps = 1.5, in_thr = 0.25, size_thr = 4)
            sphere = detect.dbscan_single(target_name = name)
            
            # find matched index
            tree = make_tree(d1 = np.array(parents_all["global_x"]), d2 = np.array(parents_all["global_y"]), d3 = np.array(parents_all["global_z"]))
            ground_truth_indices = set(parents_all.index)
            
            # calculate all metrics
            precision, recall, accuracy, f1 = metric_main(tree, ground_truth_indices, sphere)
            precision_lst.append(precision)
            recall_lst.append(recall)
            accuracy_lst.append(accuracy)
            f1_lst.append(f1)
            
            if seed % 50 == 0:
                print(f"{seed} out of {len(seed_lst)} iterations!")

        results_df = pd.DataFrame({"Simulation": seed_lst.tolist(),
                                   "Precision": precision_lst,
                                   "Recall": recall_lst, 
                                   "Accuracy": accuracy_lst,
                                   "F1 Score": f1_lst})
        results_df.to_csv(os.path.join(output_dir, f"single_marker_{dimension}_{name}_{num_clusters_extra}_{num_clusters_intra}.csv"), index = 0)

## Multi-marker CSR and aggregation

In [11]:
# Settings
name = ["A", "B", "C"]

shape = (2000, 2000)
layer_num = 8
layer_gap = 1.5
write_path = ""

CSR_density = [0.04, 0.02, 0.01]

extra_density = [0.02, 0.01, 0.005]
extra_num_clusters = 5000
extra_beta = (1, 19)
extra_comp_prob = [0.4, 0.3, 0.3]
extra_mean_dist = 1

intra_density = [0.02, 0.01, 0.005]
intra_num_clusters = 1000
intra_beta = (19, 1)
intra_comp_prob = [0.8, 0.1, 0.1]
intra_mean_dist = 4

In [None]:
# Main simulation loop
dimension_settings = {"3D": True, "2D": False}
seed_lst = np.arange(1, 201)

for dimension, simulate_z in dimension_settings.items():
    
    print(f"Running multi-marker simulations for dimension: {dimension}")
    
    precision_lst_A, recall_lst_A, accuracy_lst_A, f1_lst_A = [], [], [], []
    precision_lst_B, recall_lst_B, accuracy_lst_B, f1_lst_B = [], [], [], []
    precision_lst_C, recall_lst_C, accuracy_lst_C, f1_lst_C = [], [], [], []
    precision_lst_all, recall_lst_all, accuracy_lst_all, f1_lst_all = [], [], [], []

    for seed in seed_lst:

        # simulate data
        multi_simulate_extra = multi_simulation(name = name, density = extra_density, shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed)
        parents_extra, parents_all_extra, points_extra = multi_simulate_extra.simulate_cluster(num_clusters = extra_num_clusters, beta = extra_beta, comp_prob = extra_comp_prob, mean_dist = extra_mean_dist, comp_thr = 2)
        
        multi_simulate_intra = multi_simulation(name = name, density = intra_density, shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 100)
        parents_intra, parents_all_intra, points_intra = multi_simulate_intra.simulate_cluster(num_clusters = intra_num_clusters, beta = intra_beta, comp_prob = intra_comp_prob, mean_dist = intra_mean_dist, comp_thr = 2)
        
        simulate_A = simulation(name = name[0], density = CSR_density[0], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 200)
        points_CSR_A = simulate_A.simulate_CSR()

        simulate_B = simulation(name = name[1], density = CSR_density[1], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 300)
        points_CSR_B = simulate_B.simulate_CSR()

        simulate_C = simulation(name = name[2], density = CSR_density[2], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 400)
        points_CSR_C = simulate_C.simulate_CSR()
        
        parents_all = parents_extra
        points_all = pd.concat([points_extra, points_intra, points_CSR_A, points_CSR_B, points_CSR_C], axis = 0, ignore_index = True)
        
        points_A = points_all[points_all["target"] == "A"]
        points_B = points_all[points_all["target"] == "B"]
        points_C = points_all[points_all["target"] == "C"]
        
        # save simulated data
        write_simulated_data(points_all, f"simulated_data/multi_marker/{dimension}/all/seed_{seed}.csv", simulate_z)
        write_simulated_data(points_A, f"simulated_data/multi_marker/{dimension}/A/seed_{seed}.csv", simulate_z)
        write_simulated_data(points_B, f"simulated_data/multi_marker/{dimension}/B/seed_{seed}.csv", simulate_z)
        write_simulated_data(points_C, f"simulated_data/multi_marker/{dimension}/C/seed_{seed}.csv", simulate_z)
        
        # ground truth tree and index
        tree = make_tree(d1 = np.array(parents_all["global_x"]), d2 = np.array(parents_all["global_y"]), d3 = np.array(parents_all["global_z"]))
        ground_truth_indices = set(parents_all.index)
        
        # run mcDETECT on A/B/C/all
        detect_A = model(shape = (2000, 2000), transcripts = points_A, target_all = ["A", "B", "C"], eps = 1.5, in_thr = 0.25, size_thr = 4)
        sphere_A = detect_A.dbscan_single(target_name = "A")
        precision_A, recall_A, accuracy_A, f1_A = metric_main(tree, ground_truth_indices, sphere_A)
        precision_lst_A.append(precision_A)
        recall_lst_A.append(recall_A)
        accuracy_lst_A.append(accuracy_A)
        f1_lst_A.append(f1_A)
        
        detect_B = model(shape = (2000, 2000), transcripts = points_B, target_all = ["A", "B", "C"], eps = 1.5, in_thr = 0.25, size_thr = 4)
        sphere_B = detect_B.dbscan_single(target_name = "B")
        precision_B, recall_B, accuracy_B, f1_B = metric_main(tree, ground_truth_indices, sphere_B)
        precision_lst_B.append(precision_B)
        recall_lst_B.append(recall_B)
        accuracy_lst_B.append(accuracy_B)
        f1_lst_B.append(f1_B)
        
        detect_C = model(shape = (2000, 2000), transcripts = points_C, target_all = ["A", "B", "C"], eps = 1.5, in_thr = 0.25, size_thr = 4)
        sphere_C = detect_C.dbscan_single(target_name = "C")
        precision_C, recall_C, accuracy_C, f1_C = metric_main(tree, ground_truth_indices, sphere_C)
        precision_lst_C.append(precision_C)
        recall_lst_C.append(recall_C)
        accuracy_lst_C.append(accuracy_C)
        f1_lst_C.append(f1_C)
        
        detect_all = model(shape = (2000, 2000), transcripts = points_all, target_all = ["A", "B", "C"], eps = 1.5, in_thr = 0.25, comp_thr = 2, size_thr = 4, p = 0.2)
        sphere_all = detect_all.merge_data()
        precision_all, recall_all, accuracy_all, f1_all = metric_main(tree, ground_truth_indices, sphere_all)
        precision_lst_all.append(precision_all)
        recall_lst_all.append(recall_all)
        accuracy_lst_all.append(accuracy_all)
        f1_lst_all.append(f1_all)
        
        if seed % 50 == 0:
            print("{} out of {} iterations!".format(seed, len(seed_lst)))
    
    pd.DataFrame({"Simulation": seed_lst.tolist(), "Precision": precision_lst_A, "Recall": recall_lst_A, "Accuracy": accuracy_lst_A, "F1": f1_lst_A}).to_csv(os.path.join(output_dir, f"multi_marker_{dimension}_A_{extra_num_clusters}_{intra_num_clusters}.csv"), index = 0)
    pd.DataFrame({"Simulation": seed_lst.tolist(), "Precision": precision_lst_B, "Recall": recall_lst_B, "Accuracy": accuracy_lst_B, "F1": f1_lst_B}).to_csv(os.path.join(output_dir, f"multi_marker_{dimension}_B_{extra_num_clusters}_{intra_num_clusters}.csv"), index = 0)
    pd.DataFrame({"Simulation": seed_lst.tolist(), "Precision": precision_lst_C, "Recall": recall_lst_C, "Accuracy": accuracy_lst_C, "F1": f1_lst_C}).to_csv(os.path.join(output_dir, f"multi_marker_{dimension}_C_{extra_num_clusters}_{intra_num_clusters}.csv"), index = 0)
    pd.DataFrame({"Simulation": seed_lst.tolist(), "Precision": precision_lst_all, "Recall": recall_lst_all, "Accuracy": accuracy_lst_all, "F1": f1_lst_all}).to_csv(os.path.join(output_dir, f"multi_marker_{dimension}_all_{extra_num_clusters}_{intra_num_clusters}.csv"), index = 0)

Running multi-marker simulations for dimension: 3D
50 out of 200 iterations!
100 out of 200 iterations!
150 out of 200 iterations!
200 out of 200 iterations!
Running multi-marker simulations for dimension: 2D
50 out of 200 iterations!


## Benchmark parameter p in the multi-marker scenario

**Note on raw count vs precision:** Raw detection count (e.g., 5137 at p=0.2) exceeds ground truth (~3000) because many detections are *split* (one aggregate detected as multiple overlapping spheres). Precision/recall use spatial matching: FP = detections with NO ground truth overlap. At p=0.2, precision ~95% and recall ~99% (same as Multi-marker CSR section), so few true FPs. The model always applies "drop contained" (see `remove_overlaps` in model.py); p only controls merging of overlapping-but-not-containing pairs.

In [6]:
# Set up
name = ["A", "B", "C"]

shape = (2000, 2000)
layer_num = 8
layer_gap = 1.5
write_path = ""

CSR_density = [0.04, 0.02, 0.01]

extra_density = [0.02, 0.01, 0.005]
extra_num_clusters = 5000
extra_beta = (1, 19)
extra_comp_prob = [0.4, 0.3, 0.3]
extra_mean_dist = 1

intra_density = [0.02, 0.01, 0.005]
intra_num_clusters = 1000
intra_beta = (19, 1)
intra_comp_prob = [0.8, 0.1, 0.1]
intra_mean_dist = 4

In [10]:
# Benchmark p parameter in multi-marker scenario across multiple seeds
# Ground truth: extrasomatic aggregates with 2+ components (~3000, 60% of 5000)
# Raw detection count vs GT: excess at low p is mostly SPLIT detections (same aggregate as multiple spheres), not true FPs
# Precision/recall (via metric_main) correctly measure true FPs: detections with NO GT overlap
p_values = np.arange(0, 1.1, 0.1)

# Use multiple seeds for more robust benchmarking
benchmark_seeds = np.arange(1, 11)  # 10 seeds

# Results storage - will aggregate across seeds
num_detections_vs_p = []
precision_vs_p = []
recall_vs_p = []
accuracy_vs_p = []
f1_vs_p = []
avg_detections_per_GT_vs_p = []

# Storage for per-seed results
all_num_detections = []
all_precision = []
all_recall = []
all_accuracy = []
all_f1 = []
all_avg_detections_per_GT = []

print("Benchmarking p parameter across multiple seeds...")
for p in p_values:
    # Storage for this p value across all seeds
    seed_num_detections = []
    seed_precision = []
    seed_recall = []
    seed_accuracy = []
    seed_f1 = []
    seed_avg_detections_per_GT = []
    
    for benchmark_seed in benchmark_seeds:
        
        # Simulate data for this seed
        multi_simulate_extra = multi_simulation(name = name, density = extra_density, shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = True, write_path = write_path, seed = benchmark_seed)
        parents_extra, parents_all_extra, points_extra = multi_simulate_extra.simulate_cluster(num_clusters = extra_num_clusters, beta = extra_beta, comp_prob = extra_comp_prob, mean_dist = extra_mean_dist, comp_thr = 2)
        
        multi_simulate_intra = multi_simulation(name = name, density = intra_density, shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = True, write_path = write_path, seed = benchmark_seed + 10)
        parents_intra, parents_all_intra, points_intra = multi_simulate_intra.simulate_cluster(num_clusters = intra_num_clusters, beta = intra_beta, comp_prob = intra_comp_prob, mean_dist = intra_mean_dist, comp_thr = 2)
        
        simulate_A = simulation(name = name[0], density = CSR_density[0], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = True, write_path = write_path, seed = benchmark_seed + 20)
        points_CSR_A = simulate_A.simulate_CSR()
        
        simulate_B = simulation(name = name[1], density = CSR_density[1], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = True, write_path = write_path, seed = benchmark_seed + 30)
        points_CSR_B = simulate_B.simulate_CSR()
        
        simulate_C = simulation(name = name[2], density = CSR_density[2], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = True, write_path = write_path, seed = benchmark_seed + 40)
        points_CSR_C = simulate_C.simulate_CSR()
        
        parents_all = parents_extra
        points_all = pd.concat([points_extra, points_intra, points_CSR_A, points_CSR_B, points_CSR_C], axis = 0, ignore_index = True)
        
        # Ground truth for precision/recall (same as Multi-marker CSR section)
        tree = make_tree(d1 = np.array(parents_all["global_x"]), d2 = np.array(parents_all["global_y"]), d3 = np.array(parents_all["global_z"]))
        ground_truth_indices = set(parents_all.index)
        
        # Run detection with this p value
        detect_all = model(shape = (2000, 2000), transcripts = points_all, target_all = ["A", "B", "C"], eps = 1.5, in_thr = 0.25, comp_thr = 2, size_thr = 4, p = p, l = 2.5)
        sphere_all = detect_all.merge_data()
        
        # Count detections
        num_detections = sphere_all.shape[0]
        seed_num_detections.append(num_detections)
        
        # Precision, recall, accuracy, F1
        precision, recall, accuracy, f1 = metric_main(tree, ground_truth_indices, sphere_all)
        seed_precision.append(precision)
        seed_recall.append(recall)
        seed_accuracy.append(accuracy)
        seed_f1.append(f1)
        
        # For each ground truth aggregate: count how many detections contain it (overlap it)
        # A detection contains GT if dist(sphere_center, GT_center) <= sphere_r
        detections_per_GT = []
        for _, gt_row in parents_all.iterrows():
            gx, gy, gz = gt_row["global_x"], gt_row["global_y"], gt_row["global_z"]
            count = 0
            for _, sphere in sphere_all.iterrows():
                cx, cy, cz, r = sphere["sphere_x"], sphere["sphere_y"], sphere["sphere_z"], sphere["sphere_r"]
                if np.sqrt((gx - cx)**2 + (gy - cy)**2 + (gz - cz)**2) <= r:
                    count += 1
            detections_per_GT.append(count)
        avg_detections_per_GT = np.mean(detections_per_GT) if detections_per_GT else 0.0
        seed_avg_detections_per_GT.append(avg_detections_per_GT)
    
    # Calculate mean across seeds for this p value
    num_detections_vs_p.append(np.mean(seed_num_detections))
    precision_vs_p.append(np.mean(seed_precision))
    recall_vs_p.append(np.mean(seed_recall))
    accuracy_vs_p.append(np.mean(seed_accuracy))
    f1_vs_p.append(np.mean(seed_f1))
    avg_detections_per_GT_vs_p.append(np.mean(seed_avg_detections_per_GT))
    
    # Store all seed results
    all_num_detections.append(seed_num_detections)
    all_precision.append(seed_precision)
    all_recall.append(seed_recall)
    all_accuracy.append(seed_accuracy)
    all_f1.append(seed_f1)
    all_avg_detections_per_GT.append(seed_avg_detections_per_GT)
    
    print(f"p = {p:.1f}: Mean {np.mean(seed_num_detections):.1f} detections, Prec {np.mean(seed_precision):.2%}, Rec {np.mean(seed_recall):.2%}, Acc {np.mean(seed_accuracy):.2%}, F1 {np.mean(seed_f1):.2%}, Avg det/GT {np.mean(seed_avg_detections_per_GT):.2f}")

# Save mean results
p_benchmark_df = pd.DataFrame({
    "p": p_values,
    "num_detections": num_detections_vs_p,
    "precision": precision_vs_p,
    "recall": recall_vs_p,
    "accuracy": accuracy_vs_p,
    "f1": f1_vs_p,
    "avg_detections_per_GT": avg_detections_per_GT_vs_p
})
p_benchmark_df.to_csv(os.path.join(output_dir, "p_benchmark_multi_marker_3D_mean.csv"), index = 0)

# Save detailed results with all seeds
p_benchmark_df_detailed = pd.DataFrame({
    "p": np.repeat(p_values, len(benchmark_seeds)),
    "seed": np.tile(benchmark_seeds, len(p_values)),
    "num_detections": [item for sublist in all_num_detections for item in sublist],
    "precision": [item for sublist in all_precision for item in sublist],
    "recall": [item for sublist in all_recall for item in sublist],
    "accuracy": [item for sublist in all_accuracy for item in sublist],
    "f1": [item for sublist in all_f1 for item in sublist],
    "avg_detections_per_GT": [item for sublist in all_avg_detections_per_GT for item in sublist]
})
p_benchmark_df_detailed.to_csv(os.path.join(output_dir, "p_benchmark_multi_marker_3D_detailed.csv"), index = 0)

Benchmarking p parameter across multiple seeds...
p = 0.0: Mean 4106.7 detections, Prec 96.12%, Rec 98.79%, Acc 97.10%, F1 97.44%, Avg det/GT 1.34
p = 0.1: Mean 3864.5 detections, Prec 96.12%, Rec 98.79%, Acc 96.92%, F1 97.44%, Avg det/GT 1.26
p = 0.2: Mean 3198.8 detections, Prec 96.23%, Rec 98.78%, Acc 96.39%, F1 97.49%, Avg det/GT 1.04
p = 0.3: Mean 3042.2 detections, Prec 96.70%, Rec 98.78%, Acc 96.69%, F1 97.73%, Avg det/GT 0.99
p = 0.4: Mean 3033.4 detections, Prec 96.83%, Rec 98.78%, Acc 96.81%, F1 97.79%, Avg det/GT 0.99
p = 0.5: Mean 3031.3 detections, Prec 96.86%, Rec 98.77%, Acc 96.85%, F1 97.81%, Avg det/GT 0.99
p = 0.6: Mean 3029.9 detections, Prec 96.86%, Rec 98.76%, Acc 96.84%, F1 97.80%, Avg det/GT 0.99
p = 0.7: Mean 3029.3 detections, Prec 96.87%, Rec 98.77%, Acc 96.85%, F1 97.81%, Avg det/GT 0.99
p = 0.8: Mean 3028.4 detections, Prec 96.87%, Rec 98.76%, Acc 96.85%, F1 97.80%, Avg det/GT 0.99
p = 0.9: Mean 3027.6 detections, Prec 96.88%, Rec 98.76%, Acc 96.86%, F1 97.8

## Benchmark ratio between CSR, extranuclear, and intranuclear aggregation

In [None]:
# Benchmark 1: Vary CSR ratio, keep extra and intra-nuclear ratios identical
# Only use A, B, C markers and 3D case

# Original marker settings for A, B, C
marker_settings_original = {"A": {"density": 0.08, "num_clusters_extra": 5000, "num_clusters_intra": 2000},
                            "B": {"density": 0.04, "num_clusters_extra": 3000, "num_clusters_intra": 1200},
                            "C": {"density": 0.02, "num_clusters_extra": 2000, "num_clusters_intra": 800}}

# Original ratio
original_ratio = [0.5, 0.25, 0.25]  # [CSR, Extra, Intra]
original_extra_ratio = original_ratio[1]
original_intra_ratio = original_ratio[2]

# Test 5 different CSR ratios
csr_ratios = [0.2, 0.3, 0.4, 0.5, 0.6]

# Settings
point_type = ["CSR", "Extranuclear", "Intranuclear"]
mean_dist_extra = 1
mean_dist_intra = 4
beta_extra = (1, 19)
beta_intra = (19, 1)
simulate_z = True  # 3D only
shape_area = 2000 * 2000  # Used for scaling cluster counts

seed_lst = np.arange(1, 101)

print("Benchmark 1: Varying CSR ratio (keeping extra = intra)")
print(f"Testing CSR ratios: {csr_ratios}")

for csr_ratio in csr_ratios:
    # Calculate extra and intra ratios (they must be equal and sum with CSR to 1)
    remaining_ratio = 1.0 - csr_ratio
    extra_ratio = remaining_ratio / 2.0
    intra_ratio = remaining_ratio / 2.0
    ratio = [csr_ratio, extra_ratio, intra_ratio]
    
    print(f"\nCSR ratio: {csr_ratio:.2f}, Extra ratio: {extra_ratio:.2f}, Intra ratio: {intra_ratio:.2f}")
    
    for name, params_original in marker_settings_original.items():
        density_overall = params_original["density"]
        
        # Calculate scaling factors to maintain approximately same number of transcripts per cluster
        # Points per cluster ≈ (density * shape_area) / num_clusters
        # To keep points per cluster constant: num_clusters must scale proportionally with density change
        # num_clusters_new / num_clusters_original = new_density / original_density = new_ratio / original_ratio
        extra_scale_factor = extra_ratio / original_extra_ratio
        intra_scale_factor = intra_ratio / original_intra_ratio
        
        num_clusters_extra = int(params_original["num_clusters_extra"] * extra_scale_factor)
        num_clusters_intra = int(params_original["num_clusters_intra"] * intra_scale_factor)
        
        print(f"  Marker {name}: num_clusters_extra={num_clusters_extra}, num_clusters_intra={num_clusters_intra}")
        
        precision_lst = []
        recall_lst = []
        accuracy_lst = []
        f1_lst = []
        
        for seed in seed_lst:
            # Simulate data
            for i in range(len(point_type)):
                simulate = simulation(name=name, density=density_overall * ratio[i], shape=(2000, 2000), 
                                     layer_num=8, layer_gap=1.5, simulate_z=simulate_z, 
                                     write_path=output_dir + "/", seed=seed)
                if i == 0:
                    points_CSR = simulate.simulate_CSR()
                    points_CSR["type"] = [point_type[i]] * points_CSR.shape[0]
                elif i == 1:
                    parents_cluster_extra, points_cluster_extra = simulate.simulate_cluster(
                        num_clusters=num_clusters_extra, beta=beta_extra, mean_dist=mean_dist_extra)
                    points_cluster_extra["type"] = [point_type[i]] * points_cluster_extra.shape[0]
                elif i == 2:
                    parents_cluster_intra, points_cluster_intra = simulate.simulate_cluster(
                        num_clusters=num_clusters_intra, beta=beta_intra, mean_dist=mean_dist_intra)
                    points_cluster_intra["type"] = [point_type[i]] * points_cluster_intra.shape[0]
            
            points_all = pd.concat([points_CSR, points_cluster_extra, points_cluster_intra], 
                                  axis=0, ignore_index=True)
            parents_all = parents_cluster_extra
            
            # Run mcDETECT
            detect = model(shape=(2000, 2000), transcripts=points_all, target_all=["A", "B", "C"], 
                          eps=1.5, in_thr=0.25, size_thr=4)
            sphere = detect.dbscan_single(target_name=name)
            
            # Find matched index
            tree = make_tree(d1=np.array(parents_all["global_x"]), 
                           d2=np.array(parents_all["global_y"]), 
                           d3=np.array(parents_all["global_z"]))
            ground_truth_indices = set(parents_all.index)
            
            # Calculate all metrics
            precision, recall, accuracy, f1 = metric_main(tree, ground_truth_indices, sphere)
            precision_lst.append(precision)
            recall_lst.append(recall)
            accuracy_lst.append(accuracy)
            f1_lst.append(f1)
            
            if seed % 25 == 0:
                print(f"    {seed} out of {len(seed_lst)} iterations!")
        
        # Save results
        results_df = pd.DataFrame({
            "Simulation": seed_lst.tolist(),
            "Precision": precision_lst,
            "Recall": recall_lst,
            "Accuracy": accuracy_lst,
            "F1_Score": f1_lst,
            "CSR_ratio": [csr_ratio] * len(seed_lst),
            "Extra_ratio": [extra_ratio] * len(seed_lst),
            "Intra_ratio": [intra_ratio] * len(seed_lst)
        })
        filename = f"benchmark_ratio_csr_{name}_csr{csr_ratio:.2f}_extra{extra_ratio:.2f}_intra{intra_ratio:.2f}.csv"
        results_df.to_csv(os.path.join(output_dir, filename), index=0)
        
        print(f"  Marker {name}: Mean Precision={np.mean(precision_lst):.4f}, "
              f"Recall={np.mean(recall_lst):.4f}, F1={np.mean(f1_lst):.4f}")

print("\nBenchmark 1 completed!")

In [None]:
# Benchmark 2: Fix CSR at 0.5, vary the proportion of extra- and intra-nuclear ratios
# Only use A, B, C markers and 3D case

# Original marker settings for A, B, C
marker_settings_original = {"A": {"density": 0.08, "num_clusters_extra": 5000, "num_clusters_intra": 2000},
                            "B": {"density": 0.04, "num_clusters_extra": 3000, "num_clusters_intra": 1200},
                            "C": {"density": 0.02, "num_clusters_extra": 2000, "num_clusters_intra": 800}}

# Original ratio
original_ratio = [0.5, 0.25, 0.25]  # [CSR, Extra, Intra]
original_extra_ratio = original_ratio[1]
original_intra_ratio = original_ratio[2]

# Fix CSR at 0.5, test 5 different extra/intra splits
# Since CSR = 0.5, extra + intra must sum to 0.5
csr_ratio_fixed = 0.5
extra_intra_splits = [
    (0.4, 0.1),   # Extra-heavy
    (0.325, 0.175),
    (0.25, 0.25),  # Equal (original)
    (0.175, 0.325),
    (0.1, 0.4)    # Intra-heavy
]

# Settings
point_type = ["CSR", "Extranuclear", "Intranuclear"]
mean_dist_extra = 1
mean_dist_intra = 4
beta_extra = (1, 19)
beta_intra = (19, 1)
simulate_z = True  # 3D only
shape_area = 2000 * 2000  # Used for scaling cluster counts

seed_lst = np.arange(1, 101)

print("Benchmark 2: Varying extra/intra ratio (fixing CSR at 0.5)")
print(f"Testing extra/intra splits: {extra_intra_splits}")

for extra_ratio, intra_ratio in extra_intra_splits:
    ratio = [csr_ratio_fixed, extra_ratio, intra_ratio]
    
    print(f"\nCSR ratio: {csr_ratio_fixed:.2f}, Extra ratio: {extra_ratio:.2f}, Intra ratio: {intra_ratio:.2f}")
    
    for name, params_original in marker_settings_original.items():
        density_overall = params_original["density"]
        
        # Calculate scaling factors to maintain approximately same number of transcripts per cluster
        # Points per cluster ≈ (density * shape_area) / num_clusters
        # To keep points per cluster constant: num_clusters must scale proportionally with density change
        # num_clusters_new / num_clusters_original = new_density / original_density = new_ratio / original_ratio
        extra_scale_factor = extra_ratio / original_extra_ratio
        intra_scale_factor = intra_ratio / original_intra_ratio
        
        num_clusters_extra = int(params_original["num_clusters_extra"] * extra_scale_factor)
        num_clusters_intra = int(params_original["num_clusters_intra"] * intra_scale_factor)
        
        print(f"  Marker {name}: num_clusters_extra={num_clusters_extra}, num_clusters_intra={num_clusters_intra}")
        
        precision_lst = []
        recall_lst = []
        accuracy_lst = []
        f1_lst = []
        
        for seed in seed_lst:
            # Simulate data
            for i in range(len(point_type)):
                simulate = simulation(name=name, density=density_overall * ratio[i], shape=(2000, 2000), 
                                     layer_num=8, layer_gap=1.5, simulate_z=simulate_z, 
                                     write_path=output_dir + "/", seed=seed)
                if i == 0:
                    points_CSR = simulate.simulate_CSR()
                    points_CSR["type"] = [point_type[i]] * points_CSR.shape[0]
                elif i == 1:
                    parents_cluster_extra, points_cluster_extra = simulate.simulate_cluster(
                        num_clusters=num_clusters_extra, beta=beta_extra, mean_dist=mean_dist_extra)
                    points_cluster_extra["type"] = [point_type[i]] * points_cluster_extra.shape[0]
                elif i == 2:
                    parents_cluster_intra, points_cluster_intra = simulate.simulate_cluster(
                        num_clusters=num_clusters_intra, beta=beta_intra, mean_dist=mean_dist_intra)
                    points_cluster_intra["type"] = [point_type[i]] * points_cluster_intra.shape[0]
            
            points_all = pd.concat([points_CSR, points_cluster_extra, points_cluster_intra], 
                                  axis=0, ignore_index=True)
            parents_all = parents_cluster_extra
            
            # Run mcDETECT
            detect = model(shape=(2000, 2000), transcripts=points_all, target_all=["A", "B", "C"], 
                          eps=1.5, in_thr=0.25, size_thr=4)
            sphere = detect.dbscan_single(target_name=name)
            
            # Find matched index
            tree = make_tree(d1=np.array(parents_all["global_x"]), 
                           d2=np.array(parents_all["global_y"]), 
                           d3=np.array(parents_all["global_z"]))
            ground_truth_indices = set(parents_all.index)
            
            # Calculate all metrics
            precision, recall, accuracy, f1 = metric_main(tree, ground_truth_indices, sphere)
            precision_lst.append(precision)
            recall_lst.append(recall)
            accuracy_lst.append(accuracy)
            f1_lst.append(f1)
            
            if seed % 25 == 0:
                print(f"    {seed} out of {len(seed_lst)} iterations!")
        
        # Save results
        results_df = pd.DataFrame({
            "Simulation": seed_lst.tolist(),
            "Precision": precision_lst,
            "Recall": recall_lst,
            "Accuracy": accuracy_lst,
            "F1_Score": f1_lst,
            "CSR_ratio": [csr_ratio_fixed] * len(seed_lst),
            "Extra_ratio": [extra_ratio] * len(seed_lst),
            "Intra_ratio": [intra_ratio] * len(seed_lst)
        })
        filename = f"benchmark_ratio_fixedcsr_{name}_csr{csr_ratio_fixed:.2f}_extra{extra_ratio:.3f}_intra{intra_ratio:.3f}.csv"
        results_df.to_csv(os.path.join(output_dir, filename), index=0)
        
        print(f"  Marker {name}: Mean Precision={np.mean(precision_lst):.4f}, "
              f"Recall={np.mean(recall_lst):.4f}, F1={np.mean(f1_lst):.4f}")

print("\nBenchmark 2 completed!")