In [None]:
import matplotlib.pyplot as plt
from model import *
from simulate import *

In [2]:
# Function for calculating precision, recall, and accuracy
def calculate_metric(ground_truth_indices, matched_index):
    
    flattened_matches = []
    for match in matched_index:
        if isinstance(match, tuple):
            flattened_matches.extend(match)
        elif match != -1:
            flattened_matches.append(match)

    # 1. True Positives (TP): Unique ground truth points correctly detected
    unique_matched_points = set(flattened_matches)
    true_positives = len(unique_matched_points & ground_truth_indices)

    # 2. False Positives (FP): Detections that didn't match any ground truth
    false_positives = len([x for x in matched_index if x == -1])

    # 3. False Negatives (FN): Ground truth points that were never matched
    false_negatives = len(ground_truth_indices - unique_matched_points)

    # 4. Total ground truth points (used for recall)
    total_ground_truth_points = len(ground_truth_indices)

    # 5. Total detections (used for accuracy)
    total_detections = len(matched_index)

    # 6. Precision
    precision = true_positives / (true_positives + false_positives)

    # 7. Recall
    recall = true_positives / (true_positives + false_negatives)

    # 8. Revised Accuracy
    true_matches = len([x for x in matched_index if x != -1])  # Count of detections correctly matched
    accuracy = true_matches / total_detections
    
    return precision, recall, accuracy


def metric_main(tree, ground_truth_indices, sphere):
    matched_index = []
    for k in range(sphere.shape[0]):
        idx = tree.query_ball_point([sphere['sphere_x'].iloc[k], sphere['sphere_y'].iloc[k], sphere['sphere_z'].iloc[k]], sphere['sphere_r'].iloc[k])
        if len(idx) == 0:
            matched_index.append(-1)
        elif len(idx) == 1:
            matched_index += idx
        elif len(idx) > 1:
            matched_index.append(tuple(idx))
    return calculate_metric(ground_truth_indices, matched_index)

## Single-marker CSR and aggregation

In [None]:
# Point types and their ratios
point_type = ['CSR', 'Extranuclear', 'Intranuclear']
ratio = [0.5, 0.25, 0.25]

In [None]:
# Mean distances for extranuclear and intranuclear aggregation
mean_dist_extra = 1
mean_dist_intra = 3.5

# Mean in-nucleus ratio for extranuclear and intranuclear aggregation
beta_extra = (2, 8)
beta_intra = (8, 2)

In [None]:
# Main
name = 'A'
density_overall = 0.08
num_clusters_extra = 5000
num_clusters_intra = 2000

# name = 'B'
# density_overall = 0.04
# num_clusters_extra = 3000
# num_clusters_intra = 1200

# name = 'C'
# density_overall = 0.02
# num_clusters_extra = 2000
# num_clusters_intra = 800

seed_lst = np.arange(1, 201)
precision_lst = []
recall_lst = []
accuracy_lst = []

for seed in seed_lst:

    # simulate data
    for i in range(len(point_type)):
        simulate = simulation(name = name, density = density_overall * ratio[i], shape = (2000, 2000), layer_num = 8, layer_gap = 1.5, simulate_z = True, write_path = 'main_output/', seed = seed)
        # simulate = simulation(name = name, density = density_overall * ratio[i], shape = (2000, 2000), layer_num = 8, layer_gap = 1.5, simulate_z = False, write_path = 'main_output/', seed = seed)
        if i == 0:
            points_CSR = simulate.simulate_CSR()
            points_CSR['type'] = [point_type[i]] * points_CSR.shape[0]
        elif i == 1:
            parents_cluster_extra, points_cluster_extra = simulate.simulate_cluster(num_clusters = num_clusters_extra, beta = beta_extra, mean_dist = mean_dist_extra)
            points_cluster_extra['type'] = [point_type[i]] * points_cluster_extra.shape[0]
        elif i == 2:
            parents_cluster_intra, points_cluster_intra = simulate.simulate_cluster(num_clusters = num_clusters_intra, beta = beta_intra, mean_dist = mean_dist_intra)
            points_cluster_intra['type'] = [point_type[i]] * points_cluster_intra.shape[0]
    points_all = pd.concat([points_CSR, points_cluster_extra, points_cluster_intra], axis = 0, ignore_index = True)
    parents_all = parents_cluster_extra
    
    # run mcDETECT
    detect = model(shape = (2000, 2000), transcripts = points_all, target_all = ['A', 'B', 'C'], eps = 1.5, in_thr = 0.5, size_thr = 3.5)
    sphere = detect.dbscan_single(target_name = name)
    
    # find matched index
    tree = make_tree(d1 = np.array(parents_all['global_x']), d2 = np.array(parents_all['global_y']), d3 = np.array(parents_all['global_z']))
    ground_truth_indices = set(parents_all.index)
    
    # calculate precision, recall, and accuracy
    precision, recall, accuracy = metric_main(tree, ground_truth_indices, sphere)
    precision_lst.append(precision)
    recall_lst.append(recall)
    accuracy_lst.append(accuracy)
    
    if seed % 50 == 0:
        print('{} out of {} iterations!'.format(seed, len(seed_lst)))

pd.DataFrame({'Simulation': seed_lst.tolist(), 'Precision': precision_lst, 'Recall': recall_lst, 'Accuracy': accuracy_lst}).to_csv('main_output/single_marker_3D/' + name + '_{}_{}.csv'.format(num_clusters_extra, num_clusters_intra), index = 0)
# pd.DataFrame({'Simulation': seed_lst.tolist(), 'Precision': precision_lst, 'Recall': recall_lst, 'Accuracy': accuracy_lst}).to_csv('main_output/single_marker_2D/' + name + '_{}_{}.csv'.format(num_clusters_extra, num_clusters_intra), index = 0)

## Multi-marker aggregation

In [None]:
# Set up
name = ['A', 'B', 'C']

shape = (2000, 2000)
layer_num = 8
layer_gap = 1.5
write_path = ''

CSR_density = [0.04, 0.02, 0.01]

extra_density = [0.02, 0.01, 0.005]
extra_num_clusters = 5000
extra_beta = (2, 8)
extra_comp_prob = [0.4, 0.3, 0.3]
extra_mean_dist = 1

intra_density = [0.02, 0.01, 0.005]
intra_num_clusters = 1000
intra_beta = (8, 2)
intra_comp_prob = [0.8, 0.1, 0.1]
intra_mean_dist = 3.5

In [None]:
# Main
simulate_z = True

if simulate_z:
    setting = '3D'
else:
    setting = '2D'

seed_lst = np.arange(1, 201)
precision_lst_A, recall_lst_A, accuracy_lst_A = [], [], []
precision_lst_B, recall_lst_B, accuracy_lst_B = [], [], []
precision_lst_C, recall_lst_C, accuracy_lst_C = [], [], []
precision_lst_all, recall_lst_all, accuracy_lst_all = [], [], []

for seed in seed_lst:

    # simulate data
    multi_simulate_extra = multi_simulation(name = name, density = extra_density, shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed)
    parents_extra, parents_all_extra, points_extra = multi_simulate_extra.simulate_cluster(num_clusters = extra_num_clusters, beta = extra_beta, comp_prob = extra_comp_prob, mean_dist = extra_mean_dist, comp_thr = 1)
    
    multi_simulate_intra = multi_simulation(name = name, density = intra_density, shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 100)
    parents_intra, parents_all_intra, points_intra = multi_simulate_intra.simulate_cluster(num_clusters = intra_num_clusters, beta = intra_beta, comp_prob = intra_comp_prob, mean_dist = intra_mean_dist, comp_thr = 1)
    
    simulate_A = simulation(name = name[0], density = CSR_density[0], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 200)
    points_CSR_A = simulate_A.simulate_CSR()

    simulate_B = simulation(name = name[1], density = CSR_density[1], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 300)
    points_CSR_B = simulate_B.simulate_CSR()

    simulate_C = simulation(name = name[2], density = CSR_density[2], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 400)
    points_CSR_C = simulate_C.simulate_CSR()
    
    parents_all = parents_extra
    points_all = pd.concat([points_extra, points_intra, points_CSR_A, points_CSR_B, points_CSR_C], axis = 0, ignore_index = True)
    
    points_A = points_all[points_all['target'] == 'A']
    points_B = points_all[points_all['target'] == 'B']
    points_C = points_all[points_all['target'] == 'C']
    
    # ground truth tree and index
    tree = make_tree(d1 = np.array(parents_all['global_x']), d2 = np.array(parents_all['global_y']), d3 = np.array(parents_all['global_z']))
    ground_truth_indices = set(parents_all.index)
    
    # run mcDETECT on A/B/C/all
    detect_A = model(shape = (2000, 2000), transcripts = points_A, target_all = ['A', 'B', 'C'], eps = 1.5, in_thr = 0.5, size_thr = 3.5)
    sphere_A = detect_A.dbscan_single(target_name = 'A')
    precision_A, recall_A, accuracy_A = metric_main(tree, ground_truth_indices, sphere_A)
    precision_lst_A.append(precision_A)
    recall_lst_A.append(recall_A)
    accuracy_lst_A.append(accuracy_A)
    
    detect_B = model(shape = (2000, 2000), transcripts = points_B, target_all = ['A', 'B', 'C'], eps = 1.5, in_thr = 0.5, size_thr = 3.5)
    sphere_B = detect_B.dbscan_single(target_name = 'B')
    precision_B, recall_B, accuracy_B = metric_main(tree, ground_truth_indices, sphere_B)
    precision_lst_B.append(precision_B)
    recall_lst_B.append(recall_B)
    accuracy_lst_B.append(accuracy_B)
    
    detect_C = model(shape = (2000, 2000), transcripts = points_C, target_all = ['A', 'B', 'C'], eps = 1.5, in_thr = 0.5, size_thr = 3.5)
    sphere_C = detect_C.dbscan_single(target_name = 'C')
    precision_C, recall_C, accuracy_C = metric_main(tree, ground_truth_indices, sphere_C)
    precision_lst_C.append(precision_C)
    recall_lst_C.append(recall_C)
    accuracy_lst_C.append(accuracy_C)
    
    detect_all = model(shape = (2000, 2000), transcripts = points_all, target_all = ['A', 'B', 'C'], eps = 1.5, in_thr = 0.5, comp_thr = 1, size_thr = 3.5, p = 0.5)
    sphere_all = detect_all.merge_data()
    precision_all, recall_all, accuracy_all = metric_main(tree, ground_truth_indices, sphere_all)
    precision_lst_all.append(precision_all)
    recall_lst_all.append(recall_all)
    accuracy_lst_all.append(accuracy_all)
    
    print('{} out of {} iterations!'.format(seed, len(seed_lst)))

pd.DataFrame({'Simulation': seed_lst.tolist(), 'Precision': precision_lst_A, 'Recall': recall_lst_A, 'Accuracy': accuracy_lst_A}).to_csv('main_output/multi_marker_{}/A_{}_{}.csv'.format(setting, extra_num_clusters, intra_num_clusters), index = 0)
pd.DataFrame({'Simulation': seed_lst.tolist(), 'Precision': precision_lst_B, 'Recall': recall_lst_B, 'Accuracy': accuracy_lst_B}).to_csv('main_output/multi_marker_{}/B_{}_{}.csv'.format(setting, extra_num_clusters, intra_num_clusters), index = 0)
pd.DataFrame({'Simulation': seed_lst.tolist(), 'Precision': precision_lst_C, 'Recall': recall_lst_C, 'Accuracy': accuracy_lst_C}).to_csv('main_output/multi_marker_{}/C_{}_{}.csv'.format(setting, extra_num_clusters, intra_num_clusters), index = 0)
pd.DataFrame({'Simulation': seed_lst.tolist(), 'Precision': precision_lst_all, 'Recall': recall_lst_all, 'Accuracy': accuracy_lst_all}).to_csv('main_output/multi_marker_{}/all_{}_{}.csv'.format(setting, extra_num_clusters, intra_num_clusters), index = 0)

## Multi-marker aggregation (20 markers)

In [3]:
max_comp = 20
comp_list = np.arange(1, max_comp + 1)

extra_alpha, extra_beta_param = 5, 3  # not to be confused with your beta distribution for in-nucleus ratio
x_extra = comp_list / max_comp  # scale to [0,1]
pdf_extra = (x_extra**(extra_alpha - 1)) * ((1 - x_extra)**(extra_beta_param - 1))
extra_comp_prob = pdf_extra / pdf_extra.sum()

intra_alpha, intra_beta_param = 2, 5
x_intra = comp_list / max_comp
pdf_intra = (x_intra**(intra_alpha - 1)) * ((1 - x_intra)**(intra_beta_param - 1))
intra_comp_prob = pdf_intra / pdf_intra.sum()

In [4]:
# Set up
num_A, num_B, num_C = 7, 7, 6

name = [f"A_{i}" for i in range(1, num_A + 1)] + [f"B_{i}" for i in range(1, num_B + 1)] + [f"C_{i}" for i in range(1, num_C + 1)]

CSR_density_base = {'A': 0.04, 'B': 0.02, 'C': 0.01}
extra_density_base = {'A': 0.02, 'B': 0.01, 'C': 0.005}
intra_density_base = {'A': 0.02, 'B': 0.01, 'C': 0.005}

CSR_density = []
extra_density = []
intra_density = []

for marker in name:
    t = marker[0]
    CSR_density.append(CSR_density_base[t])
    extra_density.append(extra_density_base[t])
    intra_density.append(intra_density_base[t])

shape = (2000, 2000)
layer_num = 8
layer_gap = 1.5
write_path = ''

extra_num_clusters = 5000
extra_beta = (2, 8)
extra_mean_dist = 1

intra_num_clusters = 1000
intra_beta = (8, 2)
intra_mean_dist = 3.5

In [17]:
# Main
seed_lst = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

simulate_z = True

if simulate_z:
    setting = '3D'
else:
    setting = '2D'

precision_dict = {m: [] for m in name}
recall_dict = {m: [] for m in name}
accuracy_dict = {m: [] for m in name}

precision_lst_all, recall_lst_all, accuracy_lst_all = [], [], []

for seed in seed_lst:

    # simulate data
    multi_simulate_extra = multi_simulation(name = name, density = extra_density, shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed)
    parents_extra, parents_all_extra, points_extra = multi_simulate_extra.simulate_cluster(num_clusters = extra_num_clusters, beta = extra_beta, comp_prob = extra_comp_prob, mean_dist = extra_mean_dist, comp_thr = 1)
    
    multi_simulate_intra = multi_simulation(name = name, density = intra_density, shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 100)
    parents_intra, parents_all_intra, points_intra = multi_simulate_intra.simulate_cluster(num_clusters = intra_num_clusters, beta = intra_beta, comp_prob = intra_comp_prob, mean_dist = intra_mean_dist, comp_thr = 1)
    
    points_CSR_list = []
    for i, marker in enumerate(name):
        sim = simulation(name = marker, density = CSR_density[i], shape = shape, layer_num = layer_num, layer_gap = layer_gap, simulate_z = simulate_z, write_path = write_path, seed = seed + 200 + i)
        points_CSR_list.append(sim.simulate_CSR())
    
    points_all = pd.concat([points_extra, points_intra] + points_CSR_list, axis = 0, ignore_index = True)
    parents_all = parents_extra
    
    # ground truth tree and index
    tree = make_tree(d1 = np.array(parents_all['global_x']), d2 = np.array(parents_all['global_y']), d3 = np.array(parents_all['global_z']))
    ground_truth_indices = set(parents_all.index)
    
    # run mcDETECT on individual markers/all
    for marker in name:
        points_marker = points_all[points_all['target'] == marker]
        detect_marker = model(shape = shape, transcripts = points_marker, target_all = name, eps = 1.5, in_thr = 0.5, size_thr = 3.5)
        sphere_marker = detect_marker.dbscan_single(target_name = marker)
        precision, recall, accuracy = metric_main(tree, ground_truth_indices, sphere_marker)

        precision_dict[marker].append(precision)
        recall_dict[marker].append(recall)
        accuracy_dict[marker].append(accuracy)
    
    detect_all = model(shape = shape, transcripts = points_all, target_all = name, eps = 1.5, in_thr = 0.5, comp_thr = 1, size_thr = 3.5, p = 0.5)
    sphere_all = detect_all.merge_data()
    precision_all, recall_all, accuracy_all = metric_main(tree, ground_truth_indices, sphere_all)
    precision_lst_all.append(precision_all)
    recall_lst_all.append(recall_all)
    accuracy_lst_all.append(accuracy_all)
    
    print(f"{seed} out of {len(seed_lst)} iterations!")

10 out of 10 iterations!
20 out of 10 iterations!
30 out of 10 iterations!
40 out of 10 iterations!
50 out of 10 iterations!
60 out of 10 iterations!
70 out of 10 iterations!
80 out of 10 iterations!
90 out of 10 iterations!
100 out of 10 iterations!


In [18]:
np.mean(precision_lst_all), np.mean(recall_lst_all), np.mean(accuracy_lst_all)

(0.9112053511367643, 0.9999, 0.9124686572333627)

In [19]:
all_recalls = [np.mean(lst) for lst in recall_dict.values()]

In [20]:
np.min(all_recalls), np.max(all_recalls)

(0.47088, 0.6128600000000001)