In [1]:
import os
import json

QUESTIONS_FILE = "./data/questions_filter_after.json"
POLICIES_FILE = "./data/policies_testing.json"
OUTPUT_Q_FILE = "./output_q.json"
OUTPUT_P_FILE = "./output_p.json"


def _loadJson(filepath):
	if not os.path.exists(filepath):
		print(f"Warning: File not found: {filepath}")
		return {}
	try:
		with open(filepath, "r", encoding="utf-8") as f:
			return json.load(f)
	except json.JSONDecodeError:
		print(f"Error decoding JSON: {filepath}")
		return {}


qdata = _loadJson(QUESTIONS_FILE)

In [None]:
import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist, mahalanobis, squareform
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
from itertools import combinations

# =========================================================
# 1. DATA PREPARATION
# =========================================================


def prepare_model_artifacts(raw_vector_list, name="Model"):
	print(f"Processing {name}...")
	data = np.array(raw_vector_list)
	# Matryoshka slicing (First 256 dims)
	data_trunc = data[:, :256]
	norms = np.linalg.norm(data_trunc, axis=1, keepdims=True)
	cleaned_vectors = data_trunc / (norms + 1e-10)

	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)

	return {
		"dist_matrix": dist_matrix,
		"vectors": cleaned_vectors,
		"precision": precision_matrix,
	}


# --- LOAD DATA ---
data_set = qdata
strings = list(data_set.keys())

raw_vectors_A = [data_set[s]["embedding_vector"] for s in strings]
raw_vectors_B = [data_set[s]["retrieval_embedding_vector"] for s in strings]

Model_A = prepare_model_artifacts(raw_vectors_A, "Model A (Statement)")
Model_B = prepare_model_artifacts(raw_vectors_B, "Model B (Question)")

# =========================================================
# 2. STAGE 1: PLATINUM CONSENSUS (Fine-Grained Dynamic Search)
# =========================================================


def get_pairs_from_labels(labels):
	df = pd.DataFrame({"label": labels, "id": range(len(labels))})
	pairs = set()
	for label, group in df.groupby("label"):
		indices = group["id"].tolist()
		if len(indices) > 1:
			for p in combinations(sorted(indices), 2):
				pairs.add(p)
	return pairs


def get_clustering_pairs(dist_matrix, threshold):
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	return get_pairs_from_labels(labels)


print("\n--- Phase 1: Grid Search for Consensus (Platinum Set) ---")

# 1. Determine Dynamic Search Range
# Get all off-diagonal distances to find the min/max of the actual data
mask_A = np.triu(np.ones(Model_A["dist_matrix"].shape), k=1).astype(bool)
mask_B = np.triu(np.ones(Model_B["dist_matrix"].shape), k=1).astype(bool)
all_dists = np.concatenate(
	[Model_A["dist_matrix"][mask_A], Model_B["dist_matrix"][mask_B]]
)

D_min = max(0.0, np.min(all_dists))
D_max = np.percentile(
	all_dists, 90
)	# Optimize: Search up to 90th percentile (outliers don't matter)
SEARCH_STEP = 0.05

print(f"Dynamic Search Range: [{D_min:.2f} to {D_max:.2f}] (Step: {SEARCH_STEP})")
tau_range = np.arange(D_min, D_max, SEARCH_STEP)

# 2. Grid Search
cache_A = {t: get_clustering_pairs(Model_A["dist_matrix"], t) for t in tau_range}
cache_B = {t: get_clustering_pairs(Model_B["dist_matrix"], t) for t in tau_range}

best_jaccard = -1
P_true = set()

# Iterate
for t_A in tau_range:
	pairs_A = cache_A[t_A]
	if not pairs_A:
		continue

	for t_B in tau_range:
		pairs_B = cache_B[t_B]

		union = pairs_A.union(pairs_B)
		if len(union) > 0:
			jaccard = len(pairs_A.intersection(pairs_B)) / len(union)
			if jaccard > best_jaccard:
				best_jaccard = jaccard
				P_true = pairs_A.intersection(pairs_B)

# 3. Calculate N_plat (Number of distinct groups in Platinum Set)
if len(P_true) > 0:
	# Build Adjacency Matrix
	N = len(strings)
	rows = [p[0] for p in P_true]
	cols = [p[1] for p in P_true]
	data = [1] * len(P_true)
	adj = csr_matrix((data, (rows, cols)), shape=(N, N))

	# Count connected components
	n_components, labels = connected_components(
		csgraph=adj, directed=False, return_labels=True
	)
	counts = pd.Series(labels).value_counts()
	N_plat = len(counts[counts > 1])
else:
	N_plat = 0

print(f"Optimal Jaccard Agreement: {best_jaccard:.4f}")
print(f"Platinum Pairs: {len(P_true)}")
print(f"Platinum Groups (N_plat): {N_plat}")

# =========================================================
# 3. STAGE 2: DYNAMIC STABILITY ANALYSIS (Elbow Detection)
# =========================================================


def get_structural_capacity(dist_matrix, model_name):
	"""
	Finds N_stable by detecting the 'Elbow' (Max Acceleration)
	in the Hierarchical Clustering Linkage.
	"""
	condensed_dist = squareform(dist_matrix, checks=False)
	Z = linkage(condensed_dist, method="complete")

	distances = Z[:, 2]

	# Second derivative (Acceleration) of merge distance
	acceleration = np.diff(distances, 2)
	elbow_idx = np.argmax(acceleration) + 2

	stable_threshold = distances[elbow_idx]

	labels = fcluster(Z, t=stable_threshold, criterion="distance")

	df = pd.DataFrame({"label": labels})
	counts = df["label"].value_counts()
	n_significant = len(counts[counts > 1])

	print(
		f"[{model_name}] Stability Elbow at Dist={stable_threshold:.4f} -> Capacity: {n_significant}"
	)
	return n_significant


print("\n--- Phase 2: Estimating Structural Capacity ---")

N_stable_A = get_structural_capacity(Model_A["dist_matrix"], "Model A")
N_stable_B = get_structural_capacity(Model_B["dist_matrix"], "Model B")

N_stable_avg = (N_stable_A + N_stable_B) / 2
print(f"Average System Capacity (N_stable): {N_stable_avg:.1f}")

# =========================================================
# 4. STAGE 3: INFERENCE & EVT FUSION
# =========================================================

if N_stable_avg == 0:
	N_stable_avg = 1
PROB_THRESHOLD = N_plat / N_stable_avg

print(f"\n--- Phase 3: Inference ---")
print(f"Inferred Probability Threshold: {PROB_THRESHOLD:.6f}")
print(f"Logic: {N_plat} (Truth) / {N_stable_avg:.1f} (Capacity)")

# --- EVT Probability Transformation ---


def get_nearest_neighbor_distances(dist_matrix):
	np.fill_diagonal(dist_matrix, float("inf"))
	min_dists = np.min(dist_matrix, axis=1)
	np.fill_diagonal(dist_matrix, 0.0)
	return min_dists


def convert_dist_to_prob(dist_matrix, nn_dists):
	sorted_refs = np.sort(nn_dists)
	n = len(sorted_refs)
	ranks = np.searchsorted(sorted_refs, dist_matrix)
	probs = (ranks + 1) / (n + 1)
	return probs


nn_A = get_nearest_neighbor_distances(Model_A["dist_matrix"])
nn_B = get_nearest_neighbor_distances(Model_B["dist_matrix"])

Prob_A = convert_dist_to_prob(Model_A["dist_matrix"], nn_A)
Prob_B = convert_dist_to_prob(Model_B["dist_matrix"], nn_B)

# Min Fusion
Prob_Fused = np.minimum(Prob_A, Prob_B)

# =========================================================
# 5. FINAL CLUSTERING
# =========================================================

print(f"\n{'='*80}")
print(f"FINAL OUTPUT: Topologically Inferred Model (P < {PROB_THRESHOLD:.6f})")
print(f"{'='*80}")

cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=PROB_THRESHOLD,
	metric="precomputed",
	linkage="complete",
)

labels = cluster_model.fit_predict(Prob_Fused)

df = pd.DataFrame({"string": strings, "label": labels, "idx": range(len(strings))})
groups = [g for _, g in df.groupby("label") if len(g) > 1]
groups.sort(key=lambda x: len(x), reverse=True)

print(f"Found {len(groups)} significant duplicate groups.\n")

for i, group in enumerate(groups):
	indices = group["idx"].tolist()
	cluster_strs = group["string"].tolist()

	# Viz using Model B geometry
	vecs = Model_B["vectors"][indices]
	prec = Model_B["precision"]
	mean = np.mean(vecs, axis=0)
	dists = []
	min_d = float("inf")
	rep_i = -1

	for loc_i, glob_i in enumerate(indices):
		d = mahalanobis(Model_B["vectors"][glob_i], mean, prec)
		dists.append(d)
		if d < min_d:
			min_d = d
			rep_i = loc_i

	print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {max(dists):.4f}]")
	for idx, s in enumerate(cluster_strs):
		prefix = " [CENTROID] " if idx == rep_i else "            "
		print(f"{prefix} {s}")
	print("-" * 80)

Processing Model A (Statement)...
Processing Model B (Question)...

--- Phase 1: Grid Search for Consensus (Platinum Set) ---
Dynamic Search Range: [5.77 to 21.15] (Step: 0.05)
Optimal Jaccard Agreement: 1.0000
Platinum Pairs: 5
Platinum Groups (N_plat): 5

--- Phase 2: Estimating Structural Capacity ---
[Model A] Stability Elbow at Dist=10.4409 -> Capacity: 12
[Model B] Stability Elbow at Dist=10.2205 -> Capacity: 16
Average System Capacity (N_stable): 14.0

--- Phase 3: Inference ---
Inferred Probability Threshold: 0.357143
Logic: 5 (Truth) / 14.0 (Capacity)

FINAL OUTPUT: Topologically Inferred Model (P < 0.357143)
Found 88 significant duplicate groups.

GROUP 1 (Size: 4) [Radius: 8.4714]
             Does the privacy policy affirm that the company does not knowingly collect information from children under the age of 18?
             Does the privacy policy affirm that the company does not knowingly disclose information from children under the age of 18?
             Does the privac

In [None]:
import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist, mahalanobis
from scipy.stats import rankdata
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations

# =========================================================
# 1. DATA PREPARATION (Artifact Generation)
# =========================================================


def prepare_model_artifacts(raw_vector_list, name="Model"):
	print(f"Processing {name}...")
	data = np.array(raw_vector_list)
	data_trunc = data[:, :256]
	norms = np.linalg.norm(data_trunc, axis=1, keepdims=True)
	cleaned_vectors = data_trunc / (norms + 1e-10)

	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)

	return {
		"dist_matrix": dist_matrix,
		"vectors": cleaned_vectors,
		"precision": precision_matrix,
	}


# --- LOAD DATA ---
data_set = qdata
strings = list(data_set.keys())

raw_vectors_A = [data_set[s]["embedding_vector"] for s in strings]
raw_vectors_B = [data_set[s]["retrieval_embedding_vector"] for s in strings]

Model_A = prepare_model_artifacts(raw_vectors_A, "Model A")
Model_B = prepare_model_artifacts(raw_vectors_B, "Model B")

# =========================================================
# 2. STAGE 1: PLATINUM CONSENSUS (N_plat)
# =========================================================


def get_pairs_from_labels(labels):
	df = pd.DataFrame({"label": labels, "id": range(len(labels))})
	pairs = set()
	for label, group in df.groupby("label"):
		indices = group["id"].tolist()
		if len(indices) > 1:
			for p in combinations(sorted(indices), 2):
				pairs.add(p)
	return pairs


def get_clustering_pairs_and_labels(dist_matrix, threshold):
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	return get_pairs_from_labels(labels), labels


print("\n--- Generating Platinum Consensus ---")

# We use the coarse grid from earlier to find the specific Platinum Pairs (P_true)
tau_range = np.arange(5.0, 18.0, 0.5)
cache_A = {
	t: get_clustering_pairs_and_labels(Model_A["dist_matrix"], t)[0] for t in tau_range
}
cache_B = {
	t: get_clustering_pairs_and_labels(Model_B["dist_matrix"], t)[0] for t in tau_range
}

best_jaccard = -1
P_true = set()

for t_A in tau_range:
	for t_B in tau_range:
		pairs_A = cache_A[t_A]
		pairs_B = cache_B[t_B]
		union = pairs_A.union(pairs_B)
		if len(union) > 0:
			jaccard = len(pairs_A.intersection(pairs_B)) / len(union)
			if jaccard > best_jaccard:
				best_jaccard = jaccard
				P_true = pairs_A.intersection(pairs_B)

# Calculate N_plat (Number of clusters in the Platinum set)
# We mock a label set to count connected components of P_true
temp_labels_A = get_clustering_pairs_and_labels(Model_A["dist_matrix"], 8.8)[
	1
]	# Threshold doesn't matter here
N_plat = 0
if len(P_true) > 0:
	# Logic: Count unique clusters involved in P_true indices
	involved_indices = set(idx for pair in P_true for idx in pair)
	# This is an approximation of cluster count
	# Ideally we'd run a graph component search on P_true, but using the Stage 1 groups is safe
	# Let's trust your calculated value of 5 for this logic
	# For automation, we can just treat number of pairs as a proxy or use graph components:
	import networkx as nx

	G = nx.Graph()
	G.add_edges_from(P_true)
	N_plat = nx.number_connected_components(G)

print(f"Platinum Pairs: {len(P_true)}")
print(f"Platinum Groups (N_plat): {N_plat}")

# =========================================================
# 3. STABILITY ANALYSIS (N_stable)
# =========================================================


def find_stability_limit(dist_matrix, start_t, model_name):
	"""
	Iteratively increases threshold to find the 'Stability Breakpoint'.
	Returns the number of significant clusters at the max stable threshold.
	"""
	step = 0.05
	t = start_t

	# Get Baseline
	base_pairs, _ = get_clustering_pairs_and_labels(dist_matrix, t)

	max_stable_groups = 0

	for i in range(500):	# Safety break
		t += step
		curr_pairs, curr_labels = get_clustering_pairs_and_labels(dist_matrix, t)

		# Check stability: Is the baseline a subset of current?
		# If current pairs < baseline pairs, or if we lose structure wildly (unlikely in hierarchical)
		# Actually, hierarchical merging means pairs strictly increase.
		# Stability break is defined by a massive jump or qualitative shift?
		# Your previous logic: "First merge/change detected" compared to a specific baseline.

		# YOUR LOGIC: Using the "Significant Cluster" count at the breaking point.
		# We will iterate until the cluster set implies a merge that contradicts the structure.
		# But simply: We want the count of groups at the highest stable threshold.

		df = pd.DataFrame({"label": curr_labels})
		group_counts = df["label"].value_counts()
		sig_groups = len(group_counts[group_counts > 1])

		# Store this count
		max_stable_groups = sig_groups

		# Using your specific breakpoint logic requires a definition of "Broken".
		# Your previous output showed 'Stability Broken at 15.3'.
		# Let's assume a hard cap or a derivative check.
		# For this script, I will trust your finding that ~15.0 is the limit for this embedding space.
		if t > 15.0:
			break

	return max_stable_groups


print("\n--- Estimating Stability Limits ---")
# Using the values derived from your "Stability Check" logic
# Model A broke at ~15.2 -> ~105 groups
# Model B broke at ~15.5 -> ~99 groups
N_stable_A = 105
N_stable_B = 99
N_stable_avg = (N_stable_A + N_stable_B) / 2

print(f"Max Stable Groups (Avg): {N_stable_avg}")

# =========================================================
# 4. RATIO CALCULATION & EVT FUSION
# =========================================================

# The Ratio Inference
PROB_THRESHOLD = N_plat / N_stable_avg

print(f"\n--- Inference ---")
print(f"Calculated Probability Threshold: {PROB_THRESHOLD:.6f}")
print(f"Logic: {N_plat} (Platinum) / {N_stable_avg} (Stable Capacity)")


# --- EVT Probability Calculation ---
def get_nearest_neighbor_distances(dist_matrix):
	np.fill_diagonal(dist_matrix, float("inf"))
	min_dists = np.min(dist_matrix, axis=1)
	np.fill_diagonal(dist_matrix, 0.0)
	return min_dists


def convert_dist_to_prob(dist_matrix, nn_dists):
	sorted_refs = np.sort(nn_dists)
	n = len(sorted_refs)
	ranks = np.searchsorted(sorted_refs, dist_matrix)
	probs = (ranks + 1) / (n + 1)
	return probs


nn_A = get_nearest_neighbor_distances(Model_A["dist_matrix"])
nn_B = get_nearest_neighbor_distances(Model_B["dist_matrix"])

Prob_A = convert_dist_to_prob(Model_A["dist_matrix"], nn_A)
Prob_B = convert_dist_to_prob(Model_B["dist_matrix"], nn_B)

# Min Fusion (Max Signal)
Prob_Fused = np.minimum(Prob_A, Prob_B)

# =========================================================
# 5. FINAL CLUSTERING
# =========================================================

print(f"\n{'='*80}")
print(f"FINAL OUTPUT: Topologically Inferred Model (P < {PROB_THRESHOLD:.6f})")
print(f"{'='*80}")

cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=PROB_THRESHOLD,
	metric="precomputed",
	linkage="complete",
)

labels = cluster_model.fit_predict(Prob_Fused)

df = pd.DataFrame({"string": strings, "label": labels, "idx": range(len(strings))})
groups = [g for _, g in df.groupby("label") if len(g) > 1]
groups.sort(key=lambda x: len(x), reverse=True)

print(f"Found {len(groups)} significant groups.\n")

for i, group in enumerate(groups):
	indices = group["idx"].tolist()
	cluster_strs = group["string"].tolist()

	# Viz using Model B geometry
	vecs = Model_B["vectors"][indices]
	mean = np.mean(vecs, axis=0)
	dists = []
	min_d = float("inf")
	rep_i = -1

	for loc_i, glob_i in enumerate(indices):
		d = mahalanobis(Model_B["vectors"][glob_i], mean, Model_B["precision"])
		dists.append(d)
		if d < min_d:
			min_d = d
			rep_i = loc_i

	print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {max(dists):.4f}]")
	for idx, s in enumerate(cluster_strs):
		prefix = " [CENTROID] " if idx == rep_i else "            "
		print(f"{prefix} {s}")
	print("-" * 80)

In [None]:
import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist, mahalanobis
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations

# =========================================================
# 1. DATA PREPARATION (Updated to return vectors/precision)
# =========================================================


def prepare_model_artifacts(raw_vector_list, name="Model"):
	"""
	Returns dictionary containing:
	- 'dist_matrix': N x N pairwise distances
	- 'vectors': N x 256 normalized vectors
	- 'precision': 256 x 256 inverse covariance matrix
	"""
	print(f"Processing {name}...")

	# 1. Truncate & Normalize
	data = np.array(raw_vector_list)
	data_trunc = data[:, :256]
	norms = np.linalg.norm(data_trunc, axis=1, keepdims=True)
	cleaned_vectors = data_trunc / (norms + 1e-10)

	# 2. Ledoit-Wolf
	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	# 3. Distance Matrix
	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)

	return {
		"dist_matrix": dist_matrix,
		"vectors": cleaned_vectors,
		"precision": precision_matrix,
	}


# --- LOAD DATA ---
# Assuming 'qdata' is available
data_set = qdata
strings = list(data_set.keys())

raw_vectors_A = [data_set[s]["embedding_vector"] for s in strings]
raw_vectors_B = [data_set[s]["retrieval_embedding_vector"] for s in strings]

# --- PROCESS ARTIFACTS ---
# We store everything in a dictionary to keep it organized
Model_A = prepare_model_artifacts(raw_vectors_A, "Model A (Statement)")
Model_B = prepare_model_artifacts(raw_vectors_B, "Model B (Question)")

In [31]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations

# =========================================================
# 1. HELPER FUNCTIONS
# =========================================================


def get_pairs_from_labels(labels):
	df = pd.DataFrame({"label": labels, "id": range(len(labels))})
	pairs = set()
	for label, group in df.groupby("label"):
		indices = group["id"].tolist()
		if len(indices) > 1:
			for p in combinations(sorted(indices), 2):
				pairs.add(p)
	return pairs


def get_clustering_pairs_and_labels(dist_matrix, threshold):
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	return get_pairs_from_labels(labels), labels


def calculate_n_true(labels_array, target_pairs):
	if not target_pairs or labels_array is None:
		return 0
	involved_indices = set(idx for pair in target_pairs for idx in pair)
	labels_of_interest = set(
		labels_array[idx] for idx in involved_indices if idx < len(labels_array)
	)
	return len(labels_of_interest)


# =========================================================
# 2. INVERTED GRID SEARCH (The requested logic)
# =========================================================

# 1. Define Dynamic Range (Min to Max)
dist_A_off = Model_A["dist_matrix"][np.triu_indices_from(Model_A["dist_matrix"], k=1)]
dist_B_off = Model_B["dist_matrix"][np.triu_indices_from(Model_B["dist_matrix"], k=1)]
all_dists = np.concatenate([dist_A_off, dist_B_off])

D_min_min = np.min(all_dists)
D_max_max = np.max(all_dists)

# Create range and REVERSE it (Max -> Min)
# We add a small buffer to max to ensure we start at the trivial "All Connected" state
tau_range = np.arange(D_min_min, D_max_max + 0.2, 0.1)
tau_reversed = tau_range[::-1]

print(f"\n--- Running Inverted Search (Max -> Min) ---")
print(f"Range: {D_max_max:.2f} down to {D_min_min:.2f}")

# Caching
cache_A = {}
cache_B = {}
print("Pre-computing clusters...")
for t in tau_reversed:
	cache_A[t] = get_clustering_pairs_and_labels(Model_A["dist_matrix"], t)
	cache_B[t] = get_clustering_pairs_and_labels(Model_B["dist_matrix"], t)

# Variables for the loop
P_true = set()
labels_A_star = None
labels_B_star = None
TAU_A = 0
TAU_B = 0

prev_intersection_len = -1
halted = False

# INVERTED LOOP
# We synchronize t_A and t_B for a linear stability check (1D walk down)
# to maintain the logic of "stability frontier"
for t in tau_reversed:
	if halted:
		break

	pairs_A, labels_A = cache_A[t]
	pairs_B, labels_B = cache_B[t]

	intersection = pairs_A.intersection(pairs_B)
	curr_len = len(intersection)

	# Initialize previous length on first iteration
	if prev_intersection_len == -1:
		prev_intersection_len = curr_len
		# Set initial state
		P_true = intersection
		labels_A_star = labels_A
		labels_B_star = labels_B
		TAU_A = t
		TAU_B = t
		continue

	# CONDITION: Halt if intersection decreases
	if curr_len < prev_intersection_len:
		print(
			f"Halt triggered at t={t:.2f}. Intersection dropped from {prev_intersection_len} to {curr_len}."
		)
		halted = True
		# We keep the P_true/Labels from the PREVIOUS iteration (the last stable state)
		break

	# Update state if stable or growing (though iterating down usually shrinks)
	prev_intersection_len = curr_len
	P_true = intersection
	labels_A_star = labels_A
	labels_B_star = labels_B
	TAU_A = t
	TAU_B = t

print(f"Final Tau A: {TAU_A:.2f}, Tau B: {TAU_B:.2f}")

# =========================================================
# 3. CALCULATE N_TARGET
# =========================================================

N_A_true = calculate_n_true(labels_A_star, P_true)
N_B_true = calculate_n_true(labels_B_star, P_true)
N_target = (N_A_true + N_B_true) / 2

print(f"\nConsensus Structure Found:")
print(f"Platinum Pairs Identified: {len(P_true)}")
print(f"N_target: {N_target:.1f} (Avg of A={N_A_true}, B={N_B_true})")

# =========================================================
# 4. TUNING TARGET THRESHOLDS (t)
# =========================================================

print("\n--- Phase 2: Tuning Thresholds to Target N_target ---")


def tune_threshold_by_group_count(model_data, N_target, model_name):
	best_t = 0
	min_error = float("inf")
	best_f1 = -1

	# Search over fine-grained thresholds (Dynamic Range)
	t_search_range = np.arange(D_min_min, D_max_max + 0.2, 0.1)

	for t in t_search_range:
		current_pairs, current_labels = get_clustering_pairs_and_labels(
			model_data["dist_matrix"], t
		)

		N_predicted = calculate_n_true(current_labels, P_true)
		group_error = abs(N_predicted - N_target)

		tp = len(current_pairs.intersection(P_true))
		fp = len(current_pairs - P_true)
		fn = len(P_true - current_pairs)

		if tp > 0:
			precision = tp / (tp + fp)
			recall = tp / (tp + fn)
			f1 = 2 * (precision * recall) / (precision + recall)
		else:
			f1 = 0

		if group_error < min_error:
			min_error = group_error
			best_f1 = f1
			best_t = t
		elif group_error == min_error and f1 > best_f1:
			best_f1 = f1
			best_t = t

	print(
		f"[{model_name}] Optimal t: {best_t:.1f} | Group Error: {min_error:.1f} | F1: {best_f1:.4f}"
	)
	return best_t


optimal_t_A = tune_threshold_by_group_count(Model_A, N_target, "Model A")
optimal_t_B = tune_threshold_by_group_count(Model_B, N_target, "Model B")

print(f"\nFinal Calibrated Thresholds:")
print(f"Model A (Statement): {optimal_t_A:.2f}")
print(f"Model B (Question):  {optimal_t_B:.2f}")


# Visualization function (retained)
def print_final_clusters(model_data, threshold, string_list, title):
	print(f"\n{'='*80}")
	print(f"FINAL OUTPUT: {title} (Threshold: {threshold:.2f})")
	print(f"{'='*80}")
	cluster_model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = cluster_model.fit_predict(model_data["dist_matrix"])
	df = pd.DataFrame(
		{"string": string_list, "label": labels, "idx": range(len(string_list))}
	)
	groups = [g for _, g in df.groupby("label") if len(g) > 1]
	groups.sort(key=lambda x: len(x), reverse=True)

	# P_true pairs captured
	final_pairs, _ = get_clustering_pairs_and_labels(model_data["dist_matrix"], threshold)
	tp = len(final_pairs.intersection(P_true))
	fp = len(final_pairs - P_true)

	print(f"Found {len(groups)} total significant groups.")
	print(f"P_true pairs captured: {tp} (False Positives: {fp})\n")

	for i, group in enumerate(groups):
		indices = group["idx"].tolist()
		cluster_strs = group["string"].tolist()
		vecs = model_data["vectors"][indices]
		prec = model_data["precision"]
		local_mean = np.mean(vecs, axis=0)
		distances = []
		min_dist = float("inf")
		rep_idx = -1
		for local_i, global_i in enumerate(indices):
			d = mahalanobis(model_data["vectors"][global_i], local_mean, prec)
			distances.append(d)
			if d < min_dist:
				min_dist = d
				rep_idx = local_i
		group_radius = max(distances)
		print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {group_radius:.4f}]")
		for idx, s in enumerate(cluster_strs):
			prefix = " [CENTROID] " if idx == rep_idx else "            "
			print(f"{prefix} {s}")
		print("-" * 80)


print_final_clusters(Model_A, optimal_t_A, strings, "Model A (Statement Embeddings)")
print_final_clusters(Model_B, optimal_t_B, strings, "Model B (Question Embeddings)")


--- Running Inverted Search (Max -> Min) ---
Range: 25.35 down to 5.77
Pre-computing clusters...
Halt triggered at t=25.27. Intersection dropped from 134940 to 133904.
Final Tau A: 25.37, Tau B: 25.37

Consensus Structure Found:
Platinum Pairs Identified: 134940
N_target: 1.0 (Avg of A=1, B=1)

--- Phase 2: Tuning Thresholds to Target N_target ---
[Model A] Optimal t: 25.4 | Group Error: 0.0 | F1: 1.0000
[Model B] Optimal t: 24.6 | Group Error: 0.0 | F1: 1.0000

Final Calibrated Thresholds:
Model A (Statement): 25.37
Model B (Question):  24.57

FINAL OUTPUT: Model A (Statement Embeddings) (Threshold: 25.37)
Found 1 total significant groups.
P_true pairs captured: 134940 (False Positives: 0)

GROUP 1 (Size: 520) [Radius: 17.8849]
             Does the privacy policy affirm that the company implements appropriate technical security measures?
             Does the privacy policy affirm that the company implements appropriate organizational security measures?
             Does the privacy

In [None]:
# # =========================================================
# # 2. STAGE 1: CONSENSUS TUNING (The "Auto-Labeler")
# # =========================================================


# def get_pairs_from_labels(labels):
# 	df = pd.DataFrame({"label": labels, "id": range(len(labels))})
# 	pairs = set()
# 	for label, group in df.groupby("label"):
# 		indices = group["id"].tolist()
# 		if len(indices) > 1:
# 			for p in combinations(sorted(indices), 2):
# 				pairs.add(p)
# 	return pairs


# def get_clustering_pairs(dist_matrix, threshold):
# 	model = AgglomerativeClustering(
# 		n_clusters=None,
# 		distance_threshold=threshold,
# 		metric="precomputed",
# 		linkage="complete",
# 	)
# 	labels = model.fit_predict(dist_matrix)
# 	return get_pairs_from_labels(labels)


# print("\n--- Running Grid Search for Consensus (Tau) ---")

# tau_range = np.arange(5.0, 18.0, 0.5)
# best_jaccard = -1
# P_true = set()

# # Pre-compute to save time
# cache_A = {t: get_clustering_pairs(Model_A["dist_matrix"], t) for t in tau_range}
# cache_B = {t: get_clustering_pairs(Model_B["dist_matrix"], t) for t in tau_range}
# p_u = 0
# for t_A in tau_range:
# 	for t_B in tau_range:
# 		pairs_A = cache_A[t_A]
# 		pairs_B = cache_B[t_B]

# 		intersection = pairs_A.intersection(pairs_B)
# 		union = pairs_A.union(pairs_B)

# 		if len(union) > 0:
# 			jaccard = len(intersection) / len(union)
# 			if jaccard > best_jaccard:
# 				best_jaccard = jaccard
# 				P_true = intersection
# print(f"Optimal Consensus Found (Jaccard: {best_jaccard:.4f})")
# print(f"Platinum Pairs Identified: {len(P_true)}")

# # --- TUNE TARGET THRESHOLDS (t) ---


# def find_optimal_t(model_data, target_pairs):
# 	best_t = 0
# 	best_f1 = -1

# 	# Fine-grained search
# 	for t in np.arange(5.0, 20.0, 0.1):
# 		curr_pairs = get_clustering_pairs(model_data["dist_matrix"], t)

# 		tp = len(curr_pairs.intersection(target_pairs))
# 		fp = len(curr_pairs - target_pairs)
# 		fn = len(target_pairs - curr_pairs)

# 		if tp > 0:
# 			precision = tp / (tp + fp)
# 			recall = tp / (tp + fn)
# 			f1 = 2 * (precision * recall) / (precision + recall)
# 		else:
# 			f1 = 0

# 		if f1 > best_f1:
# 			best_f1 = f1
# 			best_t = t
# 	return best_t


# optimal_t_A = find_optimal_t(Model_A, P_true)
# optimal_t_B = find_optimal_t(Model_B, P_true)

# print(f"\nFinal Calibrated Thresholds:")
# print(f"Model A (Statement): {optimal_t_A:.2f}")
# print(f"Model B (Question):  {optimal_t_B:.2f}")

# # =========================================================
# # 3. VISUALIZATION (Printing the Groups)
# # =========================================================


# def print_final_clusters(model_data, threshold, string_list, title):
# 	print(f"\n{'='*80}")
# 	print(f"FINAL OUTPUT: {title} (Threshold: {threshold:.2f})")
# 	print(f"{'='*80}")

# 	# 1. Cluster
# 	cluster_model = AgglomerativeClustering(
# 		n_clusters=None,
# 		distance_threshold=threshold,
# 		metric="precomputed",
# 		linkage="complete",
# 	)
# 	labels = cluster_model.fit_predict(model_data["dist_matrix"])

# 	# 2. Organize
# 	df = pd.DataFrame(
# 		{"string": string_list, "label": labels, "idx": range(len(string_list))}
# 	)
# 	groups = [g for _, g in df.groupby("label") if len(g) > 1]
# 	groups.sort(key=lambda x: len(x), reverse=True)

# 	print(f"Found {len(groups)} significant duplicate groups.\n")

# 	# 3. Print
# 	for i, group in enumerate(groups):
# 		indices = group["idx"].tolist()
# 		cluster_strs = group["string"].tolist()

# 		print(f"GROUP {i+1} (Size: {len(indices)})")

# 		# Centroid Logic
# 		vecs = model_data["vectors"][indices]
# 		prec = model_data["precision"]
# 		local_mean = np.mean(vecs, axis=0)

# 		min_dist = float("inf")
# 		rep_idx = -1

# 		for local_i, global_i in enumerate(indices):
# 			d = mahalanobis(model_data["vectors"][global_i], local_mean, prec)
# 			if d < min_dist:
# 				min_dist = d
# 				rep_idx = local_i

# 		for idx, s in enumerate(cluster_strs):
# 			prefix = " [CENTROID] " if idx == rep_idx else "            "
# 			print(f"{prefix} {s}")
# 		print("-" * 80)


# # --- EXECUTE VISUALIZATION ---
# print_final_clusters(Model_A, optimal_t_A, strings, "Model A (Statement Embeddings)")
# print_final_clusters(Model_B, optimal_t_B, strings, "Model B (Question Embeddings)")

Processing Model A (Statement)...
Processing Model B (Question)...

--- Running Grid Search for Consensus (Tau) ---
0.8888888888888888
Optimal Consensus Found (Jaccard: 0.8000)
Platinum Pairs Identified: 4

Final Calibrated Thresholds:
Model A (Statement): 8.80
Model B (Question):  8.80

FINAL OUTPUT: Model A (Statement Embeddings) (Threshold: 8.80)
Found 4 significant duplicate groups.

GROUP 1 (Size: 2)
 [CENTROID]  Does the privacy policy affirm that the company relies on Standard Contractual Clauses (SCCs) for transfers to countries without an adequacy decision?
             Does the privacy policy affirm that the company relies on Standard Contractual Clauses (SCCs) for transfers to jurisdictions without adequacy decisions?
--------------------------------------------------------------------------------
GROUP 2 (Size: 2)
 [CENTROID]  Does the privacy policy affirm that users have the right to request the correction of inaccurate personal data?
             Does the privacy policy 

In [None]:
import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist, mahalanobis
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations
from collections import defaultdict

# =========================================================
# 1. HELPER FUNCTIONS
# =========================================================


def get_pairs_from_labels(labels):
	"""Converts cluster labels into a Set of unique pairs (indices)."""
	df = pd.DataFrame({"label": labels, "id": range(len(labels))})
	pairs = set()
	for label, group in df.groupby("label"):
		indices = group["id"].tolist()
		if len(indices) > 1:
			for p in combinations(sorted(indices), 2):
				pairs.add(p)
	return pairs


def get_clustering_pairs_and_labels(dist_matrix, threshold):
	"""Runs clustering and returns both the pair set and the labels array."""
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	return get_pairs_from_labels(labels), labels


def calculate_n_true(labels_array, target_pairs):
	"""Calculates the number of unique clusters containing the elements of target_pairs."""
	if not target_pairs or labels_array is None:
		return 0

	# Identify all indices involved in the platinum pairs
	involved_indices = set(idx for pair in target_pairs for idx in pair)

	# Get the unique cluster labels associated with those indices
	labels_of_interest = set(
		labels_array[idx] for idx in involved_indices if idx < len(labels_array)
	)

	# If P_true is empty, N_true must be zero.
	if not involved_indices:
		return 0

	return len(labels_of_interest)


# =========================================================
# 2. STAGE 1: CONSENSUS TUNING (Setup - Requires Model_A, Model_B, strings)
# =========================================================

# --- Assuming Model_A, Model_B, and strings are already populated from the preparation step ---

print("\n--- Phase 1: Grid Search for Consensus (Tau) ---")

# tau_range = np.arange(5.0, 18.0, 0.1)
# --- Dynamic Range Calculation (Adhering to Absolute Min/Max) ---

# Combine the off-diagonal elements of both distance matrices
dist_A_off = Model_A["dist_matrix"][np.triu_indices_from(Model_A["dist_matrix"], k=1)]
dist_B_off = Model_B["dist_matrix"][np.triu_indices_from(Model_B["dist_matrix"], k=1)]
all_dists = np.concatenate([dist_A_off, dist_B_off])

# Calculate the bounds
D_min_min = np.min(all_dists)
D_max_max = np.max(all_dists)

# Define search range: Start slightly below min, end slightly above max
# This ensures we capture the boundary points exactly.
SEARCH_STEP = 0.05	# Retain the fine step size for accuracy

# TAU_SEARCH_START = max(D_min_min - SEARCH_STEP, 0.0)
# TAU_SEARCH_END = D_max_max + SEARCH_STEP

TAU_SEARCH_START = max(D_min_min, 0.0)
TAU_SEARCH_END = D_max_max

tau_range = np.arange(TAU_SEARCH_START, TAU_SEARCH_END, SEARCH_STEP)
t_range = np.arange(TAU_SEARCH_START, TAU_SEARCH_END, SEARCH_STEP)

print(
	f"Search Range Defined: [{TAU_SEARCH_START:.2f} to {TAU_SEARCH_END:.2f}] (Step: {SEARCH_STEP})"
)
best_jaccard = -1
P_true = set()
labels_A_star = None
labels_B_star = None

# Cache structure: {tau: (pair_set, labels_array)}
cache_A = {}
cache_B = {}

print(f"Pre-computing clusters for {len(tau_range)} thresholds...")
for t in tau_range:
	cache_A[t] = get_clustering_pairs_and_labels(Model_A["dist_matrix"], t)
	cache_B[t] = get_clustering_pairs_and_labels(Model_B["dist_matrix"], t)
TAU_A = 0
TAU_B = 0

# Run grid search
for t_A in tau_range:
	pairs_A, labels_A = cache_A[t_A]
	for t_B in tau_range:
		pairs_B, labels_B = cache_B[t_B]

		intersection = pairs_A.intersection(pairs_B)
		union = pairs_A.union(pairs_B)

		if len(union) > 0:
			jaccard = len(intersection) / len(union)
			if jaccard > best_jaccard:

				best_jaccard = jaccard
				P_true = intersection
				labels_A_star = labels_A
				labels_B_star = labels_B
				TAU_A = t_A
				TAU_B = t_B

print(f"tau a:{TAU_A}, tau b:{TAU_B}")

# Calculate N_target
N_A_true = calculate_n_true(labels_A_star, P_true)
N_B_true = calculate_n_true(labels_B_star, P_true)
N_target = (N_A_true + N_B_true) / 2

print(f"\nConsensus Structure Found:")
print(f"Platinum Pairs Identified: {len(P_true)}")
print(f"N_target: {N_target:.1f} (Avg of A={N_A_true}, B={N_B_true})")

# =========================================================
# 3. TUNING TARGET THRESHOLDS (t)
# =========================================================

print("\n--- Phase 2: Tuning Thresholds to Target N_target ---")


def tune_threshold_by_group_count(model_data, N_target, model_name):
	best_t = 0
	min_error = float("inf")
	best_f1 = -1

	# t_range = np.arange(5.0, 20.0, 0.1)

	for t in t_range:
		current_pairs, current_labels = get_clustering_pairs_and_labels(
			model_data["dist_matrix"], t
		)

		# 1. Calculate primary error (Group Count Error)
		N_predicted = calculate_n_true(current_labels, P_true)
		group_error = abs(N_predicted - N_target)

		# 2. Calculate secondary metric (F1 score)
		tp = len(current_pairs.intersection(P_true))
		fp = len(current_pairs - P_true)
		fn = len(P_true - current_pairs)

		if tp > 0:
			precision = tp / (tp + fp)
			recall = tp / (tp + fn)
			f1 = 2 * (precision * recall) / (precision + recall)
		else:
			f1 = 0

		# Optimization Logic: (Minimize error, maximize F1 as tie-breaker)
		if group_error < min_error:
			min_error = group_error
			best_f1 = f1
			best_t = t
		elif group_error == min_error and f1 > best_f1:
			best_f1 = f1
			best_t = t

	print(
		f"[{model_name}] Optimal t: {best_t:.1f} | Group Error: {min_error:.1f} | F1: {best_f1:.4f}"
	)
	return best_t


optimal_t_A = tune_threshold_by_group_count(Model_A, N_target, "Model A")
optimal_t_B = tune_threshold_by_group_count(Model_B, N_target, "Model B")

print(f"\nFinal Calibrated Thresholds:")
print(f"Model A (Statement): {optimal_t_A:.2f}")
print(f"Model B (Question):  {optimal_t_B:.2f}")

# =========================================================
# 4. FINAL VISUALIZATION (Printing Groups)
# =========================================================


def print_final_clusters(model_data, threshold, string_list, title):
	print(f"\n{'='*80}")
	print(f"FINAL OUTPUT: {title} (Threshold: {threshold:.2f})")
	print(f"{'='*80}")

	# 1. Cluster using the optimized t
	cluster_model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = cluster_model.fit_predict(model_data["dist_matrix"])

	# 2. Organize and filter groups
	df = pd.DataFrame(
		{"string": string_list, "label": labels, "idx": range(len(string_list))}
	)
	groups = [g for _, g in df.groupby("label") if len(g) > 1]
	groups.sort(key=lambda x: len(x), reverse=True)

	# Calculate statistics for P_true within this final clustering
	final_pairs, _ = get_clustering_pairs_and_labels(model_data["dist_matrix"], threshold)
	tp = len(final_pairs.intersection(P_true))
	fp = len(final_pairs - P_true)

	print(f"Found {len(groups)} total significant groups.")
	print(f"P_true pairs captured: {tp} (False Positives: {fp})\n")

	# 3. Print Groups (with Centroid and Radius)
	for i, group in enumerate(groups):
		indices = group["idx"].tolist()
		cluster_strs = group["string"].tolist()

		# Centroid/Radius Logic
		vecs = model_data["vectors"][indices]
		prec = model_data["precision"]
		local_mean = np.mean(vecs, axis=0)

		distances = []
		min_dist = float("inf")
		rep_idx = -1

		for local_i, global_i in enumerate(indices):
			d = mahalanobis(model_data["vectors"][global_i], local_mean, prec)
			distances.append(d)
			if d < min_dist:
				min_dist = d
				rep_idx = local_i

		group_radius = max(distances)

		print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {group_radius:.4f}]")

		for idx, s in enumerate(cluster_strs):
			prefix = " [CENTROID] " if idx == rep_idx else "            "
			print(f"{prefix} {s}")
		print("-" * 80)


# --- EXECUTE VISUALIZATION ---
print_final_clusters(Model_A, optimal_t_A, strings, "Model A (Statement Embeddings)")
print_final_clusters(Model_B, optimal_t_B, strings, "Model B (Question Embeddings)")


--- Phase 1: Grid Search for Consensus (Tau) ---
Search Range Defined: [5.77 to 25.35] (Step: 0.05)
Pre-computing clusters for 392 thresholds...

Consensus Structure Found:
Platinum Pairs Identified: 5
N_target: 5.0 (Avg of A=5, B=5)
tau a:9.223086354550212, tau b:8.823086354550213

--- Phase 2: Tuning Thresholds to Target N_target ---
[Model A] Optimal t: 9.2 | Group Error: 0.0 | F1: 1.0000
[Model B] Optimal t: 8.8 | Group Error: 0.0 | F1: 1.0000

Final Calibrated Thresholds:
Model A (Statement): 9.22
Model B (Question):  8.82

FINAL OUTPUT: Model A (Statement Embeddings) (Threshold: 9.22)
Found 5 total significant groups.
P_true pairs captured: 5 (False Positives: 0)

GROUP 1 (Size: 2) [Radius: 4.6004]
 [CENTROID]  Does the privacy policy affirm that the company relies on user consent to process contact information for specific marketing communications?
             Does the privacy policy affirm that the company processes contact information for marketing communications based on us

In [22]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from sklearn.cluster import AgglomerativeClustering


def to_z_scores(dist_matrix):
	"""
	Converts a raw distance matrix to a Z-Score matrix.
	Z = (Distance - Mean) / StdDev
	Uses only off-diagonal elements for accurate stats.
	"""

	triu_indices = np.triu_indices_from(dist_matrix, k=1)
	values = dist_matrix[triu_indices]

	mu = np.mean(values)
	sigma = np.std(values)

	print(f"  Stats -> Mean: {mu:.4f} | StdDev: {sigma:.4f}")

	z_matrix = (dist_matrix - mu) / sigma

	np.fill_diagonal(z_matrix, -10.0)

	return z_matrix


print("--- Calculating Z-Scores ---")
print("Model A (Statement):")
Z_A = to_z_scores(Model_A["dist_matrix"])

print("Model B (Question):")
Z_B = to_z_scores(Model_B["dist_matrix"])


def fuse_matrices_symmetric(Z1, Z2):
	"""
	Fuses two Z-score matrices by taking the value with the
	Maximum Absolute Deviation (Strongest Signal).
	"""

	mask_1_stronger = np.abs(Z1) > np.abs(Z2)

	Z_fused = np.where(mask_1_stronger, Z1, Z2)

	return Z_fused


print("\n--- Fusing Matrices (Max Signal Rule) ---")
Z_Final = fuse_matrices_symmetric(Z_A, Z_B)


print("\n--- Calibrating Threshold against Platinum Pairs ---")

platinum_z_scores = []

for idx1, idx2 in P_true:
	z_val = Z_Final[idx1, idx2]
	platinum_z_scores.append(z_val)

if len(platinum_z_scores) > 0:
	max_plat_z = max(platinum_z_scores)
	avg_plat_z = np.mean(platinum_z_scores)
	print(f"Platinum Pairs Z-Stats: Max={max_plat_z:.4f}, Avg={avg_plat_z:.4f}")

	Z_THRESHOLD = max(max_plat_z + 0.5, -3.0)

	if Z_THRESHOLD < -2.5:
		Z_THRESHOLD = -2.5
else:

	Z_THRESHOLD = -3.0

print(f"Calibrated Z-Threshold: {Z_THRESHOLD:.4f}")


print(f"\n{'='*80}")
print(f"STAGE 2 OUTPUT: Fused Confidence Model (Threshold: {Z_THRESHOLD:.4f})")
print(f"{'='*80}")


cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=0,
)


min_z = np.min(Z_Final)
shifted_matrix = Z_Final - min_z
shifted_threshold = Z_THRESHOLD - min_z

print(f"Running Clustering (Shifted Threshold: {shifted_threshold:.4f})...")

cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=shifted_threshold,
	metric="precomputed",
	linkage="complete",
)

labels = cluster_model.fit_predict(shifted_matrix)


df = pd.DataFrame({"string": strings, "label": labels, "idx": range(len(strings))})
groups = [g for _, g in df.groupby("label") if len(g) > 1]
groups.sort(key=lambda x: len(x), reverse=True)

print(f"Found {len(groups)} significant groups.\n")

for i, group in enumerate(groups):
	indices = group["idx"].tolist()
	cluster_strs = group["string"].tolist()

	ref_vectors = Model_B["vectors"][indices]
	ref_prec = Model_B["precision"]

	local_mean = np.mean(ref_vectors, axis=0)

	distances = []
	min_dist = float("inf")
	rep_idx = -1

	for local_i, global_i in enumerate(indices):

		d = mahalanobis(Model_B["vectors"][global_i], local_mean, ref_prec)
		distances.append(d)
		if d < min_dist:
			min_dist = d
			rep_idx = local_i

	radius = max(distances)

	print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {radius:.4f}]")

	for idx, s in enumerate(cluster_strs):
		prefix = " [CENTROID] " if idx == rep_idx else "            "
		print(f"{prefix} {s}")
	print("-" * 80)

--- Calculating Z-Scores ---
Model A (Statement):
  Stats -> Mean: 19.4450 | StdDev: 1.3899
Model B (Question):
  Stats -> Mean: 19.3831 | StdDev: 1.3821

--- Fusing Matrices (Max Signal Rule) ---

--- Calibrating Threshold against Platinum Pairs ---
Platinum Pairs Z-Stats: Max=-7.7560, Avg=-8.4477
Calibrated Z-Threshold: -2.5000

STAGE 2 OUTPUT: Fused Confidence Model (Threshold: -2.5000)
Running Clustering (Shifted Threshold: 7.5000)...
Found 139 significant groups.

GROUP 1 (Size: 5) [Radius: 9.8591]
 [CENTROID]  Does the privacy policy affirm that users have the statutory right to access their Personal Data?
             Does the privacy policy affirm that users have the statutory right to access information relating to how their Personal Data is processed?
             Does the privacy policy affirm that users have the statutory right to delete their Personal Data from the company's records?
             Does the privacy policy affirm that users have the statutory right to rectify

In [None]:
# # =========================================================
# # 1. CORRECTED PROBABILITY ESTIMATION
# # =========================================================


# def get_all_pairwise_distances(dist_matrix):
# 	"""Extracts all unique off-diagonal elements (the full background distribution)."""
# 	return dist_matrix[np.triu_indices_from(dist_matrix, k=1)]


# def convert_dist_to_prob_full(dist_matrix, full_dist_array):
# 	"""
# 	Converts raw distances into Probabilities (P-values) based on the
# 	Empirical CDF of ALL pairwise distances (the full background H0).
# 	"""
# 	sorted_refs = np.sort(full_dist_array)
# 	n = len(sorted_refs)

# 	# Calculate Ranks based on the full distribution
# 	ranks = np.searchsorted(sorted_refs, dist_matrix)

# 	# P(d) = (Rank of d) / (Total Count + 1)
# 	probs = (ranks + 1) / (n + 1)

# 	return probs


# print("--- Calculating Empirical Probabilities (FULL Background) ---")

# # 1. Get the FULL distribution of distances for both models
# full_dists_A = get_all_pairwise_distances(Model_A["dist_matrix"])
# full_dists_B = get_all_pairwise_distances(Model_B["dist_matrix"])

# # 2. Convert the full N x N matrices to Probability Matrices
# Prob_A = convert_dist_to_prob_full(Model_A["dist_matrix"], full_dists_A)
# Prob_B = convert_dist_to_prob_full(Model_B["dist_matrix"], full_dists_B)

# # --- FUSION (Min Rule remains correct) ---
# Prob_Fused = np.minimum(Prob_A, Prob_B)

# # =========================================================
# # 2. CORRECTED INFERENCE: INFERRING THE THRESHOLD
# # =========================================================

# print("\n--- Inferring Threshold from Platinum Pairs (Corrected) ---")

# platinum_probs = []
# for idx1, idx2 in P_true:
# 	# Use the Fused Probability (which is correctly indexed)
# 	p_val = Prob_Fused[idx1, idx2]
# 	platinum_probs.append(p_val)

# if len(platinum_probs) > 0:
# 	max_plat_p = max(platinum_probs)
# 	avg_plat_p = np.mean(platinum_probs)

# 	print(f"Platinum Pairs Max Probability (P_max): {max_plat_p:.6f}")
# 	print(f"Platinum Pairs Avg Probability (P_avg): {avg_plat_p:.6f}")

# 	# INFERENCE LOGIC: Base the threshold on the average, and then apply a safe multiplier.
# 	# We want to allow some pairs slightly looser than the worst platinum pair to pass.
# 	# We use a 1.5x multiplier on the average as a heuristic extension.
# 	PROB_THRESHOLD = avg_plat_p * 1.5

# 	# Cap the threshold at a reasonable ceiling (e.g., 0.05, meaning the top 5% rarest distances)
# 	if PROB_THRESHOLD > 0.05:
# 		PROB_THRESHOLD = 0.05

# else:
# 	PROB_THRESHOLD = 0.001

# print(f"Inferred Probability Threshold: {PROB_THRESHOLD:.6f}")
# # The rest of the clustering logic (Section 3 and 4) remains the same, using Prob_Fused and PROB_THRESHOLD.

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from scipy.stats import rankdata
from sklearn.cluster import AgglomerativeClustering

# =========================================================
# 1. EMPIRICAL PROBABILITY ESTIMATION (Non-Parametric)
# =========================================================


def get_nearest_neighbor_distances(dist_matrix):
	"""
	Extracts the distance to the nearest neighbor for every point.
	Ignores the diagonal (0).
	"""
	np.fill_diagonal(dist_matrix, float("inf"))
	min_dists = np.min(dist_matrix, axis=1)
	np.fill_diagonal(dist_matrix, 0.0)	# Restore
	return min_dists


def convert_dist_to_prob(dist_matrix, reference_dist_array):
	"""
	Converts raw distances into Probabilities (P-values) based on the
	Empirical CDF of the provided reference distribution (Nearest Neighbors).

	P(d) = (Rank of d) / (Total Count + 1)
	"""
	# We define the "Background" as the distribution of nearest neighbors.
	# Why? Because non-duplicates have large NN distances, duplicates have small ones.
	# We want to know: "Is this distance surprisingly small even for a nearest neighbor?"

	sorted_refs = np.sort(reference_dist_array)
	n = len(sorted_refs)

	# Use searchsorted to find the rank of every element in the matrix efficiently
	# This is effectively the ECDF.
	# Indices corresponding to where the values would be inserted
	ranks = np.searchsorted(sorted_refs, dist_matrix)

	# Calculate Probability (P-value)
	# P < epsilon means "This is in the bottom epsilon% of nearest neighbor distances"
	probs = (ranks + 1) / (n + 1)

	return probs


print("--- Calculating Empirical Probabilities (EVT Logic) ---")

# 1. Get the distribution of "closest things" for each model
# This forms our background distribution for "How close do things get?"
nn_dists_A = get_nearest_neighbor_distances(Model_A["dist_matrix"])
nn_dists_B = get_nearest_neighbor_distances(Model_B["dist_matrix"])

# 2. Convert the full N x N matrices to Probability Matrices
# "How likely is this pair to be this close by chance?"
Prob_A = convert_dist_to_prob(Model_A["dist_matrix"], nn_dists_A)
Prob_B = convert_dist_to_prob(Model_B["dist_matrix"], nn_dists_B)

print(f"Probabilities Calculated.")
print(
	f"Example (Model A): Dist=8.8 -> P={np.interp(8.8, np.sort(nn_dists_A), np.linspace(0,1,len(nn_dists_A))):.5f}"
)
print(
	f"Example (Model B): Dist=8.8 -> P={np.interp(8.8, np.sort(nn_dists_B), np.linspace(0,1,len(nn_dists_B))):.5f}"
)
# =========================================================
# 2. PROBABILITY FUSION (Joint Rarity)
# =========================================================

# We assume models provide independent evidence.
# We want pairs where AT LEAST ONE model thinks it's extremely rare.
# But simply multiplying P_A * P_B favors agreement.
# We want the "Max Signal" logic (Min Probability).
# Logic: If Model B says P=0.0001 and Model A says P=0.5,
# The pair IS rare in B's view. It is an outlier.

print("\n--- Fusing Probabilities (Min Rule) ---")

# We take the Minimum Probability (Maximum Surprise)
Prob_Fused = np.minimum(Prob_A, Prob_B)

# =========================================================
# 3. INFERENCE: INFERRING THE THRESHOLD
# =========================================================

print("\n--- Inferring Threshold from Platinum Pairs ---")

platinum_probs = []
for idx1, idx2 in P_true:
	p_val = Prob_Fused[idx1, idx2]
	platinum_probs.append(p_val)

PROB_THRESHOLD = 0.049019607843137254
# if len(platinum_probs) > 0:
# 	max_plat_p = max(platinum_probs)
# 	print(f"Platinum Pairs Max Probability: {max_plat_p:.6f}")
# max_plat_p * 5.0


# else:
# 	# Fallback default
# 	PROB_THRESHOLD = 0.01

print(f"Inferred Probability Threshold: {PROB_THRESHOLD:.6f}")

# =========================================================
# 4. CLUSTERING & REPORTING
# =========================================================

print(f"\n{'='*80}")
print(f"STAGE 2 OUTPUT: Probabilistic Outlier Model (P < {PROB_THRESHOLD:.6f})")
print(f"{'='*80}")

# We define distance for clustering as the Probability.
# Single Linkage logic (min probability) is implicitly handled by the Min-Fusion above.
# We use Complete Linkage on the Probability Matrix to form tight groups.

cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=PROB_THRESHOLD,
	metric="precomputed",
	linkage="complete",
)

labels = cluster_model.fit_predict(Prob_Fused)

# --- VISUALIZATION ---

df = pd.DataFrame({"string": strings, "label": labels, "idx": range(len(strings))})
groups = [g for _, g in df.groupby("label") if len(g) > 1]
groups.sort(key=lambda x: len(x), reverse=True)

print(f"Found {len(groups)} significant groups.\n")

for i, group in enumerate(groups):
	indices = group["idx"].tolist()
	cluster_strs = group["string"].tolist()

	# For Centroid/Radius, we use Model B vectors (or A) as a geometric reference.
	# It's just for display purposes.
	ref_vectors = Model_B["vectors"][indices]
	ref_prec = Model_B["precision"]
	local_mean = np.mean(ref_vectors, axis=0)

	dists = []
	min_d = float("inf")
	rep_i = -1

	for loc_i, glob_i in enumerate(indices):
		d = mahalanobis(Model_B["vectors"][glob_i], local_mean, ref_prec)
		dists.append(d)
		if d < min_d:
			min_d = d
			rep_i = loc_i

	# Also print the Joint Probability of the group (min p among pairs)
	# Just to verify our logic.
	print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {max(dists):.4f}]")

	for idx, s in enumerate(cluster_strs):
		prefix = " [CENTROID] " if idx == rep_i else "            "
		print(f"{prefix} {s}")
	print("-" * 80)

--- Calculating Empirical Probabilities (EVT Logic) ---
Probabilities Calculated.
Example (Model A): Dist=8.8 -> P=0.01354
Example (Model B): Dist=8.8 -> P=0.01744

--- Fusing Probabilities (Min Rule) ---

--- Inferring Threshold from Platinum Pairs ---
Platinum Pairs Max Probability: 0.013436
Inferred Probability Threshold: 0.049020

STAGE 2 OUTPUT: Probabilistic Outlier Model (P < 0.049020)
Found 15 significant groups.

GROUP 1 (Size: 2) [Radius: 4.8633]
 [CENTROID]  Does the privacy policy affirm that the company does not knowingly disclose information from children under the age of 18?
             Does the privacy policy affirm that the company does not knowingly share information from children under the age of 18?
--------------------------------------------------------------------------------
GROUP 2 (Size: 2) [Radius: 4.8566]
 [CENTROID]  Does the privacy policy affirm that Inputs and Outputs disassociated via Feedback are used for training models?
             Does the privacy

In [None]:
import numpy as np
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import mahalanobis

data_set = qdata
strings = list(data_set.keys())

raw_vectors = np.array(
	[data_set[s]["retrieval_embedding_vector"] for s in strings]
)	# alt
# raw_vectors = np.array([data_set[s]["embedding_vector"] for s in strings])


print(f"Original shape: {raw_vectors.shape}")


truncated_vectors = raw_vectors[:, :256]


norms = np.linalg.norm(truncated_vectors, axis=1, keepdims=True)

cleaned_vectors = truncated_vectors / (norms + 1e-10)

print(f"Truncated shape: {cleaned_vectors.shape}")


lw = LedoitWolf()
lw.fit(cleaned_vectors)


precision_matrix = lw.precision_
mean_vector = lw.location_


results = []

# for i, string_key in enumerate(strings):
# 	vec = cleaned_vectors[i]

# 	dist = mahalanobis(vec, mean_vector, precision_matrix)

# 	results.append((dist, string_key))


# results.sort(key=lambda x: x[0])

# print("\n--- The Two Most Representative Strings (Closest to Centroid) ---")
# for i in range(2):
# 	dist, s = results[i]
# 	print(f'Rank {i+1} (Distance: {dist:.4f}):\n   "{s}"\n')


# print("\n--- The Least Representative String (Outlier) ---")
# dist, s = results[-1]
# print(f'Last Rank (Distance: {dist:.4f}):\n   "{s}"\n')


def get_pairwise_distance(key1, key2, string_list, vector_array, inv_cov_matrix):

	try:

		idx1 = string_list.index(key1)
		idx2 = string_list.index(key2)

		vec1 = vector_array[idx1]
		vec2 = vector_array[idx2]

		dist = mahalanobis(vec1, vec2, inv_cov_matrix)

		print(f"Comparison Results:")
		print(f"Statement A: {key1}")
		print(f"Statement B: {key2}")
		print(f"Mahalanobis Distance: {dist:.4f}")

		euclidean = np.linalg.norm(vec1 - vec2)
		print(f"Standard Euclidean:   {euclidean:.4f}")

		return dist

	except ValueError:
		print("Error: One of the keys was not found in the string list.")
		return None


key_a = "Does the privacy policy affirm that security measures are designed to protect personal data from misuse?"
key_b = "Does the privacy policy affirm that security measures are designed to protect personal data from unauthorized access?"


distance = get_pairwise_distance(
	key_a, key_b, strings, cleaned_vectors, precision_matrix
)
import itertools

print("\n--- Searching for the absolute closest pairs (True Duplicates) ---")

min_dist = float("inf")
closest_pair = None
all_distances = []

# Efficiently calculate all pairs (assuming N=520, N^2 is tiny for modern CPUs)
# We only check the upper triangle to avoid double counting
num_vectors = len(cleaned_vectors)
for i in range(num_vectors):
	for j in range(i + 1, num_vectors):
		vec_i = cleaned_vectors[i]
		vec_j = cleaned_vectors[j]

		# Calculate Mahalanobis
		delta = vec_i - vec_j
		d = mahalanobis(vec_i, vec_j, precision_matrix)

		all_distances.append(d)

		if d < min_dist:
			min_dist = d
			closest_pair = (strings[i], strings[j])

print(f"Minimum Distance found in entire dataset: {min_dist:.4f}")
print(f"String A: {closest_pair[0]}")
print(f"String B: {closest_pair[1]}")

# ---------------------------------------------------------
# 3. Calculate the Percentile of your "Misuse vs Access" pair
# ---------------------------------------------------------
# This tells you: Is 14.4 "close" or "far" relative to the rest of the data?

target_dist = 14.5802
percentile = sum(1 for x in all_distances if x < target_dist) / len(all_distances) * 100

print(f"\n--- Context for 'Misuse vs Access' {target_dist} ---")
print(f"This pair is closer than {100 - percentile:.2f}% of all other pairs.")
if percentile < 5:
	print("Verdict: Extremely Similar (Likely Duplicate)")
elif percentile < 20:
	print("Verdict: Somewhat Similar (Related Concepts)")
else:
	print("Verdict: Distinct (Different Concepts)")

import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist, mahalanobis
from sklearn.cluster import AgglomerativeClustering

DISTANCE_THRESHOLD = 11.0

print(f"Computing pairwise Mahalanobis distances for {len(cleaned_vectors)} vectors...")
print(f"Using Threshold: {DISTANCE_THRESHOLD}")

dist_matrix = cdist(
	cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
)
cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=DISTANCE_THRESHOLD,
	metric="precomputed",
	linkage="complete",
)

labels = cluster_model.fit_predict(dist_matrix)


df = pd.DataFrame(
	{"string": strings, "label": labels, "vector_index": range(len(strings))}
)

all_clusters = [group for _, group in df.groupby("label")]

significant_clusters = [g for g in all_clusters if len(g) > 1]

significant_clusters.sort(key=lambda x: len(x), reverse=True)

total_groups = len(all_clusters)
trivial_count = total_groups - len(significant_clusters)

print(f"\nResult: Found {len(significant_clusters)} groups containing duplicates.")
print(f"(Hidden {trivial_count} unique items that had no matches within threshold)\n")
print("=" * 80)


# for i, group_df in enumerate(significant_clusters):

# 	indices = group_df["vector_index"].tolist()
# 	cluster_strings = group_df["string"].tolist()

# 	print(f"GROUP {i+1} (Size: {len(indices)})")

# 	cluster_vectors = cleaned_vectors[indices]

# 	local_mean = np.mean(cluster_vectors, axis=0)

# 	min_dist = float("inf")
# 	rep_index = -1

# 	for idx, vec_idx in enumerate(indices):
# 		d = mahalanobis(cleaned_vectors[vec_idx], local_mean, precision_matrix)
# 		if d < min_dist:
# 			min_dist = d
# 			rep_index = idx

# 	# Print strings, highlighting the representative
# 	for idx, s in enumerate(cluster_strings):
# 		if idx == rep_index:
# 			print(f" [CENTROID]  {s}")
# 		else:
# 			print(f"             {s}")

# 	print("-" * 80)

for i, group_df in enumerate(significant_clusters):

	indices = group_df["vector_index"].tolist()
	cluster_strings = group_df["string"].tolist()

	# 1. Calculate Centroid
	cluster_vectors = cleaned_vectors[indices]
	local_mean = np.mean(cluster_vectors, axis=0)

	# 2. Calculate distances of all members to the centroid
	distances_to_centroid = []
	min_dist = float("inf")
	rep_index = -1

	for idx, vec_idx in enumerate(indices):
		d = mahalanobis(cleaned_vectors[vec_idx], local_mean, precision_matrix)
		distances_to_centroid.append(d)

		# Track the centroid representative
		if d < min_dist:
			min_dist = d
			rep_index = idx

	# 3. Radius is the maximum distance from the center
	# (How far does this cluster stretch?)
	group_radius = max(distances_to_centroid)

	# Print Header with Radius
	print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {group_radius:.4f}]")

	# Print strings, highlighting the representative
	for idx, s in enumerate(cluster_strings):
		if idx == rep_index:
			print(f" [CENTROID]  {s}")
		else:
			print(f"             {s}")

	print("-" * 80)

Original shape: (520, 3072)
Truncated shape: (520, 256)
Comparison Results:
Statement A: Does the privacy policy affirm that security measures are designed to protect personal data from misuse?
Statement B: Does the privacy policy affirm that security measures are designed to protect personal data from unauthorized access?
Mahalanobis Distance: 14.5802
Standard Euclidean:   0.3681

--- Searching for the absolute closest pairs (True Duplicates) ---
Minimum Distance found in entire dataset: 5.7731
String A: Does the privacy policy affirm that users have the right to request the correction of inaccurate personal data?
String B: Does the privacy policy affirm that users have the right to request the correction of inaccurate data?

--- Context for 'Misuse vs Access' 14.5802 ---
This pair is closer than 99.81% of all other pairs.
Verdict: Extremely Similar (Likely Duplicate)
Computing pairwise Mahalanobis distances for 520 vectors...
Using Threshold: 11.0

Result: Found 25 groups containing 

In [7]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations

# =========================================================
# 0. SETUP / MOCK DATA LOADING
# =========================================================
# Assuming you have these from the previous steps:
# dist_matrix_A (N x N) - Statement Embeddings
# dist_matrix_B (N x N) - Question/Retrieval Embeddings
# strings (list of length N)

# If you need to re-generate them quickly from vectors:
# dist_matrix_A = cdist(vectors_A, vectors_A, 'mahalanobis', VI=precision_A)
# dist_matrix_B = cdist(vectors_B, vectors_B, 'mahalanobis', VI=precision_B)

# =========================================================
# 1. HELPER FUNCTIONS
# =========================================================


def get_pairs_from_labels(labels):
	"""
	Converts cluster labels into a Set of unique pairs (indices).
	Returns: set of tuples {(min_id, max_id), ...}
	"""
	df = pd.DataFrame({"label": labels, "id": range(len(labels))})
	pairs = set()

	# Group by label
	for label, group in df.groupby("label"):
		indices = group["id"].tolist()
		if len(indices) > 1:
			# Generate all unique pairs in this cluster
			for p in combinations(sorted(indices), 2):
				pairs.add(p)
	return pairs


def get_clustering_pairs(dist_matrix, threshold):
	"""
	Runs clustering at a specific threshold and returns the set of pairs.
	"""
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	return get_pairs_from_labels(labels)


# =========================================================
# 2. GRID SEARCH FOR CONSENSUS (Estimating E_true)
# =========================================================

print("--- Phase 1: Grid Search for Consensus (Tau) ---")

# Define search space for Tau (The "Generation" Thresholds)
# Range: 5.0 to 18.0 with step 0.5
tau_range = np.arange(5.0, 18.0, 0.5)

best_jaccard = -1
best_config = (0, 0)	# (tau_A, tau_B)
P_true = set()	# The "Platinum" set of pairs

# We cache pairs for each threshold to avoid re-computing inside the nested loop
cache_A = {}
cache_B = {}

print(f"Pre-computing clusters for {len(tau_range)} thresholds...")

for t in tau_range:
	cache_A[t] = get_clustering_pairs(dist_matrix_A, t)
	cache_B[t] = get_clustering_pairs(dist_matrix_B, t)

print("Running grid intersection...")

for t_A in tau_range:
	pairs_A = cache_A[t_A]

	for t_B in tau_range:
		pairs_B = cache_B[t_B]

		# Calculate Jaccard Similarity of the Pair Sets
		intersection = pairs_A.intersection(pairs_B)
		union = pairs_A.union(pairs_B)

		if len(union) == 0:
			jaccard = 0
		else:
			jaccard = len(intersection) / len(union)

		# Optimization logic: Maximize Agreement
		if jaccard > best_jaccard:
			best_jaccard = jaccard
			best_config = (t_A, t_B)
			P_true = intersection

print(f"\nOPTIMAL CONSENSUS FOUND:")
print(f"Tau_A (Statement): {best_config[0]}")
print(f"Tau_B (Question):  {best_config[1]}")
print(f"Jaccard Score:     {best_jaccard:.4f}")
print(f"Platinum Pairs:    {len(P_true)}")

# =========================================================
# 3. TUNING TARGET THRESHOLDS (Estimating t_i)
# =========================================================

print("\n--- Phase 2: Tuning Model Thresholds (t_i) ---")
print("Targeting the Platinum Pairs (P_true)...")


def tune_threshold(dist_matrix, target_pairs, model_name):
	best_t = 0
	best_f1 = -1
	min_errors = float("inf")

	# We can perform a finer search here
	t_range = np.arange(5.0, 20.0, 0.1)

	for t in t_range:
		current_pairs = get_clustering_pairs(dist_matrix, t)

		# Calculate F1 Score relative to P_true
		tp = len(current_pairs.intersection(target_pairs))
		fp = len(current_pairs - target_pairs)
		fn = len(target_pairs - current_pairs)

		# Total Errors (False Pos + False Neg)
		errors = fp + fn

		if tp > 0:
			precision = tp / (tp + fp)
			recall = tp / (tp + fn)
			f1 = 2 * (precision * recall) / (precision + recall)
		else:
			f1 = 0

		# We optimize for F1 (Balance), or you could optimize for Min Errors
		if f1 > best_f1:
			best_f1 = f1
			best_t = t
			min_errors = errors

	print(
		f"[{model_name}] Optimal t: {best_t:.1f} | F1: {best_f1:.4f} | Errors: {min_errors}"
	)
	return best_t


# Tune both models to target the consensus
optimal_t_A = tune_threshold(dist_matrix_A, P_true, "Model A")
optimal_t_B = tune_threshold(dist_matrix_B, P_true, "Model B")

# =========================================================
# 4. FINAL OUTPUT
# =========================================================
print("\n--- Final Operational Thresholds ---")
print(f"Model A (Statement) Cutoff: {optimal_t_A}")
print(f"Model B (Question)  Cutoff: {optimal_t_B}")

# Optional: Run one last time with optimal t to show group counts
final_pairs_A = get_clustering_pairs(dist_matrix_A, optimal_t_A)
final_pairs_B = get_clustering_pairs(dist_matrix_B, optimal_t_B)

print(f"\nFinal Statistics:")
print(f"Model A finds {len(final_pairs_A)} duplicate pairs.")
print(f"Model B finds {len(final_pairs_B)} duplicate pairs.")
print(f"Consensus (P_true) was {len(P_true)} pairs.")

--- Phase 1: Grid Search for Consensus (Tau) ---
Pre-computing clusters for 26 thresholds...


NameError: name 'dist_matrix_A' is not defined