In [None]:
import os
import json

import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist, mahalanobis
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from scipy.stats import rankdata
from sklearn.cluster import AgglomerativeClustering

QUESTIONS_FILE = "./data/questions_filter_after.json"
POLICIES_FILE = "./data/policies_testing.json"
OUTPUT_Q_FILE = "./output_q.json"
OUTPUT_P_FILE = "./output_p.json"


def _loadJson(filepath):
	if not os.path.exists(filepath):
		print(f"Warning: File not found: {filepath}")
		return {}
	try:
		with open(filepath, "r", encoding="utf-8") as f:
			return json.load(f)
	except json.JSONDecodeError:
		print(f"Error decoding JSON: {filepath}")
		return {}


qdata = _loadJson(QUESTIONS_FILE)


def prepare_model_artifacts(raw_vector_list, name="Model"):
	print(f"Processing {name}...")
	data = np.array(raw_vector_list)
	data_trunc = data[:, :256]
	norms = np.linalg.norm(data_trunc, axis=1, keepdims=True)
	cleaned_vectors = data_trunc / (norms + 1e-10)

	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)

	return {
		"dist_matrix": dist_matrix,
		"vectors": cleaned_vectors,
		"precision": precision_matrix,
	}


# --- LOAD DATA ---
data_set = qdata
strings = list(data_set.keys())

raw_vectors_A = [data_set[s]["embedding_vector"] for s in strings]
raw_vectors_B = [data_set[s]["retrieval_embedding_vector"] for s in strings]

Model_A = prepare_model_artifacts(raw_vectors_A, "Model A (Statement)")
Model_B = prepare_model_artifacts(raw_vectors_B, "Model B (Question)")


def get_pairs_from_labels(labels):
	"""Converts cluster labels into a Set of unique pairs (indices)."""
	df = pd.DataFrame({"label": labels, "id": range(len(labels))})
	pairs = set()
	for label, group in df.groupby("label"):
		indices = group["id"].tolist()
		if len(indices) > 1:
			for p in combinations(sorted(indices), 2):
				pairs.add(p)
	return pairs


def get_clustering_pairs_and_labels(dist_matrix, threshold):
	"""Runs clustering and returns both the pair set and the labels array."""
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	return get_pairs_from_labels(labels), labels


def calculate_n_true(labels_array, target_pairs):
	"""Calculates the number of unique clusters containing the elements of target_pairs."""
	if not target_pairs or labels_array is None:
		return 0

	involved_indices = set(idx for pair in target_pairs for idx in pair)

	labels_of_interest = set(
		labels_array[idx] for idx in involved_indices if idx < len(labels_array)
	)

	if not involved_indices:
		return 0

	return len(labels_of_interest)


print("\n--- Phase 1: Grid Search for Consensus (Tau) ---")


dist_A_off = Model_A["dist_matrix"][np.triu_indices_from(Model_A["dist_matrix"], k=1)]
dist_B_off = Model_B["dist_matrix"][np.triu_indices_from(Model_B["dist_matrix"], k=1)]
all_dists = np.concatenate([dist_A_off, dist_B_off])


D_min_min = np.min(all_dists)
D_max_max = np.max(all_dists)


SEARCH_STEP = 0.2


TAU_SEARCH_START = max(D_min_min, 0.0)
TAU_SEARCH_END = D_max_max

tau_range = np.arange(TAU_SEARCH_START, TAU_SEARCH_END, SEARCH_STEP)
t_range = np.arange(TAU_SEARCH_START, TAU_SEARCH_END, SEARCH_STEP)

print(
	f"Search Range Defined: [{TAU_SEARCH_START:.2f} to {TAU_SEARCH_END:.2f}] (Step: {SEARCH_STEP})"
)
best_jaccard = -1
P_true = set()
labels_A_star = None
labels_B_star = None


cache_A = {}
cache_B = {}

print(f"Pre-computing clusters for {len(tau_range)} thresholds...")
for t in tau_range:
	cache_A[t] = get_clustering_pairs_and_labels(Model_A["dist_matrix"], t)
	cache_B[t] = get_clustering_pairs_and_labels(Model_B["dist_matrix"], t)
TAU_A = 0
TAU_B = 0

for t_A in tau_range:
	pairs_A, labels_A = cache_A[t_A]
	for t_B in tau_range:
		pairs_B, labels_B = cache_B[t_B]

		intersection = pairs_A.intersection(pairs_B)
		union = pairs_A.union(pairs_B)

		if len(union) > 0:
			jaccard = len(intersection) / len(union)
			if jaccard > best_jaccard:

				best_jaccard = jaccard
				P_true = intersection
				labels_A_star = labels_A
				labels_B_star = labels_B
				TAU_A = t_A
				TAU_B = t_B


print(f"tau a:{TAU_A}, tau b:{TAU_B}")


N_A_true = calculate_n_true(labels_A_star, P_true)
N_B_true = calculate_n_true(labels_B_star, P_true)
N_target = (N_A_true + N_B_true) / 2

print(f"\nConsensus Structure Found:")
print(f"Platinum Pairs Identified: {len(P_true)}")
print(f"N_target: {N_target:.1f} (Avg of A={N_A_true}, B={N_B_true})")


print("\n--- Phase 2: Tuning Thresholds to Target N_target ---")


def tune_threshold_by_group_count(model_data, N_target, model_name):
	best_t = 0
	min_error = float("inf")
	best_f1 = -1

	for t in t_range:
		current_pairs, current_labels = get_clustering_pairs_and_labels(
			model_data["dist_matrix"], t
		)

		N_predicted = calculate_n_true(current_labels, P_true)
		group_error = abs(N_predicted - N_target)

		tp = len(current_pairs.intersection(P_true))
		fp = len(current_pairs - P_true)
		fn = len(P_true - current_pairs)

		if tp > 0:
			precision = tp / (tp + fp)
			recall = tp / (tp + fn)
			f1 = 2 * (precision * recall) / (precision + recall)
		else:
			f1 = 0

		if group_error < min_error:
			min_error = group_error
			best_f1 = f1
			best_t = t
		elif group_error == min_error and f1 > best_f1:
			best_f1 = f1
			best_t = t

	print(
		f"[{model_name}] Optimal t: {best_t:.1f} | Group Error: {min_error:.1f} | F1: {best_f1:.4f}"
	)
	return best_t


optimal_t_A = tune_threshold_by_group_count(Model_A, N_target, "Model A")
optimal_t_B = tune_threshold_by_group_count(Model_B, N_target, "Model B")

print(f"\nFinal Calibrated Thresholds:")
print(f"Model A (Statement): {optimal_t_A:.2f}")
print(f"Model B (Question):  {optimal_t_B:.2f}")


def print_final_clusters(model_data, threshold, string_list, title):
	print(f"\n{'='*80}")
	print(f"FINAL OUTPUT: {title} (Threshold: {threshold:.2f})")
	print(f"{'='*80}")

	cluster_model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = cluster_model.fit_predict(model_data["dist_matrix"])

	df = pd.DataFrame(
		{"string": string_list, "label": labels, "idx": range(len(string_list))}
	)
	groups = [g for _, g in df.groupby("label") if len(g) > 1]
	groups.sort(key=lambda x: len(x), reverse=True)

	final_pairs, _ = get_clustering_pairs_and_labels(model_data["dist_matrix"], threshold)
	tp = len(final_pairs.intersection(P_true))
	fp = len(final_pairs - P_true)

	print(f"Found {len(groups)} total significant groups.")
	print(f"P_true pairs captured: {tp} (False Positives: {fp})\n")

	for i, group in enumerate(groups):
		indices = group["idx"].tolist()
		cluster_strs = group["string"].tolist()

		vecs = model_data["vectors"][indices]
		prec = model_data["precision"]
		local_mean = np.mean(vecs, axis=0)

		distances = []
		min_dist = float("inf")
		rep_idx = -1

		for local_i, global_i in enumerate(indices):
			d = mahalanobis(model_data["vectors"][global_i], local_mean, prec)
			distances.append(d)
			if d < min_dist:
				min_dist = d
				rep_idx = local_i

		group_radius = max(distances)

		print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {group_radius:.4f}]")

		for idx, s in enumerate(cluster_strs):
			prefix = " [CENTROID] " if idx == rep_idx else "            "
			print(f"{prefix} {s}")
		print("-" * 80)


print_final_clusters(Model_A, optimal_t_A, strings, "Model A (Statement Embeddings)")
print_final_clusters(Model_B, optimal_t_B, strings, "Model B (Question Embeddings)")


def get_nearest_neighbor_distances(dist_matrix):
	"""
	Extracts the distance to the nearest neighbor for every point.
	Ignores the diagonal (0).
	"""
	np.fill_diagonal(dist_matrix, float("inf"))
	min_dists = np.min(dist_matrix, axis=1)
	np.fill_diagonal(dist_matrix, 0.0)
	return min_dists


def convert_dist_to_prob(dist_matrix, reference_dist_array):
	"""
	Converts raw distances into Probabilities (P-values) based on the
	Empirical CDF of the provided reference distribution (Nearest Neighbors).

	P(d) = (Rank of d) / (Total Count + 1)
	"""

	sorted_refs = np.sort(reference_dist_array)
	n = len(sorted_refs)

	ranks = np.searchsorted(sorted_refs, dist_matrix)

	probs = (ranks + 1) / (n + 1)

	return probs


print("--- Calculating Empirical Probabilities (EVT Logic) ---")


nn_dists_A = get_nearest_neighbor_distances(Model_A["dist_matrix"])
nn_dists_B = get_nearest_neighbor_distances(Model_B["dist_matrix"])


Prob_A = convert_dist_to_prob(Model_A["dist_matrix"], nn_dists_A)
Prob_B = convert_dist_to_prob(Model_B["dist_matrix"], nn_dists_B)

print(f"Probabilities Calculated.")
print(
	f"Example (Model A): Dist=8.8 -> P={np.interp(8.8, np.sort(nn_dists_A), np.linspace(0,1,len(nn_dists_A))):.5f}"
)
print(
	f"Example (Model B): Dist=8.8 -> P={np.interp(8.8, np.sort(nn_dists_B), np.linspace(0,1,len(nn_dists_B))):.5f}"
)


print("\n--- Fusing Probabilities (Min Rule) ---")


Prob_Fused = np.minimum(Prob_A, Prob_B)


print("\n--- Inferring Threshold from Platinum Pairs ---")

platinum_probs = []
for idx1, idx2 in P_true:
	p_val = Prob_Fused[idx1, idx2]
	platinum_probs.append(p_val)

# HARDCODED_MAX_A_SIZE = 105
# HARDCODED_MAX_STABLE_B_SIZE = 99
# AV_STABLE = (HARDCODED_MAX_A_SIZE + HARDCODED_MAX_STABLE_B_SIZE) / 2

# AV_STABLE = (getMaxStable(TAU_A, ...) + getMaxStable(TAU_B, ...)) / 2
AV_STABLE = (getMaxStable(TAU_A, Model_A) + getMaxStable(TAU_B, Model_B)) / 2
# PROB_THRESHOLD = len(P_true) / (AV_STABLE)
PROB_THRESHOLD = len(P_true) / (AV_STABLE)

print(f"Inferred Probability Threshold: {PROB_THRESHOLD:.6f}")


print(f"\n{'='*80}")
print(f"STAGE 2 OUTPUT: Probabilistic Outlier Model (P < {PROB_THRESHOLD:.6f})")
print(f"{'='*80}")


cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=PROB_THRESHOLD,
	metric="precomputed",
	linkage="complete",
)

labels = cluster_model.fit_predict(Prob_Fused)


df = pd.DataFrame({"string": strings, "label": labels, "idx": range(len(strings))})
groups = [g for _, g in df.groupby("label") if len(g) > 1]
groups.sort(key=lambda x: len(x), reverse=True)

print(f"Found {len(groups)} significant groups.\n")

for i, group in enumerate(groups):
	indices = group["idx"].tolist()
	cluster_strs = group["string"].tolist()

	ref_vectors = Model_B["vectors"][indices]
	ref_prec = Model_B["precision"]
	local_mean = np.mean(ref_vectors, axis=0)

	dists = []
	min_d = float("inf")
	rep_i = -1

	for loc_i, glob_i in enumerate(indices):
		d = mahalanobis(Model_B["vectors"][glob_i], local_mean, ref_prec)
		dists.append(d)
		if d < min_d:
			min_d = d
			rep_i = loc_i

	print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {max(dists):.4f}]")

	for idx, s in enumerate(cluster_strs):
		prefix = " [CENTROID] " if idx == rep_i else "            "
		print(f"{prefix} {s}")
	print("-" * 80)

Processing Model A (Statement)...
Processing Model B (Question)...

--- Phase 1: Grid Search for Consensus (Tau) ---
Search Range Defined: [5.77 to 25.35] (Step: 0.2)
Pre-computing clusters for 98 thresholds...
tau a:24.97308635455024, tau b:24.57308635455024

Consensus Structure Found:
Platinum Pairs Identified: 133904
N_target: 1.5 (Avg of A=2, B=1)

--- Phase 2: Tuning Thresholds to Target N_target ---
[Model A] Optimal t: 25.0 | Group Error: 0.5 | F1: 1.0000
[Model B] Optimal t: 24.6 | Group Error: 0.5 | F1: 0.9961

Final Calibrated Thresholds:
Model A (Statement): 24.97
Model B (Question):  24.57

FINAL OUTPUT: Model A (Statement Embeddings) (Threshold: 24.97)
Found 2 total significant groups.
P_true pairs captured: 133904 (False Positives: 0)

GROUP 1 (Size: 518) [Radius: 17.6866]
             Does the privacy policy affirm that the company implements appropriate technical security measures?
             Does the privacy policy affirm that the company implements appropriate organ

In [None]:
import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist, mahalanobis
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations

# =========================================================
# 1. DATA PREPARATION (Updated to return vectors/precision)
# =========================================================


def prepare_model_artifacts(raw_vector_list, name="Model"):
	"""
	Returns dictionary containing:
	- 'dist_matrix': N x N pairwise distances
	- 'vectors': N x 256 normalized vectors
	- 'precision': 256 x 256 inverse covariance matrix
	"""
	print(f"Processing {name}...")

	# 1. Truncate & Normalize
	data = np.array(raw_vector_list)
	data_trunc = data[:, :256]
	norms = np.linalg.norm(data_trunc, axis=1, keepdims=True)
	cleaned_vectors = data_trunc / (norms + 1e-10)

	# 2. Ledoit-Wolf
	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	# 3. Distance Matrix
	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)

	return {
		"dist_matrix": dist_matrix,
		"vectors": cleaned_vectors,
		"precision": precision_matrix,
	}


# --- LOAD DATA ---
# Assuming 'qdata' is available
data_set = qdata
strings = list(data_set.keys())

raw_vectors_A = [data_set[s]["embedding_vector"] for s in strings]
raw_vectors_B = [data_set[s]["retrieval_embedding_vector"] for s in strings]

# --- PROCESS ARTIFACTS ---
# We store everything in a dictionary to keep it organized
Model_A = prepare_model_artifacts(raw_vectors_A, "Model A (Statement)")
Model_B = prepare_model_artifacts(raw_vectors_B, "Model B (Question)")
import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist, mahalanobis
from sklearn.cluster import AgglomerativeClustering
from itertools import combinations
from collections import defaultdict

# =========================================================
# 1. HELPER FUNCTIONS
# =========================================================


def get_pairs_from_labels(labels):
	"""Converts cluster labels into a Set of unique pairs (indices)."""
	df = pd.DataFrame({"label": labels, "id": range(len(labels))})
	pairs = set()
	for label, group in df.groupby("label"):
		indices = group["id"].tolist()
		if len(indices) > 1:
			for p in combinations(sorted(indices), 2):
				pairs.add(p)
	return pairs


def get_clustering_pairs_and_labels(dist_matrix, threshold):
	"""Runs clustering and returns both the pair set and the labels array."""
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	return get_pairs_from_labels(labels), labels


def calculate_n_true(labels_array, target_pairs):
	"""Calculates the number of unique clusters containing the elements of target_pairs."""
	if not target_pairs or labels_array is None:
		return 0

	# Identify all indices involved in the platinum pairs
	involved_indices = set(idx for pair in target_pairs for idx in pair)

	# Get the unique cluster labels associated with those indices
	labels_of_interest = set(
		labels_array[idx] for idx in involved_indices if idx < len(labels_array)
	)

	# If P_true is empty, N_true must be zero.
	if not involved_indices:
		return 0

	return len(labels_of_interest)


# =========================================================
# 2. STAGE 1: CONSENSUS TUNING (Setup - Requires Model_A, Model_B, strings)
# =========================================================

# --- Assuming Model_A, Model_B, and strings are already populated from the preparation step ---

print("\n--- Phase 1: Grid Search for Consensus (Tau) ---")

# tau_range = np.arange(5.0, 18.0, 0.1)
# --- Dynamic Range Calculation (Adhering to Absolute Min/Max) ---

# Combine the off-diagonal elements of both distance matrices
dist_A_off = Model_A["dist_matrix"][np.triu_indices_from(Model_A["dist_matrix"], k=1)]
dist_B_off = Model_B["dist_matrix"][np.triu_indices_from(Model_B["dist_matrix"], k=1)]
all_dists = np.concatenate([dist_A_off, dist_B_off])

# Calculate the bounds
D_min_min = np.min(all_dists)
D_max_max = np.max(all_dists)

# Define search range: Start slightly below min, end slightly above max
# This ensures we capture the boundary points exactly.
SEARCH_STEP = 0.5	# Retain the fine step size for accuracy

# TAU_SEARCH_START = max(D_min_min - SEARCH_STEP, 0.0)
# TAU_SEARCH_END = D_max_max + SEARCH_STEP

TAU_SEARCH_START = max(D_min_min, 0.0)
TAU_SEARCH_END = D_max_max

tau_range = np.arange(TAU_SEARCH_START, TAU_SEARCH_END, SEARCH_STEP)
t_range = np.arange(TAU_SEARCH_START, TAU_SEARCH_END, SEARCH_STEP)

print(
	f"Search Range Defined: [{TAU_SEARCH_START:.2f} to {TAU_SEARCH_END:.2f}] (Step: {SEARCH_STEP})"
)
best_jaccard = -1
P_true = set()
labels_A_star = None
labels_B_star = None

# Cache structure: {tau: (pair_set, labels_array)}
cache_A = {}
cache_B = {}

print(f"Pre-computing clusters for {len(tau_range)} thresholds...")
for t in tau_range:
	cache_A[t] = get_clustering_pairs_and_labels(Model_A["dist_matrix"], t)
	cache_B[t] = get_clustering_pairs_and_labels(Model_B["dist_matrix"], t)
TAU_A = 0
TAU_B = 0

# Run grid search
for t_A in tau_range:
	pairs_A, labels_A = cache_A[t_A]
	for t_B in tau_range:
		pairs_B, labels_B = cache_B[t_B]

		intersection = pairs_A.intersection(pairs_B)
		union = pairs_A.union(pairs_B)

		if len(union) > 0:
			jaccard = len(intersection) / len(union)
			if jaccard > best_jaccard:

				best_jaccard = jaccard
				P_true = intersection
				labels_A_star = labels_A
				labels_B_star = labels_B
				TAU_A = t_A
				TAU_B = t_B

print(f"tau a:{TAU_A}, tau b:{TAU_B}")

# Calculate N_target
N_A_true = calculate_n_true(labels_A_star, P_true)
N_B_true = calculate_n_true(labels_B_star, P_true)
N_target = (N_A_true + N_B_true) / 2

print(f"\nConsensus Structure Found:")
print(f"Platinum Pairs Identified: {len(P_true)}")
print(f"N_target: {N_target:.1f} (Avg of A={N_A_true}, B={N_B_true})")

# =========================================================
# 3. TUNING TARGET THRESHOLDS (t)
# =========================================================

print("\n--- Phase 2: Tuning Thresholds to Target N_target ---")


def tune_threshold_by_group_count(model_data, N_target, model_name):
	best_t = 0
	min_error = float("inf")
	best_f1 = -1

	# t_range = np.arange(5.0, 20.0, 0.1)

	for t in t_range:
		current_pairs, current_labels = get_clustering_pairs_and_labels(
			model_data["dist_matrix"], t
		)

		# 1. Calculate primary error (Group Count Error)
		N_predicted = calculate_n_true(current_labels, P_true)
		group_error = abs(N_predicted - N_target)

		# 2. Calculate secondary metric (F1 score)
		tp = len(current_pairs.intersection(P_true))
		fp = len(current_pairs - P_true)
		fn = len(P_true - current_pairs)

		if tp > 0:
			precision = tp / (tp + fp)
			recall = tp / (tp + fn)
			f1 = 2 * (precision * recall) / (precision + recall)
		else:
			f1 = 0

		# Optimization Logic: (Minimize error, maximize F1 as tie-breaker)
		if group_error < min_error:
			min_error = group_error
			best_f1 = f1
			best_t = t
		elif group_error == min_error and f1 > best_f1:
			best_f1 = f1
			best_t = t

	print(
		f"[{model_name}] Optimal t: {best_t:.1f} | Group Error: {min_error:.1f} | F1: {best_f1:.4f}"
	)
	return best_t


optimal_t_A = tune_threshold_by_group_count(Model_A, N_target, "Model A")
optimal_t_B = tune_threshold_by_group_count(Model_B, N_target, "Model B")

print(f"\nFinal Calibrated Thresholds:")
print(f"Model A (Statement): {optimal_t_A:.2f}")
print(f"Model B (Question):  {optimal_t_B:.2f}")

# =========================================================
# 4. FINAL VISUALIZATION (Printing Groups)
# =========================================================


def print_final_clusters(model_data, threshold, string_list, title):
	print(f"\n{'='*80}")
	print(f"FINAL OUTPUT: {title} (Threshold: {threshold:.2f})")
	print(f"{'='*80}")

	# 1. Cluster using the optimized t
	cluster_model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = cluster_model.fit_predict(model_data["dist_matrix"])

	# 2. Organize and filter groups
	df = pd.DataFrame(
		{"string": string_list, "label": labels, "idx": range(len(string_list))}
	)
	groups = [g for _, g in df.groupby("label") if len(g) > 1]
	groups.sort(key=lambda x: len(x), reverse=True)

	# Calculate statistics for P_true within this final clustering
	final_pairs, _ = get_clustering_pairs_and_labels(model_data["dist_matrix"], threshold)
	tp = len(final_pairs.intersection(P_true))
	fp = len(final_pairs - P_true)

	print(f"Found {len(groups)} total significant groups.")
	print(f"P_true pairs captured: {tp} (False Positives: {fp})\n")

	# 3. Print Groups (with Centroid and Radius)
	for i, group in enumerate(groups):
		indices = group["idx"].tolist()
		cluster_strs = group["string"].tolist()

		# Centroid/Radius Logic
		vecs = model_data["vectors"][indices]
		prec = model_data["precision"]
		local_mean = np.mean(vecs, axis=0)

		distances = []
		min_dist = float("inf")
		rep_idx = -1

		for local_i, global_i in enumerate(indices):
			d = mahalanobis(model_data["vectors"][global_i], local_mean, prec)
			distances.append(d)
			if d < min_dist:
				min_dist = d
				rep_idx = local_i

		group_radius = max(distances)

		print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {group_radius:.4f}]")

		for idx, s in enumerate(cluster_strs):
			prefix = " [CENTROID] " if idx == rep_idx else "            "
			print(f"{prefix} {s}")
		print("-" * 80)


# --- EXECUTE VISUALIZATION ---
print_final_clusters(Model_A, optimal_t_A, strings, "Model A (Statement Embeddings)")
print_final_clusters(Model_B, optimal_t_B, strings, "Model B (Question Embeddings)")

import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from scipy.stats import rankdata
from sklearn.cluster import AgglomerativeClustering

# =========================================================
# 1. EMPIRICAL PROBABILITY ESTIMATION (Non-Parametric)
# =========================================================


def get_nearest_neighbor_distances(dist_matrix):
	"""
	Extracts the distance to the nearest neighbor for every point.
	Ignores the diagonal (0).
	"""
	np.fill_diagonal(dist_matrix, float("inf"))
	min_dists = np.min(dist_matrix, axis=1)
	np.fill_diagonal(dist_matrix, 0.0)	# Restore
	return min_dists


def convert_dist_to_prob(dist_matrix, reference_dist_array):
	"""
	Converts raw distances into Probabilities (P-values) based on the
	Empirical CDF of the provided reference distribution (Nearest Neighbors).

	P(d) = (Rank of d) / (Total Count + 1)
	"""
	# We define the "Background" as the distribution of nearest neighbors.
	# Why? Because non-duplicates have large NN distances, duplicates have small ones.
	# We want to know: "Is this distance surprisingly small even for a nearest neighbor?"

	sorted_refs = np.sort(reference_dist_array)
	n = len(sorted_refs)

	# Use searchsorted to find the rank of every element in the matrix efficiently
	# This is effectively the ECDF.
	# Indices corresponding to where the values would be inserted
	ranks = np.searchsorted(sorted_refs, dist_matrix)

	# Calculate Probability (P-value)
	# P < epsilon means "This is in the bottom epsilon% of nearest neighbor distances"
	probs = (ranks + 1) / (n + 1)

	return probs


print("--- Calculating Empirical Probabilities (EVT Logic) ---")

# 1. Get the distribution of "closest things" for each model
# This forms our background distribution for "How close do things get?"
nn_dists_A = get_nearest_neighbor_distances(Model_A["dist_matrix"])
nn_dists_B = get_nearest_neighbor_distances(Model_B["dist_matrix"])

# 2. Convert the full N x N matrices to Probability Matrices
# "How likely is this pair to be this close by chance?"
Prob_A = convert_dist_to_prob(Model_A["dist_matrix"], nn_dists_A)
Prob_B = convert_dist_to_prob(Model_B["dist_matrix"], nn_dists_B)

print(f"Probabilities Calculated.")
print(
	f"Example (Model A): Dist=8.8 -> P={np.interp(8.8, np.sort(nn_dists_A), np.linspace(0,1,len(nn_dists_A))):.5f}"
)
print(
	f"Example (Model B): Dist=8.8 -> P={np.interp(8.8, np.sort(nn_dists_B), np.linspace(0,1,len(nn_dists_B))):.5f}"
)
# =========================================================
# 2. PROBABILITY FUSION (Joint Rarity)
# =========================================================

# We assume models provide independent evidence.
# We want pairs where AT LEAST ONE model thinks it's extremely rare.
# But simply multiplying P_A * P_B favors agreement.
# We want the "Max Signal" logic (Min Probability).
# Logic: If Model B says P=0.0001 and Model A says P=0.5,
# The pair IS rare in B's view. It is an outlier.

print("\n--- Fusing Probabilities (Min Rule) ---")

# We take the Minimum Probability (Maximum Surprise)
Prob_Fused = np.minimum(Prob_A, Prob_B)

# =========================================================
# 3. INFERENCE: INFERRING THE THRESHOLD
# =========================================================

print("\n--- Inferring Threshold from Platinum Pairs ---")

platinum_probs = []
for idx1, idx2 in P_true:
	p_val = Prob_Fused[idx1, idx2]
	platinum_probs.append(p_val)

PROB_THRESHOLD = 0.049019607843137254

print(f"Inferred Probability Threshold: {PROB_THRESHOLD:.6f}")

# =========================================================
# 4. CLUSTERING & REPORTING
# =========================================================

print(f"\n{'='*80}")
print(f"STAGE 2 OUTPUT: Probabilistic Outlier Model (P < {PROB_THRESHOLD:.6f})")
print(f"{'='*80}")

# We define distance for clustering as the Probability.
# Single Linkage logic (min probability) is implicitly handled by the Min-Fusion above.
# We use Complete Linkage on the Probability Matrix to form tight groups.

cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=PROB_THRESHOLD,
	metric="precomputed",
	linkage="complete",
)

labels = cluster_model.fit_predict(Prob_Fused)

# --- VISUALIZATION ---

df = pd.DataFrame({"string": strings, "label": labels, "idx": range(len(strings))})
groups = [g for _, g in df.groupby("label") if len(g) > 1]
groups.sort(key=lambda x: len(x), reverse=True)

print(f"Found {len(groups)} significant groups.\n")

for i, group in enumerate(groups):
	indices = group["idx"].tolist()
	cluster_strs = group["string"].tolist()

	# For Centroid/Radius, we use Model B vectors (or A) as a geometric reference.
	# It's just for display purposes.
	ref_vectors = Model_B["vectors"][indices]
	ref_prec = Model_B["precision"]
	local_mean = np.mean(ref_vectors, axis=0)

	dists = []
	min_d = float("inf")
	rep_i = -1

	for loc_i, glob_i in enumerate(indices):
		d = mahalanobis(Model_B["vectors"][glob_i], local_mean, ref_prec)
		dists.append(d)
		if d < min_d:
			min_d = d
			rep_i = loc_i

	# Also print the Joint Probability of the group (min p among pairs)
	# Just to verify our logic.
	print(f"GROUP {i+1} (Size: {len(indices)}) [Radius: {max(dists):.4f}]")

	for idx, s in enumerate(cluster_strs):
		prefix = " [CENTROID] " if idx == rep_i else "            "
		print(f"{prefix} {s}")
	print("-" * 80)

Processing Model A (Statement)...


In [None]:
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import mahalanobis

data_set = qdata
strings = list(data_set.keys())

# raw_vectors = np.array(
# 	[data_set[s]["retrieval_embedding_vector"] for s in strings]
# )	# alt
raw_vectors = np.array([data_set[s]["embedding_vector"] for s in strings])


print(f"Original shape: {raw_vectors.shape}")


truncated_vectors = raw_vectors[:, :256]


norms = np.linalg.norm(truncated_vectors, axis=1, keepdims=True)

cleaned_vectors = truncated_vectors / (norms + 1e-10)

print(f"Truncated shape: {cleaned_vectors.shape}")


lw = LedoitWolf()
lw.fit(cleaned_vectors)


precision_matrix = lw.precision_
mean_vector = lw.location_

DISTANCE_THRESHOLD = 8.823086354550213	# 9.223086354550212

print(f"Computing pairwise Mahalanobis distances for {len(cleaned_vectors)} vectors...")
print(f"Using Threshold: {DISTANCE_THRESHOLD}")

dist_matrix = cdist(
	cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
)
cluster_model = AgglomerativeClustering(
	n_clusters=None,
	distance_threshold=DISTANCE_THRESHOLD,
	metric="precomputed",
	linkage="complete",
)

labels = cluster_model.fit_predict(dist_matrix)


df = pd.DataFrame(
	{"string": strings, "label": labels, "vector_index": range(len(strings))}
)

all_clusters = [group for _, group in df.groupby("label")]

significant_clusters = [g for g in all_clusters if len(g) > 1]

significant_clusters.sort(key=lambda x: len(x), reverse=True)

total_groups = len(all_clusters)
trivial_count = total_groups - len(significant_clusters)

baseline_indices = {
	tuple(g["vector_index"].sort_values().tolist()) for g in significant_clusters
}

if not baseline_indices:
	print(
		"Baseline run resulted in no significant clusters (all size 1 or 0). Skipping stability check."
	)
else:
	print(
		f"\nStarting stability check against {len(baseline_indices)} baseline significant clusters."
	)

	STEP_SIZE = 0.05	# Define a reasonable step size
	MAX_ITERATIONS = 500

	t = DISTANCE_THRESHOLD
	final_stable_threshold = DISTANCE_THRESHOLD

	for i in range(MAX_ITERATIONS):
		t += STEP_SIZE

		cluster_model_t = AgglomerativeClustering(
			n_clusters=None,
			distance_threshold=t,
			metric="precomputed",
			linkage="complete",
		)

		labels_t = cluster_model_t.fit_predict(dist_matrix)

		df_t = pd.DataFrame(
			{"string": strings, "label": labels_t, "vector_index": range(len(strings))}
		)

		all_clusters_t = [group for _, group in df_t.groupby("label")]
		significant_clusters_t = [g for g in all_clusters_t if len(g) > 1]

		current_indices = {
			tuple(g["vector_index"].sort_values().tolist()) for g in significant_clusters_t
		}

		if not baseline_indices.issubset(current_indices):

			print("\n--- STABILITY BROKEN ---")
			print(f"First merge/change detected at proposed threshold t={t:.6f}")
			print(f"Maximum stable threshold found: {final_stable_threshold:.6f}")
			break

		else:
			final_stable_threshold = t
			if i % 20 == 0:
				print(
					f"Iteration {i}: Threshold {t:.6f} stable. Significant clusters found: {len(significant_clusters_t)}"
				)

	else:
		print(
			f"\nReached Max Iterations ({MAX_ITERATIONS}). Final stable threshold: {final_stable_threshold:.6f}"
		)

In [None]:
def getMaxStable(start_tau, model_data):

	dist_matrix = model_data["dist_matrix"]

	# --- Baseline Run ---
	base_model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=start_tau,
		metric="precomputed",
		linkage="complete",
	)
	base_labels = base_model.fit_predict(dist_matrix)

	def get_cluster_set_and_count(labels, n_items):
		df = pd.DataFrame({"label": labels, "idx": range(n_items)})
		groups = [
			g["idx"].sort_values().tolist() for _, g in df.groupby("label") if len(g) > 1
		]
		# Return set of tuples for subset comparison, and the count
		return set(tuple(g) for g in groups), len(groups)

	baseline_indices, baseline_count = get_cluster_set_and_count(
		base_labels, len(dist_matrix)
	)

	if not baseline_indices:
		return 0

	# --- Stability Loop ---
	STEP_SIZE = SEARCH_STEP
	MAX_ITERATIONS = 500
	t = start_tau
	last_stable_count = baseline_count

	for _ in range(MAX_ITERATIONS):
		t += STEP_SIZE

		model_t = AgglomerativeClustering(
			n_clusters=None,
			distance_threshold=t,
			metric="precomputed",
			linkage="complete",
		)
		labels_t = model_t.fit_predict(dist_matrix)

		current_indices, current_count = get_cluster_set_and_count(labels_t, len(dist_matrix))

		# If baseline is not a subset of current, stability is broken
		if not baseline_indices.issubset(current_indices):
			return last_stable_count

		last_stable_count = current_count

	return last_stable_count


AV_STABLE = (getMaxStable(TAU_A, Model_A) + getMaxStable(TAU_B, Model_B)) / 2