In [146]:
import os
import json
import numpy as np
import concurrent.futures
from itertools import product

from scipy.spatial.distance import cdist
from scipy.stats import rankdata, expon, beta
from sklearn.covariance import LedoitWolf
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture

# ==============================================================================
# 1. DATA LOADING
# ==============================================================================

QUESTIONS_FILE = "./data/questions_filter_after.json"


def _loadJson(filepath):
	if not os.path.exists(filepath):
		print(f"Warning: File not found: {filepath}")
		return {}
	try:
		with open(filepath, "r", encoding="utf-8") as f:
			return json.load(f)
	except json.JSONDecodeError:
		print(f"Error decoding JSON: {filepath}")
		return {}


print(">>> Loading Data...")
qdata = _loadJson(QUESTIONS_FILE)

# ==============================================================================
# 2. DISTANCE & ARTIFACT PREPARATION
# ==============================================================================


def getMahalanobisDistances(vectors_a, vectors_b):
	norms = np.linalg.norm(vectors_a, axis=1, keepdims=True)
	cleaned_vectors = vectors_a / (norms + 1e-10)

	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)
	return dist_matrix, precision_matrix


Distance_Processors = {
	"cosine": lambda emb_a, emb_b: 1.0
	- (emb_a @ emb_b.T)
	/ (
		np.linalg.norm(emb_a, axis=1, keepdims=True)
		@ np.linalg.norm(emb_b, axis=1, keepdims=True).T
		+ 1e-10
	),
	"l1": lambda emb_a, emb_b: np.sum(np.abs(emb_a[..., np.newaxis] - emb_b.T), axis=1),
	"l2": lambda emb_a, emb_b: np.linalg.norm(emb_a[..., np.newaxis] - emb_b.T, axis=1),
	"dot": lambda emb_a, emb_b: emb_a @ emb_b.T,
	"mahalanobis": lambda emb_a, emb_b: getMahalanobisDistances(emb_a, emb_b),
}


def _prepareModelArtifact(
	raw_vectors,
	semantic_data,
	truncation_dim=256,
	distance_metric="mahalanobis",
	debug=True,
):
	data_matrix = np.array(raw_vectors)
	input_dim = data_matrix.shape[1]

	if input_dim < truncation_dim and debug:
		print(
			f"Warning: Vector dimension ({input_dim}) < limit ({truncation_dim}). No truncation."
		)

	data_truncated = data_matrix[:, :truncation_dim]

	dist_output = Distance_Processors[distance_metric](data_truncated, data_truncated)

	precision_matrix = None
	if distance_metric == "mahalanobis":
		dist_matrix, precision_matrix = dist_output
	else:
		dist_matrix = dist_output

	# Calculate NN indices (excluding self) for cache usage
	d_temp = dist_matrix.copy()
	np.fill_diagonal(d_temp, float("inf"))
	nn_indices = np.argmin(d_temp, axis=1)

	return {
		"dist_matrix": dist_matrix,
		"vectors": data_truncated,
		"precision": precision_matrix,
		"semantic_data": semantic_data,
		"metric": distance_metric,
		"nn_indices": nn_indices,
	}


def prepareModelArtifacts(
	data_set, vector_keys, truncation_dim=256, distance_metric="mahalanobis", debug=True
):
	semantic_data = list(data_set.keys())
	model_artifacts = {}
	raw_vectors = {}
	for key in vector_keys:
		raw_vectors[key] = [data_set[s][key] for s in semantic_data]

	executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(vector_keys))
	futures = dict()
	for key in vector_keys:
		if debug:
			print(f"Processing {key}...")
		futures[key] = executor.submit(
			_prepareModelArtifact,
			raw_vectors[key],
			semantic_data,
			truncation_dim,
			distance_metric,
			debug,
		)
	executor.shutdown(wait=True)
	for key in vector_keys:
		model_artifacts[key] = futures[key].result()
	return model_artifacts


# ==============================================================================
# 3. ACGC CONSENSUS SEARCH (COLD START)
# ==============================================================================


def clusterAndGetArtifacts(dist_matrix, threshold):
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	groups = {}
	for idx, label in enumerate(labels):
		groups.setdefault(label, []).append(idx)
	return [g for g in groups.values() if len(g) > 1], labels


def getNNPairsFromGroups(groups, nn_indices):
	pairs = set()
	for group in groups:
		if len(group) < 2:
			continue
		group_set = set(group)
		for idx in group:
			nn_idx = nn_indices[idx]
			if nn_idx in group_set:
				pairs.add(tuple(sorted((idx, nn_idx))))
	return pairs


def calculateNTrue(labels_array, target_pairs):
	if not target_pairs or labels_array is None:
		return 0
	involved = {idx for pair in target_pairs for idx in pair}
	if not involved:
		return 0
	return len({labels_array[idx] for idx in involved})


def createClusteringCache(model_artifacts, tau_range):
	cache = {name: {} for name in model_artifacts.keys()}
	# print(f"Building ACGC Cache ({len(tau_range)} steps)...")
	for name, artifact in model_artifacts.items():
		prev_groups = []
		for t in tau_range:
			groups, labels = clusterAndGetArtifacts(artifact["dist_matrix"], t)
			# Optimization: Only store if groups changed
			if len(groups) != len(prev_groups) or groups != prev_groups:
				nn_pairs = getNNPairsFromGroups(groups, artifact["nn_indices"])
				cache[name][t] = [groups, labels, nn_pairs]
				prev_groups = groups
	return cache


def findConsensusViaACGC(clustering_cache, model_keys):
	threshold_axes = [list(clustering_cache[m].keys()) for m in model_keys]
	best_score = -1
	best_p_true = set()

	for thresholds in product(*threshold_axes):
		current_config = dict(zip(model_keys, thresholds))
		pair_sets = [clustering_cache[m][t][2] for m, t in current_config.items()]

		p_true = set.intersection(*pair_sets)
		if not p_true:
			continue

		n_true_sum = 0
		for m, t in current_config.items():
			labels = clustering_cache[m][t][1]
			n_true_sum += calculateNTrue(labels, p_true)

		current_score = n_true_sum / len(model_keys)
		if current_score > best_score:
			best_score = current_score
			best_p_true = p_true

	return best_p_true


# ==============================================================================
# 4. PROBABILISTIC MODELING (EVT & MIXTURE MODELS)
# ==============================================================================


# def computeLocalDensityRanks(dist_matrix, k_neighbors=50):
# 	# method='min' ensures ties get the lower rank
# 	ranks = rankdata(dist_matrix, axis=1, method="min")
# 	# Subtract 1 so the diagonal (self) is 0.0
# 	ranks = ranks - 1.0
# 	normalized_ranks = ranks / k_neighbors
# 	return np.clip(normalized_ranks, 0.0, 1.0)


def computeLocalDensityRanks(dist_matrix, k_neighbors=50, max_valid_dist=None):
	"""
	Computes ranks but penalizes neighbors that are 'physically' too far.

	Args:
	    max_valid_dist: If provided, any distance > this value is treated
	                    as Background Noise (Rank 1.0), regardless of being a NN.
	"""
	# 1. Standard Ranking
	ranks = rankdata(dist_matrix, axis=1, method="min") - 1.0
	normalized_ranks = ranks / k_neighbors

	# 2. Distance Gating (The Fix)
	if max_valid_dist is not None:
		# Create a mask where the raw distance is too high
		# We treat these as "Infinite Rank" (Probability 0 of being signal)
		noise_mask = dist_matrix > max_valid_dist
		normalized_ranks[noise_mask] = 1.0	# Force to max entropy

	return np.clip(normalized_ranks, 0.0, 1.0)


def computeFusedEVTStatistic(rank_matrices_dict):
	matrices = list(rank_matrices_dict.values())
	stacked_matrices = np.stack(matrices, axis=0)
	return np.min(stacked_matrices, axis=0)


def _get_off_diagonal_samples(matrix):
	mask = ~np.eye(matrix.shape[0], dtype=bool)
	return matrix[mask]


def _initializeParametersViaNaiveQuantile(fused_statistic_matrix, signal_quantile=0.01):
	data = _get_off_diagonal_samples(fused_statistic_matrix)
	threshold = np.quantile(data, signal_quantile)
	signal_data = data[data <= threshold]
	noise_data = data[data > threshold]

	lamb = 1.0 / (np.mean(signal_data) + 1e-6)
	mu, var = np.mean(noise_data), np.var(noise_data)

	if var >= mu * (1 - mu):
		alpha, b_param = 1.0, 1.0
	else:
		common = (mu * (1 - mu) / (var + 1e-9)) - 1
		alpha = max(mu * common, 1.0)
		b_param = max((1 - mu) * common, 1.0)

	return {"pi": signal_quantile, "lambda": lamb, "alpha": alpha, "beta": b_param}


def _initializeParametersViaConsensus(fused_statistic_matrix, consensus_pairs):
	if not consensus_pairs:
		print("Warning: ACGC found no pairs. Falling back to naive quantile.")
		return _initializeParametersViaNaiveQuantile(fused_statistic_matrix)

	signal_values = [fused_statistic_matrix[i, j] for i, j in consensus_pairs]
	flat_all = _get_off_diagonal_samples(fused_statistic_matrix)

	noise_mean = np.mean(flat_all)
	noise_var = np.var(flat_all)

	pi = len(consensus_pairs) / len(flat_all)
	lamb = 1.0 / (np.mean(signal_values) + 1e-6)

	common = (noise_mean * (1 - noise_mean) / (noise_var + 1e-9)) - 1
	alpha = max(noise_mean * common, 1.0)
	beta_param = max((1 - noise_mean) * common, 1.0)

	return {"pi": pi, "lambda": lamb, "alpha": alpha, "beta": beta_param}


def fitWeibullBetaMixture(
	fused_statistic_matrix, initial_params=None, max_iter=100, tol=1e-4
):
	data = _get_off_diagonal_samples(fused_statistic_matrix)
	data = np.clip(data, 1e-6, 1.0 - 1e-6)

	if initial_params is None:
		params = _initializeParametersViaNaiveQuantile(fused_statistic_matrix)
	else:
		params = initial_params

	pi, lamb, a, b_p = params["pi"], params["lambda"], params["alpha"], params["beta"]
	log_likelihood_old = -np.inf

	for _ in range(max_iter):
		# E-Step
		pdf_signal = expon.pdf(data, scale=1.0 / lamb)
		pdf_noise = beta.pdf(data, a, b_p)

		weighted_signal = pi * pdf_signal
		weighted_noise = (1 - pi) * pdf_noise
		total_evidence = weighted_signal + weighted_noise + 1e-10
		gamma = weighted_signal / total_evidence

		# M-Step
		N_s = np.sum(gamma)
		pi = N_s / len(data)

		weighted_sum_x = np.sum(gamma * data)
		lamb = 1.0 / (weighted_sum_x / (N_s + 1e-10))

		w_noise = 1 - gamma
		w_noise_sum = np.sum(w_noise) + 1e-10
		mu_n = np.sum(w_noise * data) / w_noise_sum
		var_n = np.sum(w_noise * (data - mu_n) ** 2) / w_noise_sum

		if var_n < mu_n * (1 - mu_n):
			common = (mu_n * (1 - mu_n) / (var_n + 1e-10)) - 1
			a = max(mu_n * common, 1.0)
			b_p = max((1 - mu_n) * common, 1.0)

		log_likelihood_new = np.sum(np.log(total_evidence))
		if abs(log_likelihood_new - log_likelihood_old) < tol:
			break
		log_likelihood_old = log_likelihood_new

	final_params = {"pi": pi, "lambda": lamb, "alpha": a, "beta": b_p}

	# Compute Full Posterior
	pdf_S_full = expon.pdf(fused_statistic_matrix, scale=1.0 / lamb)
	pdf_N_full = beta.pdf(np.clip(fused_statistic_matrix, 1e-6, 1 - 1e-6), a, b_p)
	numerator = pi * pdf_S_full
	posterior = numerator / (numerator + (1 - pi) * pdf_N_full + 1e-10)

	return posterior, final_params


# def fitWeibullBetaMixture(
# 	fused_statistic_matrix, initial_params=None, max_iter=20, tol=1e-4
# ):
# 	data = _get_off_diagonal_samples(fused_statistic_matrix)
# 	data = np.clip(data, 1e-6, 1.0 - 1e-6)

# 	if initial_params is None:
# 		params = _initializeParametersViaNaiveQuantile(fused_statistic_matrix)
# 	else:
# 		params = initial_params

# 	pi, lamb, a, b_p = params["pi"], params["lambda"], params["alpha"], params["beta"]
# 	log_likelihood_old = -np.inf

# 	# --- EM LOOP (Learning Shapes with True Priors) ---
# 	for _ in range(max_iter):
# 		# E-Step
# 		pdf_signal = expon.pdf(data, scale=1.0 / lamb)
# 		pdf_noise = beta.pdf(data, a, b_p)

# 		weighted_signal = pi * pdf_signal
# 		weighted_noise = (1 - pi) * pdf_noise
# 		total_evidence = weighted_signal + weighted_noise + 1e-10
# 		gamma = weighted_signal / total_evidence

# 		# M-Step
# 		N_s = np.sum(gamma)
# 		pi = N_s / len(data)

# 		weighted_sum_x = np.sum(gamma * data)
# 		lamb = 1.0 / (weighted_sum_x / (N_s + 1e-10))

# 		w_noise = 1 - gamma
# 		w_noise_sum = np.sum(w_noise) + 1e-10
# 		mu_n = np.sum(w_noise * data) / w_noise_sum
# 		var_n = np.sum(w_noise * (data - mu_n) ** 2) / w_noise_sum

# 		if var_n < mu_n * (1 - mu_n):
# 			common = (mu_n * (1 - mu_n) / (var_n + 1e-10)) - 1
# 			a = max(mu_n * common, 1.0)
# 			b_p = max((1 - mu_n) * common, 1.0)

# 		log_likelihood_new = np.sum(np.log(total_evidence))
# 		if abs(log_likelihood_new - log_likelihood_old) < tol:
# 			break
# 		log_likelihood_old = log_likelihood_new

# 	final_params = {"pi": pi, "lambda": lamb, "alpha": a, "beta": b_p}

# 	# --- FINAL INFERENCE (Balanced Prior) ---
# 	# We calculate the probability assuming P(Signal) = 0.5 vs P(Noise) = 0.5
# 	# This turns the output into a Likelihood Ratio test normalized to [0,1].

# 	pdf_S_full = expon.pdf(fused_statistic_matrix, scale=1.0 / lamb)
# 	pdf_N_full = beta.pdf(np.clip(fused_statistic_matrix, 1e-6, 1 - 1e-6), a, b_p)

# 	# Use 0.5 instead of 'pi' here to ignore class imbalance for detection
# 	numerator = 0.5 * pdf_S_full
# 	posterior = numerator / (numerator + 0.5 * pdf_N_full + 1e-10)

# 	return posterior, final_params


def fitWeibullBetaMixture(
	fused_statistic_matrix, initial_params=None, max_iter=100, tol=1e-4
):
	"""
	Fits EVT Mixture Model using Balanced-EM.
	Crucial Fix: Forces 50/50 prior during training to prevent
	class imbalance from collapsing the signal distribution.
	"""
	data = _get_off_diagonal_samples(fused_statistic_matrix)
	data = np.clip(data, 1e-6, 1.0 - 1e-6)

	if initial_params is None:
		params = _initializeParametersViaNaiveQuantile(fused_statistic_matrix)
	else:
		params = initial_params

	# We ignore the 'pi' from init because it is too small (e.g. 0.00002).
	# We will trust the Lambda/Alpha/Beta shapes, but ignore the frequency.
	lamb = params["lambda"]
	a = params["alpha"]
	b_p = params["beta"]

	log_likelihood_old = -np.inf

	# --- BALANCED EM LOOP ---
	for _ in range(max_iter):
		# E-Step: Calculate Responsibilities assuming Balanced Priors (0.5 / 0.5)
		# This asks: "Based on SHAPE alone, which distribution fits best?"
		pdf_signal = expon.pdf(data, scale=1.0 / lamb)
		pdf_noise = beta.pdf(data, a, b_p)

		# Force 50/50 weights
		weighted_signal = 0.5 * pdf_signal
		weighted_noise = 0.5 * pdf_noise

		total_evidence = weighted_signal + weighted_noise + 1e-10
		gamma = weighted_signal / total_evidence	# P(Signal | Data, Balanced)

		# M-Step: Update Shape Parameters based on these responsibilities
		N_s = np.sum(gamma)

		# Update Lambda (Weighted MLE)
		weighted_sum_x = np.sum(gamma * data)
		# Protect against N_s vanishing
		lamb = 1.0 / (weighted_sum_x / (N_s + 1e-10))

		# Update Noise Params
		w_noise = 1 - gamma
		w_noise_sum = np.sum(w_noise) + 1e-10
		mu_n = np.sum(w_noise * data) / w_noise_sum
		var_n = np.sum(w_noise * (data - mu_n) ** 2) / w_noise_sum

		if var_n < mu_n * (1 - mu_n):
			common = (mu_n * (1 - mu_n) / (var_n + 1e-10)) - 1
			a = max(mu_n * common, 1.0)
			b_p = max((1 - mu_n) * common, 1.0)

		# Check convergence on the balanced likelihood
		log_likelihood_new = np.sum(np.log(total_evidence))
		if abs(log_likelihood_new - log_likelihood_old) < tol:
			break
		log_likelihood_old = log_likelihood_new

	# We do NOT return a learned 'pi'. We return the shapes.
	final_params = {"lambda": lamb, "alpha": a, "beta": b_p}

	# --- FINAL INFERENCE ---
	# Again, use Balanced Inference. P > 0.5 means "More likely Signal than Noise".
	pdf_S_full = expon.pdf(fused_statistic_matrix, scale=1.0 / lamb)
	pdf_N_full = beta.pdf(np.clip(fused_statistic_matrix, 1e-6, 1 - 1e-6), a, b_p)

	numerator = 0.5 * pdf_S_full
	posterior = numerator / (numerator + 0.5 * pdf_N_full + 1e-10)

	return posterior, final_params


# def fitWeibullBetaMixture( # errs bcos clustering and inf AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# 	fused_statistic_matrix, initial_params=None, max_iter=20, tol=1e-4
# ):
# 	"""
# 	Fits EVT Mixture Model using Balanced-EM.
# 	Crucial Fix: Forces 50/50 prior during training steps.
# 	"""
# 	data = _get_off_diagonal_samples(fused_statistic_matrix)
# 	data = np.clip(data, 1e-6, 1.0 - 1e-6)

# 	if initial_params is None:
# 		params = _initializeParametersViaNaiveQuantile(fused_statistic_matrix)
# 	else:
# 		params = initial_params

# 	lamb = params["lambda"]
# 	a = params["alpha"]
# 	b_p = params["beta"]

# 	log_likelihood_old = -np.inf

# 	# --- BALANCED EM LOOP ---
# 	for _ in range(max_iter):
# 		# E-Step: Force 50/50 weights
# 		pdf_signal = expon.pdf(data, scale=1.0 / lamb)
# 		pdf_noise = beta.pdf(data, a, b_p)

# 		# Use 0.5 prior to learn SHAPE, ignoring FREQUENCY
# 		weighted_signal = 0.5 * pdf_signal
# 		weighted_noise = 0.5 * pdf_noise

# 		total_evidence = weighted_signal + weighted_noise + 1e-10
# 		gamma = weighted_signal / total_evidence

# 		# M-Step
# 		N_s = np.sum(gamma)
# 		weighted_sum_x = np.sum(gamma * data)
# 		lamb = 1.0 / (weighted_sum_x / (N_s + 1e-10))

# 		w_noise = 1 - gamma
# 		w_noise_sum = np.sum(w_noise) + 1e-10
# 		mu_n = np.sum(w_noise * data) / w_noise_sum
# 		var_n = np.sum(w_noise * (data - mu_n) ** 2) / w_noise_sum

# 		if var_n < mu_n * (1 - mu_n):
# 			common = (mu_n * (1 - mu_n) / (var_n + 1e-10)) - 1
# 			a = max(mu_n * common, 1.0)
# 			b_p = max((1 - mu_n) * common, 1.0)

# 		if abs(np.sum(np.log(total_evidence)) - log_likelihood_old) < tol:
# 			break
# 		log_likelihood_old = np.sum(np.log(total_evidence))

# 	final_params = {"lambda": lamb, "alpha": a, "beta": b_p}

# 	# --- FINAL INFERENCE (Balanced) ---
# 	pdf_S_full = expon.pdf(fused_statistic_matrix, scale=1.0 / lamb)
# 	pdf_N_full = beta.pdf(np.clip(fused_statistic_matrix, 1e-6, 1 - 1e-6), a, b_p)

# 	numerator = 0.5 * pdf_S_full
# 	posterior = numerator / (numerator + 0.5 * pdf_N_full + 1e-10)

# 	return posterior, final_params


def fitGaussianMixture(fused_statistic_matrix, n_components=2, initial_params=None):
	data = _get_off_diagonal_samples(fused_statistic_matrix).reshape(-1, 1)
	gmm = GaussianMixture(
		n_components=n_components, covariance_type="full", random_state=42
	)
	gmm.fit(data)

	signal_idx = np.argmin(gmm.means_.flatten())
	probs = gmm.predict_proba(fused_statistic_matrix.reshape(-1, 1))
	posterior = probs[:, signal_idx].reshape(fused_statistic_matrix.shape)

	params = {
		"means": gmm.means_.tolist(),
		"signal_idx": int(signal_idx),
		"weights": gmm.weights_.tolist(),
	}
	return posterior, params


# ==============================================================================
# 5. TOPOLOGY & CLUSTERING
# ==============================================================================


def computeTransitivityPrior(likelihood_matrix):
	raw_support = likelihood_matrix @ likelihood_matrix
	max_val = np.max(raw_support)
	return (raw_support / max_val) if max_val > 0 else raw_support


def computeFinalPosterior(likelihood_matrix, prior_matrix, weight_factor=0.5):
	safe_prior = np.clip(prior_matrix, 0.1, 1.0)
	return likelihood_matrix * (safe_prior**weight_factor)


def extractAndPrintClusters(posterior_matrix, semantic_data, threshold=0.5):
	dist_matrix = np.clip(1.0 - posterior_matrix, 0.0, 1.0)
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=(1.0 - threshold),
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)

	groups = {}
	for idx, label in enumerate(labels):
		groups.setdefault(label, []).append(idx)

	significant_groups = [g for g in groups.values() if len(g) > 1]
	significant_groups.sort(key=len, reverse=True)

	print(f"\n{'='*80}")
	print(f"PROBABILISTIC CLUSTERING RESULTS (Threshold P > {threshold})")
	print(f"{'='*80}")
	print(f"Found {len(significant_groups)} significant groups.\n")

	for i, indices in enumerate(significant_groups):
		sub_probs = posterior_matrix[np.ix_(indices, indices)]
		mask = ~np.eye(len(indices), dtype=bool)
		avg_prob = np.mean(sub_probs[mask]) if len(indices) > 1 else 1.0

		print(f"GROUP {i+1} (Size: {len(indices)}) [Cohesion: {avg_prob:.4f}]")
		for idx in indices:
			print(f" - {semantic_data[idx]}")
		print("-" * 80)


def findConsensusViaJaccard(clustering_cache, model_keys):
	"""
	Finds the 'Platinum' set: Pairs that appear in ALL models at the same
	threshold configuration that maximizes Intersection-over-Union.
	"""
	threshold_axes = [list(clustering_cache[m].keys()) for m in model_keys]
	best_score = -1
	best_p_true = set()

	# Grid Search
	for thresholds in product(*threshold_axes):
		current_config = dict(zip(model_keys, thresholds))
		pair_sets = [clustering_cache[m][t][2] for m, t in current_config.items()]

		# INTERSECTION (The Core)
		p_true = set.intersection(*pair_sets)
		if not p_true:
			continue

		# UNION (The Broad Net)
		p_union = set.union(*pair_sets)
		if not p_union:
			continue

		# Jaccard Score
		current_score = len(p_true) / len(p_union)

		# We prefer high Jaccard, but also want non-trivial size
		# (Tie-breaker: larger set size)
		if current_score > best_score:
			best_score = current_score
			best_p_true = p_true
		elif current_score == best_score:
			if len(p_true) > len(best_p_true):
				best_p_true = p_true

	return best_p_true


# ==============================================================================
# 6. MAIN EXECUTION
# ==============================================================================

print(">>> Preparing Model Artifacts...")
model_keys = ["embedding_vector", "retrieval_embedding_vector"]
artifacts = prepareModelArtifacts(qdata, model_keys, truncation_dim=256, debug=False)

print("\n>>> Phase 1: Deriving Consensus Priors (ACGC)...")
max_dist = max(np.max(a["dist_matrix"]) for a in artifacts.values())
tau_range = np.arange(0.1, max_dist, 0.05)
cache = createClusteringCache(artifacts, tau_range)
print("\n>>> Phase 1: Deriving Consensus Priors (ACGC)...")
max_dist = max(np.max(a["dist_matrix"]) for a in artifacts.values())
tau_range = np.arange(0.1, max_dist, 0.05)
cache = createClusteringCache(artifacts, tau_range)

consensus_pairs = findConsensusViaACGC(cache, model_keys)
# consensus_pairs = findConsensusViaJaccard(cache, model_keys) # doesnt work
# gives AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
# importantly ACGC isnt the precise labels, just most consensus sets
print(f"Consensus Anchors Found: {len(consensus_pairs)}")


def findConsensusViaJaccard(clustering_cache, model_keys):
	"""
	Finds the 'Platinum' set: Pairs that appear in ALL models at the same
	threshold configuration that maximizes Intersection-over-Union.
	"""
	threshold_axes = [list(clustering_cache[m].keys()) for m in model_keys]
	best_score = -1
	best_p_true = set()

	# Grid Search
	for thresholds in product(*threshold_axes):
		current_config = dict(zip(model_keys, thresholds))
		pair_sets = [clustering_cache[m][t][2] for m, t in current_config.items()]

		# INTERSECTION (The Core)
		p_true = set.intersection(*pair_sets)
		if not p_true:
			continue

		# UNION (The Broad Net)
		p_union = set.union(*pair_sets)
		if not p_union:
			continue

		# Jaccard Score
		current_score = len(p_true) / len(p_union)

		# We prefer high Jaccard, but also want non-trivial size
		# (Tie-breaker: larger set size)
		if current_score > best_score:
			best_score = current_score
			best_p_true = p_true
		elif current_score == best_score:
			if len(p_true) > len(best_p_true):
				best_p_true = p_true

	return best_p_true


# --- STEP 1.5: Derive Strict Safety Horizon ---
consensus_dists = []
safety_horizon = 0
for key in model_keys:
	mat = artifacts[key]["dist_matrix"]
	for i, j in consensus_pairs:
		consensus_dists.append(mat[i, j])

if consensus_dists:
	max_consensus_dist = np.max(consensus_dists)
	# Jaccard pairs are very tight (approx Dist 8.8).
	# 1.5x buffer -> ~13.2. This allows for valid fuzzy matches
	# but filters out the "Child Privacy" hallucination (Dist ~15+).
	safety_horizon = max_consensus_dist * 1.5
	print(
		f"Safety Horizon Derived: {safety_horizon:.4f} (Max Seed: {max_consensus_dist:.4f})"
	)
else:
	# Fallback if Jaccard finds nothing (rare)
	safety_horizon = None
	print("Warning: No Jaccard seeds. Safety Horizon disabled.")

# --- STEP 2: Probabilistic Modeling ---
print("\n>>> Phase 2: Running EVT Pipeline...")
k_neighbors = 40

rank_matrices = {}
for key, artifact in artifacts.items():
	# PASS THE SAFETY HORIZON HERE
	rank_matrices[key] = computeLocalDensityRanks(
		artifact["dist_matrix"], k_neighbors=k_neighbors, max_valid_dist=safety_horizon
	)

fused_matrix = computeFusedEVTStatistic(rank_matrices)

# Re-init params since the rank inputs have changed (gated items are now 1.0)
init_params = _initializeParametersViaConsensus(fused_matrix, consensus_pairs)
print(f"Bootstrapped Params: {init_params}")

likelihood_matrix, final_params = fitWeibullBetaMixture(
	fused_matrix, initial_params=None
)
print("EM Model Converged.")


# --- STEP 3: Topology & Results ---
print("\n>>> Phase 3: Final Inference")
prior_matrix = computeTransitivityPrior(likelihood_matrix)
posterior_matrix = computeFinalPosterior(likelihood_matrix, prior_matrix)


final_matrix = posterior_matrix * prior_matrix
print("Applying Conservative Symmetrization...")
final_symmetric_matrix = np.minimum(final_matrix, final_matrix.T)
extractAndPrintClusters(
	final_matrix, artifacts[model_keys[0]]["semantic_data"], threshold=0.5
)

>>> Loading Data...


In [131]:
# ==============================================================================
# 6. MAIN EXECUTION
# ==============================================================================

print(">>> Preparing Model Artifacts...")
model_keys = ["embedding_vector", "retrieval_embedding_vector"]
artifacts = prepareModelArtifacts(qdata, model_keys, truncation_dim=256, debug=False)

print("\n>>> Phase 1: Deriving Consensus Priors (ACGC)...")
max_dist = max(np.max(a["dist_matrix"]) for a in artifacts.values())
tau_range = np.arange(0.1, max_dist, 0.05)
cache = createClusteringCache(artifacts, tau_range)
print("\n>>> Phase 1: Deriving Consensus Priors (ACGC)...")
max_dist = max(np.max(a["dist_matrix"]) for a in artifacts.values())
tau_range = np.arange(0.1, max_dist, 0.05)
cache = createClusteringCache(artifacts, tau_range)

consensus_pairs = findConsensusViaACGC(cache, model_keys)
# consensus_pairs = findConsensusViaJaccard(cache, model_keys) # doesnt work
# gives AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
# importantly ACGC isnt the precise labels, just most consensus sets
print(f"Consensus Anchors Found: {len(consensus_pairs)}")


def findConsensusViaJaccard(clustering_cache, model_keys):
	"""
	Finds the 'Platinum' set: Pairs that appear in ALL models at the same
	threshold configuration that maximizes Intersection-over-Union.
	"""
	threshold_axes = [list(clustering_cache[m].keys()) for m in model_keys]
	best_score = -1
	best_p_true = set()

	# Grid Search
	for thresholds in product(*threshold_axes):
		current_config = dict(zip(model_keys, thresholds))
		pair_sets = [clustering_cache[m][t][2] for m, t in current_config.items()]

		# INTERSECTION (The Core)
		p_true = set.intersection(*pair_sets)
		if not p_true:
			continue

		# UNION (The Broad Net)
		p_union = set.union(*pair_sets)
		if not p_union:
			continue

		# Jaccard Score
		current_score = len(p_true) / len(p_union)

		# We prefer high Jaccard, but also want non-trivial size
		# (Tie-breaker: larger set size)
		if current_score > best_score:
			best_score = current_score
			best_p_true = p_true
		elif current_score == best_score:
			if len(p_true) > len(best_p_true):
				best_p_true = p_true

	return best_p_true


# --- STEP 1.5: Derive Strict Safety Horizon ---
consensus_dists = []
safety_horizon = 0
for key in model_keys:
	mat = artifacts[key]["dist_matrix"]
	for i, j in consensus_pairs:
		consensus_dists.append(mat[i, j])

if consensus_dists:
	max_consensus_dist = np.max(consensus_dists)
	# Jaccard pairs are very tight (approx Dist 8.8).
	# 1.5x buffer -> ~13.2. This allows for valid fuzzy matches
	# but filters out the "Child Privacy" hallucination (Dist ~15+).
	safety_horizon = max_consensus_dist * 1.5
	print(
		f"Safety Horizon Derived: {safety_horizon:.4f} (Max Seed: {max_consensus_dist:.4f})"
	)
else:
	# Fallback if Jaccard finds nothing (rare)
	safety_horizon = None
	print("Warning: No Jaccard seeds. Safety Horizon disabled.")

# --- STEP 2: Probabilistic Modeling ---
print("\n>>> Phase 2: Running EVT Pipeline...")
k_neighbors = 40

rank_matrices = {}
for key, artifact in artifacts.items():
	# PASS THE SAFETY HORIZON HERE
	rank_matrices[key] = computeLocalDensityRanks(
		artifact["dist_matrix"], k_neighbors=k_neighbors, max_valid_dist=safety_horizon
	)

fused_matrix = computeFusedEVTStatistic(rank_matrices)

# Re-init params since the rank inputs have changed (gated items are now 1.0)
init_params = _initializeParametersViaConsensus(fused_matrix, consensus_pairs)
print(f"Bootstrapped Params: {init_params}")

likelihood_matrix, final_params = fitWeibullBetaMixture(
	fused_matrix, initial_params=None
)
print("EM Model Converged.")


# --- STEP 3: Topology & Results ---
print("\n>>> Phase 3: Final Inference")
prior_matrix = computeTransitivityPrior(likelihood_matrix)
posterior_matrix = computeFinalPosterior(likelihood_matrix, prior_matrix)


final_matrix = posterior_matrix * prior_matrix
print("Applying Conservative Symmetrization...")
final_symmetric_matrix = np.minimum(final_matrix, final_matrix.T)
extractAndPrintClusters(
	final_matrix, artifacts[model_keys[0]]["semantic_data"], threshold=0.5
)

>>> Preparing Model Artifacts...


In [None]:
print("\n>>> Phase 1: Deriving Consensus Priors (ACGC)...")
max_dist = max(np.max(a["dist_matrix"]) for a in artifacts.values())
tau_range = np.arange(0.1, max_dist, 0.05)
cache = createClusteringCache(artifacts, tau_range)

consensus_pairs = findConsensusViaACGC(cache, model_keys)
# consensus_pairs = findConsensusViaJaccard(cache, model_keys) # doesnt work
# gives AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
# importantly ACGC isnt the precise labels, just most consensus sets
print(f"Consensus Anchors Found: {len(consensus_pairs)}")


def findConsensusViaJaccard(clustering_cache, model_keys):
	"""
	Finds the 'Platinum' set: Pairs that appear in ALL models at the same
	threshold configuration that maximizes Intersection-over-Union.
	"""
	threshold_axes = [list(clustering_cache[m].keys()) for m in model_keys]
	best_score = -1
	best_p_true = set()

	# Grid Search
	for thresholds in product(*threshold_axes):
		current_config = dict(zip(model_keys, thresholds))
		pair_sets = [clustering_cache[m][t][2] for m, t in current_config.items()]

		# INTERSECTION (The Core)
		p_true = set.intersection(*pair_sets)
		if not p_true:
			continue

		# UNION (The Broad Net)
		p_union = set.union(*pair_sets)
		if not p_union:
			continue

		# Jaccard Score
		current_score = len(p_true) / len(p_union)

		# We prefer high Jaccard, but also want non-trivial size
		# (Tie-breaker: larger set size)
		if current_score > best_score:
			best_score = current_score
			best_p_true = p_true
		elif current_score == best_score:
			if len(p_true) > len(best_p_true):
				best_p_true = p_true

	return best_p_true


# --- STEP 1.5: Derive Strict Safety Horizon ---
consensus_dists = []
safety_horizon = 0
for key in model_keys:
	mat = artifacts[key]["dist_matrix"]
	for i, j in consensus_pairs:
		consensus_dists.append(mat[i, j])

if consensus_dists:
	max_consensus_dist = np.max(consensus_dists)
	# Jaccard pairs are very tight (approx Dist 8.8).
	# 1.5x buffer -> ~13.2. This allows for valid fuzzy matches
	# but filters out the "Child Privacy" hallucination (Dist ~15+).
	safety_horizon = max_consensus_dist * 1.5
	print(
		f"Safety Horizon Derived: {safety_horizon:.4f} (Max Seed: {max_consensus_dist:.4f})"
	)
else:
	# Fallback if Jaccard finds nothing (rare)
	safety_horizon = None
	print("Warning: No Jaccard seeds. Safety Horizon disabled.")

# --- STEP 2: Probabilistic Modeling ---
print("\n>>> Phase 2: Running EVT Pipeline...")
k_neighbors = 40

rank_matrices = {}
for key, artifact in artifacts.items():
	# PASS THE SAFETY HORIZON HERE
	rank_matrices[key] = computeLocalDensityRanks(
		artifact["dist_matrix"], k_neighbors=k_neighbors, max_valid_dist=safety_horizon
	)

fused_matrix = computeFusedEVTStatistic(rank_matrices)

# Re-init params since the rank inputs have changed (gated items are now 1.0)
init_params = _initializeParametersViaConsensus(fused_matrix, consensus_pairs)
print(f"Bootstrapped Params: {init_params}")

likelihood_matrix, final_params = fitWeibullBetaMixture(
	fused_matrix, initial_params=None
)
print("EM Model Converged.")


# --- STEP 3: Topology & Results ---
print("\n>>> Phase 3: Final Inference")
prior_matrix = computeTransitivityPrior(likelihood_matrix)
posterior_matrix = computeFinalPosterior(likelihood_matrix, prior_matrix)


final_matrix = posterior_matrix * prior_matrix
print("Applying Conservative Symmetrization...")
final_symmetric_matrix = np.minimum(final_matrix, final_matrix.T)
extractAndPrintClusters(
	final_matrix, artifacts[model_keys[0]]["semantic_data"], threshold=0.5
)


>>> Phase 1: Deriving Consensus Priors (ACGC)...
Consensus Anchors Found: 5
Safety Horizon Derived: 13.8013 (Max Seed: 9.2009)

>>> Phase 2: Running EVT Pipeline...
Bootstrapped Params: {'pi': 1.8526752630798872e-05, 'lambda': np.float64(39.99840006399744), 'alpha': 1.0, 'beta': 1.0}


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


EM Model Converged.

>>> Phase 3: Final Inference
Applying Conservative Symmetrization...


ValueError: Input X contains NaN.
AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Safety Horizon Derived: 13.8013 (Max Seed: 9.2009)

>>> Phase 2: Running EVT Pipeline...
Bootstrapped Params: {'pi': 1.8526752630798872e-05, 'lambda': np.float64(39.99840006399744), 'alpha': 1.0, 'beta': 1.0}


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


EM Model Converged.

>>> Phase 3: Final Inference
Applying Conservative Symmetrization...


ValueError: Input X contains NaN.
AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

np.float64(13.80133726823928)

In [None]:
# --- STEP 2: Probabilistic Modeling ---
print("\n>>> Phase 2: Running EVT Pipeline...")
k_neighbors = 40

rank_matrices = {}
for key, artifact in artifacts.items():
	# PASS THE SAFETY HORIZON HERE
	rank_matrices[key] = computeLocalDensityRanks(
		artifact["dist_matrix"], k_neighbors=k_neighbors, max_valid_dist=safety_horizon
	)

fused_matrix = computeFusedEVTStatistic(rank_matrices)

# Re-init params since the rank inputs have changed (gated items are now 1.0)
init_params = _initializeParametersViaConsensus(fused_matrix, consensus_pairs)
print(f"Bootstrapped Params: {init_params}")

likelihood_matrix, final_params = fitWeibullBetaMixture(
	fused_matrix, initial_params=None
)
print("EM Model Converged.")


# --- STEP 3: Topology & Results ---
print("\n>>> Phase 3: Final Inference")
prior_matrix = computeTransitivityPrior(likelihood_matrix)
posterior_matrix = computeFinalPosterior(likelihood_matrix, prior_matrix)


# final_matrix = posterior_matrix * prior_matrix
print("Applying Conservative Symmetrization...")
final_symmetric_matrix = np.minimum(final_matrix, final_matrix.T)
extractAndPrintClusters(
	final_matrix, artifacts[model_keys[0]]["semantic_data"], threshold=0.5
)


>>> Phase 2: Running EVT Pipeline...
Bootstrapped Params: {'pi': 1.8526752630798872e-05, 'lambda': np.float64(39.99840006399744), 'alpha': 1.0, 'beta': 1.0}


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


EM Model Converged.

>>> Phase 3: Final Inference
Applying Conservative Symmetrization...


ValueError: Input X contains NaN.
AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
import re


def computeLexicalVerificationPrior(semantic_data, sensitivity=0.8):
	"""
	Calculates a penalty matrix based on discrete token mismatches.
	"""
	N = len(semantic_data)
	prior_matrix = np.ones((N, N), dtype=np.float64)

	# User requested to omit 'privacy' and 'policy' from stopwords
	STOPWORDS = {
		"the",
		"a",
		"an",
		"of",
		"to",
		"in",
		"for",
		"on",
		"by",
		"with",
		"is",
		"are",
		"was",
		"were",
		"be",
		"been",
		"that",
		"this",
		"it",
		"not",
		"or",
		"and",
		"does",
		"affirm",
	}

	def get_tokens(text):
		words = re.findall(r"\b\w+\b", text.lower())
		return set(w for w in words if w not in STOPWORDS)

	token_sets = [get_tokens(s) for s in semantic_data]

	for i in range(N):
		for j in range(i + 1, N):
			set_a = token_sets[i]
			set_b = token_sets[j]

			diff = (set_a - set_b) | (set_b - set_a)
			mismatch_count = len(diff)

			if mismatch_count > 0:
				penalty = np.exp(-sensitivity * mismatch_count)
				prior_matrix[i, j] = penalty
				prior_matrix[j, i] = penalty

	return prior_matrix

)


>>> Phase 2: Running EVT Pipeline...
Bootstrapped Params: {'pi': 0.00047428486734845115, 'lambda': np.float64(62.555015132292255), 'alpha': np.float64(1.3946512320706785), 'beta': np.float64(3.024188090720149)}
EM Model Converged.
Applying Lexical Verification Prior...

>>> Phase 3: Final Inference
Applying Conservative Symmetrization...

PROBABILISTIC CLUSTERING RESULTS (Threshold P > 0.5)
Found 13 significant groups.

GROUP 1 (Size: 2) [Cohesion: 0.5316]
 - Does the privacy policy affirm that security measures are designed to protect personal data from unauthorized access?
 - Does the privacy policy affirm that security measures are designed to protect personal data from unauthorized disclosure?
--------------------------------------------------------------------------------
GROUP 2 (Size: 2) [Cohesion: 0.5048]
 - Does the privacy policy affirm that the Services are not directed to children under 13?
 - Does the privacy policy affirm that the Services are not intended for children u

In [None]:
print(np.max(consensus_dists))
print(np.max(dist_data))

# 17.06319536285627
# 25.353436297260522

17.06319536285627
25.353436297260522


In [None]:
safety_horizon = (
	1	# Changing this doesnt seem to matter, even set at 1 it prints the same
)

# --- STEP 2: Probabilistic Modeling ---
print("\n>>> Phase 2: Running EVT Pipeline...")
k_neighbors = 1	# Changing this doesnt seem to matter, even 1 prints the same

rank_matrices = {}
for key, artifact in artifacts.items():
	# PASS THE SAFETY HORIZON HERE
	rank_matrices[key] = computeLocalDensityRanks(
		artifact["dist_matrix"], k_neighbors=k_neighbors, max_valid_dist=safety_horizon
	)

fused_matrix = computeFusedEVTStatistic(rank_matrices)

# Re-init params since the rank inputs have changed (gated items are now 1.0)
init_params = _initializeParametersViaConsensus(fused_matrix, consensus_pairs)
print(f"Bootstrapped Params: {init_params}")

likelihood_matrix, final_params = fitWeibullBetaMixture(
	fused_matrix, initial_params=None
)
print("EM Model Converged.")
# --- STEP 4: Lexical Verification (Inserted) ---
print("Applying Lexical Verification Prior...")
lexical_prior = computeLexicalVerificationPrior(
	artifacts[model_keys[0]]["semantic_data"],
	sensitivity=0.4,	# Tunable: 0.8 penalizes single-word swaps heavily
)

# --- STEP 3: Topology & Results ---
print("\n>>> Phase 3: Final Inference")
prior_matrix = computeTransitivityPrior(likelihood_matrix)
posterior_matrix = computeFinalPosterior(likelihood_matrix, prior_matrix)


final_matrix = posterior_matrix	# * lexical_prior
print("Applying Conservative Symmetrization...")
final_symmetric_matrix = np.minimum(final_matrix, final_matrix.T)
extractAndPrintClusters(
	final_symmetric_matrix, artifacts[model_keys[0]]["semantic_data"], threshold=0.5
)


>>> Phase 2: Running EVT Pipeline...
Bootstrapped Params: {'pi': 0.00047428486734845115, 'lambda': np.float64(62.555015132292255), 'alpha': np.float64(1.3946512320706785), 'beta': np.float64(3.024188090720149)}
EM Model Converged.
Applying Lexical Verification Prior...

>>> Phase 3: Final Inference
Applying Conservative Symmetrization...

PROBABILISTIC CLUSTERING RESULTS (Threshold P > 0.5)
Found 64 significant groups.

GROUP 1 (Size: 20) [Cohesion: 0.8015]
 - Does the privacy policy affirm that the company implements commercially reasonable technical measures to protect Personal Data?
 - Does the privacy policy affirm that the company implements commercially reasonable organizational measures to protect Personal Data?
 - Does the privacy policy affirm that the company relies on Standard Contractual Clauses (SCCs) for transfers to countries without an adequacy decision?
 - Does the privacy policy affirm that the company relies on Standard Contractual Clauses (SCCs) for transfers to ju

In [None]:
# # --- STEP 2: Probabilistic Modeling ---
# print("\n>>> Phase 2: Running EVT Pipeline...")

# rank_matrices = {}
# for key, artifact in artifacts.items():
# 	rank_matrices[key] = computeLocalDensityRanks(artifact["dist_matrix"], k_neighbors=30)

# fused_matrix = computeFusedEVTStatistic(rank_matrices)
# init_params = _initializeParametersViaConsensus(fused_matrix, consensus_pairs)
# print(f"Bootstrapped Params: {init_params}")

# likelihood_matrix, final_params = fitWeibullBetaMixture(
# 	fused_matrix, initial_params=init_params
# )
# print("EM Model Converged.")


# # --- STEP 3: Topology & Results ---
# print("\n>>> Phase 3: Final Inference...")
# prior_matrix = computeTransitivityPrior(likelihood_matrix)
# posterior_matrix = computeFinalPosterior(likelihood_matrix, prior_matrix)

# extractAndPrintClusters(
# 	posterior_matrix, artifacts[model_keys[0]]["semantic_data"], threshold=0.5
# )