In [None]:
# This is a refined and expanded specification document. It integrates the formalized notation, the detailed breakdown of distance metrics (Mahalanobis vs. Cosine), and the domain-adaptive lexical strategies (SentencePiece) discussed in the previous steps.

# ---


# To infer equivalence rigorously, we move from loose terminology to specific random variables and sets.

# -   **Hypothesis ($H_{eq}$)**: A binary random variable where $H_{eq}(i,j)=1$ implies question $i$ and question $j$ are semantically and legally equivalent.
# -   **The View Set ($V$)**: The set of all embedding configurations (e.g., `BERT_CLS`, `GEMINI_RETRIEVAL`).
# -   **The Metric Set ($M$)**: The set of geometric interpretations of distance: $M = \{\text{COSINE}, \text{EUCLIDEAN}, \text{MAHALANOBIS}\}$.
# -   **Configuration ($C$)**: A tuple $(v, m)$ representing a specific view and metric combination.
# -   **Raw Distance ($d_{v,m}(i,j)$)**: The scalar distance between vectors of $i$ and $j$ under configuration $(v, m)$.
# -   **Normalized Distance ($\phi_{v,m}(d)$)**: The Cumulative Distribution Function (CDF) value of a distance $d$ within configuration $(v,m)$. This maps raw distances to a percentile $p \in [0,1]$, allowing comparison across different metrics.
# -   **Lexical Sequence ($T_i$)**: The sequence of integer token IDs produced by a domain-specific tokenizer (SentencePiece) trained on the **Policy Corpus**.
# -   **Information Weight ($W(t)$)**: The self-information (surprisal) of a token $t$, defined as $-\log P(t)$ from the tokenizer's unigram language model.

# ---


# We extend the input data types to support metric-specific caching and tokenization data.

# ```typescript
# // 1.1 Core Identifiers
# type TQuestionString = `Does the privacy policy affirm that ${string}?`;
# type TViewKey = string; // e.g., 'gemini_retrieval_query'
# type TMetricType = 'COSINE' | 'EUCLIDEAN' | 'MAHALANOBIS';

# // 1.2 Grounding Data (Origin)
# interface IGrounding {
#     document_hash: string;
#     subsection_hash: string;
#     substring_indices: [number, number]; // [Start, End]
#     supporting_text: string;
# }

# // 1.3 Computed Distance Cache
# // Keyed by View + Metric to allow geometric specific lookups
# interface IDistanceData {
#     [view_key: TViewKey]: {
#         [metric: TMetricType]: {
#             [question_hash: string]: Array<{
#                 target_hash: string;
#                 raw_distance: number;       // d(i,j)
#                 percentile_rank: number;    // phi(d(i,j)) - Normalized 0-1
#                 rank_index: number;         // Integer rank (1st NN, 2nd NN...)
#             }>
#         }
#     }
# }

# // 1.4 Lexical Data (SentencePiece)
# interface ILexicalData {
#     [question_hash: string]: {
#         token_ids: number[];        // The integer sequence
#         token_weights: number[];    // -log(P(t)) for each token
#         special_tokens: {           // Boolean flags for hard filters
#             has_negation: boolean;
#             has_temporal_condition: boolean; // e.g., "72 hours", "30 days"
#         }
#     }
# }
# ```

# ---


# These are the inputs for $P(H_{eq} | \textrm{factors})$. We categorize them by the signal source.

# #
# *Measures derived from the vector space geometry of a specific View ($v$) and Metric ($m$).*

# | ID | Factor Name | Computation | Rationale |
# | :--- | :--- | :--- | :--- |
# | **G1** | **Normalized Proximity** | $\phi_{v,m}(d(i,j))$ | Using percentiles allows us to compare "closeness" between Mahalanobis (ellipsoid) and Cosine (cone) spaces. |
# | **G2** | **Lowe’s Ratio** | $d(i,j) / d(i, \text{NN}_2)$ | Measures distinctiveness. If $j$ is much closer than the next neighbor, it's a specific match, not just a topic cluster. |
# | **G3** | **Local Density ($\rho$)** | $k / \text{Vol}(m, d(i, \text{NN}_k))$ | The density of the vector space around $i$. High density regions (common legal boilerplate) require stricter thresholds than sparse regions. |

# #
# *Measures derived from the neighbor graph, robust to absolute distance scaling.*

# | ID | Factor Name | Computation | Rationale |
# | :--- | :--- | :--- | :--- |
# | **T1** | **Mutual NN ($k$-RNN)** | $\mathbb{I}(j \in \text{NN}_k(i) \land i \in \text{NN}_k(j))$ | Reciprocity is a strong filter for asymmetric relationships (entailment vs equivalence). |
# | **T2** | **Shared Neighborhood** | Jaccard($\text{NN}_k(i), \text{NN}_k(j)$) | If $i$ and $j$ see the same "world" (neighbors), they likely occupy the same semantic point. |
# | **T3** | **Cluster Stability** | Membership in optimal **ACCC** or **Jaccard** clusters. | Pre-computed cluster membership acts as a high-confidence prior. |

# #
# *Measures derived from the agreement across different embedding models.*

# | ID | Factor Name | Computation | Rationale |
# | :--- | :--- | :--- | :--- |
# | **C1** | **View Support** | $\frac{1}{|V|} \sum \mathbb{I}(j \in \text{NN}_{k}(i \mid v))$ | The fraction of models that agree $j$ is relevant to $i$. Noise tends to be uncorrelated across models. |
# | **C2** | **Rank Variance** | $\text{Var}(\{ \text{rank}_v(i \to j) \})$ | Low variance implies a robust semantic link. High variance suggests the link is an artifact of one specific model's training. |

# #
# *Measures derived from the **Policy-Trained SentencePiece** tokenizer. This acts as a "High-Pass Filter" for precise legal terminology.*

# | ID | Factor Name | Computation | Rationale |
# | :--- | :--- | :--- | :--- |
# | **L1** | **Info-Weighted Overlap** | $\frac{\sum_{t \in T_i \cap T_j} W(t)}{\sum_{t \in T_i \cup T_j} W(t)}$ | A Jaccard index that ignores stop-words (low $W$) and prioritizes legal terms (high $W$) naturally. |
# | **L2** | **Token Edit Distance** | Levenshtein($T_i, T_j$) | Distinguishes structural changes. "Directed to" vs "Intended for" may appear as high-cost substitutions if trained on policies. |
# | **L3** | **Hard Negation Delta** | $\text{XOR}(\text{has\_negation}_i, \text{has\_negation}_j)$ | Hard veto. "Do we share?" vs "Do we not share?" are semantically close vectors but legally opposite. |

# #
# *Measures derived from the source document mapping.*

# | ID | Factor Name | Computation | Rationale |
# | :--- | :--- | :--- | :--- |
# | **O1** | **Exact Substring Match** | $\exists doc: \text{Sub}(i) \equiv \text{Sub}(j)$ | If both questions map to the exact same string in the same policy, $P(H_{eq}) \approx 1$. (The "Anchor"). |
# | **O2** | **Index Overlap (IoU)** | $\text{IoU}(\text{Indices}_i, \text{Indices}_j)$ | Partial overlap suggests strong correlation. |

# ---


# We explicitly define the geometries to prevent invalid comparisons.

# ##
# -   **Shape:** Directional cones radiating from the origin.
# -   **Radius:** Angular divergence.
# -   **Blind Spot:** Magnitude. Two vectors can be identical in direction but represent different "intensities" (though standard embedding usage normalizes this).

# ##
# -   **Shape:** An ellipsoid defined by the covariance matrix $\Sigma_v$ of the view.
# -   **Radius:** Statistical Distance ($\sigma$). A distance of $1.0$ means the point is 1 standard deviation away from the centroid relative to the local correlation.
# -   **Truncation:** We assume truncated Mahalanobis vectors (e.g., 256 dims) represent the principal components. Distances here are cleaner than full-dimension Euclidean distances because they discard low-variance (noise) dimensions.

# ##
# When calculating **ACCC** or clustering thresholds $\tau$:
# -   $\text{Vol}_{\text{cos}}(\tau) \propto \sin(\tau)^{D-1}$
# -   $\text{Vol}_{\text{mah}}(\tau) \propto \tau^D \sqrt{\det(\Sigma)}$
# -   **Implication:** We cannot use a fixed scalar threshold $\tau$ across metrics. We must use the **Normalized Proximity (G1)** to select dynamic thresholds that represent equivalent statistical likelihoods.

# ---


# #
# Since we lack labeled $Y$, we define a proxy label $Y'$:
# $$ Y'_{ij} = 1 \iff \text{Factor O1 (Exact Substring)} \text{ is True for } > 1 \text{ Document} $$
# We assume $P(H_{eq} | Y'=1) \approx 1$. We can use this subset to calibrate the weights of geometric factors for the ungrounded pairs.

# #
# -   **Embeddings are Low-Pass Filters:** They smooth over minor syntactic variations to capture intent. They excel at recall but fail at precision (e.g., "Children under 13" $\approx$ "Children under 18").
# -   **Lexical (SP) is a High-Pass Filter:** It captures precise symbol differences but fails at intent.
# -   **Inference Rule:** $P(H_{eq})$ is maximized when geometric similarity is high **AND** lexical weighted overlap is high. High geometric similarity with low lexical overlap indicates a **Hard Negative** (e.g., "Directed" vs "Intended").

# #
# We assume that noise in the **SentencePiece** tokenization (symbolic) is independent of noise in the **Embedding** generation (semantic).
# Therefore, strict agreement between **L2 (Token Edit Distance)** and **C1 (View Support)** implies true equivalence, as accidental collision in both spaces simultaneously is statistically improbable.

In [None]:
from typing import TypedDict, Dict, List, Tuple, NewType, Literal, Union

QuestionString = NewType("QuestionString", str)
QuestionHash = NewType("QuestionHash", str)
DocumentHash = NewType("DocumentHash", str)
SubsectionHash = NewType("SubsectionHash", str)
ViewKey = NewType("ViewKey", str)

MetricType = Literal["COSINE", "EUCLIDEAN", "MAHALANOBIS"]
TaskType = Literal[
	"SEMANTIC_SIMILARITY", "FACT_VERIFICATION", "RETRIEVAL_QUERY", "RETRIEVAL_DOCUMENT"
]


StartIndex = int
EndIndex = int
SubstringIndices = Tuple[StartIndex, EndIndex]


class EvidenceSnippet(TypedDict):
	"""
	Source text location data.
	"""

	supporting_substring: str
	substring_indices: SubstringIndices


SubsectionData = Dict[SubsectionHash, List[EvidenceSnippet]]


PolicyMap = Dict[DocumentHash, SubsectionData]


class QuestionData(TypedDict, total=False):
	policy_data: PolicyMap


InputData = Dict[QuestionString, QuestionData]


class SpecialTokens(TypedDict):
	has_negation: bool
	has_temporal_condition: bool
	is_legal_boilerplate: bool


class LexicalEntry(TypedDict):
	token_ids: List[int]
	token_weights: List[float]
	special_tokens: SpecialTokens


LexicalData = Dict[QuestionHash, LexicalEntry]


class NeighborNode(TypedDict):
	target_hash: QuestionHash
	raw_distance: float
	percentile_rank: float
	rank_index: int


DistanceCache = Dict[ViewKey, Dict[MetricType, Dict[QuestionHash, List[NeighborNode]]]]


class InferenceFactors(TypedDict):
	"""
	The computed vector of factors for a specific candidate pair (i, j).
	"""

	# --- Geometric (Metric Dependent) ---
	geo_normalized_proximity: float	# G1: phi(d)
	geo_lowe_ratio: float	# G2: d(NN1) / d(NN2)
	geo_local_density: float	# G3: Density estimate around i

	# --- Topological (Graph Structure) ---
	top_mutual_nn: bool	# T1: i is NN of j AND j is NN of i
	top_shared_neighbor_iou: float	# T2: Jaccard overlap of NN sets
	top_cluster_stability: bool	# T3: Co-membership in ACCC/Jaccard clusters

	# --- Consensus (Multi-View) ---
	con_view_support: float	# C1: % of views agreeing on relationship
	con_rank_variance: float	# C2: Variance of rank(j) across views

	# --- Lexical (Domain-Adaptive) ---
	lex_info_weighted_overlap: float	# L1: Weighted Jaccard
	lex_token_edit_distance: float	# L2: SP Token Levenshtein
	lex_hard_negation_delta: bool	# L3: XOR of negation presence

	# --- Origin (Grounding) ---
	# UPDATED: Replaced Overlap (Float) with Identity (Bool)
	org_text_equivalence: bool	# O1: Text(i) == Text(j) (Content Match)
	org_location_identity: (
		bool	# O2: Doc(i)==Doc(j) AND Idx(i)==Idx(j) (Strict Reference Match)
	)

# I will recompute the vectors more concretely



In [None]:
100 // 50

In [None]:
SOURCE_FILE = "/data/questions_filter_after.json"


def _saveJson(filepath, data):
	with open(filepath, "w", encoding="utf-8") as f:
		json.dump(data, f, indent=4, default=str)


def _loadJson(filepath):
	if not os.path.exists(filepath):
		print(f"Warning: File not found: {filepath}")
		return {}
	try:
		with open(filepath, "r", encoding="utf-8") as f:
			return json.load(f)
	except json.JSONDecodeError:
		print(f"Error decoding JSON: {filepath}")
		return {}

In [None]:
101 // 50

101 % 50

# 101/2

I have a collection of questions, that follow the grammar: `Does the privacy policy affirm that {X}`, where `X` is some variable statement.
I have some embedding model M, which can compute the embeddings for a string given 8 different `task_types`.
I additionally want to compute the embeddings for some manipulations of the string:


```python
EMBEDDING_MODELS = ["gemini-embedding-001"]
TASK_TYPES = [
	"SEMANTIC_SIMILARITY",
	"CLASSIFICATION",
	"CLUSTERING",
	"RETRIEVAL_DOCUMENT",
	"RETRIEVAL_QUERY",
	"CODE_RETRIEVAL_QUERY",
	"QUESTION_ANSWERING",
	"FACT_VERIFICATION",
]

# Core Question
CORE_PREFIX = (
	"Does the privacy policy affirm that "	# Default and form of the data-source
)
# -- alt prefixes start here --
CORE_STATEMENT = "The privacy policy affirms that "

# NEGATION
NEGATION_PREFIX = "Does the privacy policy not affirm that "
NEGATION_TO_STATEMENT = "The privacy policy does not affirm that "

# THE CONTRARY

CONTRARY_PREFIX = "Does the privacy policy deny that "
CONTRARY_STATEMENT = "The privacy policy denies that "

# THE NEGATION OF THE CONTRARY
NEG_CONTRARY_PREFIX = "Does the privacy policy not deny that "
NEG_CONTRARY_STATEMENT = "The privacy policy does not deny that "


def changePrefix(question, replacement):
	processed_text = replacement + question[len(CORE_PREFIX) :]
	return processed_text


def questionToStatement(question, replacement):
	processed_text = (replacement + question[len(CORE_PREFIX) :])[:-1] + "."
	return processed_text


# also for extraction of purely X: This should only be ran once per statement
def propositionExtraction(question):
	processed_text = (question[len(CORE_PREFIX) :].upper())[:-1] + "."
	return processed_text
```

- Per all_variations for single embedding_type: 1*8*520 =4680
- Total: 1*9*8*520 = 37440

Given our dimensions is `3072` we do not need to truncate the vectors if we run it one test across all embedding mutations. The full length is also already normalised.

We shall use a truncation of `768` a dataset including all mutations of the question.
Then as before for the single variation we will use `256` as we have done before.

We also want to compute the embedding for associated `supporting_substring` each `TASK_TYPE` in `TASK_TYPES`, just incase we want to expand our sources for comparison.

I think we may then need to adjust our data structue, I think we should retain the `JSON` for identification, i.e using the original, just without the `TEmbeddingData`:
```
interface IData {
	[question_string: TQuestionString]: {
		policy_data: {
			[document_hash: string]: {
				[subsection_hash: string]: {
					supporting_substring: string;
					substring_indices: [TStartIndex, TEndIndex];
				}[];
			};
		};
	};
}
```
However the format of our embedding data will have to change to be reasonable. I do not want to use a database since I am just testing out this locally. I think npz should be sufficient for now. Lets use a seperate npz file for each TASK_TYPES, and a seperate file for supporting strings but keep all question manipulations in the file. We will key our arrays in the npz file by the string we used, since theyre guarenteed to remain unique.

The processing of `supporting_strings` since it isnt 1-1, and our core task is about data analysis of the questions. I think we will leave processing of the distance matrices until later.

I've had a go at writing the code, but commented out the actual api call so i dont end up calling it unecessary:
```
from model_management import GeminiModel
import numpy as np
import concurrent.futures

MODEL_NAME = EMBEDDING_MODELS[0]
def preprocessQuestionData(question_data):
	output_dict = {}
	for k in question_data:
		statement = propositionExtraction(k)
		negation = changePrefix(k, NEGATION_PREFIX)
		negation_statement = questionToStatement(k, NEGATION_TO_STATEMENT)
		contrary = changePrefix(k, CONTRARY_PREFIX)
		contrary_statement = questionToStatement(k, CONTRARY_STATEMENT)
		neg_contrary = changePrefix(k, NEG_CONTRARY_PREFIX)
		neg_contrary_statement = changePrefix(k, NEG_CONTRARY_STATEMENT)
		output_dict[k] = [
			k,
			statement,
			negation,
			negation_statement,
			contrary,
			contrary_statement,
			neg_contrary,
			neg_contrary_statement,
		]
	return output_dict


class ReRunException(Exception):
	pass


def reRunGuard(expected_key_length):
	if os.path.isdir(EMBEDDING_STORAGE_DIR):
		entries = os.listdir(EMBEDDING_STORAGE_DIR)
		if entries:
			try:
				loaded = np.load(f"{EMBEDDING_STORAGE_DIR}/{entries[0]}")
				if len(loaded.keys()) == expected_key_length:
					raise ReRunException("Preventing Arbitrary Re-processing")
				else:
					print(len(loaded.keys()))
			except Exception as e:
				if isinstance(e, ReRunException):
					raise e
				pass


def processEmbeddings(data, *, MAX_EMBEDDING_STRINGS=95, save_data=True):

	model = GeminiModel()

	def runEmbedding(questions, task_type):	# needs closure over model
		return questions
		# if len(questions) >= MAX_EMBEDDING_STRINGS:
		# 	raise Exception(f"More than {MAX_EMBEDDING_STRINGS} inputs")

		# model.client.models.embed_content(
		# 	model=MODEL_NAME,
		# 	contents=questions,
		# 	config=types.EmbedContentConfig(task_type=task_type),
		# )
		# embeddings = [e.values for e in _embeddings.embeddings]
		# return embeddings

	mutated_data = preprocessQuestionData(data)
	mutated_keys = list(mutated_data.keys())
	mutations = len(list(mutated_data.values())[0])

	total_strings = len(mutated_keys * mutations)
	max_embedded_strings = min(MAX_EMBEDDING_STRINGS, total_strings)

	reRunGuard(total_strings)

	iteration_amount = total_strings // max_embedded_strings

	embeddings_futures_dict = {}
	flat_data = []
	for k, v in mutated_data.items():
		flat_data.extend(v)

	# since we have ~500 questions, makes far more sense to run 8 of iterations of 100 questions, plus we save by task type anyways
	reshaped_data = []
	for i in range(iteration_amount + 1):
		data = flat_data[i * max_embedded_strings : max_embedded_strings * (i + 1)]
		if data != []:
			reshaped_data.append(data)
	data_store = {}
	for t in TASK_TYPES:
		# will move to `Gemini Batch API` at some point but that has a higher latence
		print(f"computing task type {t}")
		executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(reshaped_data))
		question_futures = {}
		for i, e in enumerate(reshaped_data):
			print(f"processing batch of {len(e)} strings")

			question_futures[i] = executor.submit(runEmbedding, e, t)

		executor.shutdown(wait=True)
		for i in range(len(reshaped_data)):
			if t in data_store:
				data_store[t].extend(question_futures[i].result())
			else:
				data_store[t] = question_futures[i].result()
	output_data = {}
	for k, v in data_store.items():
		if len(v)!= total_strings:
			raise ValueError(f"Expected {total_strings} embeddings: Recieved {len(v)}")
		output_data[k] = dict(zip(flat_data, list([*np.array(v)])))

	# could be done in above loop but keep seperate for now:
	if save_data:
		for k, v in output_data.items():
			np.savez_compressed(f"{EMBEDDING_STORAGE_DIR}/{k}", **v)

	return output_data


processEmbeddings(test_keys, MAX_EMBEDDING_STRINGS=3)
```

Any glaring errors?


In [None]:
stri = "Hello everyone GDPR".capitalize()
stri

In [None]:
EMBEDDING_MODELS = ["models/gemini-embedding-001"]
TASK_TYPES = [
	"SEMANTIC_SIMILARITY",
	"CLASSIFICATION",
	"CLUSTERING",
	"RETRIEVAL_DOCUMENT",
	"RETRIEVAL_QUERY",
	"CODE_RETRIEVAL_QUERY",
	"QUESTION_ANSWERING",
	"FACT_VERIFICATION",
]

# Core Question
CORE_PREFIX = (
	"Does the privacy policy affirm that "	# Default and form of the data-source
)
# -- alt prefixes start here --
CORE_STATEMENT = "The privacy policy affirms that "

# NEGATION
NEGATION_PREFIX = "Does the privacy policy not affirm that "
NEGATION_TO_STATEMENT = "The privacy policy does not affirm that "

# THE CONTRARY

CONTRARY_PREFIX = "Does the privacy policy deny that "
CONTRARY_STATEMENT = "The privacy policy denies that "

# THE NEGATION OF THE CONTRARY
NEG_CONTRARY_PREFIX = "Does the privacy policy not deny that "
NEG_CONTRARY_STATEMENT = "The privacy policy does not deny that "


def changePrefix(question, replacement):
	processed_text = replacement + question[len(CORE_PREFIX) :]
	return processed_text


def questionToStatement(question, replacement):
	processed_text = (replacement + question[len(CORE_PREFIX) :])[:-1] + "."
	return processed_text


# also for extraction of purely X: This should only be ran once per statement
def propositionExtraction(question):
	content = question[len(CORE_PREFIX) :]
	processed_text = content[0].upper() + content[1:-1] + "."
	return processed_text


def preprocessQuestionData(question_data):
	output_dict = {}
	for k in question_data:
		statement = propositionExtraction(k)
		negation = changePrefix(k, NEGATION_PREFIX)
		negation_statement = questionToStatement(k, NEGATION_TO_STATEMENT)
		contrary = changePrefix(k, CONTRARY_PREFIX)
		contrary_statement = questionToStatement(k, CONTRARY_STATEMENT)
		neg_contrary = changePrefix(k, NEG_CONTRARY_PREFIX)
		neg_contrary_statement = changePrefix(k, NEG_CONTRARY_STATEMENT)
		output_dict[k] = [
			k,
			statement,
			negation,
			negation_statement,
			contrary,
			contrary_statement,
			neg_contrary,
			neg_contrary_statement,
		]
	return output_dict

In [None]:
import os
import json

SOURCE_FILE = "./data/questions_filter_after.json"


def _saveJson(filepath, data):
	with open(filepath, "w", encoding="utf-8") as f:
		json.dump(data, f, indent=4, default=str)


def _loadJson(filepath):
	if not os.path.exists(filepath):
		print(f"Warning: File not found: {filepath}")
		return {}
	try:
		with open(filepath, "r", encoding="utf-8") as f:
			return json.load(f)
	except json.JSONDecodeError:
		print(f"Error decoding JSON: {filepath}")
		return {}

In [None]:
inlined_requests

In [None]:
inlined_requests = types.EmbedContentBatchDict(
	model="models/gemini-embedding-001",
	content=types.Content(parts=[types.Part(text="Why is the sky blue?")]),
	config={"task_type": "SEMANTIC_SIMILARITY"},
)

In [None]:
# print the response
for i, inline_response in enumerate(batch_job_inline.dest.inlined_responses, start=1):
	print(f"\n--- Response {i} ---")

	# Check for a successful response
	if inline_response.response:
		# The .text property is a shortcut to the generated text.
		print(inline_response.response.text)

In [None]:
inline_requests = [
	{"contents": [{"parts": [{"text": "Tell me a one-sentence joke."}], "role": "user"}]},
	{"contents": [{"parts": [{"text": "Why is the sky blue?"}], "role": "user"}]},
]

In [None]:
qdata = _loadJson(SOURCE_FILE)

test_keys = list(qdata.keys())

In [None]:
def prepareModelArtifacts(
	data_set, vector_keys, truncation_dim=256, distance_metric="mahalanobis", debug=True
):
	semantic_data = list(data_set.keys())
	model_artifacts = {}
	raw_vectors = {}
	for key in vector_keys:
		raw_vectors[key] = [data_set[s][key] for s in semantic_data]
	executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(vector_keys))

	futures = dict()
	for key in vector_keys:
		if debug:
			print(f"Processing {key}...")

		futures[key] = executor.submit(
			_prepareModelArtifact,
			raw_vectors[key],
			semantic_data,
			truncation_dim,
			distance_metric,
			debug,
		)
	executor.shutdown(wait=True)
	for key in vector_keys:
		model_artifacts[key] = futures[key].result()
	return model_artifacts

In [None]:
# from google.genai import types, errors
# import random


# def processEmbeddings(data, *, MAX_EMBEDDING_STRINGS=95, save_data=True):

# 	model = GeminiModel()

# 	# def runEmbedding(questions, task_type):	# needs closure over model
# 	# 	# return questions
# 	# 	if len(questions) > MAX_EMBEDDING_STRINGS:
# 	# 		raise Exception(f"More than {MAX_EMBEDDING_STRINGS} inputs, got {len(questions)}")

# 	# 	_embeddings = model.client.models.embed_content(
# 	# 		model=MODEL_NAME,
# 	# 		contents=questions,
# 	# 		config=types.EmbedContentConfig(task_type=task_type),
# 	# 	)
# 	# 	embeddings = [e.values for e in _embeddings.embeddings]
# 	# 	return embeddings
# 	# --- 1. ROBUST RETRY LOGIC ---

# 	def runEmbedding(questions, task_type):
# 		"""
# 		Wraps the API call with exponential backoff for 429 errors.
# 		"""
# 		retries = 0
# 		max_retries = 8
# 		base_delay = 2	# seconds
# 		start_delay = 30	# seconds

# 		if len(questions) > MAX_EMBEDDING_STRINGS:
# 			raise Exception(f"More than {MAX_EMBEDDING_STRINGS} inputs, got {len(questions)}")

# 		base_jitter = start_delay * random.uniform(0, 1)

# 		time.sleep(base_jitter)
# 		while True:
# 			try:
# 				# Actual API Call
# 				# Note: Adjust access syntax to match your specific wrapper's return type
# 				result = model.client.models.embed_content(
# 					model=MODEL_NAME,
# 					contents=questions,
# 					config=types.EmbedContentConfig(task_type=task_type),
# 				)
# 				# Ensure we extract the list of vectors correctly
# 				return [e.values for e in result.embeddings]

# 			except errors.ClientError as e:
# 				# This catches 429 Too Many Requests

# 				if retries >= max_retries:
# 					print(f"Max retries exceeded for batch. Error: {e}")
# 					raise e
# 				# Exponential Backoff + Jitter (to prevent thundering herd)
# 				sleep_time = (base_delay * (2**retries)) + random.uniform(0, 1)
# 				print(
# 					f"Hit 429. Retrying in {sleep_time:.2f}s... (Attempt {retries+1}/{max_retries})"
# 				)
# 				time.sleep(sleep_time)
# 				retries += 1
# 			except Exception as e:

# 				# Catch other potential errors (500s, 503s)
# 				print(f"Unexpected error: {e}")
# 				if retries >= max_retries:
# 					raise e
# 				time.sleep(5)

# 				retries += 1

# 	mutated_data = preprocessQuestionData(data)
# 	mutated_keys = list(mutated_data.keys())
# 	mutations = len(list(mutated_data.values())[0])

# 	total_strings = len(mutated_keys * mutations)
# 	max_embedded_strings = min(MAX_EMBEDDING_STRINGS, total_strings)

# 	# reRunGuard(total_strings)

# 	iteration_amount = total_strings // max_embedded_strings

# 	embeddings_futures_dict = {}
# 	flat_data = []
# 	for k, v in mutated_data.items():
# 		flat_data.extend(v)
# 	# return flat_data
# 	# since we have ~500 questions, makes far more sense to run 8 of iterations of 100 questions, plus we save by task type anyways
# 	reshaped_data = []
# 	for i in range(iteration_amount + 1):
# 		data = flat_data[i * max_embedded_strings : max_embedded_strings * (i + 1)]
# 		if data != []:
# 			reshaped_data.append(data)
# 	data_store = {}
# 	for t in TASK_TYPES:
# 		# will move to `Gemini Batch API` at some point but that has a higher latence
# 		print(f"computing task type {t}")
# 		executor = concurrent.futures.ThreadPoolExecutor(max_workers=8)
# 		question_futures = {}
# 		for i, e in enumerate(reshaped_data):
# 			print(f"processing batch of {len(e)} strings")

# 			question_futures[i] = executor.submit(runEmbedding, e, t)

# 		executor.shutdown(wait=True)
# 		for i in range(len(reshaped_data)):
# 			if t in data_store:
# 				data_store[t].extend(question_futures[i].result())
# 			else:
# 				data_store[t] = question_futures[i].result()
# 	output_data = {}
# 	for k, v in data_store.items():
# 		if len(v) != total_strings:
# 			raise ValueError(f"Expected {total_strings} embeddings: Recieved {len(v)}")
# 		output_data[k] = dict(zip(flat_data, list([*np.array(v)])))

# 	# could be done in above loop but keep seperate for now:
# 	if save_data:
# 		for k, v in output_data.items():
# 			np.savez_compressed(f"{EMBEDDING_STORAGE_DIR}/{k}", **v)

# 	return output_data


# processEmbeddings(test_keys)

In [None]:
# np.savez_compressed(f"{EMBEDDING_STORAGE_DIR}/SEMANTIC_SIMILARITY", **test_dict)
import numpy as np
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist

EMBEDDING_STORAGE_DIR = "./embedding_storage"

loaded = np.load(f"{EMBEDDING_STORAGE_DIR}/SEMANTIC_SIMILARITY.npz")
KEYS = [
	"CLASSIFICATION",
	"CLUSTERING",
	"RETRIEVAL_DOCUMENT",
	"RETRIEVAL_QUERY",
	"CODE_RETRIEVAL_QUERY",
	"QUESTION_ANSWERING",
	"FACT_VERIFICATION",
]
# len(list(loaded.keys()))
mat = [v for k, v in loaded.items()]

arr = np.array(mat)[:, :768]
norms = np.linalg.norm(arr, axis=1, keepdims=True)
normalised = arr / norms

In [15]:
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.covariance import LedoitWolf
import concurrent.futures

executor = concurrent.futures.ThreadPoolExecutor(max_workers=7)

KEYS = [
	"CLASSIFICATION",
	"CLUSTERING",
	"RETRIEVAL_DOCUMENT",
	"RETRIEVAL_QUERY",
	"CODE_RETRIEVAL_QUERY",
	"QUESTION_ANSWERING",
	"FACT_VERIFICATION",
]


def processMal(key):

	loaded = np.load(f"{EMBEDDING_STORAGE_DIR}/{key}.npz")

	mat = [v for k, v in loaded.items()]

	arr = np.array(mat)[:, :768]
	norms = np.linalg.norm(arr, axis=1, keepdims=True)
	normalised = arr / norms

	lw = LedoitWolf()

	lw.fit(normalised)
	precision_matrix = lw.precision_

	dist_matrix = cdist(normalised, normalised, metric="mahalanobis", VI=precision_matrix)
	np.savez_compressed(f"./distances/{key}_mahalanobis_lw", **{key: dist_matrix})
	return dist_matrix


matrix_futures = {}
for key in KEYS:
	matrix_futures[key] = executor.submit(processMal, key)
executor.shutdown(wait=True)
save_res = {}
for k in matrix_futures:
	save_res[k] = matrix_futures[k].result()

In [3]:
dist_matrix = cdist(normalised, normalised, metric="mahalanobis", VI=precision_matrix)

In [None]:
# np.savez_compressed(f"./distances/mahalanobis_lw", SEMANTIC_SIMILARITY=dist_matrix)

In [None]:
# np.savez_compressed(f"./distances/test", b=np.array((1, 2)))
np.savez_compressed(f"./distances/test", **{"a": np.array((1, 2))})


loaded = np.load(f"./distances/test.npz")
loaded["a"]

array([1, 2])

In [None]:
N_rows = arr.shape[0]
D = arr.shape[1]

In [None]:
dist_matrix = np.empty((N_rows, N_rows), dtype=normalised.dtype)

In [None]:
block_size = 500

In [None]:
# for i_start in range(0, N_rows, block_size):

# 	i_end = min(i_start + block_size, N_rows)

# 	Block_A = normalised[i_start:i_end, :]

# 	for j_start in range(0, N_rows, block_size):
# 		j_end = min(j_start + block_size, N_rows)

# 		Block_B = normalised[j_start:j_end, :]

# 		diff_block = Block_A[:, None, :] - Block_B[None, :, :]

# 		VI_diff = diff_block @ precision_matrix

# 		mahalanobis_sq_block = np.sum(diff_block * VI_diff, axis=-1)

# 		dist_block = np.sqrt(mahalanobis_sq_block)
# 		dist_matrix[i_start:i_end, j_start:j_end] = dist_block
# 		print(j_start)

In [None]:
# diff_matrix = normalised[:, None, :] - normalised[None, :, :]

In [None]:
# VI_diff = diff_matrix @ precision_matrix

In [None]:
mahalanobis_sq_matrix = np.sum(diff_matrix * VI_diff, axis=-1)

In [None]:
dist_matrix = np.sqrt(mahalanobis_sq_matrix)

In [None]:
dist_matrix = np.maximum(dist_matrix, 0.0)

# return dist_matrix, precision_matrix

In [None]:
dist_matrix

In [None]:
from scipy.spatial.distance import cdist

In [None]:
import numpy as np
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist


def getMahalanobisDistances_Custom(vectors_a, vectors_b):

	norms = np.linalg.norm(vectors_a, axis=1, keepdims=True)
	cleaned_vectors = vectors_a / (norms + 1e-10)

	if vectors_a is vectors_b:
		data_vectors = cleaned_vectors
	else:

		data_vectors = cleaned_vectors

	lw = LedoitWolf()
	lw.fit(data_vectors)
	precision_matrix = lw.precision_
	VI = precision_matrix

	diff_matrix = cleaned_vectors[:, None, :] - cleaned_vectors[None, :, :]

	VI_diff = diff_matrix @ VI

	mahalanobis_sq_matrix = np.sum(diff_matrix * VI_diff, axis=-1)

	dist_matrix = np.sqrt(mahalanobis_sq_matrix)

	dist_matrix = np.maximum(dist_matrix, 0.0)

	return dist_matrix, precision_matrix

In [None]:
dist_matrix = cdist(normalised, normalised, metric="mahalanobis", VI=precision_matrix)

In [None]:
dist_matrix = cdist(normalised, normalised, metric="mahalanobis")

In [None]:
def getMahalanobisDistances(vectors_a, vectors_b):
	norms = np.linalg.norm(vectors_a, axis=1, keepdims=True)
	cleaned_vectors = vectors_a / (norms + 1e-10)

	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)
	return dist_matrix, precision_matrix

In [None]:
norms

In [None]:
a = {1: 2, 3: 4}
"Hello wORld".upper()

In [None]:
EMBEDDING_TYPES = [
	"SEMANTIC_SIMILARITY",
	"CLASSIFICATION",
	"CLUSTERING",
	"RETRIEVAL_DOCUMENT",
	"RETRIEVAL_QUERY",
	"CODE_RETRIEVAL_QUERY",
	"QUESTION_ANSWERING",
	"FACT_VERIFICATION",
]

len(EMBEDDING_TYPES)

In [None]:
EMBEDDING_TYPES = [
	"SEMANTIC_SIMILARITY",
	"CLASSIFICATION",
	"CLUSTERING",
	"RETRIEVAL_DOCUMENT",
	"RETRIEVAL_QUERY",
	"CODE_RETRIEVAL_QUERY",
	"QUESTION_ANSWERING",
	"FACT_VERIFICATION",
]

# Core Question
CORE_PREFIX = (
	"Does the privacy policy affirm that "	# Default and form of the data-source
)
CORE_STATEMENT = "The privacy policy affirms that "

# NEGATION
NEGATION_PREFIX = "Does the privacy policy not affirm that "
NEGATION_TO_STATEMENT = "The privacy policy does not affirm that "

# THE CONTRARY

CONTRARY_PREFIX = "Does the privacy policy deny that "
CONTRARY_PREFIX = "The privacy policy denies that "

# THE NEGATION OF THE CONTRARY
NEG_CONTRARY_PREFIX = "Does the privacy policy not deny that "
NEG_CONTRARY_STATEMENT = "The privacy policy does not deny that "


def statementConversion(question, prefix, replacement):
	processed_text = (replacement + question[len(prefix) :])[:-1] + "."

	return processed_text


# also for extraction of purely X: This should only be ran once per statement
def propositionExtraction(question):
	processed_text = (question[len(CORE_PREFIX) :].capitalize())[:-1] + "."
	return processed_text

In [None]:
# # Keys corresponding to data dictionary
# import os
# import json

# QUESTIONS_FILE = "./data/questions_filter_after.json"
# POLICIES_FILE = "./data/policies_testing.json"
# OUTPUT_Q_FILE = "./output_q.json"
# OUTPUT_P_FILE = "./output_p.json"


# def _loadJson(filepath):
# 	if not os.path.exists(filepath):
# 		print(f"Warning: File not found: {filepath}")
# 		return {}
# 	try:
# 		with open(filepath, "r", encoding="utf-8") as f:
# 			return json.load(f)
# 	except json.JSONDecodeError:
# 		print(f"Error decoding JSON: {filepath}")
# 		return {}


# qdata = _loadJson(QUESTIONS_FILE)
# model_keys = ["embedding_vector", "retrieval_embedding_vector"]
# metric = "Jaccard"
# artifacts = prepareModelArtifacts(qdata, model_keys)

In [None]:
a = np.array(((1, 0), (0, 1)))
import scipy

type(scipy.linalg.norm(a, axis=1, keepdims=True))
a[:, :None].shape == a.shape

In [None]:
def getMahalanobisDistances(vectors_a, vectors_b):
	norms = np.linalg.norm(vectors_a, axis=1, keepdims=True)
	cleaned_vectors = vectors_a / (norms + 1e-10)

	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)
	return dist_matrix, precision_matrix


def _prepareModelArtifact(
	raw_vectors,
	semantic_data,
	truncation_dim=256,
	distance_metric="mahalanobis",
	debug=True,
):
	data_matrix = np.array(raw_vectors)
	data_truncated = data_matrix[:, :truncation_dim]

	dist_output = Distance_Processors[distance_metric](data_truncated, data_truncated)

	if distance_metric == "mahalanobis":
		dist_matrix, precision_matrix = dist_output
	else:
		dist_matrix = dist_output
		precision_matrix = None

	return {
		"dist_matrix": dist_matrix,
		"vectors": data_truncated,
		"precision": precision_matrix,
		"semantic_data": semantic_data,
		"metric": distance_metric,
	}

In [None]:
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.covariance import LedoitWolf
import concurrent.futures
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots


from sklearn.cluster import AgglomerativeClustering
import concurrent.futures
from itertools import combinations
from collections import defaultdict, Counter

# ---------------------------------------------------------
# 1. Distance & Model Utilities
# ---------------------------------------------------------


def getMahalanobisDistances(vectors_a, vectors_b):
	norms = np.linalg.norm(vectors_a, axis=1, keepdims=True)
	cleaned_vectors = vectors_a / (norms + 1e-10)

	lw = LedoitWolf()
	lw.fit(cleaned_vectors)
	precision_matrix = lw.precision_

	dist_matrix = cdist(
		cleaned_vectors, cleaned_vectors, metric="mahalanobis", VI=precision_matrix
	)
	return dist_matrix, precision_matrix


Distance_Processors = {
	"cosine": lambda emb_a, emb_b: 1.0
	- (emb_a @ emb_b.T)
	/ (
		np.linalg.norm(emb_a, axis=1, keepdims=True)
		@ np.linalg.norm(emb_b, axis=1, keepdims=True).T
		+ 1e-10
	),
	"l1": lambda emb_a, emb_b: np.sum(np.abs(emb_a[..., np.newaxis] - emb_b.T), axis=1),
	"l2": lambda emb_a, emb_b: np.linalg.norm(emb_a[..., np.newaxis] - emb_b.T, axis=1),
	"dot": lambda emb_a, emb_b: emb_a @ emb_b.T,
	"mahalanobis": lambda emb_a, emb_b: getMahalanobisDistances(emb_a, emb_b),
}


def _prepareModelArtifact(
	raw_vectors,
	semantic_data,
	truncation_dim=256,
	distance_metric="mahalanobis",
	debug=True,
):
	data_matrix = np.array(raw_vectors)
	data_truncated = data_matrix[:, :truncation_dim]

	dist_output = Distance_Processors[distance_metric](data_truncated, data_truncated)

	if distance_metric == "mahalanobis":
		dist_matrix, precision_matrix = dist_output
	else:
		dist_matrix = dist_output
		precision_matrix = None

	np.fill_diagonal(dist_matrix, float("inf"))
	nn_indices = np.argmin(dist_matrix, axis=1)
	np.fill_diagonal(dist_matrix, 0.0)

	return {
		"dist_matrix": dist_matrix,
		"vectors": data_truncated,
		"precision": precision_matrix,
		"semantic_data": semantic_data,
		"metric": distance_metric,
		"nn_indices": nn_indices,
	}


def prepareModelArtifacts(
	data_set, vector_keys, truncation_dim=256, distance_metric="mahalanobis", debug=True
):
	semantic_data = list(data_set.keys())
	model_artifacts = {}
	raw_vectors = {}
	for key in vector_keys:
		raw_vectors[key] = [data_set[s][key] for s in semantic_data]

	executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(vector_keys))
	futures = dict()

	for key in vector_keys:
		if debug:
			print(f"Processing {key}...")
		futures[key] = executor.submit(
			_prepareModelArtifact,
			raw_vectors[key],
			semantic_data,
			truncation_dim,
			distance_metric,
			debug,
		)
	executor.shutdown(wait=True)
	for key in vector_keys:
		model_artifacts[key] = futures[key].result()
	return model_artifacts


# ---------------------------------------------------------
# 2. Clustering Utilities
# ---------------------------------------------------------


def getGroupsFromLabels(labels):
	groups = {}
	for idx, label in enumerate(labels):
		groups.setdefault(label, []).append(idx)
	return [g for g in groups.values() if len(g) > 1]


def getNNPairsFromGroups(groups, nn_indices):
	pairs = set()
	for group in groups:
		if len(group) < 2:
			continue
		group_set = set(group)
		for idx in group:
			nn_idx = nn_indices[idx]
			if nn_idx in group_set:
				pairs.add(tuple(sorted((idx, nn_idx))))
	return pairs


def clusterAndGetArtifacts(dist_matrix, threshold):
	model = AgglomerativeClustering(
		n_clusters=None,
		distance_threshold=threshold,
		metric="precomputed",
		linkage="complete",
	)
	labels = model.fit_predict(dist_matrix)
	return getGroupsFromLabels(labels), labels


def getPairsFromLablesCombinations(labels):
	groups = {}
	for idx, label in enumerate(labels):
		groups.setdefault(label, []).append(idx)
	pairs = set()
	for label, indices in groups.items():
		if len(indices) > 1:
			for p in combinations(sorted(indices), 2):
				pairs.add(p)
	return pairs


def getComponentsFromPairs(pairs):
	"""
	Converts a set of pairs (edges) into connected components (clusters).
	"""
	adj = defaultdict(set)
	nodes = set()
	for u, v in pairs:
		adj[u].add(v)
		adj[v].add(u)
		nodes.add(u)
		nodes.add(v)

	components = []
	visited = set()

	for node in nodes:
		if node not in visited:
			stack = [node]
			visited.add(node)
			comp = []
			while stack:
				curr = stack.pop()
				comp.append(curr)
				for neighbor in adj[curr]:
					if neighbor not in visited:
						visited.add(neighbor)
						stack.append(neighbor)
			components.append(comp)
	return components


# ---------------------------------------------------------
# 3. Stochastic Analysis Logic
# # ---------------------------------------------------------


# def getCharacteristicLength(dist_matrix, nn_indices):
# 	"""
# 	Returns the scalar distance to the nearest neighbor (r).
# 	"""
# 	N = dist_matrix.shape[0]
# 	sigmas = dist_matrix[np.arange(N), nn_indices]
# 	sigmas[sigmas == 0] = 1e-9
# 	return sigmas.reshape(-1, 1)
def getCharacteristicLength(dist_matrix, k=10, dim=None):
	"""
	Returns the Characteristic Length Scale (sigma) derived from local density.

	Formula: sigma ~ rho^(-1/D)
	Where rho = k / Vol(r_k) ~ k / (r_k ** D)
	Therefore: sigma = r_k / (k ** (1/D))

	This normalizes the k-th neighbor distance to the average inter-point spacing.
	"""
	N = dist_matrix.shape[0]

	# 1. Find Distance to k-th Neighbor (r_k)
	# Clamp k to valid range
	print(N)
	print(k)

	valid_k = min(max(1, k), N - 1)
	# Efficiently find the k-th smallest distance in each row
	r_k = np.partition(dist_matrix, valid_k, axis=1)[:, valid_k]

	# Handle zeros (duplicates)
	r_k[r_k == 0] = 1e-9
	r_k = r_k.reshape(-1, 1)

	# 2. Apply Density Scaling: sigma = r_k * k^(-1/D)
	# If dim is not provided or 0, we assume D=1 (linear scaling)
	if dim and dim > 0:
		scaling_factor = valid_k ** (-1.0 / dim)
		sigma = r_k * scaling_factor
	else:
		sigma = r_k

	return sigma


def runStochasticAnalysis(
	artifacts,
	optimal_taus,
	n_iterations=200,
	noise_fraction=0.15,
	active_tests=["NN"],
	dim=256,
):
	"""
	Runs noise iterations on the distance matrices directly.

	Returns:
	    diameter_distributions: Dict {test_mode: [list_of_all_diameters]}
	    pair_counters: Dict {test_mode: Counter((a,b): count)}
	    all_pair_sets: Dict {test_mode: [set_iter_1, set_iter_2, ...]}
	"""
	model_keys = list(artifacts.keys())

	diameter_distributions = {t: [] for t in active_tests}
	pair_counters = {t: Counter() for t in active_tests}
	all_pair_sets = {t: [] for t in active_tests}	# Storing raw sets as requested

	# Pre-calculate Characteristic Lengths
	model_sigmas = {}
	for key in model_keys:
		art = artifacts[key]
		# model_sigmas[key] = getCharacteristicLength(
		# 	art["dist_matrix"], art["nn_indices"], dim=dim
		# )
		model_sigmas[key] = getCharacteristicLength(art["dist_matrix"], k=10, dim=256)

	ref_key = model_keys[0]
	ref_dist_matrix = artifacts[ref_key]["dist_matrix"]

	print(f"Running {n_iterations} iterations (Noise Fraction: {noise_fraction})...")

	for i in range(n_iterations):
		if i % 50 == 0:
			print(i)
		iter_groups = {}
		iter_labels = {}

		# 1. Perturb Matrices & Cluster
		for key in model_keys:
			art = artifacts[key]
			sigma = model_sigmas[key]
			N = art["dist_matrix"].shape[0]

			# Symmetric Noise Scaled by Fraction
			scale = ((sigma + sigma.T) / 2.0) * noise_fraction
			noise = np.random.normal(0, 1, size=(N, N)) * scale
			sym_noise = (noise + noise.T) / 2.0

			noisy_dist = art["dist_matrix"] + sym_noise
			np.fill_diagonal(noisy_dist, 0.0)
			# noisy_dist[noisy_dist < 0] = 0.0

			t = optimal_taus[key]
			groups, labels = clusterAndGetArtifacts(noisy_dist, t)
			iter_groups[key] = groups
			iter_labels[key] = labels

		# 2. Process active tests
		for test_mode in active_tests:
			pair_sets = []
			for key in model_keys:
				if test_mode == "NN":
					p = getNNPairsFromGroups(iter_groups[key], artifacts[key]["nn_indices"])
				elif test_mode == "Combinations":
					p = getPairsFromLablesCombinations(iter_labels[key])
				pair_sets.append(p)

			if pair_sets:
				p_true = set.intersection(*pair_sets)

				# A. Store Raw Set (Constraint check)
				all_pair_sets[test_mode].append(p_true)

				# B. Update Pair Counter
				pair_counters[test_mode].update(p_true)

				# C. Build Consensus Clusters (Connected Components)
				consensus_clusters = getComponentsFromPairs(p_true)

				# D. Calculate Diameter
				for cluster_indices in consensus_clusters:
					if len(cluster_indices) > 1:
						sub_dist = ref_dist_matrix[np.ix_(cluster_indices, cluster_indices)]
						d = np.max(sub_dist)
						diameter_distributions[test_mode].append(d)

	return diameter_distributions, pair_counters, all_pair_sets


def plotStochasticResults(diameter_data, pair_counters, n_iterations):
	"""
	Plots:
	1. Average Cumulative Distribution of Consensus Cluster Diameters.
	2. Histogram of Pair Co-occurrence Counts.
	"""
	fig = make_subplots(
		rows=1,
		cols=2,
		subplot_titles=(
			"Avg Cumulative Clusters vs Diameter",
			"Pair Co-occurrence Distribution",
		),
	)

	for test_mode in diameter_data.keys():
		diameters = diameter_data[test_mode]
		counts = list(pair_counters[test_mode].values())

		if not diameters:
			continue

		# --- Plot 1: Average CDF ---
		sorted_d = np.sort(diameters)
		# We normalize y_vals by n_iterations to get Average Count per Run
		y_vals = np.arange(1, len(sorted_d) + 1) / n_iterations

		fig.add_trace(
			go.Scatter(
				x=sorted_d,
				y=y_vals,
				mode="lines",
				name=f"{test_mode} (CDF)",
				legendgroup=test_mode,	# Toggle grouping
				opacity=0.8,
			),
			row=1,
			col=1,
		)

		# --- Plot 2: Histogram of Counts ---
		fig.add_trace(
			go.Histogram(
				x=counts,
				name=f"{test_mode} (Counts)",
				legendgroup=test_mode,	# Toggle grouping
				opacity=0.6,
				nbinsx=50,
			),
			row=1,
			col=2,
		)

	# Layout Updates
	fig.update_xaxes(title_text="Cluster Diameter (Physical)", row=1, col=1)
	fig.update_yaxes(title_text="Avg Count of Clusters ≤ Diameter", row=1, col=1)

	fig.update_xaxes(
		title_text="Co-occurrence Count (Max: {})".format(n_iterations), row=1, col=2
	)
	fig.update_yaxes(title_text="Number of Pairs", row=1, col=2)

	fig.update_layout(
		title="Stochastic Resonance Analysis", hovermode="x unified", width=1200, height=600
	)
	fig.show()

In [None]:
# 1. Configuration
active_tests = ["NN", "Combinations"]	# Can be set to ["NN", "Combinations"]
iterations = 2000
optimal_taus = {"embedding_vector": 17.0, "retrieval_embedding_vector": 17}


dist_data, counters, all_sets = runStochasticAnalysis(
	artifacts,
	optimal_taus,
	n_iterations=iterations,
	noise_fraction=0.4,
	active_tests=active_tests,
	dim=256,
)

# 2. Plot with S

In [None]:
# ubplots & Averaging
plotStochasticResults(dist_data, counters, iterations)

# 3. Print Top 5 Pairs
semantic_list = artifacts[list(artifacts.keys())[0]]["semantic_data"]

print("\n--- Top 5 Coincident Pairs ---")
for mode in active_tests:
	print(f"\n[Mode: {mode}]")
	top_5 = counters[mode].most_common(5)
	for (idx_a, idx_b), count in top_5:
		str_a = semantic_list[idx_a]
		str_b = semantic_list[idx_b]
		print(f" ({count}/{iterations}) {str_a} <--> {str_b}")

# 3. Print Pairs above Threshold
threshold_X = 5	# Adjust this value as needed
semantic_list = artifacts[list(artifacts.keys())[0]]["semantic_data"]

print(f"\n--- Coincident Pairs (Count > {threshold_X}) ---")
for mode in active_tests:
	print(f"\n[Mode: {mode}]")

	# Sort by count descending for readability
	sorted_pairs = sorted(counters[mode].items(), key=lambda x: x[1], reverse=True)

	found_any = False
	for (idx_a, idx_b), count in sorted_pairs:
		if count >= threshold_X:
			found_any = True
			# Truncate strings for cleaner output
			str_a = semantic_list[idx_a]
			str_b = semantic_list[idx_b]
			print(f" {count}:")
			print(f"    - {str_a}")
			print(f"    - {str_b}")

	if not found_any:
		print(" No pairs found above threshold.")
# dist_data, counters, all_pair_sets = runStochasticAnalysis(
# 	artifacts, optimal_taus, n_iterations=iterations, active_tests=active_tests
# )

# # 2. Plot CDF
# plotDiameterCDF(dist_data)

# # 3. Print Top 5 Pairs
# semantic_list = artifacts[list(artifacts.keys())[0]]["semantic_data"]

# print("\n--- Top 5 Coincident Pairs ---")
# for mode in active_tests:
# 	print(f"\n[Mode: {mode}]")
# 	top_5 = counters[mode].most_common(10)
# 	for (idx_a, idx_b), count in top_5:
# 		str_a = semantic_list[idx_a]
# 		str_b = semantic_list[idx_b]
# 		print(f" ({count}/{iterations}) {str_a} <--> {str_b}")

# prior on freq
# prior on length

In [None]:
fig = go.Figure(go.Scatter(x=[1, 2]))

In [None]:
# all_pair_sets