In [None]:
import os
from dotenv import load_dotenv
from google import genai
from google.genai import types

embedding_models = {
	"Legacy_1": "models/embedding-001",
	"Legacy_2": "models/text-embedding-004",
	"Current": "models/gemini-embedding-001",
}

chat_models = {
	"FLASH_LATEST": "gemini-2.5-flash-preview-09-2025",
	"PRO": "gemini-2.5-pro",
}


class GeminiModel:
	DEFAULT_CHAT_MODEL = chat_models["FLASH_LATEST"]
	DEFAULT_EMBEDDING_MODEL = embedding_models["Current"]

	def __init__(self, *, api_key=None):
		if api_key is None:
			load_dotenv()
			GOOGLE_API_KEY = self.getEnvAPIKey()
		else:
			GOOGLE_API_KEY = api_key
		self.client = genai.Client(api_key=GOOGLE_API_KEY)

	def getEnvAPIKey(self):
		load_dotenv()
		return os.environ.get("GEMINI_API_KEY")

	def startChat(self, model_name=None):

		if model_name is None:
			model_name = self.DEFAULT_CHAT_MODEL

		chat = self.client.chats.create(model=model_name)
		return chat

	def send_prompt(self, chat, prompt):
		response = chat.send_message(prompt)
		return response.text

	def getSemanticEmbedding(self, semantic_datum, *, model_name=None, task_type=None):
		if task_type is None:
			task_type = "SEMANTIC_SIMILARITY"
		if model_name is None:
			model_name = self.DEFAULT_EMBEDDING_MODEL
		result = self.client.models.embed_content(
			model=model_name,
			contents=semantic_datum,
			config=types.EmbedContentConfig(task_type=task_type),
		)
		return result.embeddings[0].values

	@staticmethod
	def preprocessStatement(statement):
		prefix = "Does the privacy policy affirm that "
		processed_text = (statement[len(prefix) :].capitalize())[:-1] + "."
		return processed_text

	@staticmethod
	def preprocessStatement2(statement):
		prefix = "Does the privacy policy affirm"
		replacement = "The privacy policy affirms"
		# processed_text = (statement[len(prefix) :].capitalize())[:-1] + "."
		processed_text = ("The privacy policy affirms" + statement[len(prefix) :])[:-1] + "."

		return processed_text


question_1 = "Does the privacy policy affirm that personal data transfers are automatically consented to by using the service?"
question_1_prime = "Does the privacy policy affirm that user data transfers are automatically consented to by using the service?"
question_2 = "Does the privacy policy affirm that personal data processing is automatically consented to by using the service?"

s_1 = GeminiModel.preprocessStatement(question_1)
s_1_p = GeminiModel.preprocessStatement(question_1_prime)
s_2 = GeminiModel.preprocessStatement(question_2)

In [None]:
import numpy as np

task_type = "None"


def process_sim(emb_a, emb_b):
	arr_1 = np.array(emb_a)
	arr_2 = np.array(emb_b)
	dot_product = np.dot(arr_1, arr_2)
	norm1 = np.linalg.norm(arr_1)
	norm2 = np.linalg.norm(arr_2)
	similarity_score = dot_product / (norm1 * norm2)
	return similarity_score


q_1_embed = model.getSemanticEmbedding(question_1)
q_1_prime_embed = model.getSemanticEmbedding(question_1_prime)
q_2_embed = model.getSemanticEmbedding(question_2)

sim_a = process_sim(q_1_embed, q_1_prime_embed)
sim_b = process_sim(q_1_embed, q_2_embed)
print(f"1 vs 1 prime: {sim_a}")
print(f"1 vs 2: {sim_b}")
print(f"delta = {sim_a-sim_b}")

s_1_embed = model.getSemanticEmbedding(s_1)
s_1_prime_embed = model.getSemanticEmbedding(s_1_p)
s2_embed = model.getSemanticEmbedding(s_2)
sim_c = process_sim(s_1_embed, s_1_prime_embed)
sim_d = process_sim(s_1_embed, s2_embed)
print(f"statement 1 vs 1 prime: {sim_c}")
print(f"statement 1 vs 2: {sim_d}")
print(f"delta = {sim_c-sim_d}")

1 vs 1 prime: 0.9920240458129393
1 vs 2: 0.970932779302992
delta = 0.021091266509947304
statement 1 vs 1 prime: 0.9873116243207628
statement 1 vs 2: 0.9590774326955188
delta = 0.028234191625243987


In [None]:
# import requests
# from bs4 import BeautifulSoup
# from markdownify import markdownify as md
# import re
# import datetime
# import hashlib

# current_date = datetime.datetime.now().__str__()

In [None]:
# # # String methods


# def isInString(string, substring):
# 	return substring in string


# def getSubstringIndices(string, substring):
# 	start_index = string.find(substring)
# 	end_index = start_index + len(substring)
# 	return (start_index, end_index)


# # def getHash(data):
# # 	# return hash(data)
# # 	return hashlib.sha256(data.encode("utf-8")).hexdigest()

policy_breakdown = {}

policy_template = {
	"policy_name:string": {
		"version_hash:string": {"date": "", "policy_md": "string", "policy_breakdown": [""]}
	}
}
question_relation: {
policy_name,
version_hash,
relevant_substring?,
relevant_substring_idx?,
}
question_data needs to include:
	- question:string
	- origin:question_relation
	- other_policies:question_relation[]

In [None]:
# Markdown Methods


def splitMarkdown(markdown_text):
	heading_pattern = r"^#{1,6}\s+.*"
	parts = re.split(heading_pattern, markdown_text, flags=re.MULTILINE)
	content_list = [part.strip() for part in parts[1:] if part.strip()]
	return content_list


def removePreamble(markdown_text):
	pattern = r"\A.*?(?=^#\s)"
	cleaned_text = re.sub(pattern, "", markdown_text, flags=re.DOTALL | re.MULTILINE)
	return cleaned_text


MARKDOWN_LINK_PATTERN = re.compile(r"(\[.*?\])\((.*?)\)")
URL_PLACEHOLDER = "(DYNAMIC_URL_REMOVED)"


def normalize_markdown_links(markdown_text):

	def replacer(match):
		return match.group(1) + URL_PLACEHOLDER

	normalized_text = MARKDOWN_LINK_PATTERN.sub(replacer, markdown_text)
	return normalized_text


def getHash(data):

	cleaned_data = normalize_markdown_links(data)

	return hashlib.sha256(cleaned_data.encode("utf-8")).hexdigest()

In [None]:
# URL Methods

# url = "https://openai.com/policies/privacy-policy/"
# url = "https://www.gemini.com/en-SG/legal/privacy-policy"
# url = "https://www.anthropic.com/legal/privacy"

data_source = {
	"gemini": "https://www.gemini.com/en-SG/legal/privacy-policy",
	"openai": "https://openai.com/policies/privacy-policy/",
	"anthropic": "https://www.anthropic.com/legal/privacy",
}


def extractContent(url, headers=None):
	if headers is None:
		headers = {
			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
		}

		response = requests.get(url, headers=headers, timeout=10)
		response.raise_for_status()
		html_content = response.text

		soup = BeautifulSoup(html_content, "lxml")

		main_content_element = soup.find("main")

		return main_content_element


def extractMarkdown(main_content):
	return md(str(main_content), heading_style="ATX")

In [None]:
_policy_data = {"policy_markdown": "", "fetch_date": "", "chunks": []}

chunk = {"": ""}
policy_breakdowns = {}

In [None]:
# Policy Methods


def _processPolicy(markdown_content, policy_name, policy_url, policy_hash):
	data = _policy_data.copy()
	data["policy_markdown"] = markdown_content
	data["fetch_date"] = datetime.datetime.now().__str__()
	chunks = []
	proto_chunks = splitMarkdown(markdown_content)
	for i in proto_chunks:
		chunks.append({getHash(i): i})

	data["chunks"] = chunks

	policy_breakdowns[policy_name][policy_hash] = data


def processPolicy(all_policy_data, policy_name, policy_url):
	main_content = extractContent(policy_url)
	markdown_content = removePreamble(extractMarkdown(main_content))
	_hash = getHash(markdown_content)

	if not (policy_name in all_policy_data):
		all_policy_data[policy_name] = {}

	if not (_hash in all_policy_data[policy_name]):
		_processPolicy(markdown_content, policy_name, policy_url, _hash)
	return


processPolicy(policy_breakdowns, "anthropic", "https://www.anthropic.com/legal/privacy")
# extractContent("https://www.anthropic.com/legal/privacy")

KeyboardInterrupt: 

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

_GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")
# client = genai.Client(api_key=_GOOGLE_API_KEY)

In [None]:
from google import genai
import numpy as np
import os
from dotenv import load_dotenv
from model_parameters import DEFAULT_MODEL_NAME

load_dotenv()

_GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")

client = genai.Client(api_key=_GOOGLE_API_KEY)


def send_prompt(chat, prompt):
	response = chat.send_message(prompt)
	return response.text


def startChat(GOOGLE_API_KEY=None, model_name=None):

	if GOOGLE_API_KEY is None:
		GOOGLE_API_KEY = _GOOGLE_API_KEY
	if model_name is None:
		model_name = DEFAULT_MODEL_NAME

	chat = client.chats.create(model=model_name)
	return chat


def f():
	chat = startChat()
	response = send_prompt(chat, "hello")
	print(response)


f()

In [None]:
from model_parameters import DEFAULT_MODEL_NAME
from _prompts import ANALYSIS_PROMPT, SUBSTRING_PROMPT

import numpy as np

import google.generativeai as embedding_genai

from google import genai
import numpy as np


def send_prompt(chat, prompt):
	response = chat.send_message(prompt)
	return response.text


def startChat(GOOGLE_API_KEY=None, model_name=None):
	if GOOGLE_API_KEY is None:
		GOOGLE_API_KEY = _GOOGLE_API_KEY
	if model_name is None:
		model_name = DEFAULT_MODEL_NAME

	chat = client.chats.create(model=model_name)
	return chat


def _startChat(GOOGLE_API_KEY=None, model_name=None):
	if GOOGLE_API_KEY is None:
		GOOGLE_API_KEY = _GOOGLE_API_KEY
	if model_name is None:
		model_name = DEFAULT_MODEL_NAME
	client = genai.Client(api_key=GOOGLE_API_KEY)

	chat = client.chats.create(model=model_name)
	r = chat.send_message("hello")
	print(r.text)
	return chat


import google.generativeai as embedding_genai


def getEmbedding(semantic_datum, GOOGLE_API_KEY):
	try:
		embedding_genai.configure(api_key=GOOGLE_API_KEY)
	except AttributeError:
		print("Please set your GEMINI_API_KEY environment variable.")

	result1 = embedding_genai.embed_content(
		model="models/embedding-001", content=semantic_datum, task_type="SEMANTIC_SIMILARITY"
	)
	return np.array(result1["embedding"])


import ast

policy_qs = {}
THRESHOLD = 0.97


def verifyResponse(response):
	if response.startswith("```json") and response.endswith("```"):
		try:
			return ast.literal_eval(response[7:-3])
		except:
			print(response[7:-3])
			raise
	return None


def getQuestions(subsection):
	chat = startChat()
	_ = send_prompt(chat, ANALYSIS_PROMPT)
	response = send_prompt(chat, str(subsection))
	proto_questions = verifyResponse(response)
	if proto_questions == None:
		return getQuestions(subsection)
	return proto_questions


def getSubstrings(questions):
	chat = startChat()
	_ = send_prompt(chat, SUBSTRING_PROMPT)
	response = send_prompt(chat, f"```{str(subsection)}```")
	proto_strings = verifyResponse(response)
	if proto_strings == None:
		return getSubstrings(subsection)
	return proto_strings


def produceQuestions(policy_hash, policy_name, policy_breakdown):
	for i in policy_breakdown:
		section = list(i.values())[0]
		proto_questions = getQuestions(section)
		for _, w in proto_questions.items():
			embedding = getEmbedding(w, _GOOGLE_API_KEY)
			other_embeddings = list(policy_qs.keys())
			hasSimilar = False
			if len(other_embeddings) > 0:
				for e in other_embeddings:
					dot_product = np.dot(embedding, np.array(e))
					norm1 = np.linalg.norm(embedding)
					norm2 = np.linalg.norm(1)
					if (dot_product / (norm1 * norm2)) > THRESHOLD:
						policy_qs[e].update({policy_hash: section})
						hasSimilar = True
						break
			if not hasSimilar:
				policy_qs.update({tuple(embedding): {policy_hash: section, "question": w}})
		print(policy_qs)
		break

	qs_to_validate = []
	for k, v in policy_qs.items():
		if policy_hash in v:
			qs_to_validate.append({"policy_hash": v[policy_hash], "question": v["question"]})

	if len(qs_to_validate) > 0:
		qs_object = {}

		for i, q in enumerate(qs_to_validate):
			qs_object[str(i)] = q["question"]

		proto_strings = getSubstrings(qs_object)	# need to take in snippet as arg
		# sanitise the internal `"`
		for k, v in proto_strings.items():
			if not (v in policy_breakdowns[policy_name][policy_hash]["policy_markdown"]):
				for l, w in policy_qs.items():
					if policy_hash in w:
						policy_qs[l].pop(policy_hash)
				raise Exception(v)
				return produceQuestions(policy_hash, policy_name, policy_breakdown)

	# need to add substring logic
	return

In [None]:
# policy_breakdowns["anthropic"]

produceQuestions(
	"99d2042def1972a39015ce3aabba58396d3836e653a9b424044cf35bf0a7989f",
	"anthropic",
	policy_breakdowns["anthropic"][
		"99d2042def1972a39015ce3aabba58396d3836e653a9b424044cf35bf0a7989f"
	]["chunks"],
)

In [None]:
{
	"1": "Does the privacy policy affirm that it explains how the company collects, uses, discloses, and processes personal data?",
	"2": "Does the privacy policy affirm that it applies when the company acts as a data controller for its 'Services' and 'Commercial Services'?",
	"3": "Does the privacy policy affirm that using products like Claude.ai for personal use is considered one of its 'Services'?",
	"4": "Does the privacy policy affirm that it does not apply when the company acts as a data processor on behalf of commercial customers?",
	"5": "Does the privacy policy affirm that in cases where it acts as a data processor, the commercial customer is the data controller?",
	"6": "Does the privacy policy affirm that a separate 'Non-User Privacy Policy' provides information on how its large language models are trained?",
	"7": "Does the privacy policy affirm that it describes the user's privacy rights?",
	"8": "Does the privacy policy affirm that Section 4 ('Rights and Choices') contains more information on how to exercise privacy rights?",
	"9": "Does the privacy policy affirm that Section 11 contains specific provisions for users located in Canada?",
	"10": "Does the privacy policy affirm that Section 12 contains specific provisions for users located in Brazil?",
}

In [None]:
# def saveHash(key, content):
# 	sig = hash(content)
# 	# save to json with key = key, v = hash
# 	_id = sig
# 	if _id == sig:
# 		return True
# 	# pass
# 	return False


# def extractContent(url, headers=None):
# 	if headers is None:
# 		headers = {
# 			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
# 		}

# 		response = requests.get(url, headers=headers, timeout=10)
# 		response.raise_for_status()
# 		html_content = response.text

# 		soup = BeautifulSoup(html_content, "lxml")

# 		main_content_element = soup.find("main")

# 		_markdown_content = md(str(main_content_element), heading_style="ATX")
# 		markdown_content = removePreamble(_markdown_content)
# 		return markdown_content


# def saveMdFile(content, name):
# 	if not name.endswith(".md"):
# 		name = name + ".md"

# 	with open(name, "w", encoding="utf-8") as file:
# 		file.write(content)


# def collatePolicy(data_source):
# 	for k, v in data_source.items():
# 		markdown_content = extractContent(v)
# 		if saveHash(k, markdown_content):
# 			# Runs LLM Question analysis of content:
# 			# adds data and new version questions to question json
# 			# reanalyses all saved privacy policies against that # (Do at end)
# 			# updates gui
# 			pass

# 		saveMdFile(markdown_content, k)
# 	return splitMarkdown(markdown_content)

In [None]:
# url = "https://openai.com/policies/privacy-policy/"
# url = "https://www.gemini.com/en-SG/legal/privacy-policy"
# url = "https://www.anthropic.com/legal/privacy"

# data_source = {
# 	# "gemini": "https://www.gemini.com/en-SG/legal/privacy-policy",
# 	# "openai": "https://openai.com/policies/privacy-policy/",
# 	"anthropic": "https://www.anthropic.com/legal/privacy",
# }
# import requests
# from bs4 import BeautifulSoup
# from markdownify import markdownify as md
# import re


# def saveHash(key, content):
# 	sig = hash(content)
# 	# save to json with key = key, v = hash
# 	_id = sig
# 	if _id == sig:
# 		return True
# 	# pass
# 	return False


# def splitMarkdown(markdown_text, removeheaders=True):
# 	heading_pattern = r"^#{1,6}\s+.*"
# 	parts = re.split(heading_pattern, markdown_text, flags=re.MULTILINE)
# 	content_list = [part.strip() for part in parts[1:] if part.strip()]
# 	return content_list


# def removePreamble(markdown_text):
# 	pattern = r"\A.*?(?=^#\s)"
# 	cleaned_text = re.sub(pattern, "", markdown_text, flags=re.DOTALL | re.MULTILINE)
# 	return cleaned_text


# def extractContent(url, headers=None):
# 	if headers is None:
# 		headers = {
# 			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
# 		}

# 		response = requests.get(url, headers=headers, timeout=10)
# 		response.raise_for_status()
# 		html_content = response.text

# 		soup = BeautifulSoup(html_content, "lxml")

# 		main_content_element = soup.find("main")

# 		_markdown_content = md(str(main_content_element), heading_style="ATX")
# 		markdown_content = removePreamble(_markdown_content)
# 		return markdown_content


# def saveMdFile(content, name):
# 	if not name.endswith(".md"):
# 		name = name + ".md"

# 	with open(name, "w", encoding="utf-8") as file:
# 		file.write(content)


# def collatePolicy(data_source):
# 	split_markdown = {}
# 	md = ""
# 	for k, v in data_source.items():
# 		markdown_content = extractContent(v)
# 		# if saveHash(k, markdown_content):
# 		# 	# Runs LLM Question analysis of content:
# 		# 	# adds data and new version questions to question json
# 		# 	# reanalyses all saved privacy policies against that # (Do at end)
# 		# 	# updates gui
# 		# 	pass

# 		# saveMdFile(markdown_content, k)
# 		split_markdown[k] = splitMarkdown(markdown_content)
# 		md = markdown_content

# 	# doc_str = f"""
# 	# ```json
# 	# {split_markdown}
# 	# ```
# 	# """
# 	# saveMdFile(doc_str, "output")
# 	# return split_markdown

# 	return md

In [None]:
# collatePolicy(data_source)

After each response pair, regex until 100% match on substrings.


Parrelelise 

In [None]:
# import os
# import google.generativeai as genai
# import numpy as np


# def calculate_similarity(question1, question2):
# 	try:
# 		genai.configure(api_key="")
# 	except AttributeError:
# 		print("Please set your GEMINI_API_KEY environment variable.")

# 	result1 = genai.embed_content(
# 		model="models/embedding-001", content=question1, task_type="SEMANTIC_SIMILARITY"
# 	)
# 	result2 = genai.embed_content(
# 		model="models/embedding-001", content=question2, task_type="SEMANTIC_SIMILARITY"
# 	)

# 	embedding1 = np.array(result1["embedding"])
# 	embedding2 = np.array(result2["embedding"])

# 	dot_product = np.dot(embedding1, embedding2)
# 	norm1 = np.linalg.norm(embedding1)
# 	norm2 = np.linalg.norm(embedding2)

# 	similarity_score = dot_product / (norm1 * norm2)

# 	return similarity_score


# # If threshold > 0.95 flag as similar