In [26]:
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
import datetime
import hashlib

current_date = datetime.datetime.now().__str__()

In [27]:
# # String methods


def isInString(string, substring):
	return substring in string


def getSubstringIndices(string, substring):
	start_index = string.find(substring)
	end_index = start_index + len(substring)
	return (start_index, end_index)


# def getHash(data):
# 	# return hash(data)
# 	return hashlib.sha256(data.encode("utf-8")).hexdigest()

policy_breakdown = {}

policy_template = {
	"policy_name:string": {
		"version_hash:string": {"date": "", "policy_md": "string", "policy_breakdown": [""]}
	}
}
question_relation: {
policy_name,
version_hash,
relevant_substring?,
relevant_substring_idx?,
}
question_data needs to include:
	- question:string
	- origin:question_relation
	- other_policies:question_relation[]

In [28]:
# Markdown Methods


def splitMarkdown(markdown_text):
	heading_pattern = r"^#{1,6}\s+.*"
	parts = re.split(heading_pattern, markdown_text, flags=re.MULTILINE)
	content_list = [part.strip() for part in parts[1:] if part.strip()]
	return content_list


def removePreamble(markdown_text):
	pattern = r"\A.*?(?=^#\s)"
	cleaned_text = re.sub(pattern, "", markdown_text, flags=re.DOTALL | re.MULTILINE)
	return cleaned_text


MARKDOWN_LINK_PATTERN = re.compile(r"(\[.*?\])\((.*?)\)")
URL_PLACEHOLDER = "(DYNAMIC_URL_REMOVED)"


def normalize_markdown_links(markdown_text):

	def replacer(match):
		return match.group(1) + URL_PLACEHOLDER

	normalized_text = MARKDOWN_LINK_PATTERN.sub(replacer, markdown_text)
	return normalized_text


def getHash(data):

	cleaned_data = normalize_markdown_links(data)

	return hashlib.sha256(cleaned_data.encode("utf-8")).hexdigest()

In [29]:
# URL Methods

# url = "https://openai.com/policies/privacy-policy/"
# url = "https://www.gemini.com/en-SG/legal/privacy-policy"
# url = "https://www.anthropic.com/legal/privacy"

data_source = {
	"gemini": "https://www.gemini.com/en-SG/legal/privacy-policy",
	"openai": "https://openai.com/policies/privacy-policy/",
	"anthropic": "https://www.anthropic.com/legal/privacy",
}


def extractContent(url, headers=None):
	if headers is None:
		headers = {
			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
		}

		response = requests.get(url, headers=headers, timeout=10)
		response.raise_for_status()
		html_content = response.text

		soup = BeautifulSoup(html_content, "lxml")

		main_content_element = soup.find("main")

		return main_content_element


def extractMarkdown(main_content):
	return md(str(main_content), heading_style="ATX")

In [30]:
policy_breakdowns = {}

In [31]:
# Policy Methods


def _processPolicy(markdown_content, policy_name, policy_url, policy_hash):
	data = {
		"policy_md": markdown_content,
		"policy_breakdown": splitMarkdown(markdown_content),
		"access_data": datetime.datetime.now().__str__(),
	}
	policy_breakdowns[policy_name][policy_hash] = data


def processPolicy(all_policy_data, policy_name, policy_url):
	main_content = extractContent(policy_url)
	markdown_content = removePreamble(extractMarkdown(main_content))
	_hash = getHash(markdown_content)

	if not (policy_name in all_policy_data):
		all_policy_data[policy_name] = {}

	if not (_hash in all_policy_data[policy_name]):
		_processPolicy(markdown_content, policy_name, policy_url, _hash)
	return


processPolicy(policy_breakdowns, "anthropic", "https://www.anthropic.com/legal/privacy")

In [None]:
def produceQuestions(policy_breakdown):
	policy_qs = {}
	for i in policy_breakdowns:
		qs = generate_qs(i)
		for q in qs:
			computEmbedding(q)


# after each q generated, we will compute the embedding vector and save.
# find and replace similarity >0.98 with reference to the first

In [32]:
policy_breakdowns["anthropic"]

{'99d2042def1972a39015ce3aabba58396d3836e653a9b424044cf35bf0a7989f': {'policy_md': '# Privacy Policy\n\nEffective October 8, 2025[Previous Version](/legal/archive/fbe2e87e-8f4e-4bba-9e5e-a6a8b15afc0e)\n\nEnglish\n\nAnthropic is an AI safety and research company working to build reliable, interpretable, and steerable AI systems.\n\nThis Privacy Policy explains how we collect, use, disclose, and process your personal data when you use our website and other places where Anthropic acts as a\xa0*data controller*—for example, when you interact with Claude.ai or other products as a consumer for personal use ("**Services**") or when Anthropic operates and provides our commercial customers and their end users with access to our commercial products, such as the Claude Team plan (“**Commercial Services**”).\n\nThis Privacy Policy does not apply where Anthropic acts as a\xa0*data processor*and processes personal data on behalf of commercial customers using Anthropic’s Commercial Services – for exa

In [33]:
# def saveHash(key, content):
# 	sig = hash(content)
# 	# save to json with key = key, v = hash
# 	_id = sig
# 	if _id == sig:
# 		return True
# 	# pass
# 	return False


# def extractContent(url, headers=None):
# 	if headers is None:
# 		headers = {
# 			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
# 		}

# 		response = requests.get(url, headers=headers, timeout=10)
# 		response.raise_for_status()
# 		html_content = response.text

# 		soup = BeautifulSoup(html_content, "lxml")

# 		main_content_element = soup.find("main")

# 		_markdown_content = md(str(main_content_element), heading_style="ATX")
# 		markdown_content = removePreamble(_markdown_content)
# 		return markdown_content


# def saveMdFile(content, name):
# 	if not name.endswith(".md"):
# 		name = name + ".md"

# 	with open(name, "w", encoding="utf-8") as file:
# 		file.write(content)


# def collatePolicy(data_source):
# 	for k, v in data_source.items():
# 		markdown_content = extractContent(v)
# 		if saveHash(k, markdown_content):
# 			# Runs LLM Question analysis of content:
# 			# adds data and new version questions to question json
# 			# reanalyses all saved privacy policies against that # (Do at end)
# 			# updates gui
# 			pass

# 		saveMdFile(markdown_content, k)
# 	return splitMarkdown(markdown_content)

In [34]:
# url = "https://openai.com/policies/privacy-policy/"
# url = "https://www.gemini.com/en-SG/legal/privacy-policy"
# url = "https://www.anthropic.com/legal/privacy"

# data_source = {
# 	# "gemini": "https://www.gemini.com/en-SG/legal/privacy-policy",
# 	# "openai": "https://openai.com/policies/privacy-policy/",
# 	"anthropic": "https://www.anthropic.com/legal/privacy",
# }
# import requests
# from bs4 import BeautifulSoup
# from markdownify import markdownify as md
# import re


# def saveHash(key, content):
# 	sig = hash(content)
# 	# save to json with key = key, v = hash
# 	_id = sig
# 	if _id == sig:
# 		return True
# 	# pass
# 	return False


# def splitMarkdown(markdown_text, removeheaders=True):
# 	heading_pattern = r"^#{1,6}\s+.*"
# 	parts = re.split(heading_pattern, markdown_text, flags=re.MULTILINE)
# 	content_list = [part.strip() for part in parts[1:] if part.strip()]
# 	return content_list


# def removePreamble(markdown_text):
# 	pattern = r"\A.*?(?=^#\s)"
# 	cleaned_text = re.sub(pattern, "", markdown_text, flags=re.DOTALL | re.MULTILINE)
# 	return cleaned_text


# def extractContent(url, headers=None):
# 	if headers is None:
# 		headers = {
# 			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
# 		}

# 		response = requests.get(url, headers=headers, timeout=10)
# 		response.raise_for_status()
# 		html_content = response.text

# 		soup = BeautifulSoup(html_content, "lxml")

# 		main_content_element = soup.find("main")

# 		_markdown_content = md(str(main_content_element), heading_style="ATX")
# 		markdown_content = removePreamble(_markdown_content)
# 		return markdown_content


# def saveMdFile(content, name):
# 	if not name.endswith(".md"):
# 		name = name + ".md"

# 	with open(name, "w", encoding="utf-8") as file:
# 		file.write(content)


# def collatePolicy(data_source):
# 	split_markdown = {}
# 	md = ""
# 	for k, v in data_source.items():
# 		markdown_content = extractContent(v)
# 		# if saveHash(k, markdown_content):
# 		# 	# Runs LLM Question analysis of content:
# 		# 	# adds data and new version questions to question json
# 		# 	# reanalyses all saved privacy policies against that # (Do at end)
# 		# 	# updates gui
# 		# 	pass

# 		# saveMdFile(markdown_content, k)
# 		split_markdown[k] = splitMarkdown(markdown_content)
# 		md = markdown_content

# 	# doc_str = f"""
# 	# ```json
# 	# {split_markdown}
# 	# ```
# 	# """
# 	# saveMdFile(doc_str, "output")
# 	# return split_markdown

# 	return md

In [35]:
# collatePolicy(data_source)

After each response pair, regex until 100% match on substrings.


Parrelelise 

In [36]:
# import os
# import google.generativeai as genai
# import numpy as np


# def calculate_similarity(question1, question2):
# 	try:
# 		genai.configure(api_key="")
# 	except AttributeError:
# 		print("Please set your GEMINI_API_KEY environment variable.")

# 	result1 = genai.embed_content(
# 		model="models/embedding-001", content=question1, task_type="SEMANTIC_SIMILARITY"
# 	)
# 	result2 = genai.embed_content(
# 		model="models/embedding-001", content=question2, task_type="SEMANTIC_SIMILARITY"
# 	)

# 	embedding1 = np.array(result1["embedding"])
# 	embedding2 = np.array(result2["embedding"])

# 	dot_product = np.dot(embedding1, embedding2)
# 	norm1 = np.linalg.norm(embedding1)
# 	norm2 = np.linalg.norm(embedding2)

# 	similarity_score = dot_product / (norm1 * norm2)

# 	return similarity_score


# # If threshold > 0.95 flag as similar