In [None]:
prompt2 = """

**Role:** You are a Data Protection Officer (DPO) specializing in analyzing and tracking the evolution of Privacy Policies. Your expertise lies in identifying actionable statements and core privacy principles within legal texts.

**Objective:** To thoroughly analyze an excerpt of a privacy policy and extract all concrete statements (affirmations) made within it, then rephrase these affirmations as a set of concise, "True" questions, each with a unique identifier.

**Task Breakdown:**

1.  **Policy Excerpt Analysis:**
    *   Carefully read the provided `Privacy Policy Excerpt`.
    *   Identify all explicit statements, declarations, or affirmations made by the policy regarding the collection, processing, storage, sharing, security, transfer, or user rights concerning personal data.
    *   Focus on what the policy *states as a fact* or *affirms as its practice or intention*.

2.  **Identification of Key Affirmations:**
    *   For each identified statement, determine if it represents a significant action, condition, or commitment related to personal data handling.
    *   Assign an identifier (e.g., "Q1", "Q2", "Q3") to each distinct affirmation.

3.  **Question Formulation:**
    *   For each key affirmation identified in Step 2, formulate a question that:
        *   **Starts with the precise prefix:** `Does the privacy policy affirm that...`
        *   **Is followed by a statement that is *verifiably true* based *only* on the provided excerpt.** The statement must directly reflect an affirmation from the text.
        *   **Ends with a question mark (`?`).**
        *   Is concise and directly to the point, avoiding speculation or information not explicitly stated in the excerpt.
    *   Present each question with its assigned ID.

4.  **Output Generation:**
    *   Present the formulated questions as a JSON, where each item includes the question ID and the question itself. The order of questions in this list does not matter for this step; simply list them as they are identified.

**Important Considerations:**
*   **Truthfulness:** Every statement embedded within your question *must* be directly supported and affirmed by the exact wording of the provided `Privacy Policy Excerpt`. Do not infer or speculate.
*   **Completeness (within excerpt):** Aim to capture all significant affirmations related to personal data within the given text.
*   **Conciseness:** Keep each question focused on a single, clear point.

**Example Input**
```Privacy Policy Excerpt
We may store and process personal information collected on our site in the United States or any other country in which Corperation Inc. or its agents maintain facilities. By using our services, you consent to the transfer of your information among these facilities, including those located outside your country.
```
**Example Response**
```
{
"Q1":" Does the privacy policy affirm that personal data can be transferred outside of the user's country of origin?",
"Q2": "Does the privacy policy affirm that personal data transfers are automatically consented to by using the service?",
"Q3": "Does the privacy policy affirm that personal data transfers are automatically consented to by using the service?"
}

```
"""


url = "https://openai.com/policies/privacy-policy/"
url = "https://www.gemini.com/en-SG/legal/privacy-policy"
url = "https://www.anthropic.com/legal/privacy"

data_source = {
	"gemini": "https://www.gemini.com/en-SG/legal/privacy-policy",
	"openai": "https://openai.com/policies/privacy-policy/",
	"anthropic": "https://www.anthropic.com/legal/privacy",
}
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re


def saveHash(key, content):
	sig = hash(content)
	# save to json with key = key, v = hash
	_id = sig
	if _id == sig:
		return True
	# pass
	return False


def splitMarkdown(markdown_text):
	heading_pattern = r"^#{1,6}\s+.*"
	parts = re.split(heading_pattern, markdown_text, flags=re.MULTILINE)
	content_list = [part.strip() for part in parts[1:] if part.strip()]
	return content_list


def removePreamble(markdown_text):
	pattern = r"\A.*?(?=^#\s)"
	cleaned_text = re.sub(pattern, "", markdown_text, flags=re.DOTALL | re.MULTILINE)
	return cleaned_text


def extractContent(url, headers=None):
	if headers is None:
		headers = {
			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
		}

		response = requests.get(url, headers=headers, timeout=10)
		response.raise_for_status()
		html_content = response.text

		soup = BeautifulSoup(html_content, "lxml")

		main_content_element = soup.find("main")

		_markdown_content = md(str(main_content_element), heading_style="ATX")
		markdown_content = removePreamble(_markdown_content)
		return markdown_content


def saveMdFile(content, name):
	if not name.endswith(".md"):
		name = name + ".md"

	with open(name, "w", encoding="utf-8") as file:
		file.write(content)


def collatePolicy(data_source):
	for k, v in data_source.items():
		markdown_content = extractContent(v)
		if saveHash(k, markdown_content):
			# Runs LLM Question analysis of content:
			# adds data and new version questions to question json
			# reanalyses all saved privacy policies against that # (Do at end)
			# updates gui
			pass

		saveMdFile(markdown_content, k)
	return splitMarkdown(markdown_content)