In [None]:
!pip install openai==0.28.0



In [None]:
import openai
import os
from google.colab import userdata


In [None]:
openai_api_key = userdata.get('openai')

In [None]:
# Check if the key is correctly obtained
if not openai_api_key:
    raise ValueError("OpenAI API key not found. Please check the key setup.")

# Set the OpenAI key
openai.api_key = openai_api_key

In [None]:
openai.api_key_path = None  # Ensure it’s not looking for a file path


In [None]:
# Set model name to a fallback if needed
model_name = "gpt-3.5-turbo"

In [None]:
# Verify API key setup
if openai.api_key:
    print("OpenAI API Key is set correctly!")
else:
    print("Issue setting the OpenAI API Key.")

OpenAI API Key is set correctly!


In [None]:
!pip install PyPDF2



In [None]:
import PyPDF2

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text


In [None]:
# Setup T5 model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Function to generate Q&A with the custom prompt template and difficulty levels
def generate_qa(input_text, difficulty="Easy", num_questions=2):
    prompt_role = "You are a compliance and legal expert"
    prompt_task = f"Generate {num_questions} {difficulty} level questions and answers"
    prompt_audience = "The questions are aimed at compliance and legal experts working with startups and law firms."
    prompt_output = "Each answer should be a clear single sentence, followed by bullet points for additional details as necessary."
    prompt_nots = "Ensure clarity by avoiding excessive legal jargon, while maintaining necessary technical precision."
    prompt_difficulty = {
        "Easy": "Basic understanding and direct application of GDPR rules.",
        "Medium": "Moderate interpretation of GDPR compliance practices.",
        "Hard": "In-depth analysis and handling of complex GDPR compliance scenarios."
    }
    prompt_mollick = "You are very capable"

    # Combine the inputs to create a custom prompt
    full_prompt = (
        f"{prompt_mollick}. {prompt_role}. {prompt_task}.\n"
        f"{prompt_audience}. {prompt_output}. {prompt_nots}.\n"
        f"{prompt_difficulty[difficulty]}.\nHere is the context:\n\n{input_text}\n"
    )

    # Encode and generate responses
    tokens = tokenizer.encode(full_prompt, return_tensors="pt", max_length=1024, truncation=True)
    output = model.generate(
        tokens,
        max_length=200,
        num_return_sequences=num_questions,
        do_sample=True,  # Enable sampling
        temperature=0.8,  # Increase for more diverse answers
        top_p=0.9        # Focus on high-probability but diverse responses
    )
    return [tokenizer.decode(ans, skip_special_tokens=True) for ans in output]


In [None]:
# Extract GDPR text from the uploaded GDPR PDF
gdpr_text = extract_text_from_pdf("/content/GDPR.pdf")

# List of uploaded privacy policy PDF files in the same directory
pdf_files = [
    "/content/bitpay.pdf"
    # "/content/bilt.pdf",
    # "/content/plaid.pdf",
    # "/content/moonpay.pdf",
    # "/content/gemini.pdf",
    # "/content/circle.pdf",
    # "/content/remitly.pdf",
    # "/content/stripe.pdf",
    # "/content/klarna.pdf",
    # "/content/revolut.pdf"
]

In [None]:
# Loop through the files, extract text, and generate Q&A
for file_path in pdf_files:
    # Extract privacy policy text
    privacy_policy_text = extract_text_from_pdf(file_path)

    # Generate Easy, Medium, and Hard questions (4 each) based on GDPR references
    policy_based_input = (
        f"Based on the following GDPR guidelines:\n\n{gdpr_text}\n\n"
        f"Generate questions that would apply based on GDPR rules and the privacy policy:\n\n{privacy_policy_text}"
    )
    easy_policy_qa = generate_qa(policy_based_input, difficulty="Easy")
    medium_policy_qa = generate_qa(policy_based_input, difficulty="Medium")
    hard_policy_qa = generate_qa(policy_based_input, difficulty="Hard")

    # Generate Easy, Medium, and Hard questions (4 each) about GDPR application using the privacy policy content
    gdpr_based_input = (
        f"Based on the following privacy policy:\n\n{privacy_policy_text}\n\n"
        f"Generate questions about GDPR compliance and answer them using information from the privacy policy."
    )
    easy_gdpr_qa = generate_qa(gdpr_based_input, difficulty="Easy")
    medium_gdpr_qa = generate_qa(gdpr_based_input, difficulty="Medium")
    hard_gdpr_qa = generate_qa(gdpr_based_input, difficulty="Hard")


In [None]:
# Print results
print(f"Q&A for {file_path} (Policy-based):\n")
print("Easy Questions:")
for qa in easy_policy_qa:
    print(f" - {qa}")

print("\nMedium Questions:")
for qa in medium_policy_qa:
    print(f" - {qa}")

print("\nHard Questions:")
for qa in hard_policy_qa:
    print(f" - {qa}")

print(f"\nQ&A for {file_path} (GDPR-specific application):\n")
print("Easy Questions:")
for qa in easy_gdpr_qa:
    print(f" - {qa}")

print("\nMedium Questions:")
for qa in medium_gdpr_qa:
    print(f" - {qa}")

print("\nHard Questions:")
for qa in hard_gdpr_qa:
    print(f" - {qa}")

print("\n" + "="*50 + "\n")

Q&A for /content/bitpay.pdf (Policy-based):

Easy Questions:
 - Ensure clarity by avoiding excessive legal jargon, while maintaining necessary technical precision
 - Ensure clarity by avoiding excessive legal jargon, while maintaining necessary technical precision.
 - Ensure clarity by avoiding excessive legal jargon
 - Directive 95/46/EC of the European Parliament and of the Council (General Data Protection Regulation)

Medium Questions:
 - Ensure clarity by avoiding excessive legal jargon, while maintaining necessary technical precision..
 - European Parliament and the Council of 14 Apr il 2016

Hard Questions:
 - EU 2016/679

Q&A for /content/bitpay.pdf (GDPR-specific application):

Easy Questions:
 - BitPay is considered the controller for processing personal data as described in this Notice.
 - BitPay will process your personal data for you only as of the date that you submit it to us.

Medium Questions:
 - BitPay is a controller for processing personal data by us
 - a person’s le

In [None]:
# Function to generate Q&A using GPT-3.5 Turbo
def generate_qa_gpt(input_text, difficulty="Easy"):
    prompt = (
        f"You are a compliance and legal expert. Generate {difficulty} level questions and answers.\n"
        f"The questions are aimed at compliance and legal experts working with startups and law firms. "
        f"Each answer should be a clear single sentence, followed by bullet points for additional details as necessary. "
        f"Ensure clarity by avoiding excessive legal jargon, while maintaining necessary technical precision.\n"
        f"Here is the context:\n\n{input_text}"
    )

    response = openai.ChatCompletion.create(
        model=model_name,  # Use gpt-3.5-turbo
        messages=[
            {"role": "system", "content": "You are a highly skilled compliance expert."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1500,
        temperature=0.7
    )

    return response.choices[0]['message']['content']

In [None]:
# Loop through the files, extract text, and generate Q&A
for file_path in pdf_files:
    # Extract privacy policy text
    privacy_policy_text = extract_text_from_pdf(file_path)

    # Generate 7 questions from policy to GDPR (4 Easy, 2 Medium, 1 Hard)
    policy_based_input = (
        f"Based on the following GDPR guidelines:\n\n{gdpr_text}\n\n"
        f"Generate questions examining how the privacy policy aligns with specific GDPR rules. "
        f"Focus on specific GDPR compliance requirements like data subject rights, consent, and data processing.\n\n"
        f"{privacy_policy_text}"
    )
    easy_policy_qa = [generate_qa_gpt(policy_based_input, difficulty="Easy") for _ in range(4)]
    medium_policy_qa = [generate_qa_gpt(policy_based_input, difficulty="Medium") for _ in range(2)]
    hard_policy_qa = [generate_qa_gpt(policy_based_input, difficulty="Hard") for _ in range(1)]

    # Generate 5 questions from GDPR to policy (2 Easy, 2 Medium, 1 Hard)
    gdpr_based_input = (
        f"Based on the following privacy policy:\n\n{privacy_policy_text}\n\n"
        f"Generate questions about GDPR compliance and answer them using information from the privacy policy. "
        f"Focus on how the policy addresses specific GDPR requirements and whether it meets compliance standards."
    )
    easy_gdpr_qa = [generate_qa_gpt(gdpr_based_input, difficulty="Easy") for _ in range(2)]
    medium_gdpr_qa = [generate_qa_gpt(gdpr_based_input, difficulty="Medium") for _ in range(2)]
    hard_gdpr_qa = [generate_qa_gpt(gdpr_based_input, difficulty="Hard") for _ in range(1)]


RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [None]:
try:
    # List available models
    models = openai.Model.list()
    available_models = [model.id for model in models['data']]
    print("Available models:", available_models)
except Exception as e:
    print("Error retrieving models:", e)

Available models: ['tts-1', 'tts-1-1106', 'dall-e-2', 'whisper-1', 'gpt-3.5-turbo-instruct', 'gpt-3.5-turbo', 'gpt-3.5-turbo-0125', 'babbage-002', 'davinci-002', 'gpt-4o-mini-2024-07-18', 'dall-e-3', 'gpt-4o-mini', 'tts-1-hd', 'tts-1-hd-1106', 'text-embedding-ada-002', 'gpt-3.5-turbo-16k', 'text-embedding-3-small', 'text-embedding-3-large', 'gpt-3.5-turbo-1106', 'gpt-3.5-turbo-instruct-0914']


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

# Initialize Pegasus
model_name = "google/pegasus-billsum"  # You can use other Pegasus variants like "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-billsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text


In [None]:
# Improved function to generate Q&A using Pegasus
def generate_qa_pegasus(input_text, prompt_type="policy"):
    # More explicit and detailed prompt to guide Q&A generation
    if prompt_type == "policy":
        prompt = (
            f"From this privacy policy, generate 4 clear Q&A pairs about its content, "
            f"focusing on compliance aspects such as user consent, data rights, retention, and security. "
            f"Label each question as 'Q1:', 'Q2:', etc., and each answer as 'A1:', 'A2:', etc. Each question should be brief and specific, followed by a detailed answer.\n\n"
            f"Example:\n"
            f"Q1: What personal data does the policy cover?\n"
            f"A1: The policy covers data such as names, addresses, and transaction details.\n"
            f"Q2: Why is user consent important?\n"
            f"A2: User consent ensures compliance with data protection regulations.\n\n"
            f"Input:\n{input_text}"
        )
    else:
        prompt = (
            f"Based on the provided privacy policy, generate 4 Q&A pairs focused on how it aligns with GDPR compliance. "
            f"Make sure each answer provides specific information from the policy and avoids assumptions. Label questions as 'Q1:', 'Q2:', etc., and answers as 'A1:', 'A2:', etc.\n\n"
            f"Example:\n"
            f"Q1: How does the policy address data subject rights?\n"
            f"A1: The policy includes provisions for users to access, correct, or delete their data upon request.\n\n"
            f"{input_text}"
        )


    # Tokenize input and generate Q&A using Pegasus
    inputs = tokenizer(prompt, truncation=True, padding="longest", return_tensors="pt")
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=400,  # Adjust max length to allow for more detailed answers
        num_beams=5,
        early_stopping=True
    )

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Ensure structured Q&A display
    qa_pairs = output.strip().split("\n")
    clean_output = []

    for pair in qa_pairs:
        # Only add pairs if they contain "Q" and "A" markers, or continue lines otherwise
        if "Q" in pair or "A" in pair:
            clean_output.append(pair.strip())
        elif len(clean_output) > 0:
            # Append to the last element if continuing an answer
            clean_output[-1] += f" {pair.strip()}"

    # If parsing fails, return the original output
    if len(clean_output) == 0:
        return output.strip()

    return "\n".join(clean_output)

In [None]:
# Sample file paths
pdf_files = [
    "/content/bitpay.pdf"
    # # ,
    # "/content/bilt.pdf",
    # "/content/plaid.pdf",
    # "/content/moonpay.pdf",
    # "/content/gemini.pdf",
    # "/content/circle.pdf",
    # "/content/remitly.pdf",
    # "/content/stripe.pdf",
    # "/content/klarna.pdf",
    # "/content/revolut.pdf"
]

# Extract GDPR text from the uploaded GDPR PDF
gdpr_text = extract_text_from_pdf("/content/GDPR.pdf")


In [None]:

# Loop through the files, extract text, and generate Q&A
for file_path in pdf_files:
    # Extract privacy policy text
    privacy_policy_text = extract_text_from_pdf(file_path)

    # Generate Q&A based on the policy to GDPR compliance
    policy_qa = generate_qa_pegasus(privacy_policy_text, prompt_type="policy")
    gdpr_qa = generate_qa_pegasus(privacy_policy_text, prompt_type="gdpr")

    # Print results
    print(f"Q&A for {file_path} (Policy-based):\n{policy_qa}\n")
    print(f"Q&A for {file_path} (GDPR-specific application):\n{gdpr_qa}\n")
    print("="*50)


Q&A for /content/bitpay.pdf (Policy-based):
This privacy notice applies to all personal data on the website www.bitpay.com and all other BitPay sites on which it appears.

Q&A for /content/bitpay.pdf (GDPR-specific application):
The European Union’s General Data Protection Regulation (GDPR) comes into force on 25 May, 2018.



In [None]:

# Testing on the same policy PDF
policy_qa = generate_qa_pegasus(privacy_policy_text, prompt_type="policy")
gdpr_qa = generate_qa_pegasus(privacy_policy_text, prompt_type="gdpr")

print(f"Q&A for {file_path} (Policy-based):\n{policy_qa}\n")
print(f"Q&A for {file_path} (GDPR-specific application):\n{gdpr_qa}\n")

Q&A for /content/bitpay.pdf (Policy-based):
This privacy notice applies to all personal data on the website www.bitpay.com and all other BitPay sites on which it appears.

Q&A for /content/bitpay.pdf (GDPR-specific application):
The European Union’s General Data Protection Regulation (GDPR) comes into force on 25 May, 2018.



In [None]:

# Testing with a single privacy policy PDF
pdf_file = "/content/bitpay.pdf"
privacy_policy_text = extract_text_from_pdf(pdf_file)

# Generate refined Q&A based on the policy to GDPR compliance
policy_qa = generate_qa_pegasus(privacy_policy_text, prompt_type="policy")
gdpr_qa = generate_qa_pegasus(privacy_policy_text, prompt_type="gdpr")

# Display results
print(f"Q&A for {pdf_file} (Policy-based):\n{policy_qa}\n")
print(f"Q&A for {pdf_file} (GDPR-specific application):\n{gdpr_qa}\n")
print("="*50)
