In [17]:
from langchain.document_loaders import PyPDFLoader

pdf_path = "google_terms_of_service_en_in.pdf"
output_text_file = "extracted_text.txt"

loader = PyPDFLoader(pdf_path)
pages = loader.load()

extracted_text = ""
for page in pages:
    extracted_text += page.page_content + "\n"


with open(output_text_file, "w", encoding="utf-8") as text_file:
    text_file.write(extracted_text)

print(f"Text extracted and saved to {output_text_file}")

Text extracted and saved to extracted_text.txt


In [18]:
with open("extracted_text.txt", "r") as file:
    document_text = file.read()

print(document_text[:500])  # preview the first 500 characters

GOOGLE TERMS OF SERVICE
Effective May 22, 2024 | Archived versions
What’s covered in these terms
We know it’s tempting to skip these Terms of
Service, but it’s important to establish what you
can expect from us as you use Google services,
and what we expect from you.
These Terms of Service re ect the way Google’s business works, the laws that apply to
our company, and certain things we’ve always believed to be true. As a result, these Terms
of Service help de ne Google’s relationship with you as


In [19]:
import os
from langchain_google_genai import GoogleGenerativeAI
from dotenv import load_dotenv
load_dotenv()

True

In [20]:
os.environ['GOOGLE_API_KEY']=os.getenv('GOOGLE_API_KEY')
os.environ['HF_TOKEN']=os.getenv('HF_TOKEN')
HF_TOKEN = os.getenv('HF_TOKEN')

In [21]:
from transformers import pipeline

summarizer = pipeline("summarization", model="t5-small")

summary = summarizer(document_text[:1000], max_length=200, min_length=30, do_sample=False)
print("Summary:", summary[0]['summary_text'])

Device set to use mps:0
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Summary: these Terms of Service reect the way Google’s business works, the laws that apply to our company, and certain things we’ve always believed to be true . these terms include: what you can expect from us, which describes how we provide and develop our services What we expect from you, which establishes certain rules for using our services Content in Google services .


In [22]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# spliting text into sentences
sentences = sent_tokenize(document_text)

# combining sentences into passages
passages = []
current_passage = ""
for sentence in sentences:
    if len(current_passage.split()) + len(sentence.split()) < 200:  # adjust the word limit as needed
        current_passage += " " + sentence
    else:
        passages.append(current_passage.strip())
        current_passage = sentence
if current_passage:
    passages.append(current_passage.strip())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lokeshdash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
import ollama

def generate_questions(passage, min_questions=3):
    prompt = f"""
Generate at least {min_questions} questions based on the following passage:

\"\"\"{passage}\"\"\"

List each question on a new line.
"""
    response = ollama.generate(model='llama3.2', prompt=prompt)
    output = response['response']
    
    # Splitting the output by lines
    lines = output.split('\n')
    questions = []
    for line in lines:
        if line.strip():  # skip empty lines
            clean_line = line.strip().lstrip('-- ') 
            questions.append(clean_line)

    
    return questions[:min_questions]

# Generating and printing questions
for idx, passage in enumerate(passages):
    questions = generate_questions(passage)
    print(f"Passage {idx+1}:\n{passage}\n")
    print("Generated Questions:")
    for q in questions:
        print(f"- {q}")
    print("\n" + "-"*50 + "\n")


Passage 1:
GOOGLE TERMS OF SERVICE
Effective May 22, 2024 | Archived versions
What’s covered in these terms
We know it’s tempting to skip these Terms of
Service, but it’s important to establish what you
can expect from us as you use Google services,
and what we expect from you. These Terms of Service re ect the way Google’s business works, the laws that apply to
our company, and certain things we’ve always believed to be true. As a result, these Terms
of Service help de ne Google’s relationship with you as you interact with our services. For
example, these terms include the following topic headings:
What you can expect from us, which describes how we provide and develop our
services
What we expect from you, which establishes certain rules for using our services
Content in Google services, which describes the intellectual property rights to the
content you  nd in our services — whether that content belongs to you, Google, or
others
In case of problems or disagreements, which describes o

In [16]:
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# function to track and answer only unique questions
def answer_unique_questions(passages, qa_pipeline):
    answered_questions = set()  # to store unique questions

    for idx, passage in enumerate(passages):
        questions = generate_questions(passage)

        for question in questions:
            if question not in answered_questions:  # check if the question has already been answered
                answer = qa_pipeline({'question': question, 'context': passage})
                print(f"Q: {question}")
                print(f"A: {answer['answer']}\n")
                answered_questions.add(question)  # add the question to the set to avoid repetition
        print(f"{'='*50}\n")
              
answer_unique_questions(passages, qa_pipeline)

Device set to use mps:0


Q: Here are three questions based on the passage:
A: certain things we’ve always believed to be true

Q: 1. What is the main purpose of the Google Terms of Service document?
A: to establish what you
can expect from us as you use Google services

Q: 2. How do these Terms of Service affect the relationship between Google and its users?
A: help de ne


Q: 1. Who is responsible for managing a child's activity on Google services if they are under the age required to manage their own account?
A: a parent or legal guardian

Q: 2. What is the name of the organization that provides Google services and with which you're contracting, according to these terms?
A: Google LLC


Q: 1. Who is referred to as "we" in the context of Google LLC and its affiliates?
A: us

Q: 2. What types of services does Google provide, according to the passage?
A: apps and sites (like Search and Maps)


Q: Here are 3 questions based on the passage:
A: downloadable or preloaded software

Q: 1. What types of services may r