<a href="https://colab.research.google.com/github/classic-21/Context-Driven-Question-Generation-from-PDFs/blob/main/Analytica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2 transformers pdfplumber torch

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m1.

In [None]:
import pdfplumber
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import textwrap
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def split_text_into_paragraphs(text):
    pattern = r'\nIndia\.\nfile:///C\|/Documents%20and%20Settings/Chitra%20Selv...ivekananda/BooksBySwami/KarmaYoga/KarmaYogaPDF.html \(\d+ of \d+\)\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{2}:\d{2} [APM]{2}Karma Yoga\n'

    cleaned_text = re.sub(pattern, '', text)
    paragraphs = cleaned_text.split('88)2/25/2007 9:26:35 PMKarma Yoga')
    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]

    return paragraphs

def retrieve_documents(user_input, documents, top_n=10):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents + [user_input])

    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    related_docs_indices = np.argsort(cosine_similarities[0])[::-1][:top_n]

    return [documents[i] for i in related_docs_indices]

def generate_refined_prompt(relevant_documents, user_input):
    combined_docs = " ".join(relevant_documents)
    '''prompt = f"""Please generate a question asking for the key information.
        Please ask the specific question instead of the general question.

        Paragraph: {combined_docs}"""
        '''
    return combined_docs


In [None]:
def split_text_into_chunks(text, max_chunk_size=512):
    return textwrap.wrap(text, width=max_chunk_size)

def load_model(language_code):
    if language_code == "en":
        model_name = "valhalla/t5-small-e2e-qg"
    elif language_code == "hi":
        model_name = "ai4bharat/indic-t5-v2-qg"
    elif language_code == "sa":
        model_name = "sanskrit-ai/sanskrit-qa"
    else:
        raise ValueError("Unsupported language code!")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return pipeline("text2text-generation", model=model, tokenizer=tokenizer)


def generate_questions(text, language_code="en"):
    qg_pipeline = load_model(language_code)
    text_chunks = split_text_into_chunks(text)
    all_questions = []

    for chunk in text_chunks:
        input_text = f"generate questions: {chunk}"
        questions = qg_pipeline(input_text)
        all_questions.extend(questions)

    return all_questions

def process_pdf_for_questions(pdf_path,user_input, language_code="en"):
    text = extract_text_from_pdf(pdf_path)
    #print(text)
    text = split_text_into_paragraphs(text)
    print(text)
    relevant_documents = retrieve_documents(user_input, text)
    print(relevant_documents)
    refined_prompt = generate_refined_prompt(relevant_documents, user_input)
    questions = generate_questions(refined_prompt, language_code)
    return questions


user_input = input("enter the topic: ")
print(type(user_input))
pdf_path = "/content/KarmaYoga.pdf"
language_code = "en"
questions = process_pdf_for_questions(pdf_path, user_input, language_code)
print(questions)

for idx, q in enumerate(questions):
    print(f"Question {idx+1}: {q['generated_text']}")

enter the topic: all the knowledge humans have has come from their minds
<class 'str'>
["Karma Yoga\nKarma Yoga\nA book by Swami Vivekananda\nBased on lectures the Swami delivered in his rented rooms at 228 W\n39th Street in December, 1895 and January, 1896. The classes were\nfree of charge. Generally the Swami held two classes daily- morning\nand evening.\nAlthough the Swami delivered many lectures and held numerous classes\nin the two years and five months he had been in America, these lectures\nconstituted a departure in the way they were recorded. Just prior to the\ncommencement of his Winter -95-96 season in NYC, his friends and\nsupporters aided him by advertising for and ultimately hiring a\nprofessional stenographer: The man selected, Joseph Josiah Goodwin,\nlater became a disciple of the Swami and followed him to England andGoodwin's transcriptions of the Swami's lectures form the basis of five\nbooks.\nCHAPTER 1\nKarma in its effect on\ncharacter\nfile:///C|/Documents%20and%2

In [None]:
# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def split_text_into_paragraphs(text):
    # Split the text into paragraphs based on double newline characters
    pattern = r'\nIndia\.\nfile:///C\|/Documents%20and%20Settings/Chitra%20Selv...ivekananda/BooksBySwami/KarmaYoga/KarmaYogaPDF.html \(\d+ of \d+\)\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{2}:\d{2} [APM]{2}Karma Yoga\n'

    # Use re.sub to replace the unwanted text with an empty string
    cleaned_text = re.sub(pattern, '', text)
    paragraphs = cleaned_text.split('88)2/25/2007 9:26:35 PMKarma Yoga')
    # Strip leading and trailing whitespace from each paragraph
    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]

    return paragraphs

# Function to retrieve relevant documents
def retrieve_documents(user_input, documents, top_n=10):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents + [user_input])

    # Compute cosine similarity between user input and documents
    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    # Get indices of top_n similar documents
    related_docs_indices = np.argsort(cosine_similarities[0])[::-1][:top_n]

    return [documents[i] for i in related_docs_indices]

# Generate a refined prompt using retrieved documents
def generate_refined_prompt(relevant_documents, user_input):
    combined_docs = " ".join(relevant_documents)
    '''prompt = f"""Please generate a question asking for the key information.
        Please ask the specific question instead of the general question.

        Paragraph: {combined_docs}"""
        '''
    return combined_docs


In [None]:
# Step 2: Split the text into smaller chunks
def split_text_into_chunks(text, max_chunk_size=512):
    # Using textwrap to split the text into chunks of ~512 tokens
    return textwrap.wrap(text, width=max_chunk_size)

# Step 3: Load models for question generation
def load_model(language_code):
    if language_code == "en":  # English model
        model_name = "valhalla/t5-small-e2e-qg"
    elif language_code == "hi":  # Hindi model
        model_name = "ai4bharat/indic-t5-v2-qg"
    elif language_code == "sa":  # Sanskrit model (if available)
        model_name = "sanskrit-ai/sanskrit-qa"  # Placeholder for Sanskrit model
    else:
        raise ValueError("Unsupported language code!")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Step 4: Generate questions based on extracted text
def generate_questions(text, language_code="en"):
    qg_pipeline = load_model(language_code)
    text_chunks = split_text_into_chunks(text)  # Split text into chunks
    all_questions = []

    for chunk in text_chunks:
        input_text = f"generate questions: {chunk}"
        questions = qg_pipeline(input_text)
        all_questions.extend(questions)  # Add each question to the list

    return all_questions

# Step 5: Main function for processing PDF and generating questions
def process_pdf_for_questions(pdf_path,user_input, language_code="en"):
    text = extract_text_from_pdf(pdf_path)
    #print(text)
    text = split_text_into_paragraphs(text)
    print(text)
    # Retrieve relevant documents
    relevant_documents = retrieve_documents(user_input, text)
    print(relevant_documents)
    # Generate the refined prompt
    refined_prompt = generate_refined_prompt(relevant_documents, user_input)
    questions = generate_questions(refined_prompt, language_code)
    return questions

# Example usage
user_input = input("enter the topic: ")
print(type(user_input))
pdf_path = "/content/KarmaYoga.pdf"  # Replace with your PDF file path
language_code = "en"  # Change to 'hi' for Hindi, 'sa' for Sanskrit
questions = process_pdf_for_questions(pdf_path, user_input, language_code)
print(questions)

# Output questions
for idx, q in enumerate(questions):
    print(f"Question {idx+1}: {q['generated_text']}")

enter the topic: all the knowledge humans have 
<class 'str'>
["Karma Yoga\nKarma Yoga\nA book by Swami Vivekananda\nBased on lectures the Swami delivered in his rented rooms at 228 W\n39th Street in December, 1895 and January, 1896. The classes were\nfree of charge. Generally the Swami held two classes daily- morning\nand evening.\nAlthough the Swami delivered many lectures and held numerous classes\nin the two years and five months he had been in America, these lectures\nconstituted a departure in the way they were recorded. Just prior to the\ncommencement of his Winter -95-96 season in NYC, his friends and\nsupporters aided him by advertising for and ultimately hiring a\nprofessional stenographer: The man selected, Joseph Josiah Goodwin,\nlater became a disciple of the Swami and followed him to England andGoodwin's transcriptions of the Swami's lectures form the basis of five\nbooks.\nCHAPTER 1\nKarma in its effect on\ncharacter\nfile:///C|/Documents%20and%20Settings/Chitra%20Selv..

In [None]:
# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def split_text_into_paragraphs(text):
    # Split the text into paragraphs based on double newline characters
    pattern = r'\nIndia\.\nfile:///C\|/Documents%20and%20Settings/Chitra%20Selv...ivekananda/BooksBySwami/KarmaYoga/KarmaYogaPDF.html \(\d+ of \d+\)\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{2}:\d{2} [APM]{2}Karma Yoga\n'

    # Use re.sub to replace the unwanted text with an empty string
    cleaned_text = re.sub(pattern, '', text)
    paragraphs = cleaned_text.split('88)2/25/2007 9:26:35 PMKarma Yoga')
    # Strip leading and trailing whitespace from each paragraph
    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]

    return paragraphs

# Function to retrieve relevant documents
def retrieve_documents(user_input, documents, top_n=10):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents + [user_input])

    # Compute cosine similarity between user input and documents
    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    # Get indices of top_n similar documents
    related_docs_indices = np.argsort(cosine_similarities[0])[::-1][:top_n]

    return [documents[i] for i in related_docs_indices]

# Generate a refined prompt using retrieved documents
def generate_refined_prompt(relevant_documents, user_input):
    combined_docs = " ".join(relevant_documents)
    '''prompt = f"""Please generate a question asking for the key information.
        Please ask the specific question instead of the general question.

        Paragraph: {combined_docs}"""
        '''
    return combined_docs


In [None]:
# Step 2: Split the text into smaller chunks
def split_text_into_chunks(text, max_chunk_size=512):
    # Using textwrap to split the text into chunks of ~512 tokens
    return textwrap.wrap(text, width=max_chunk_size)

# Step 3: Load models for question generation
def load_model(language_code):
    if language_code == "en":  # English model
        model_name = "valhalla/t5-small-e2e-qg"
    elif language_code == "hi":  # Hindi model
        model_name = "ai4bharat/indic-t5-v2-qg"
    elif language_code == "sa":  # Sanskrit model (if available)
        model_name = "sanskrit-ai/sanskrit-qa"  # Placeholder for Sanskrit model
    else:
        raise ValueError("Unsupported language code!")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Step 4: Generate questions based on extracted text
def generate_questions(text, language_code="en"):
    qg_pipeline = load_model(language_code)
    text_chunks = split_text_into_chunks(text)  # Split text into chunks
    all_questions = []

    for chunk in text_chunks:
        input_text = f"generate questions: {chunk}"
        questions = qg_pipeline(input_text)
        all_questions.extend(questions)  # Add each question to the list

    return all_questions

# Step 5: Main function for processing PDF and generating questions
def process_pdf_for_questions(pdf_path,user_input, language_code="en"):
    text = extract_text_from_pdf(pdf_path)
    #print(text)
    text = split_text_into_paragraphs(text)
    print(text)
    # Retrieve relevant documents
    relevant_documents = retrieve_documents(user_input, text)
    print(relevant_documents)
    # Generate the refined prompt
    refined_prompt = generate_refined_prompt(relevant_documents, user_input)
    questions = generate_questions(refined_prompt, language_code)
    return questions

# Example usage
user_input = input("enter the topic: ")
print(type(user_input))
pdf_path = "/content/KarmaYoga.pdf"  # Replace with your PDF file path
language_code = "en"  # Change to 'hi' for Hindi, 'sa' for Sanskrit
questions = process_pdf_for_questions(pdf_path, user_input, language_code)
print(questions)

# Output questions
for idx, q in enumerate(questions):
    print(f"Question {idx+1}: {q['generated_text']}")

enter the topic: our principles lead the way to our destiny
<class 'str'>
["Karma Yoga\nKarma Yoga\nA book by Swami Vivekananda\nBased on lectures the Swami delivered in his rented rooms at 228 W\n39th Street in December, 1895 and January, 1896. The classes were\nfree of charge. Generally the Swami held two classes daily- morning\nand evening.\nAlthough the Swami delivered many lectures and held numerous classes\nin the two years and five months he had been in America, these lectures\nconstituted a departure in the way they were recorded. Just prior to the\ncommencement of his Winter -95-96 season in NYC, his friends and\nsupporters aided him by advertising for and ultimately hiring a\nprofessional stenographer: The man selected, Joseph Josiah Goodwin,\nlater became a disciple of the Swami and followed him to England andGoodwin's transcriptions of the Swami's lectures form the basis of five\nbooks.\nCHAPTER 1\nKarma in its effect on\ncharacter\nfile:///C|/Documents%20and%20Settings/Chi

Link for the implementation of Hindi PDFs question generator
https://colab.research.google.com/drive/1mBYBP2MxfZboo8UqsBZTHxSwFkVxQ7xr#scrollTo=1_bv-6alT5R8