In [1]:
import fitz
from nltk.tokenize import sent_tokenize
from transformers import T5ForConditionalGeneration, T5Tokenizer
import random

In [2]:
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

def process_text(text):
    sentences = sent_tokenize(text)
    return sentences

def generate_question_and_answer(text, model, tokenizer):
    input_text_q = "generate question: " + text
    input_ids_q = tokenizer.encode(input_text_q, return_tensors="pt")
    output_q = model.generate(input_ids_q)
    question = tokenizer.decode(output_q[0], skip_special_tokens=True)

    input_text_a = "generate answer: " + text
    input_ids_a = tokenizer.encode(input_text_a, return_tensors="pt")
    output_a = model.generate(input_ids_a)
    correct_answer = tokenizer.decode(output_a[0], skip_special_tokens=True)
    
    incorrect_answers = []
    for _ in range(3):
        input_text_incorrect = "generate incorrect answer: " + text
        input_ids_incorrect = tokenizer.encode(input_text_incorrect, return_tensors="pt")
        output_incorrect = model.generate(input_ids_incorrect)
        incorrect_answer = tokenizer.decode(output_incorrect[0], skip_special_tokens=True)
        incorrect_answers.append(incorrect_answer)
    
    all_answers = incorrect_answers + [correct_answer]
    random.shuffle(all_answers)
    
    all_answers = [answer.split("<sep>")[0].strip() for answer in all_answers]
    question = question.split("<sep>")[0].strip()
    correct_answer = correct_answer.split("<sep>")[0].strip()
    
    return question, correct_answer, all_answers

def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    sentences = process_text(text)
    sentences = sentences[:10]
    
    model_name = "valhalla/t5-small-e2e-qg"
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    
    qa_pairs = [generate_question_and_answer(sentence, model, tokenizer) for sentence in sentences]
    
    return qa_pairs

pdf_path = "./The-Hound-of-the-Baskervilles.pdf"
qa_pairs = main(pdf_path)

for i, (question, correct_answer, all_answers) in enumerate(qa_pairs):
    print(f"Question {i+1}: {question}")
    for j, answer in enumerate(all_answers):
        print(f"{chr(65 + j)}. {answer}")
    correct_index = all_answers.index(correct_answer)
    print(f"Answer key: {chr(65 + correct_index)}")
    print()



Question 1: Who wrote The Hound of the Baskervilles?
A. Who wrote The Hound of the Baskervilles?
B. Who wrote The Hound of the Baskervilles?
C. Who wrote The Hound of the Baskervilles?
D. Who wrote The Hound of the Baskervilles?
Answer key: A

Question 2: Who is A. Conan Doyle?
A. What is the name of the person who is the most famous person?
B. What is the name of the person who is the most famous person?
C. What is the name of the person who is the most truly a person?
D. What is the name of the person who is the most famous person?
Answer key: C

Question 3: What chapter is the Curse of the Baskervilles Chapter 3 The Problem Chapter 4
A. What chapter is the Curse of the Baskervilles?
B. What chapter is the Curse of the Baskervilles?
C. What chapter is the Curse of the Baskervilles?
D. What chapter is the Curse of the Baskervilles?
Answer key: A

Question 4: What was the name of the stick that was left behind by the visitor?
A. What was the name of the stick that was left behind by th