In [None]:
import pdftotext
import openai
import re
from tqdm import tqdm
from difflib import SequenceMatcher
import concurrent.futures

In [None]:
# Set up the OpenAI API client
openai.api_key = "Your_API_Key"

In [None]:
# How similar questions can be
similarity = 0.8

# How many chunks of data should be used for that test (-1 for no limit => whole documents)
num_chunks = 6

# Number of questions per chunk
num_questions = 2

# Number of considered signs per question
chunk_size = 4000


In [None]:
# Load your PDF
with open("regelung.pdf", "rb") as f:
    pdf = pdftotext.PDF(f)

# If it's password-protected
#with open("secure.pdf", "rb") as f:
#    pdf = pdftotext.PDF(f, "secret")

# How many pages?
#print(len(pdf))

# Iterate over all the pages
#for page in pdf:
#    print(page)

# Read some individual pages
#print(pdf[0])
#print(pdf[1])

# Read all the text into one string
#print("\n\n".join(pdf))

# Save all text to a variable
text_long = "\n\n".join(pdf)

In [None]:
# Cut text into 2000 character chunks and put the chunks into a list.
chunk_list = [text_long[i:i+chunk_size] for i in range(0, len(text_long), chunk_size)]

# Use only the first x elements of the chunk list
chunk_list = chunk_list[:num_chunks]

# Print the list
#print(chunk_list)

In [None]:
def generate_flashcards(chunk, existing_questions):
    # Clean the input chunk
    clean_chunk = re.sub(r"\s+", " ", chunk.strip())

    # Generate a question using the OpenAI API
    prompt = f"Generate a specific technical postgrad level german question based on the following text:\n\n{clean_chunk}\n\n"
    question = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        temperature=0.5,
        max_tokens=400,
        n=1,
        stop=None,
    ).choices[0].text.strip()

    # Check if the question is too similar to existing questions
    if any(SequenceMatcher(None, question, existing_question).ratio() > similarity for existing_question in existing_questions):
        # If the question is too similar, generate a new question
        return None
    

    # Generate an answer using the OpenAI API
    answer = f"\n\n{question}\n\n Antworte einfach, technisch und spezifisch."
    answer = openai.Completion.create(
        engine="text-davinci-003",
        prompt=answer,
        temperature=0.8,
        max_tokens=3500,
        n=1,
        stop=None,
    ).choices[0].text.strip()

    return {"question": question, "answer": answer}

In [None]:
# Create an empty list to store the question answer pairs
question_answer_list = []

# Create a set to store existing questions
existing_questions = set()

# Create a progressbar
progress_bar = tqdm(total=len(chunk_list))

# Define a function to generate flashcards for a single chunk
def generate_flashcards_for_chunk(chunk):
    # Generate 5 flashcards
    num_flashcards = num_questions
    flashcards_generated = 0
    while flashcards_generated < num_flashcards:
        # Generate a flashcard for the chunk
        flashcard = generate_flashcards(chunk, existing_questions)
        # If the flashcard is not too similar to existing questions
        if flashcard:
            # Add the flashcard to the question answer list
            question_answer_list.append(flashcard)
            # Add the question to existing questions
            existing_questions.add(flashcard["question"])
            # Increment the number of flashcards generated
            flashcards_generated += 1
        # Update the progressbar
        progress_bar.update(1)

In [10]:
# Use concurrent futures to generate flashcards for all chunks in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    # For each chunk, submit a task to generate flashcards for that chunk
    futures = [executor.submit(generate_flashcards_for_chunk, chunk) for chunk in chunk_list]
    

    # Wait for all tasks to complete
    concurrent.futures.wait(futures)

# Close the progressbar
progress_bar.close()

# Print the generated question answer pairs
for qa_pair in question_answer_list:
    print(qa_pair)


  0%|          | 0/7 [00:01<?, ?it/s]
