This Notebook's purpose is to create a setof questions by choosing ten chunks of text at random and using an LLM to generate a question and answer fro each chunk.

# Import libraries

In [1]:
from dotenv import load_dotenv
from openai import AzureOpenAI
import os
import pandas as pd
import random
from tqdm.auto import tqdm

from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
load_dotenv()

# Retrieve wikipedia pages to interrogate

In [3]:
list_wikipedia_page = []
for document in os.listdir("sample_data"):
    with open(f"sample_data/{document}", encoding="utf-8") as f:
        list_wikipedia_page.append(f.read())

In [5]:
# We split the pages in chunks.

langchain_docs = [
    LangchainDocument(page_content=doc)
    for doc in tqdm(list_wikipedia_page)
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

  0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
print(f"When we split our documents, we end up with {len(docs_processed)} chunks.")

When we split our documents, we end up with 341 chunks.


# Create our LLM Client
---

We will need him in order to generate our questions / answers from our chunks.

In [None]:
OPENAI_API_ENDPOINT = os.environ.get("OPENAI_API_ENDPOINT") 
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_EMBEDDING_MODEL = os.environ.get("OPENAI_EMBEDDING_MODEL")

aoai_client = AzureOpenAI(
    azure_endpoint=OPENAI_API_ENDPOINT,
    api_key=OPENAI_API_KEY,
    api_version=OPENAI_API_VERSION,
)

In [None]:
# Prompt used to generate our questions.
# Can be modified if we want to generate more complicated questions.

QA_generation_prompt = """
Your task is to write a complicated factoid question and an answer given a context.
The question should demand a good understanding of the context to be answered correctly.
Your factoid question should be answerable with one specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [None]:
# Generating QA couples

n_generations = 10  # We intentionally generate only 10 QA couples here for cost and time considerations

print(f"Generating {n_generations} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, n_generations)):
    # Generate QA couple
    output_QA_couple = aoai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": QA_generation_prompt.format(context=sampled_context.page_content)}],
        temperature=0.2,
        top_p = 1,
        max_tokens=800)
    output_QA_couple = output_QA_couple.choices[0].message.content
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        #assert len(answer) < 300, "Answer is too long" # We could put more constraints on questions generated
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer
            }
        )
    except:
        continue

In [None]:
outputs

[{'context': '== Future ==\n\n\n=== Superintelligence and the singularity ===\nA superintelligence is a hypothetical agent that would possess intelligence far surpassing that of the brightest and most gifted human mind.\nIf research into artificial general intelligence produced sufficiently intelligent software, it might be able to reprogram and improve itself. The improved software would be even better at improving itself, leading to what I. J. Good called an "intelligence explosion" and Vernor Vinge called a "singularity".\nHowever, technologies cannot improve exponentially indefinitely, and typically follow an S-shaped curve, slowing when they reach the physical limits of what the technology can do.\n\n\n=== Transhumanism ===\nRobot designer Hans Moravec, cyberneticist Kevin Warwick, and inventor Ray Kurzweil have predicted that humans and machines will merge in the future into cyborgs that are more capable and powerful than either. This idea, called transhumanism, has roots in Aldo

# Evaluating our questions

In [None]:
# Groundedness is evluating if our answer is coherent with what is written in the text chunk.
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

# Relevance is evaluating if our question is interesting in our context (questions that are too precise or too vague for the user will be discarder)
question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning and computer enthusiast trying to know more about the domain and scientists who worked on it.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

# Standalone serves to check that the question is formulated in a way that is independant from any context.
question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [None]:
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": aoai_client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": question_groundedness_critique_prompt.format(context=output["context"], question=output["question"])}],
            temperature=0.2,
            top_p = 1,
            max_tokens=800).choices[0].message.content,
        "relevance": aoai_client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": question_relevance_critique_prompt.format(question=output["question"])}],
            temperature=0.2,
            top_p = 1,
            max_tokens=800).choices[0].message.content,
        "standalone": aoai_client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": question_standalone_critique_prompt.format(question=output["question"])}],
            temperature=0.2,
            top_p = 1,
            max_tokens=800).choices[0].message.content,
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception as e:
        continue

# Save our questions to a pandas

In [None]:
question_dataframe = pd.DataFrame(outputs)
question_dataframe.to_csv("question_benchmark.csv", sep=";", index=False)