# QA Dataset Generation
Given a raw text, the notebook helps to generate a custom HuggingFace QA dataset based on the given information.

In [None]:
%reload_ext autoreload
%autoreload 2

from IPython.core import ultratb

ultratb.VerboseTB.tb_highlight = "bg:#3e0054"

In [None]:
try:
    from syftr.configuration import cfg
except:
    import os
    os.chdir('./../')

In [None]:
DATA_FILEPATH = "data.md"  # Path to the raw text file
CHUNK_SIZE = 200  # Size of each text chunk
LLMS = [  # adjust to LLMs you want to use for question generation
    "gpt-4o-mini",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    "Qwen/Qwen3-32B",
    "google/gemma-3-27b-it",
    "microsoft/Phi-4-multimodal-instruct",
]  # We randomly select one of the provided LLMs per chunk
NUM_PARALLEL = 50  # Number of parallel processes to use for chunk processing
CUSTOM_QA_INSTRUCTIONS = None  # Add instructions that are specific to your QA generation task
assert CUSTOM_QA_INSTRUCTIONS, "Please provide custom instructions for the QA generation."

DATASET_NAME = None
assert DATASET_NAME, "Please set the DATASET_NAME variable to a valid dataset name."
HF_DATASET_NAME = f"DataRobot-Research/{DATASET_NAME}"  # Adjust name of the dataset on Hugging Face Hub
HF_TOKEN = os.getenv("HF_TOKEN")  # provide your HF token with write access

assert HF_TOKEN, "Please set the HF_TOKEN environment variable with your Hugging Face token."

print(f"Using Hugging Face token: {HF_TOKEN[:4]}...{HF_TOKEN[-4:]}")

In [None]:
def load_text(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

In [None]:
raw_text = load_text(DATA_FILEPATH)
print(f"Loaded {len(raw_text)} characters from {DATA_FILEPATH}")

In [None]:
def chunk_text(text: str, chunk_size: int = 1000) -> list:
    return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]

In [None]:
chunks = chunk_text(raw_text, CHUNK_SIZE)
print(f"Created {len(chunks)} chunks of size {CHUNK_SIZE} characters.")

In [None]:
from tenacity import retry, stop_after_attempt, wait_fixed
from syftr.llm import get_llm


@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
def generate(prompt: str, llm_name: str, **kwargs):
    llm = get_llm(llm_name)
    assert llm is not None, f"LLM {llm_name} not found."
    response = llm.complete(prompt=prompt, **kwargs)
    return response.text

In [None]:
def generate_qa_from_chunk(
    chunk: str, llm_name: str, **kwargs
) -> str:
    prompt = f"""Generate a question and answer based on the text below. Make sure to not use special formatting, like markdown, but formulate the question and the answer in a plan text format. Start with the question followed by the answer. The question should be clear and concise, and the answer should be informative and directly related to the question, for instance,
    
    Question: Who is in charge of the project SuperGold?

    Answer: The project is led by Dr. Jane Smith.

    Note that the question should always be specific, for instance, don't use generic terms like "the text" but always be specific about what you mean and use concrete names whereever possible. Same with images and tables: make sure you can specify which table or image your question is about or do not ask this question. The answer should be a direct response to the question, providing relevant information from the text chunk provided below.
    If you cannot generate a question and answer based on the text, return an empty string.
    Moreover, follow these custom instructions: \n\n{CUSTOM_QA_INSTRUCTIONS}\n\n

    Chunk: \n\n{chunk}"""
    response = generate(prompt, llm_name, **kwargs)
    return response.strip()

In [None]:
import re
import typing as T

def parse_qa_pairs(text: str, llm_name: str | None) -> T.List[T.Dict[str, str]]:  
    pattern = r"Question:\s*(.*?)\s*Answer:\s*(.*)"
    matches = re.findall(pattern, text, re.DOTALL)
    parsed_pairs = []
    for question, answer in matches:
        pair = {"question": question.strip(), "answer": answer.strip()}
        if llm_name:
            pair["llm_name"] = llm_name
        parsed_pairs.append(pair)
    return parsed_pairs

In [None]:
import random
from concurrent.futures import ThreadPoolExecutor, as_completed


def get_qa_pairs_from_chunks(chunks: T.List[str]) -> T.List[T.Dict[str, str]]:
    qa_pairs = []

    def _gen(chunk: str) -> T.List[T.Dict[str, str]]:
        llm_name = random.choice(LLMS)
        generated_text = generate_qa_from_chunk(chunk, llm_name, max_tokens=1024, temperature=0.7)
        return parse_qa_pairs(generated_text, llm_name)

    with ThreadPoolExecutor(max_workers=NUM_PARALLEL) as executor:
        futures = [
            executor.submit(_gen, chunk)
            for chunk in chunks
        ]
        for future in as_completed(futures):
            qa_pairs.extend(future.result())
    return qa_pairs

In [None]:
qa_pairs = get_qa_pairs_from_chunks(chunks)
print(f"Generated {len(qa_pairs)} Q&A pairs from {len(chunks)} chunks.")

In [None]:
# for pair in qa_pairs:
#     print("-"* 40)
#     print(f"LLM: {pair.get('llm_name', 'Unknown')}\nQuestion: {pair['question']}\nAnswer: {pair['answer']}\n")

**Adjust the parameters to make a custom split based on your needs and the amount of data generated.**

In [None]:
import datasets

from syftr.configuration import cfg


def gen_partitions(qa_pairs: T.List[T.Dict[str, str]]) -> datasets.DatasetDict:
    data = datasets.DatasetDict(
        {
            "train": datasets.Dataset.from_list(qa_pairs[:200]),
            "test": datasets.Dataset.from_list(qa_pairs[200:400]),
            "holdout": datasets.Dataset.from_list(qa_pairs[400:]),
            "sample": datasets.Dataset.from_list(qa_pairs[:5]),  # for quick testing
        }
    )
    return data


dataset = gen_partitions(qa_pairs)
print(f"Generated dataset with {len(dataset['train'])} training samples, {len(dataset['test'])} test samples, and {len(dataset['holdout'])} holdout samples.")

dataset.save_to_disk(cfg.paths.datasets_dir / DATASET_NAME)
print(f"Dataset saved to {cfg.paths.datasets_dir / DATASET_NAME}")

In [None]:
dataset.push_to_hub(HF_DATASET_NAME, private=True, token=HF_TOKEN)
print(f"Dataset pushed to Hugging Face Hub as '{HF_DATASET_NAME}'.")