# QA Dataset Generation
Given a raw text, the notebook helps to generate a custom HuggingFace QA dataset based on the given information.

In [None]:
%reload_ext autoreload
%autoreload 2

from IPython.core import ultratb

ultratb.VerboseTB.tb_highlight = "bg:#3e0054"

In [None]:
import os

if not os.getcwd().endswith("syftr"):
    os.chdir(os.path.dirname(os.getcwd()))
    print(f"Changed working directory to: {os.getcwd()}")

from syftr.configuration import cfg

In [None]:
DATA_FILEPATH = "/Users/debadeepta.dey/datasets/barclays/rise-insights-report-making-data-count-with-ai-DIGITAL.md"  # Path to the raw text file
QA_PAIRS_FILEPATH = "/Users/debadeepta.dey/datasets/barclays/rise-insights-report-making-data-count-with-ai-DIGITAL-qapairs.json"  # Path to the QA pairs file
CHUNK_SIZE = 8148  # Size of each text chunk

# Provide a valid dataset name
DATASET_NAME = "making-data-count-with-ai-2"
assert DATASET_NAME, "Please set the DATASET_NAME variable to a valid dataset name."
# -------------------------------------------------------------------------------------------

DATASET_IS_PRIVATE = True  # Set to False if you want to share the dataset publicly

HF_DATASET_NAME = f"DataRobot-Research/{DATASET_NAME}"  # Adjust name of the dataset on Hugging Face Hub
HF_TOKEN = cfg.hf_datasets.api_key.get_secret_value()  # Get Hugging Face token from configuration

assert HF_TOKEN, "Please set the HF_TOKEN environment variable with your Hugging Face token."

print(f"Using Hugging Face token: {HF_TOKEN[:4]}...{HF_TOKEN[-4:]}")

In [None]:
def load_text(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

In [None]:
raw_text = load_text(DATA_FILEPATH)
print(f"Loaded {len(raw_text)} characters from {DATA_FILEPATH}")

In [None]:
def chunk_text(text: str, chunk_size: int = 1000) -> list:
    return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]

In [None]:
chunks = chunk_text(raw_text, CHUNK_SIZE)
print(f"Created {len(chunks)} chunks of size {CHUNK_SIZE} characters.")

In [None]:
# Load QA pairs from the JSON file
import json
def load_qa_pairs(file_path: str) -> list:
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

qa_pairs = load_qa_pairs(QA_PAIRS_FILEPATH)
print(qa_pairs[:3])

**Adjust the parameters to make a custom split based on your needs and the amount of data generated.**

In [None]:
import datasets
import typing as T

def get_context(chunks: T.List[str]) -> str:
    full_context = "\n".join(chunks)
    return full_context

def prepare_hf_data(
        qa_pairs: T.List[T.Dict[str, str]], 
        chunks: T.List[str] | None = None, 
        all_grounding_data_for_each_partition = True,
) -> T.Tuple[datasets.DatasetDict, datasets.DatasetDict]:
    if all_grounding_data_for_each_partition:
        grounding_data_train = chunks
        grounding_data_test = chunks
        grounding_data_holdout = chunks
        grounding_data_sample = chunks[:5]
    elif chunks:
        grounding_data_train = get_context(chunks[:100])
        grounding_data_test = get_context(chunks[100:200])
        grounding_data_holdout = get_context(chunks[200:])
        grounding_data_sample = get_context(chunks[:5])
    else:
        raise ValueError("Either chunks or raw_text must be provided.")
    
    qa_data = datasets.DatasetDict(
        {
            "train": datasets.Dataset.from_list(qa_pairs[:50]),
            "test": datasets.Dataset.from_list(qa_pairs[50:180]),
            "holdout": datasets.Dataset.from_list(qa_pairs[180:]),
            "sample": datasets.Dataset.from_list(qa_pairs[:5]),  # for quick testing
        }
    )
    grounding_data = datasets.DatasetDict(
        {
            "train": datasets.Dataset.from_dict({"text": grounding_data_train}),
            "test": datasets.Dataset.from_dict({"text": grounding_data_test}),
            "holdout": datasets.Dataset.from_dict({"text": grounding_data_holdout}),
            "sample": datasets.Dataset.from_dict({"text": grounding_data_sample}),
        }
    )
    return qa_data, grounding_data

In [None]:
qa_data, grounding_data = prepare_hf_data(qa_pairs, chunks=chunks)

qa_data.push_to_hub(
    repo_id=HF_DATASET_NAME, 
    data_dir="examples",
    private=DATASET_IS_PRIVATE, 
    token=HF_TOKEN,
    config_name="qa"
)
print(f"QA data pushed to Hugging Face Hub.")

grounding_data.push_to_hub(
    repo_id=HF_DATASET_NAME,
    data_dir="grounding_data",
    private=DATASET_IS_PRIVATE,
    token=HF_TOKEN,
    config_name="grounding"
)
print(f"Grounding data pushed to Hugging Face Hub.")