In [1]:
from dataclasses import dataclass, field
from datasets import Dataset

In [2]:
def format_options(options):
    return '\n'.join(
        [
            f'{chr(ord("A") + i)}: {option}'
            for i, option in enumerate(options)
        ]
    )

In [3]:
@dataclass
class QuestionItem:
    question: str
    options: list[str]
    reasoning: str | None = field(default=None)
    answer: str | None = field(default=None)

@dataclass
class Entry:
    problem: str = field(default="")
    questions: list[QuestionItem] = field(default_factory=list)

In [4]:
def create_train_dataset(entries: list[Entry]) -> Dataset:
    dataset = []
    for entry in entries:
        for question in entry.questions:
            if question.reasoning is None or question.answer is None:
                continue
            dataset.append({
                "problem": entry.problem,
                "question": question.question,
                "options": format_options(question.options),
                "reasoning": question.reasoning,
                "answer": question.answer
            })
    return Dataset.from_list(dataset)

In [5]:
import pickle

In [6]:
et = pickle.load(open("./entries.pkl", "rb"))
et_aug = pickle.load(open("./entries_aug.pkl", "rb"))

In [7]:
ds = create_train_dataset(et + et_aug)

In [9]:
ds.save_to_disk("./train_aug_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1367 [00:00<?, ? examples/s]