## Load additional QA collection

In [5]:
import re
from pathlib import Path
from typing import Dict
from evaluator.models.qa import QA, QACollection


def parse_questions(filepath: Path) -> Dict[int, str]:
    content = filepath.read_text(encoding="utf-8")

    # Split questions on patterns like '\n5. '
    raw_questions = re.split(r"\n(?=\d+\.\s)", content.strip())
    question_map = {}

    for raw in raw_questions:
        match = re.match(r"(?P<num>\d+)\.\s+(?P<text>.+)", raw, re.DOTALL)
        if not match:
            continue

        q_num = int(match.group("num"))
        q_text = match.group("text").strip()

        # Remove newlines before choices like (A), (B), etc.
        q_text = re.sub(r"\n\((?=[A-D]\))", r" (", q_text)

        question_map[q_num] = q_text

    return question_map


def parse_answers(filepath: Path) -> Dict[int, str]:
    content = filepath.read_text(encoding="utf-8")
    answer_map = {}

    for line in content.strip().splitlines():
        match = re.match(r"(?P<num>\d+)\.\s*(?P<ans>[A-D])\b", line)
        if match:
            q_num = int(match.group("num"))
            ans = match.group("ans")
            answer_map[q_num] = ans

    return answer_map


def build_qa_collection(questions: Dict[int, str], answers: Dict[int, str]) -> QACollection:
    qa_map: Dict[int, QA] = {}

    for q_num, q_text in questions.items():
        answer = answers.get(q_num)
        if answer:
            qa_map[q_num] = QA(question=q_text, answer=answer)

    return QACollection(qa_map=qa_map)

In [6]:
questions_path = Path("raw/mc_from_guide.txt")
answers_path = Path("raw/mc_from_guide_ans.txt")

questions = parse_questions(questions_path)
answers = parse_answers(answers_path)

collection = build_qa_collection(questions, answers)

In [7]:
# Inspecting data
print(collection.model_dump_json(indent=2))

{
  "qa_map": {
    "1": {
      "question": "Before 800 B.C.E. Indo-European steppe tribes were different from Chinese, Indian, and Middle Eastern societies in which of these ways? (A) Steppe societies were more likely to have built architectural monuments with religious symbolism. (B) Indo-European societies were ruled by oligarchies, while the other societies were governed by monarchies. (C) Indo-European tribes did not develop a common religion on which to base social bonds. (D) Chinese, Indian, and Middle Eastern societies formed permanent settlements with wealth based on land.",
      "answer": "D"
    },
    "2": {
      "question": "A major factor in the spread of Eastern Orthodoxy was (A) the Mongol invasions of the Balkans and Kiev Russia (B) the development of the Cyrillic alphabet (C) the use of icons and symbols in religious ceremonies (D) integration of folk customs and practices into religious doctrine",
      "answer": "B"
    },
    "3": {
      "question": "During the

## Add to exisiting collection

In [8]:
import json

def write_qa_collection_to_file(filepath: Path, new_collection: QACollection):
    if filepath.exists():
        try:
            existing_data = json.loads(filepath.read_text(encoding="utf-8"))
            existing_collection = QACollection(**existing_data)
        except Exception as e:
            print(f"Error loading existing file: {e}")
            existing_collection = QACollection(qa_map={})
    else:
        existing_collection = QACollection(qa_map={})

    # Determine offset for new keys
    if existing_collection.qa_map:
        max_key = max(existing_collection.qa_map.keys())
    else:
        max_key = 0

    # Offset keys of new collection
    shifted_new_map = {
        i + max_key + 1: qa for i, qa in enumerate(new_collection.qa_map.values())
    }

    # Merge maps
    merged_map = {**existing_collection.qa_map, **shifted_new_map}
    final_collection = QACollection(qa_map=merged_map)

    # Write back to file
    filepath.write_text(final_collection.model_dump_json(indent=2), encoding="utf-8")



write_qa_collection_to_file(Path("processed/ap_history_qa.json"), collection)