In [319]:
import json
from pathlib import Path
import re

In [320]:
output_dir = Path("Microsoft.AZ-204.vDec-2023.by_.Tracy_.132q")
data_path = output_dir / "Microsoft.AZ-204.vDec-2023.by_.Tracy_.132q.json"

with open(data_path, "r") as fin:
    data = json.load(fin)

In [321]:
data_out = []
for ln in data:
    ln_txt = ln["content"]
    if ln["type"] == "image":
        ln_txt = f"== {ln_txt} ==\n"
    data_out.append(ln_txt)

In [322]:
case_start_ptn = re.compile(r"^[0-9]{2} - .+\n")
case_start_idx = []
for idx, ln in enumerate(data_out):
    if re.match(case_start_ptn, ln):
        case_start_idx.append(idx)


case_start_idx.append(len(data_out))

In [323]:
case_start_idx

[4, 21, 94, 102, 126, 141, 164, 181, 223, 244, 441, 453, 471, 795]

In [324]:
for n in range(len(case_start_idx)):
    print(case_start_idx[n : n + 2])

[4, 21]
[21, 94]
[94, 102]
[102, 126]
[126, 141]
[141, 164]
[164, 181]
[181, 223]
[223, 244]
[244, 441]
[441, 453]
[453, 471]
[471, 795]
[795]


In [325]:
case_idx_ranges = [
    case_start_idx[n : n + 2]
    for n in range(len(case_start_idx))
    if len(case_start_idx) >= n + 2
]
case_idx_ranges

[[4, 21],
 [21, 94],
 [94, 102],
 [102, 126],
 [126, 141],
 [141, 164],
 [164, 181],
 [181, 223],
 [223, 244],
 [244, 441],
 [441, 453],
 [453, 471],
 [471, 795]]

In [326]:
cases_dir = output_dir / "cases"
cases_dir.mkdir(parents=True, exist_ok=True)
for idx, (start, end) in enumerate(case_idx_ranges, start=1):
    with open(cases_dir / f"{idx}_{start}-{end}.txt", "w") as fout:
        fout.writelines(data_out[start:end])

In [327]:
def extract_content(case_txt_list: str) -> dict[str, list[str | dict[str, list[str]]]]:
    question_ptn = re.compile(r"^question \d{1,2}\n", flags=re.IGNORECASE)
    answer_ptn = re.compile(r"^(answer area|correct answer)", flags=re.IGNORECASE)

    title_ele = case_txt_list[0].split("\n")
    content = {"d": [], "qa": [], "t": title_ele.pop(0)}

    case_txt_list[0] = "\n".join(title_ele)
    q_counter = 0
    flag = "d"
    for ln in case_txt_list:
        if re.match(question_ptn, ln):
            if flag == "a":
                q_counter += 1
            flag = "q"
        if re.match(answer_ptn, ln):
            flag = "a"

        if flag in "qa":
            if len(content["qa"]) > q_counter:
                content["qa"][q_counter][flag].append(ln)
            else:
                data = {f: [] for f in "qa"}
                data[flag] = [ln]
                content["qa"].append(data)
        else:
            content[flag].append(ln)
    return content

In [328]:
cases_data = {}
for idx, (start, end) in enumerate(case_idx_ranges, start=1):
    case_content = extract_content(data_out[start:end])
    cases_data[f"{idx}_{start}-{end}"] = case_content

In [329]:
def extract_question(question_list: list[str]) -> dict[str, str | list[str]]:
    options_idx = 0
    is_mc = False
    for idx, item in enumerate(question_list):
        if "hot area:" in item.lower() or "select and place:" in item.lower():
            options_idx = idx + 1
            break
        elif "A." in item:
            options_idx = idx
            is_mc = True
            break
    question_str = "\n".join(question_list[: options_idx if is_mc else options_idx - 1])
    options_str = "\n".join(question_list[options_idx:])
    return {
        "question": question_str.strip(),
        "options": options_str.strip().split("\n")
        if not is_mc
        else [
            opt[0].strip() for opt in re.findall(r"([A-Z]\..+(\n{0,2}?)(?![A-Z]\.).+)", options_str)
        ],
    }


def clean_explanation_txt(txt: str) -> str:
    txt = re.sub(r"== (?P<answer>.+) ==", "", txt)
    while "\n\n\n" in txt:
        txt = txt.replace("\n\n\n", "\n\n")
    return txt


def extract_answer(answer_list: list[str]):
    answer_ptn = re.compile(r"Correct Answer: (?P<answer>.+)")
    answer_str = "\n".join(answer_list)
    match = re.match(answer_ptn, answer_str)
    if match:
        answer = match.group("answer").replace(" ", "").split(",")
    else:
        answer = re.findall(r"== (?P<answer>.+) ==", answer_str)
    return {"answer": answer, "explanation": clean_explanation_txt(answer_str)}


def parse_qa(qa_item: dict[str, list[str]]) -> dict[str, str | list[str]]:
    question_with_options = extract_question(qa_item["q"])
    answer = extract_answer(qa_item["a"])
    return question_with_options | answer

In [330]:
parsed_case_data = {}

for k, v in cases_data.items():
    parsed_case_data[k] = {"description": "\n".join(v["d"]), "questions": []}
    for qa_item in v["qa"]:
        parsed_case_data[k]["questions"].append(parse_qa(qa_item))

In [331]:
from pprint import pprint

for k in parsed_case_data:
    with open(cases_dir / f"{k}_qa.json", "w") as fout:
        json.dump(parsed_case_data[k], fout, indent=2, ensure_ascii=False)