In [1]:
import os

In [2]:
source_file_path = "/home/poyuan/workspace/MCQG/Category/NBME.txt"

In [4]:
with open(source_file_path, 'r') as f:
    lines = f.readlines()
len(lines)

303

In [5]:
lines[0]

'# APPENDIX B: SAMPLE LEAD-INS BASED ON TASK COMPETENCIES\n'

In [6]:
lines = lines[1:]

In [7]:
lines[0]

'## Patient Care: Diagnosis—Causes and Mechanisms \n'

In [8]:
DEST_DIR = "/home/poyuan/workspace/MCQG/Category/NBME"
os.makedirs(DEST_DIR, exist_ok=True)

In [9]:
import re
from typing import List, Dict, Any

def parse_outline(text: str) -> List[Dict[str, Any]]:
    sections: List[Dict[str, Any]] = []
    cur_l2 = None
    cur_l3 = None

    for raw in text.splitlines():
        line = raw.rstrip()
        s = line.strip()
        if not s:
            continue  # 跳過空行

        # L2: 以 "## " 開頭
        m2 = re.match(r'^##\s+(.*)$', s)
        if m2:
            title2 = m2.group(1).strip()
            cur_l2 = {"l2": title2, "l3": []}
            sections.append(cur_l2)
            cur_l3 = None
            continue

        # L3: 以 "### " 開頭（掛到最近的 L2 下）
        m3 = re.match(r'^###\s+(.*)$', s)
        if m3:
            title3 = m3.group(1).strip()
            if cur_l2 is None:  # 若未出現過 L2，建立一個匿名 L2
                cur_l2 = {"l2": None, "l3": []}
                sections.append(cur_l2)
            cur_l3 = {"l3": title3, "l4": []}
            cur_l2["l3"].append(cur_l3)
            continue

        # L4: 以 "- " 開頭（掛到最近的 L3 下；同時也隱含隸屬最近的 L2）
        m4 = re.match(r'^-\s+(.*)$', s)
        if m4:
            item = m4.group(1).strip()
            if cur_l3 is None:  # 若未出現過 L3，建立匿名 L3 並掛到最近 L2
                if cur_l2 is None:
                    cur_l2 = {"l2": None, "l3": []}
                    sections.append(cur_l2)
                cur_l3 = {"l3": None, "l4": []}
                cur_l2["l3"].append(cur_l3)
            cur_l3["l4"].append(item)
            continue

        # 其他不符合標記的行可視需要處理：這裡先忽略或你也可以記錄為 meta
        # print("Unrecognized line:", s)

    return sections


def flatten_outline(sections: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """
    將巢狀結構攤平成列資料，方便存成 CSV/資料表：
    [{"l2": ..., "l3": ..., "l4": ...}, ...]
    """
    rows = []
    for sec in sections:
        l2 = sec.get("l2")
        for sub in sec.get("l3", []):
            l3 = sub.get("l3")
            items = sub.get("l4", [])
            if items:
                for it in items:
                    rows.append({"l2": l2, "l3": l3, "l4": it})
            else:
                rows.append({"l2": l2, "l3": l3, "l4": None})
        if not sec.get("l3"):
            rows.append({"l2": l2, "l3": None, "l4": None})
    return rows

In [11]:
rv = parse_outline('\n'.join(lines))

In [12]:
type(rv)

list

In [13]:
len(rv)

14

In [14]:
rv[0]

{'l2': 'Patient Care: Diagnosis—Causes and Mechanisms',
 'l3': [{'l3': 'Identifies the cause/infectious agent or predisposing factor(s) or, given an effect, determines the cause.',
   'l4': ['Which of the following pathogens is the most likely cause of this patient’s condition?',
    'Which of the following is the most likely infectious agent?',
    'This patient most likely acquired the infectious agent via which of the following modes of transmission?',
    'This patient most likely has a defect in which of the following?',
    'Which of the following is the most likely cause/mechanism of this effect?']},
  {'l3': 'Identifies the underlying processes/pathways that account for, or contribute to, the expression or resolution of a given condition.',
   'l4': ['Which of the following is the most likely underlying cause of this patient’s condition?',
    'Which of the following is the most likely explanation for this patient’s condition?',
    'Which of the following cell types most likel

In [15]:
rv[1]

{'l2': 'Patient Care: Diagnosis—Obtaining and Predicting History and Physical Examination',
 'l3': [{'l3': 'Knows signs/symptoms of selected disorders. Response options are signs and symptoms. The item asks which signs and symptoms are characteristic of the patient’s condition. Typically used when patient presents with the condition.',
   'l4': ['Which of the following signs/symptoms is most consistent with the underlying diagnosis in this patient?']},
  {'l3': 'Knows individual’s risk factors for development of condition. Given current symptoms in presented history, identifies pertinent factor in the history. Typically used when patient presents with the condition.',
   'l4': ['Which of the following factors in this patient’s history most increased the risk for developing this condition?']},
  {'l3': 'Given a specific problem, knows what to ask to obtain pertinent additional history. The response options should not be referenced in the vignette and should not include details that woul

In [17]:
flatten = flatten_outline(rv)

In [18]:
len(flatten)

214

In [29]:
for ex in flatten:
    l3 = ex['l3']
    if not l3:
        ex['l3'] = ex['l2']

In [30]:
l2 = set()
l3 = set()
l4 = set()

for example in flatten:
    l2.add(example['l2'])
    l3.add(example['l3'])
    l4.add(example['l4'])

In [31]:
len(l2)

14

In [32]:
len(l3)

75

In [33]:
len(l4)

210

In [34]:
l2_lst = list(l2)
l3_lst = list(l3)
l4_lst = list(l4)

In [35]:
l2

{'Communication and Interpersonal Skills',
 'Patient Care: Diagnosis—Causes and Mechanisms',
 'Patient Care: Diagnosis—Determining Prognosis/Outcome',
 'Patient Care: Diagnosis—Formulating the Diagnosis',
 'Patient Care: Diagnosis—Obtaining and Predicting History and Physical Examination',
 'Patient Care: Diagnosis—Selecting and Interpreting Laboratory and Diagnostic Studies',
 'Patient Care: Management—Clinical Interventions/Treatments',
 'Patient Care: Management—Health Maintenance and Disease Prevention',
 'Patient Care: Management—Monitoring/Surveillance for Disease Recurrence or Progression',
 'Patient Care: Management—Selecting Clinical Interventions (Mixed Management)',
 'Patient Care: Management—Selecting and Monitoring Pharmacotherapy',
 'Practice-based Learning—Applied Biostatistics and Clinical Epidemiology',
 'Professionalism and Legal/Ethical Issues',
 'Systems-based Practice and Patient Safety'}

In [36]:
import json
def json_dump(file_path, data):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=1, ensure_ascii=False)
    print(f"Data successfully written to {file_path}")

In [37]:
l2_file_path = os.path.join(DEST_DIR, 'l2.json')
json_dump(l2_file_path, l2_lst)

l3_file_path = os.path.join(DEST_DIR, 'l3.json')
json_dump(l3_file_path, l3_lst)

l4_file_path = os.path.join(DEST_DIR, 'l4.json')
json_dump(l4_file_path, l4_lst)

Data successfully written to /home/poyuan/workspace/MCQG/Category/NBME/l2.json
Data successfully written to /home/poyuan/workspace/MCQG/Category/NBME/l3.json
Data successfully written to /home/poyuan/workspace/MCQG/Category/NBME/l4.json


In [39]:
flatten_file_path = os.path.join(DEST_DIR, 'nbme.json')
json_dump(flatten_file_path, flatten)

Data successfully written to /home/poyuan/workspace/MCQG/Category/NBME/nbme.json
