In [1]:
import os
import re
import json
from tqdm import tqdm
from utils import json_load, json_dump

In [2]:
SOURCE_DIR = os.path.join('data', 'temp')
print(os.listdir(SOURCE_DIR))

DEST_DIR = os.path.join('data', 'deliverables')
os.makedirs(DEST_DIR, exist_ok=True)

['4_gemma.json', '3_llama_chat.json', '5_json_gemma.json', '2_llama_zh.json', '.ipynb_checkpoints', '1_llama_en.json']


In [3]:
# helper
def extract_markdown_json(raw):
    # 把 ```json ... ``` 和 ``` 拿掉
    if raw.startswith("```"):
        raw = re.sub(r"^```(?:json)?", "", raw)
        raw = re.sub(r"```$", "", raw)
        raw = raw.strip()

    rv = json.loads(raw)
    return rv

# 1. gemma parser

In [4]:
save_file_path = os.path.join(DEST_DIR, '4_gemma_response.json')
rvs = {}

In [5]:
# load gemma response
file_path = os.path.join(SOURCE_DIR, '4_gemma.json')
data = json_load(file_path)
print(len(data))
data[0]

80


{'qid_source': 1,
 'qset_txt': '1.常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？\n\xa0\nA.上星、日月\nB.合谷、太衝\nC.內關、外關\nD.上關、下關',
 'profiling': {'model': 'gemma3:12b',
  'created_at': '2025-10-02T18:06:23.43249882Z',
  'done': True,
  'done_reason': 'stop',
  'total_duration': 20068842006,
  'load_duration': 8571495865,
  'prompt_eval_count': 376,
  'prompt_eval_duration': 3052538796,
  'eval_count': 95,
  'eval_duration': 8443841338,
  'usage': {'prompt_tokens': 376,
   'completion_tokens': 95,
   'total_tokens': 471}},
 'response': '```json\n{\n  "qid": 1,\n  "stem": "常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？",\n  "A": "上星、日月",\n  "B": "合谷、太衝",\n  "C": "內關、外關",\n  "D": "上關、下關"\n}\n```',
 'py_times': 17.757858514785767}

In [6]:
error_cnt = 0

for example in tqdm(data):
    # get qid_source and qet_text
    qid_source = example['qid_source']
    qset_txt = example['qset_txt']
    response = example['response']
    try:
        response_json = extract_markdown_json(response)
    except:
        response_json = {}
        error_cnt+=1
        print("cnt: {}".format(error_cnt))
    rv = {
        'qset_text': qset_txt,
        'response': response_json
    }
    rvs[qid_source] = rv

print(len(rvs))

json_dump(save_file_path, rvs)

100%|███████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 166111.05it/s]

80
write result to: data/deliverables/4_gemma_response.json





# 2. json_gemma_parser

In [7]:
save_file_path = os.path.join(DEST_DIR, '5_json_gemma_response.json')

In [8]:
# load json_gemma response
file_path = os.path.join(SOURCE_DIR, '5_json_gemma.json')
data = json_load(file_path)
print(len(data))
data[0]

80


{'qid_source': 1,
 'qset_txt': '1.常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？\n\xa0\nA.上星、日月\nB.合谷、太衝\nC.內關、外關\nD.上關、下關',
 'profiling': {'model': 'gemma3:12b',
  'created_at': '2025-10-02T18:19:48.689635996Z',
  'done': True,
  'done_reason': 'stop',
  'total_duration': 8750207631,
  'load_duration': 46727309,
  'prompt_eval_count': 376,
  'prompt_eval_duration': 564764160,
  'eval_count': 90,
  'eval_duration': 8046163137,
  'usage': {'prompt_tokens': 376,
   'completion_tokens': 90,
   'total_tokens': 466}},
 'response': {'qid': 1,
  'stem': '常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？',
  'A': '上星、日月',
  'B': '合谷、太衝',
  'C': '內關、外關',
  'D': '上關、下關'},
 'py_times': 8.767213582992554}

In [11]:
def dataset_dump(data):
    rvs = {}
    error_cnt = 0
    for example in tqdm(data):
        # get qid_source and qet_text
        qid_source = example['qid_source']
        qset_txt = example['qset_txt']
        try:
            response_json = example['response']
        except:
            response_json = {}
            error_cnt+=1
            print("cnt: {}".format(error_cnt))
        rv = {
            'qset_text': qset_txt,
            'response': response_json
        }
        rvs[qid_source] = rv
    print(len(rvs))
    return rvs

In [12]:
rvs = dataset_dump(data)
json_dump(save_file_path, rvs)

100%|███████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 304763.23it/s]

80
write result to: data/deliverables/5_json_gemma_response.json





# llama_1

In [13]:
# load 1_llama_en.json response
save_file_path = os.path.join(DEST_DIR, '1_llama_en_response.json')
file_path = os.path.join(SOURCE_DIR, '1_llama_en.json')
data = json_load(file_path)
print(len(data))
data[0]

80


20

In [18]:
def get_error_qid(data):
    error_ids = []
    for qset in data:
        if isinstance(qset, int):
            error_ids.append(qset + 1)
    return error_ids

In [21]:
error_ids = get_error_qid(data)
num_error = len(error_ids)
print(f'find {num_error} error_ids')

data = data[num_error:]
rvs = dataset_dump(data)
json_dump(save_file_path, rvs)

find 7 error_ids


100%|███████████████████████████████████████████████████████████████████████████████| 73/73 [00:00<00:00, 605107.10it/s]

73
write result to: data/deliverables/1_llama_en_response.json





# llama_2

In [22]:
save_file_path = os.path.join(DEST_DIR, '2_llama_zh_response.json')
file_path = os.path.join(SOURCE_DIR, '2_llama_zh.json')
data = json_load(file_path)
print(len(data))
data[0]

80


1

In [23]:
error_ids = get_error_qid(data)
num_error = len(error_ids)
print(f'find {num_error} error_ids')

data = data[num_error:]
rvs = dataset_dump(data)
json_dump(save_file_path, rvs)

find 7 error_ids


100%|███████████████████████████████████████████████████████████████████████████████| 73/73 [00:00<00:00, 684975.82it/s]

73
write result to: data/deliverables/2_llama_zh_response.json





# llama_3

In [24]:
save_file_path = os.path.join(DEST_DIR, '3_llama_chat_response.json')
file_path = os.path.join(SOURCE_DIR, '3_llama_chat.json')
data = json_load(file_path)
print(len(data))
data[0]

80


{'qid_source': 1,
 'qset_txt': '1.常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？\n\xa0\nA.上星、日月\nB.合谷、太衝\nC.內關、外關\nD.上關、下關',
 'profiling': {'model': 'llama3.1:latest',
  'created_at': '2025-10-02T18:00:12.319634632Z',
  'done': True,
  'done_reason': 'stop',
  'total_duration': 4030284384,
  'load_duration': 24253572,
  'prompt_eval_count': 1260,
  'prompt_eval_duration': 391713275,
  'eval_count': 119,
  'eval_duration': 3613294930,
  'usage': {'prompt_tokens': 1260,
   'completion_tokens': 119,
   'total_tokens': 1379}},
 'response': {'A': '膀胱經',
  'B': '膽經',
  'C': '胃經',
  'D': '肝經',
  'qid': 1,
  'stem': '常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？\n\nA.上星、日月\nB.合谷、太衝\nC.內關、外關\nD.上關、下關'},
 'py_times': 1.7432334423065186}

In [25]:
error_ids = get_error_qid(data)
num_error = len(error_ids)
print(f'find {num_error} error_ids')

data = data[num_error:]
rvs = dataset_dump(data)
json_dump(save_file_path, rvs)

find 0 error_ids


100%|███████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 789516.05it/s]

80
write result to: data/deliverables/3_llama_chat_response.json



