In [9]:
import os
import json
import csv
#import shutil

def process_relationship(writer, filename, relationship):
    event_list = relationship.get('事件', [])
    relation_key = '關係列表' if '關係列表' in relationship else '關係'
    relation_items = relationship.get(relation_key, [])

    for event in event_list:
        for rel in relation_items:
            subject_name, subject_type = rel['主體'].split("|") if "|" in rel['主體'] else (rel['主體'], 'null')
            object_name, object_type = rel['客體'].split("|") if "|" in rel['客體'] else (rel['客體'], 'null')

            main_original = rel.get('主體original', 'null')
            if isinstance(main_original, dict):
                main_original = json.dumps(main_original, ensure_ascii=False)

            object_original = rel.get('客體original', 'null')
            if isinstance(object_original, dict):
                object_original = json.dumps(object_original, ensure_ascii=False)

            relation = rel['關係']
            p_items = 'null'
            if "|" in relation:
                relation_parts = relation.split("|")
                relation = relation_parts[0]
                p_items = relation_parts[1]

            writer.writerow([
                filename, event,
                subject_name, subject_type,
                relation, p_items,
                object_name, object_type,
                rel.get('主體review', 'null'),
                main_original,
                rel.get('human', 'null'),
                rel.get('human_review', 'null'),
                rel.get('客體review', 'null'),
                object_original,
                rel.get('human', 'null'),
                rel.get('human_review', 'null')
            ])

def json_to_csv(json_data, csv_file):
    with open(csv_file, mode='w', newline='', encoding='utf-8-sig') as file:
        writer = csv.writer(file)
        writer.writerow([
            "檔名", "事件", "主體", "主體類別", "關係", "P-items", "客體", "客體類別",
            "S_review", "S_original", "S_human_review", "S_human_original",
            "O_review", "O_original", "O_human_review", "O_human_original"
        ])

        for file_data in json_data:
            filename = file_data["filename"]
            content = file_data["data"]

            # Case 1: {"review": {"relationships": [...]}}
            if isinstance(content, dict):
                if 'review' in content:
                    if isinstance(content['review'], dict) and 'relationships' in content['review']:
                        for rel in content['review']['relationships']:
                            process_relationship(writer, filename, rel)

                    # Case 4: {"review": [ {...事件, 關係列表...} ]}
                    elif isinstance(content['review'], list):
                        for rel in content['review']:
                            if '事件' in rel and ('關係列表' in rel or '關係' in rel):
                                process_relationship(writer, filename, rel)

                # ✅ Case 6: {"relationships": [ {...事件, 關係列表...} ]}
                elif 'relationships' in content and isinstance(content['relationships'], list):
                    for rel in content['relationships']:
                        if '事件' in rel and ('關係列表' in rel or '關係' in rel):
                            process_relationship(writer, filename, rel)

            # Case 2 / Case 3 / Case 5
            elif isinstance(content, list):
                for item in content:
                    if isinstance(item, dict):
                        # Case 2: [{"review": {"relationships": [...]}}]
                        if 'review' in item and isinstance(item['review'], dict) and 'relationships' in item['review']:
                            for rel in item['review']['relationships']:
                                process_relationship(writer, filename, rel)

                        # Case 3: [{"事件": [...], "關係列表": [...]}]
                        elif '事件' in item and '關係列表' in item:
                            process_relationship(writer, filename, item)

                        # Case 5: [{"事件": [...], "關係": [...]}]
                        elif '事件' in item and '關係' in item:
                            process_relationship(writer, filename, item)

def read_all_json_files(directory):                            
#def read_all_json_files(directory, done_directory):
    json_files = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    json_data = json.load(f)
                    print(f"讀取檔案 {filename}")
                    json_files.append({"filename": filename, "data": json_data})

                # 移動已處理的檔案
                #done_file_path = os.path.join(done_directory, filename)
                #shutil.move(file_path, done_file_path)
                #print(f"已將檔案移動到 {done_directory}: {filename}")

            except Exception as e:
                #print(f"無法讀取或移動文件 {filename}: {e}")
                print(f"無法讀取 {filename}: {e}")
            
    return json_files

### 產生CSV review

In [10]:
# 主流程
directory_path = "./docs/output/4_llm_resolution/llama_v7"
#directory_path = "./docs/output/4_llm_resolution/llama__feature_gemini"
#done_path = "./docs/output/4_llm_resolution/llama_v3/done"
csv_output_path = "./docs/output/4_llm_resolution/review_v3.csv"

json_data = read_all_json_files(directory_path)
if json_data:
    json_to_csv(json_data, csv_output_path)
else:
    print("未找到任何 JSON 檔案或無法讀取檔案")


讀取檔案 228事件(20).json
讀取檔案 「友仔」是什麼？光復初期臺北地區非法組織調查報告告訴您(37).json
讀取檔案 「回首向來蕭瑟處，歸去，也無風雨也無晴」—民國38年國軍遷臺紀事(30).json
讀取檔案 「威海衛」租借地的收回(42).json
讀取檔案 「快速」發展的年代：麥克阿瑟公路通車一甲子(203).json
讀取檔案 「日暮鄉關何處是」─「留越國軍」的返台路(31).json
讀取檔案 「賽德克．巴萊」重現的霧社事件(17).json
讀取檔案 「醫者仁也‧仁者人也」─光復初期臺灣醫學教育(29).json
讀取檔案 ㄋㄟㄋㄟ補給站：美援牛奶的供應(127).json
讀取檔案 一紙命令，臺灣命運大不同─中國台灣省行政長官公署警備總司令部第一號令(1).json
讀取檔案 不用手機也可哈拉一整天─45年度公用電話擴充計畫(2).json
讀取檔案 不能少了你—臺灣光復後首次戶口清查(35).json
讀取檔案 世界人權日(18).json
讀取檔案 世紀糖鐵穿鄉越鎮的五分車(148).json
讀取檔案 中元普渡與法國軍墓在基隆(170).json
讀取檔案 中華商場：見證大臺北繁華歲月(166).json
讀取檔案 中華航空：從軍事化管理到以客為尊經營(152).json
讀取檔案 九年國教：春風化雨五十載(134).json
讀取檔案 亞東關係協會：臺日友好關係的樞紐(185).json
讀取檔案 人民頭家—公民直選總統(23).json
讀取檔案 任重道遠─民國36年台灣省鐵路圖(11).json
讀取檔案 低鈉鹽的由來─從臺鹽公司檔案見端倪(60).json
讀取檔案 保存學術的火種—中央研究院播遷來臺(76).json
讀取檔案 保育與觀光—從國立公園到國家公園(141).json
讀取檔案 傳播知識的種子—臺灣總督府圖書館的故事(62).json
讀取檔案 元氣補給：美軍大兵在臺灣(163).json
讀取檔案 光復初期的山區管制與遷村(41).json
讀取檔案 光復初期臺灣鐵路的復原(86).json
讀取檔案 光輝十月．榮耀臺灣(28).json
讀取檔案 全民瘋棒球：回味臺灣棒球精彩史頁(83).json
讀取檔案 八七水災與家園重建(146).json
讀取檔案 公路趴趴走！GO~GO！(58).json
讀取

### 處理CSV to JSON

In [11]:
import csv
import json
import os

# 讀取 CSV 檔案
def read_csv(csv_file):
    data = []
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter=',')  # **明確使用逗號作為分隔符**
        cleaned_fieldnames = [fieldname.strip().lstrip('\ufeff') for fieldname in reader.fieldnames]
        print(f"讀取的欄位名稱: {cleaned_fieldnames}")  # **檢查是否正常解析**
        
        for row in reader:
            row = {key.strip().lstrip('\ufeff'): value.strip() for key, value in row.items()}  # **清理空格**
            data.append(row)
    
    return data

# 將 CSV 轉換為 JSON
def csv_to_json(csv_data, output_directory):
    os.makedirs(output_directory, exist_ok=True)  # 確保目錄存在
    json_data = {}

    for row in csv_data:
        filename = row["檔名"]
        event = row["事件"]
        subject = row["主體"]
        subject_type = row["主體類別"]
        relation = row["關係"]
        p_items = row["P-items"]
        object_ = row["客體"]
        object_type = row["客體類別"]
        s_review = row["S_review"]
        s_original = row["S_original"]
        s_human_review = row["S_human_review"]
        s_human_original = row["S_human_original"]
        o_review = row["O_review"]
        o_original = row["O_original"]
        o_human_review = row["O_human_review"]
        o_human_original = row["O_human_original"]

        # 初始化 JSON 結構
        if filename not in json_data:
            json_data[filename] = {"review": {"relationships": []}}

        # 構造關係數據，修改為六個欄位
        relationship = {
            "事件": [event],
            "關係列表": [{
                "主體": f"{subject}",
                "主體類別": f"{subject_type}",
                "關係": f"{relation}",
                "p-items": f"{p_items}",
                "客體": f"{object_}",
                "客體類別": f"{object_type}",
                "主體review": s_review,
                "主體original": s_original,
                "主體human_review": s_human_review,
                "主體human_original": s_human_original,
                "客體review": o_review,
                "客體original": o_original,
                "客體human_review": o_human_review,
                "客體human_original": o_human_original
            }]
        }

        # 添加關係到 JSON 結構
        json_data[filename]["review"]["relationships"].append(relationship)

    # 輸出 JSON 檔案
    for filename, data in json_data.items():
        with open(os.path.join(output_directory, filename), 'w', encoding='utf-8') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)

# 執行轉換
csv_file = csv_output_path 
output_directory = "./docs/output/4_human_review/llama_v7"

csv_data = read_csv(csv_file)
csv_to_json(csv_data, output_directory)


讀取的欄位名稱: ['檔名', '事件', '主體', '主體類別', '關係', 'P-items', '客體', '客體類別', 'S_review', 'S_original', 'S_human_review', 'S_human_original', 'O_review', 'O_original', 'O_human_review', 'O_human_original']


### 顯示處理完成檔案

In [12]:
import os
import json

json_dir = output_directory # JSON檔案所在資料夾

# 取得所有 JSON 檔案
files = [f for f in os.listdir(json_dir) if f.endswith(".json")]

for file in files:
    full_path = os.path.join(json_dir, file)
    
    with open(full_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        
        print(f"\n===== 檔案名稱: {file} =====")
        print(json.dumps(data, indent=2, ensure_ascii=False))


===== 檔案名稱: 228事件(20).json =====
{
  "review": {
    "relationships": [
      {
        "事件": [
          "二次大戰"
        ],
        "關係列表": [
          {
            "主體": "二次大戰",
            "主體類別": "事件",
            "關係": "時間",
            "p-items": "P585",
            "客體": "二次大戰末期",
            "客體類別": "時間",
            "主體review": "1",
            "主體original": "二次大戰末期|時間",
            "主體human_review": "null",
            "主體human_original": "null",
            "客體review": "1",
            "客體original": "二次大戰|事件",
            "客體human_review": "null",
            "客體human_original": "null"
          }
        ]
      },
      {
        "事件": [
          "二次大戰"
        ],
        "關係列表": [
          {
            "主體": "臺灣",
            "主體類別": "地點",
            "關係": "地點",
            "p-items": "P131",
            "客體": "國民政府",
            "客體類別": "組織",
            "主體review": "0",
            "主體original": "null",
            "主體human_review": "null",
            "主體human_ori

### 產生報表

In [13]:
import pandas as pd
from tabulate import tabulate

def calculate_metrics(group, pred_col, true_col):
    TP = ((group[pred_col] == 0) & (group[true_col] == 0)).sum()
    FP = ((group[pred_col] == 0) & (group[true_col] == 1)).sum()
    FN = ((group[pred_col] == 1) & (group[true_col] == 0)).sum()
    Precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    Recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    F1 = 2 * Precision * Recall / (Precision + Recall) if (Precision + Recall) > 0 else 0
    return TP, FP, FN, Precision, Recall, F1

def calculate_metrics_per_file(csv_file):
    df = pd.read_csv(csv_file)
    df["S_human_review"] = df["S_human_review"].fillna(df["S_review"])
    df["O_human_review"] = df["O_human_review"].fillna(df["O_review"])
    df[["S_review", "S_human_review", "O_review", "O_human_review"]] = df[
        ["S_review", "S_human_review", "O_review", "O_human_review"]
    ].astype(int)

    grouped = df.groupby("檔名")
    rows = []

    for fname, group in grouped:
        for role, pred_col, true_col in [("主體", "S_review", "S_human_review"), ("客體", "O_review", "O_human_review")]:
            TP, FP, FN, P, R, F1 = calculate_metrics(group, pred_col, true_col)
            rows.append({
                "檔名": fname,
                "位置": role,
                "TP": TP, "FP": FP, "FN": FN,
                "Precision": P, "Recall": R, "F1": F1
            })
    return pd.DataFrame(rows)

def calculate_overall_metrics(csv_file):
    df = pd.read_csv(csv_file)
    df["S_human_review"] = df["S_human_review"].fillna(df["S_review"])
    df["O_human_review"] = df["O_human_review"].fillna(df["O_review"])
    df[["S_review", "S_human_review", "O_review", "O_human_review"]] = df[
        ["S_review", "S_human_review", "O_review", "O_human_review"]
    ].astype(int)

    rows = []
    for role, pred_col, true_col in [("主體", "S_review", "S_human_review"), ("客體", "O_review", "O_human_review")]:
        TP, FP, FN, P, R, F1 = calculate_metrics(df, pred_col, true_col)
        rows.append({
            "檔名": "總計",
            "位置": role,
            "TP": TP, "FP": FP, "FN": FN,
            "Precision": P, "Recall": R, "F1": F1
        })
    return pd.DataFrame(rows)

def calculate_metrics_by_type(df):
    df["S_human_review"] = df["S_human_review"].fillna(df["S_review"])
    df["O_human_review"] = df["O_human_review"].fillna(df["O_review"])
    df[["S_review", "S_human_review", "O_review", "O_human_review"]] = df[
        ["S_review", "S_human_review", "O_review", "O_human_review"]
    ].astype(int)

    results = []
    for role, type_col, review_col, human_col in [
        ("主體", "主體類別", "S_review", "S_human_review"),
        ("客體", "客體類別", "O_review", "O_human_review")
    ]:
        for entity_type in df[type_col].dropna().unique():
            subset = df[df[type_col] == entity_type]
            TP, FP, FN, P, R, F1 = calculate_metrics(subset, review_col, human_col)
            results.append({
                "類型": entity_type,
                "位置": role,
                "TP": TP, "FP": FP, "FN": FN,
                "Precision": P, "Recall": R, "F1": F1
            })
    return pd.DataFrame(results)

def run_and_print_metrics(csv_file):
    df = pd.read_csv(csv_file)
    
    # 按檔名與總計
    per_file = calculate_metrics_per_file(csv_file)
    overall = calculate_overall_metrics(csv_file)
    combined = pd.concat([per_file, overall])

    for col in ["Precision", "Recall", "F1"]:
        combined[col] = combined[col].apply(lambda x: f"{x:.2%}")
    for col in ["TP", "FP", "FN"]:
        combined[col] = combined[col].astype(int)

    print("\n📄 [按檔名與總計]")
    print(tabulate(combined, headers="keys", tablefmt="grid"))

    # 類型分類
    type_metrics = calculate_metrics_by_type(df)
    for col in ["Precision", "Recall", "F1"]:
        type_metrics[col] = type_metrics[col].apply(lambda x: f"{x:.2%}")
    for col in ["TP", "FP", "FN"]:
        type_metrics[col] = type_metrics[col].astype(int)

    print("\n🏷️ [按類型分類統計]")
    print(tabulate(type_metrics, headers="keys", tablefmt="grid"))


In [14]:
run_and_print_metrics(csv_output_path)


📄 [按檔名與總計]
+-----+-----------------------------------------------------------------------------+--------+------+------+------+-------------+----------+---------+
|     | 檔名                                                                        | 位置   |   TP |   FP |   FN | Precision   | Recall   | F1      |
|   0 | 228事件(20).json                                                            | 主體   |   44 |    0 |    0 | 100.00%     | 100.00%  | 100.00% |
+-----+-----------------------------------------------------------------------------+--------+------+------+------+-------------+----------+---------+
|   1 | 228事件(20).json                                                            | 客體   |   46 |    0 |    0 | 100.00%     | 100.00%  | 100.00% |
+-----+-----------------------------------------------------------------------------+--------+------+------+------+-------------+----------+---------+
|   2 | 「友仔」是什麼？光復初期臺北地區非法組織調查報告告訴您(37).json             | 主體   |   31 |    0 |    0 | 100.00%