In [None]:
# 先產生一個CSV檔案，給後面的程式讀取
import json
import pandas as pd
import os
import re

# 設定資料夾路徑
directory_path = "./docs/output/1_clean_json/llama"
# 儲存為 CSV 檔案
csv_path = "./docs/output/2_metadata/processed_data.csv"

# 讀取 JSON 檔案
def read_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {file_path}: {e}")
        return None
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# 目標關鍵字正則表達式（優先處理 "檔案來源機關"）
target_patterns = {
    "案名": r"(案名：)",
    "檔案內容": r"(檔案內容：)",
    "檔號": r"(檔號：)",
    "管有機關": r"(管有機關：)",
    "檔案來源機關": r"(檔案來源機關：)",  # **優先處理**
    "來源機關": r"(來源機關：)"  # **如果沒有 "檔案來源機關" 才處理**
}

# 處理 `reference` 欄位
def process_reference(reference):
    if isinstance(reference, list):
        modified_ref_list = []
        contains_file_source = any("檔案來源機關" in ref for ref in reference)  # 是否有 "檔案來源機關"

        for ref in reference:
            if any(key in ref for key in target_patterns):  # 只處理包含關鍵字的行
                for key, pattern in target_patterns.items():
                    # **如果"檔案來源機關"已經存在，就不再處理 "來源機關"**
                    if key == "來源機關" and contains_file_source:
                        continue
                    ref = re.sub(pattern, r" | \1", ref)  # 在關鍵字前加 " | "
            modified_ref_list.append(ref.strip())  # 去除前後空白

        return modified_ref_list
    return reference  # 非列表的情況，直接返回

# 處理 JSON 數據
def process_data(data, file_id):
    file_ids = []  # 每個 reference 對應的 file_id
    references = []  # 處理後的 reference

    if isinstance(data, dict):
        data = [data]
    
    if isinstance(data, list):
        for item in data:
            title = item.get('titles', '')
            if isinstance(title, list):
                title = title[0] if title else ''
            
            reference = item.get('reference', [])
            modified_reference = process_reference(reference)

            for ref in modified_reference:
                file_ids.append(file_id)
                references.append(ref)  # 每個 reference 獨立存入

    # 創建 DataFrame
    df = pd.DataFrame({
        'file_id': file_ids,
        'reference': references
    })
    
    return df

# 讀取所有 JSON 檔案
def read_all_json_files(directory_path):
    all_data = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            print(f"Reading file: {file_path}")
            data = read_json(file_path)
            if data:
                all_data.append((data, filename))
    
    return all_data

# 讀取所有 JSON 檔案
all_data = read_all_json_files(directory_path)

# 整理資料
all_file_ids = []
all_references = []

# 處理所有檔案的資料
for data, file_id in all_data:
    df = process_data(data, file_id)
    all_file_ids.extend(df['file_id'])
    all_references.extend(df['reference'])

# 創建最終 DataFrame
final_df = pd.DataFrame({
    'file_id': all_file_ids,
    'reference': all_references
})

final_df.to_csv(csv_path, index=False, encoding='utf-8-sig')

print(f"CSV file has been saved to {csv_path}")


Reading file: ./docs/output/1_clean_json/llama\228事件(20).json
Reading file: ./docs/output/1_clean_json/llama\「友仔」是什麼？光復初期臺北地區非法組織調查報告告訴您(37).json
Reading file: ./docs/output/1_clean_json/llama\「回首向來蕭瑟處，歸去，也無風雨也無晴」—民國38年國軍遷臺紀事(30).json
Reading file: ./docs/output/1_clean_json/llama\「威海衛」租借地的收回(42).json
Reading file: ./docs/output/1_clean_json/llama\「快速」發展的年代：麥克阿瑟公路通車一甲子(203).json
Reading file: ./docs/output/1_clean_json/llama\「日暮鄉關何處是」─「留越國軍」的返台路(31).json
Reading file: ./docs/output/1_clean_json/llama\「賽德克．巴萊」重現的霧社事件(17).json
Reading file: ./docs/output/1_clean_json/llama\「醫者仁也‧仁者人也」─光復初期臺灣醫學教育(29).json
Reading file: ./docs/output/1_clean_json/llama\ㄋㄟㄋㄟ補給站：美援牛奶的供應(127).json
Reading file: ./docs/output/1_clean_json/llama\一紙命令，臺灣命運大不同─中國台灣省行政長官公署警備總司令部第一號令(1).json
Reading file: ./docs/output/1_clean_json/llama\不用手機也可哈拉一整天─45年度公用電話擴充計畫(2).json
Reading file: ./docs/output/1_clean_json/llama\不能少了你—臺灣光復後首次戶口清查(35).json
Reading file: ./docs/output/1_clean_json/llama\世界人權日(18).json
Reading fil