In [5]:
import requests
import os
import json
import time
import shutil
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

# 初始化 Wikipedia 中文 API
api_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=300, lang="zh")
wiki_tool = WikipediaQueryRun(api_wrapper=api_wrapper)

# Wikidata 查詢
def wikidata_search(entity_name):
    url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbsearchentities',
        'search': entity_name,
        'language': 'zh',
        'format': 'json',
        'limit': 3
    }
    response = requests.get(url, params=params).json()
    for result in response.get('search', []):
        desc = result.get('description', '')
        if desc:
            return result['id'], desc
    return None, 'Not found'

# # Wikipedia fallback 查詢
# def wikipedia_fallback(keyword):
#     try:
#         page_result = wiki_tool.invoke({"query": keyword})
#         if "Page:" in page_result and "Summary:" in page_result:
#             page_title = page_result.split("Summary:")[0].replace("Page:", "").strip()
#             summary = page_result.split("Summary:")[1].strip()
#             if page_title == keyword:
#                 return page_title, summary
#     except Exception as e:
#         print(f"❌ Wikipedia 查詢失敗: {e}")
#     return None, 'Not found'

# DBpedia 查 URI
def get_dbpedia_uri_from_dbpedia(entity_name):
    sparql_url = "http://dbpedia.org/sparql"
    sparql_query = """
    SELECT ?subject WHERE {
        ?subject rdfs:label "%s"@zh.
    }
    LIMIT 1
    """ % entity_name
    try:
        response = requests.get(sparql_url, params={"query": sparql_query, "format": "json"})
        results = response.json().get("results", {}).get("bindings", [])
        if results:
            return results[0]["subject"]["value"]
    except Exception as e:
        print(f"❌ DBpedia 查詢錯誤: {e}")
    return None

# 不需查詢的類別
SKIP_CATEGORIES = {"貨幣", "單位", "數量", "比例", "時間"}

# 整合查詢函式（使用實體名稱與類別）
def enrich_entity(entity_name, entity_type):
    # ⛔️ 如果是排除的類別就直接略過查詢
    if entity_type in SKIP_CATEGORIES:
        return None, f"Skipped ({entity_type})", None

    # 1️⃣ 先查 Wikidata
    qid, desc = wikidata_search(entity_name)
    if qid:
        dbpedia_uri = get_dbpedia_uri_from_dbpedia(entity_name)
        return qid, desc, dbpedia_uri

    # 2️⃣ fallback DBpedia URI
    dbpedia_uri = get_dbpedia_uri_from_dbpedia(entity_name)
    if dbpedia_uri:
        return None, "From DBpedia", dbpedia_uri

    # # 3️⃣ fallback Wikipedia 最後查
    # page, summary = wikipedia_fallback(entity_name)
    # if page:
    #     return None, summary, None

    return None, 'Not found', None

# 處理 JSON 中的關係
def process_relationships(relationships):
    for relationship in relationships:
        for rel in relationship.get("關係列表", []):
            subject = rel.get("主體", "")
            subject_type = rel.get("主體類別", "")
            obj = rel.get("客體", "")
            obj_type = rel.get("客體類別", "")
            relation = rel.get("關係", "")

            sub_qid, sub_desc, sub_dbpedia = enrich_entity(subject, subject_type)
            obj_qid, obj_desc, obj_dbpedia = enrich_entity(obj, obj_type)

            rel["主體 QID"] = sub_qid
            rel["主體 描述"] = sub_desc
            rel["主體 DBpedia"] = sub_dbpedia
            rel["客體 QID"] = obj_qid
            rel["客體 描述"] = obj_desc
            rel["客體 DBpedia"] = obj_dbpedia

            print(f"主體: {subject}（{subject_type}） → QID: {sub_qid} / 描述: {sub_desc} / DBpedia: {sub_dbpedia}")
            print(f"客體: {obj}（{obj_type}） → QID: {obj_qid} / 描述: {obj_desc} / DBpedia: {obj_dbpedia}")
            print(f"關係: {relation}")
            print("-" * 50)
            time.sleep(0.5)

# 批次處理檔案
def process_all_json(json_dir, out_dir, done_dir):
    for filename in os.listdir(json_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(json_dir, filename)
            print(f"🚀 處理檔案: {filename}")
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            if isinstance(data, dict):
                relationships = data.get("review", {}).get("relationships", [])
                process_relationships(relationships)
            elif isinstance(data, list):
                for item in data:
                    if isinstance(item, dict):
                        relationships = item.get("review", {}).get("relationships", [])
                        process_relationships(relationships)

            output_file = os.path.join(out_dir, filename)
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

            shutil.move(file_path, os.path.join(done_dir, filename))
            print(f"✅ 已移動到完成目錄: {filename}")


In [None]:
# 設定目錄路徑
json_dir = "./docs/output/4_human_review/v7"
done_dir = "./docs/output/4_human_review/v7/done/"
out_dir = "./docs/output/5_wiki/v7/"
#os.makedirs(out_dir, exist_ok=True)

# 執行處理目錄中的所有 JSON 檔案並保存到新目錄
process_all_json(json_dir, out_dir,done_dir)





🚀 處理檔案: 直衝雲霄：花蓮機場走過一甲子歲月(179).json
主體: 沈怡（人物） → QID: Q8276171 / 描述: Chinese politician (1901-1980) / DBpedia: None
客體: 花蓮航空站落成啟用（事件） → QID: None / 描述: Not found / DBpedia: None
關係: 參與
--------------------------------------------------
主體: 郝樂遜（人物） → QID: None / 描述: Not found / DBpedia: None
客體: 花蓮航空站落成啟用（事件） → QID: None / 描述: Not found / DBpedia: None
關係: 參與
--------------------------------------------------
主體: 擴建跑道及助航設備（事件） → QID: None / 描述: Not found / DBpedia: None
客體: 1,250萬元（數量） → QID: None / 描述: Skipped (數量) / DBpedia: None
關係: 成本
--------------------------------------------------
主體: 日本投降（事件） → QID: Q6540361 / 描述: end of World War II, 2 September 1945 / DBpedia: http://dbpedia.org/resource/Surrender_of_Japan
客體: 臺灣（地點） → QID: Q865 / 描述: country in East Asia / DBpedia: http://dbpedia.org/resource/Republic_of_China_(1912–1949)
關係: 地點
--------------------------------------------------
主體: 花蓮航空站落成啟用（事件） → QID: None / 描述: Not found / DBpedia: None
客體: 花蓮（地點） → QID: Q249868 / 描述: coun