In [1]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import DCTERMS, RDF, RDFS, XSD, FOAF, DC
import json
import os
from pathlib import Path
from datetime import datetime


# BASE URI
BASE_URI = "http://example.org/"
EX = Namespace(BASE_URI)
WDT = Namespace("http://www.wikidata.org/prop/direct/")
CC = Namespace("http://creativecommons.org/ns#")
WD = Namespace("http://www.wikidata.org/entity/")
dc1 = Namespace("http://purl.org/dc/terms/")

def init_graph():
    g = Graph()
    g.bind("ex", EX)
    g.bind("wd", WD)
    g.bind("wdt", WDT)
    g.bind("xsd", XSD)
    g.bind("cc", CC)
    g.bind("foaf", FOAF)
    g.bind("dc", dc1)
    return g

In [2]:
def generate_blank_node(graph, subject):
    # 使用 Blank Node 來生成匿名節點
    blank_node = URIRef(f"_:b{hash(subject)}")
    return blank_node

def convert_article_to_rdf(graph, article_data):
    file_id = article_data["filename"].replace(" ", "_")
    article_uri = URIRef(EX[f"doc/{file_id}"])
    
    # 使用標準類型
    graph.add((article_uri, RDF.type, FOAF.Document))

    # 標題（使用 dc:title）
    title = article_data.get("titles", [])
    if title:
        graph.add((article_uri, DC.title, Literal(title[0])))

    # 網址（使用 dc:source 或 dc:identifier）
    url = article_data.get("URL")
    if url:
        graph.add((article_uri, DC.identifier, URIRef(url)))

    # 日期（dc:date）
    raw_date = article_data.get("published_date")
    if raw_date:
        for fmt in ("%Y-%m-%d", "%Y/%m/%d"):
            try:
                parsed_date = datetime.strptime(raw_date, fmt)
                graph.add((article_uri, DC.date, Literal(parsed_date.date().isoformat(), datatype=XSD.date)))
                break
            except ValueError:
                continue
        else:
            print(f"⚠️ 日期格式錯誤（跳過）: {raw_date} in {article_data['filename']}")

    # 作者（dc:creator）
    author = article_data.get("author")
    if isinstance(author, str):
        graph.add((article_uri, DC.creator, Literal(author)))
    elif isinstance(author, list):
        for a in author:
            graph.add((article_uri, DC.creator, Literal(a)))

    # Metadata 中的檔案資訊
    metadata_raw = article_data.get("metadata", [])
    if isinstance(metadata_raw, dict):
        metadata_list = [metadata_raw]
    elif isinstance(metadata_raw, list):
        metadata_list = [item for item in metadata_raw if isinstance(item, dict)]
    else:
        metadata_list = []

    for metadata in metadata_list:
        file_number = metadata.get("檔號")
        if not file_number:
            continue

        file_uri = generate_blank_node(graph, f"file_{file_number}")
        graph.add((file_uri, RDF.type, EX.ArchiveFile))
        graph.add((file_uri, WDT["P217"], Literal(file_number)))

        # 與文件關聯
        graph.add((article_uri, EX.relatedFile, file_uri))

        if "案名" in metadata:
            graph.add((file_uri, DC.title, Literal(metadata["案名"])))
        if "來源機關" in metadata:
            graph.add((file_uri, DC.creator, Literal(metadata["來源機關"])))
        if "管有機關" in metadata:
            graph.add((file_uri, DC.publisher, Literal(metadata["管有機關"])))
        if "檔案影像" in metadata:
            graph.add((file_uri, EX.imageNumber, Literal(metadata["檔案影像"])))
        
        graph.add((article_uri, DCTERMS.rights, Literal("本資料依據創用 CC 姓名標示-非商業性 3.0 授權條款釋出")))
        graph.add((article_uri, URIRef("http://creativecommons.org/ns#license"), URIRef("http://creativecommons.org/licenses/by-nc/3.0/")))
        graph.add((article_uri, URIRef("http://creativecommons.org/ns#attributionName"), Literal("檔案管理局")))
        graph.add((article_uri, URIRef("http://creativecommons.org/ns#attributionURL"), URIRef("https://www.archives.gov.tw")))

    # 處理其他補充欄位
    other_fields = {
        "source": DC.source,
        "license": URIRef("http://creativecommons.org/licenses/by-nc/3.0/"),
        "type": EX.type,
        "theme": EX.theme,
        "place": EX.place,
        "time": EX.time,
    }

    for field, predicate in other_fields.items():
        value = article_data.get(field)
        if value:
            if isinstance(predicate, URIRef):
                # 如果是 license URI，直接加入
                graph.add((article_uri, DC.rights, predicate))
            else:
                if isinstance(value, list):
                    for v in value:
                        graph.add((article_uri, predicate, Literal(v)))
                else:
                    graph.add((article_uri, predicate, Literal(value)))

    return graph

In [None]:

## 4. 主程式邏輯（選擇單檔或多檔輸出）


def convert_json_to_rdf(json_path, output_dir="output", one_file=True):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if one_file:
        g = init_graph()
        for article in data:
            convert_article_to_rdf(g, article)
        g.serialize(destination=output_dir / "all_articles.ttl", format="turtle", base=BASE_URI)
    else:
        for article in data:
            g = init_graph()
            convert_article_to_rdf(g, article)
            filename = article["filename"].replace(" ", "_")
            g.serialize(destination=output_dir / f"{filename}.ttl", format="turtle", base=BASE_URI)


## 5. 批次讀取目錄中 JSON 檔案並提取欄位


def extract_json_fields(json_dir):
    results = []
    for filename in os.listdir(json_dir):
        if filename.endswith('.json'):
            path = os.path.join(json_dir, filename)
            try:
                with open(path, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                result = {
                    "filename": filename,
                    "titles": data.get("titles", []),
                    "author": data.get("author", []),  # 注意這裡是 'author' 且是列表
                    "published_date": data.get("published_date", ""),
                    "URL": data.get("URL", ""),
                    "metadata": data.get("metadata", [])
                }
                results.append(result)

            except json.JSONDecodeError as e:
                print(f"❌ JSON格式錯誤：{filename} — {e}")

    return results


def extract_json_fields(json_dir):
    results = []

    for filename in os.listdir(json_dir):
        if filename.endswith('.json'):
            path = os.path.join(json_dir, filename)
            try:
                with open(path, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                result = {
                    "filename": filename,
                    "titles": data.get("titles"),
                    "author": data.get("author"),
                    "published_date": data.get("published_date"),
                    "URL": data.get("URL"),
                    "metadata": data.get("metadata", [])
                }
                results.append(result)

            except json.JSONDecodeError as e:
                print(f"❌ JSON格式錯誤：{filename} — {e}")

    return results


## 6. 批次轉換整個資料夾內 JSON 為 RDF

def convert_json_directory_to_rdf(json_dir, output_dir="output", one_file=True):
    articles = extract_json_fields(json_dir)

    intermediate_file = Path(output_dir) / "_temp_articles.json"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    with open(intermediate_file, "w", encoding="utf-8") as f:
        json.dump(articles, f, ensure_ascii=False, indent=2)

    convert_json_to_rdf(intermediate_file, output_dir=output_dir, one_file=one_file)

In [None]:
json_dir = "./docs/output/2_metadata/done"
out_dir = "./docs/output/6_rdf"
convert_json_directory_to_rdf(json_dir, out_dir, one_file=True)



In [None]:
# === 步驟 1: 載入 TTL 檔案 ===
g = Graph()
ttl_path = "./docs/output/6_rdf/all_articles.ttl"  # 👉 這裡請改成你的檔名，例如 "data/228事件.ttl"
g.parse(ttl_path, format="turtle")
print(f"Graph has {len(g)} triples.")

In [30]:
# 取得唯一的三元組
unique_triples = set(g)
print(f"總共有 {len(g)} 筆三元組")
print(f"不重複三元組數量：{len(unique_triples)}")

總共有 8824 筆三元組
不重複三元組數量：8824
