### 將Metadata轉換RDF格式

In [38]:
from rdflib import Graph, URIRef, Literal, RDF, Namespace, BNode
from rdflib.namespace import DCTERMS, RDF, RDFS, XSD, FOAF, DC, OWL
import json
import os
from pathlib import Path
from datetime import datetime
import requests

In [39]:
# 定義命名空間
# BASE URI
BASE_URI = "http://example.org/"
EX = Namespace(BASE_URI)
WDT = Namespace("http://www.wikidata.org/prop/direct/")
CC = Namespace("http://creativecommons.org/ns#")
dc1 = Namespace("http://purl.org/dc/terms/")
# 連結 Wikidata 和 DBpedia 的命名空間
WD = Namespace("http://www.wikidata.org/entity/")
DBR = Namespace("http://dbpedia.org/resource/")

def init_graph():
    g = Graph()
    g.bind("ex", EX)
    g.bind("wd", WD)
    g.bind("wdt", WDT)
    g.bind("xsd", XSD)
    g.bind("cc", CC)
    g.bind("foaf", FOAF)
    g.bind("dc", dc1)
    return g

def generate_blank_node(graph, subject):
    # 使用 Blank Node 來生成匿名節點
    blank_node = URIRef(f"_:b{hash(subject)}")
    return blank_node

def convert_article_to_rdf(graph, article_data):
    file_id = article_data["filename"].replace(" ", "_")
    article_uri = URIRef(EX[f"doc/{file_id}"])
    
    # 使用標準類型
    graph.add((article_uri, RDF.type, FOAF.Document))

    # 標題（使用 dc:title）
    title = article_data.get("titles", [])
    if title:
        graph.add((article_uri, DC.title, Literal(title[0])))

    # 網址（使用 dc:source 或 dc:identifier）
    url = article_data.get("URL")
    if url:
        graph.add((article_uri, DC.identifier, URIRef(url)))

    # 日期（dc:date）
    raw_date = article_data.get("published_date")
    if raw_date:
        for fmt in ("%Y-%m-%d", "%Y/%m/%d"):
            try:
                parsed_date = datetime.strptime(raw_date, fmt)
                graph.add((article_uri, DC.date, Literal(parsed_date.date().isoformat(), datatype=XSD.date)))
                break
            except ValueError:
                continue
        else:
            print(f"⚠️ 日期格式錯誤（跳過）: {raw_date} in {article_data['filename']}")

    # 作者（dc:creator）
    author = article_data.get("author")
    if isinstance(author, str):
        graph.add((article_uri, DC.creator, Literal(author)))
    elif isinstance(author, list):
        for a in author:
            graph.add((article_uri, DC.creator, Literal(a)))

    # Metadata 中的檔案資訊
    metadata_raw = article_data.get("metadata", [])
    if isinstance(metadata_raw, dict):
        metadata_list = [metadata_raw]
    elif isinstance(metadata_raw, list):
        metadata_list = [item for item in metadata_raw if isinstance(item, dict)]
    else:
        metadata_list = []

    for metadata in metadata_list:
        file_number = metadata.get("檔號")
        if not file_number:
            continue

        file_uri = generate_blank_node(graph, f"file_{file_number}")
        graph.add((file_uri, RDF.type, EX.ArchiveFile))
        graph.add((file_uri, WDT["P217"], Literal(file_number)))

        # 與文件關聯
        graph.add((article_uri, EX.relatedFile, file_uri))

        if "案名" in metadata:
            graph.add((file_uri, DC.title, Literal(metadata["案名"])))
        if "來源機關" in metadata:
            graph.add((file_uri, DC.creator, Literal(metadata["來源機關"])))
        if "管有機關" in metadata:
            graph.add((file_uri, DC.publisher, Literal(metadata["管有機關"])))
        if "檔案影像" in metadata:
            graph.add((file_uri, EX.imageNumber, Literal(metadata["檔案影像"])))
        
        graph.add((article_uri, DCTERMS.rights, Literal("本資料依據創用 CC 姓名標示-非商業性 3.0 授權條款釋出")))
        graph.add((article_uri, URIRef("http://creativecommons.org/ns#license"), URIRef("http://creativecommons.org/licenses/by-nc/3.0/")))
        graph.add((article_uri, URIRef("http://creativecommons.org/ns#attributionName"), Literal("檔案管理局")))
        graph.add((article_uri, URIRef("http://creativecommons.org/ns#attributionURL"), URIRef("https://www.archives.gov.tw")))

    # 處理其他補充欄位
    other_fields = {
        "source": DC.source,
        "license": URIRef("http://creativecommons.org/licenses/by-nc/3.0/"),
        "type": EX.type,
        "theme": EX.theme,
        "place": EX.place,
        "time": EX.time,
    }

    for field, predicate in other_fields.items():
        value = article_data.get(field)
        if value:
            if isinstance(predicate, URIRef):
                # 如果是 license URI，直接加入
                graph.add((article_uri, DC.rights, predicate))
            else:
                if isinstance(value, list):
                    for v in value:
                        graph.add((article_uri, predicate, Literal(v)))
                else:
                    graph.add((article_uri, predicate, Literal(value)))

    return graph



## 4. 主程式邏輯（選擇單檔或多檔輸出）


def convert_json_to_rdf(json_path, output_dir="output", one_file=True):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if one_file:
        g = init_graph()
        for article in data:
            convert_article_to_rdf(g, article)
        g.serialize(destination=output_dir / "all_articles.ttl", format="turtle", base=BASE_URI)
    else:
        for article in data:
            g = init_graph()
            convert_article_to_rdf(g, article)
            filename = article["filename"].replace(" ", "_")
            g.serialize(destination=output_dir / f"{filename}.ttl", format="turtle", base=BASE_URI)


## 5. 批次讀取目錄中 JSON 檔案並提取欄位


def extract_json_fields(json_dir):
    results = []
    for filename in os.listdir(json_dir):
        if filename.endswith('.json'):
            path = os.path.join(json_dir, filename)
            try:
                with open(path, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                result = {
                    "filename": filename,
                    "titles": data.get("titles", []),
                    "author": data.get("author", []),  # 注意這裡是 'author' 且是列表
                    "published_date": data.get("published_date", ""),
                    "URL": data.get("URL", ""),
                    "metadata": data.get("metadata", [])
                }
                results.append(result)

            except json.JSONDecodeError as e:
                print(f"❌ JSON格式錯誤：{filename} — {e}")

    return results


def extract_json_fields(json_dir):
    results = []

    for filename in os.listdir(json_dir):
        if filename.endswith('.json'):
            path = os.path.join(json_dir, filename)
            try:
                with open(path, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                result = {
                    "filename": filename,
                    "titles": data.get("titles"),
                    "author": data.get("author"),
                    "published_date": data.get("published_date"),
                    "URL": data.get("URL"),
                    "metadata": data.get("metadata", [])
                }
                results.append(result)

            except json.JSONDecodeError as e:
                print(f"❌ JSON格式錯誤：{filename} — {e}")

    return results


## 6. 批次轉換整個資料夾內 JSON 為 RDF
def convert_json_directory_to_rdf(json_dir, output_dir="output", one_file=True):
    articles = extract_json_fields(json_dir)

    intermediate_file = Path(output_dir) / "_temp_articles.json"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    with open(intermediate_file, "w", encoding="utf-8") as f:
        json.dump(articles, f, ensure_ascii=False, indent=2)

    convert_json_to_rdf(intermediate_file, output_dir=output_dir, one_file=one_file)

In [40]:
json_dir = "./docs/output/2_metadata/done"
out_dir = "./docs/output/6_rdf"
convert_json_directory_to_rdf(json_dir, out_dir, one_file=True)


### 顯示metadata RDF 數量

In [41]:
# 載入 TTL 檔案 ===
g = Graph()
ttl_path = "./docs/output/6_rdf/all_articles.ttl"  
g.parse(ttl_path, format="turtle")

print(f"Graph has {len(g)} triples.")

#unique_triples = set(g)
#print(f"總共有 {len(g)} 筆三元組")
#print(f"不重複三元組數量：{len(unique_triples)}")


Graph has 9592 triples.


In [42]:
with open(ttl_path, "r", encoding="utf-8") as f:
    content = f.read()
    print(content)

@base <http://example.org/> .
@prefix cc: <http://creativecommons.org/ns#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dc1: <http://purl.org/dc/terms/> .
@prefix ex: <http://example.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<doc/228事件(20).json> a foaf:Document ;
    cc:attributionName "檔案管理局" ;
    cc:attributionURL <https://www.archives.gov.tw> ;
    cc:license <http://creativecommons.org/licenses/by-nc/3.0/> ;
    <relatedFile> <_:b-1555885546779317632> ;
    dc:creator "◎資料蒐集：應用服務組" ;
    dc:date "2009-02-16"^^xsd:date ;
    dc:identifier <https://www.archives.gov.tw/wSite/public/Attachment/0/f1718086516443.pdf> ;
    dc:title "228事件" ;
    dc1:rights "本資料依據創用 CC 姓名標示-非商業性 3.0 授權條款釋出" .

<doc/「友仔」是什麼？光復初期臺北地區非法組織調查報告告訴您(37).json> a foaf:Document ;
    cc:attributionName "檔案管理局" ;
    cc:attributionURL <https://www.archives.gov.tw> ;
    cc:license <http

In [20]:
# from rdflib import Graph

# g = Graph()
# g.parse(ttl_path, format="turtle")
#     #print(metadata)
# print(g.serialize(format="nt"))  # N-Triples
# #print(g.serialize(format="xml"))  # RDF/XML


### 將文件收集的資訊JSON轉換RDF格式

In [43]:
# 定義命名空間
EX = Namespace("http://example.org/schema#")
#BASE_URI = "http://example.org/"
#EX = Namespace(BASE_URI)

#WD = Namespace("http://www.wikidata.org/entity/")
#WDT = Namespace("http://www.wikidata.org/prop/direct/")
#DBR = Namespace("http://dbpedia.org/resource/")


In [44]:
LITERAL_CLASSES = {"數量", "比例", "單位", "貨幣", "時間"}

# 載入 JSON 檔案
def load_json(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

# 從 Wikidata 取得標籤和描述
def get_wikidata_info(qid):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            entity = data['entities'].get(qid, {})
            labels = entity.get('labels', {})
            descriptions = entity.get('descriptions', {})
            label = labels.get('zh-tw', {}).get('value') or labels.get('zh', {}).get('value')
            description = descriptions.get('zh-tw', {}).get('value') or descriptions.get('zh', {}).get('value')
            return label, description
    except Exception as e:
        print(f"Error fetching Wikidata info for {qid}: {e}")
    return None, None

# 從 DBpedia 取得標籤和描述
def get_dbpedia_info(uri):
    sparql = f"""
    SELECT ?label ?comment WHERE {{
        <{uri}> rdfs:label ?label .
        OPTIONAL {{ <{uri}> rdfs:comment ?comment . }}
        FILTER (lang(?label) = 'zh' || lang(?label) = 'zh-tw')
    }}
    """
    endpoint = "https://dbpedia.org/sparql"
    try:
        response = requests.get(endpoint, params={'query': sparql, 'format': 'json'})
        if response.status_code == 200:
            results = response.json().get('results', {}).get('bindings', [])
            if results:
                label = results[0].get('label', {}).get('value')
                comment = results[0].get('comment', {}).get('value')
                return label, comment
    except Exception as e:
        print(f"Error fetching DBpedia info for {uri}: {e}")
    return None, None

# 實體資訊加到 RDF 圖中
def add_entity_info_to_graph(g, entity_uri, entity_id):
    label, description = get_wikidata_info(entity_id)
    if label:
        g.add((entity_uri, RDFS.label, Literal(label, lang='zh-TW')))
    if description:
        g.add((entity_uri, RDFS.comment, Literal(description, lang='zh-TW')))
    if not label or not description:
        dbpedia_uri = f"http://dbpedia.org/resource/{entity_id}"
        label, description = get_dbpedia_info(dbpedia_uri)
        if label:
            g.add((entity_uri, RDFS.label, Literal(label, lang='zh-TW')))
        if description:
            g.add((entity_uri, RDFS.comment, Literal(description, lang='zh-TW')))
            
# 建立 Literal 的 Blank Node
def create_literal_blank_node(g, label, entity_type):
    bnode = BNode()
    try:
        if "成" in label:
            numeric_value = float(label.replace("成", "")) / 10
            g.add((bnode, RDF.value, Literal(numeric_value, datatype=XSD.float)))
        elif label.replace(".", "", 1).isdigit():
            g.add((bnode, RDF.value, Literal(float(label), datatype=XSD.float)))
        else:
            g.add((bnode, RDF.value, Literal(label, lang='zh-TW')))
    except:
        g.add((bnode, RDF.value, Literal(label, lang='zh-TW')))
    g.add((bnode, RDF.type, EX[entity_type]))
    g.add((bnode, RDFS.label, Literal(label, lang='zh-TW')))
    return bnode

# 轉換 JSON 內容為 Turtle 格式
def convert_json_to_turtle(data, filename, output_dir):
    g = Graph()
    g.bind("ex", EX)
    g.bind("wd", WD)
    g.bind("wdt", WDT)
    g.bind("rdfs", RDFS)
    g.bind("dbr", DBR)

    for entry in data['review']['relationships']:
        for event in entry['事件']:
            event_uri = URIRef(f"http://example.org/doc/{filename}#event_{event}")
            g.add((event_uri, RDF.type, EX.Event))
            g.add((event_uri, EX.describedIn, URIRef(f"http://example.org/doc/{filename}.json")))
            g.add((event_uri, RDFS.label, Literal(event, lang='zh-TW')))
            g.add((event_uri, RDFS.comment, Literal(f"事件：{event}", lang='zh-TW')))

            for relationship in entry['關係列表']:
                subj_label = relationship.get('主體')
                subj_qid = relationship.get('主體 QID')
                subj_dbpedia = relationship.get('主體 DBpedia')
                subj_type = relationship.get('主體類別')

                obj_label = relationship.get('客體')
                obj_qid = relationship.get('客體 QID')
                obj_dbpedia = relationship.get('客體 DBpedia')
                obj_type = relationship.get('客體類別')

                predicate = relationship.get('p-items')
                predicate_uri = WDT[predicate]

                if subj_type in LITERAL_CLASSES:
                    subj_node = create_literal_blank_node(g, subj_label, subj_type)
                elif subj_qid:
                    subj_node = WD[subj_qid]
                    add_entity_info_to_graph(g, subj_node, subj_qid)
                    if subj_dbpedia:
                        g.add((subj_node, OWL.sameAs, URIRef(subj_dbpedia)))
                else:
                    subj_node = Literal(subj_label, lang='zh-TW')

                if obj_type in LITERAL_CLASSES:
                    obj_node = create_literal_blank_node(g, obj_label, obj_type)
                elif obj_qid:
                    obj_node = WD[obj_qid]
                    add_entity_info_to_graph(g, obj_node, obj_qid)
                    if obj_dbpedia:
                        g.add((obj_node, OWL.sameAs, URIRef(obj_dbpedia)))
                else:
                    obj_node = Literal(obj_label, lang='zh-TW')

                triple_node = BNode()
                g.add((triple_node, RDF.type, EX.Triple))
                g.add((triple_node, EX.subject, subj_node))
                g.add((triple_node, EX.predicate, predicate_uri))
                g.add((triple_node, EX.object, obj_node))

                g.add((event_uri, EX.hasTriple, triple_node))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    ttl_filename = os.path.join(output_dir, f"{filename}.ttl")
    g.serialize(destination=ttl_filename, format="turtle", encoding="utf-8")
    print(f"RDF Turtle file saved as {ttl_filename}")

# 處理整個資料夾的 JSON 檔案
def convert_json_folder_to_turtle(input_folder, output_folder):
    for filename in os.listdir(input_folder):
        if filename.endswith(".json"):
            json_filepath = os.path.join(input_folder, filename)
            data = load_json(json_filepath)
            convert_json_to_turtle(data, filename[:-5], output_folder)

### 顯示 RDF 數量

In [45]:
# 載入 TTL 檔案 ===
import os
from rdflib import Graph

ttl_dir = "./docs/output/6_rdf/event"  # 你的 TTL 檔案所在目錄

# 列出所有 .ttl 檔案
ttl_files = [f for f in os.listdir(ttl_dir) if f.endswith(".ttl")]

total_triples = 0

for ttl_file in ttl_files:
    ttl_path = os.path.join(ttl_dir, ttl_file)
    
    g = Graph()
    g.parse(ttl_path, format="turtle")
    
    triple_count = len(g)
    total_triples += triple_count
    
    print(f"{ttl_file}: {triple_count} triples")

print(f"Total triples in directory: {total_triples}")

228事件(20).ttl: 536 triples
「友仔」是什麼？光復初期臺北地區非法組織調查報告告訴您(37).ttl: 332 triples
「回首向來蕭瑟處，歸去，也無風雨也無晴」—民國38年國軍遷臺紀事(30).ttl: 1226 triples
「威海衛」租借地的收回(42).ttl: 279 triples
「快速」發展的年代：麥克阿瑟公路通車一甲子(203).ttl: 419 triples
「日暮鄉關何處是」─「留越國軍」的返台路(31).ttl: 302 triples
「賽德克．巴萊」重現的霧社事件(17).ttl: 182 triples
「醫者仁也‧仁者人也」─光復初期臺灣醫學教育(29).ttl: 178 triples
ㄋㄟㄋㄟ補給站：美援牛奶的供應(127).ttl: 164 triples
一紙命令，臺灣命運大不同─中國台灣省行政長官公署警備總司令部第一號令(1).ttl: 1002 triples
不用手機也可哈拉一整天─45年度公用電話擴充計畫(2).ttl: 142 triples
不能少了你—臺灣光復後首次戶口清查(35).ttl: 367 triples
世界人權日(18).ttl: 132 triples
世紀糖鐵穿鄉越鎮的五分車(148).ttl: 640 triples
中元普渡與法國軍墓在基隆(170).ttl: 829 triples
中華商場：見證大臺北繁華歲月(166).ttl: 247 triples
中華航空：從軍事化管理到以客為尊經營(152).ttl: 802 triples
九年國教：春風化雨五十載(134).ttl: 228 triples
亞東關係協會：臺日友好關係的樞紐(185).ttl: 229 triples
人民頭家—公民直選總統(23).ttl: 729 triples
任重道遠─民國36年台灣省鐵路圖(11).ttl: 135 triples
低鈉鹽的由來─從臺鹽公司檔案見端倪(60).ttl: 244 triples
保存學術的火種—中央研究院播遷來臺(76).ttl: 320 triples
保育與觀光—從國立公園到國家公園(141).ttl: 1446 triples
傳播知識的種子—臺灣總督府圖書館的故事(62).ttl: 220 triples
元氣補給：美軍大兵在臺灣

In [46]:

# 設定輸入和輸出的資料夾路徑
input_folder = "./docs/output/5_wiki/v7"  # 你的 JSON 檔案資料夾
output_folder = "./docs/output/6_rdf/event"  # 轉換後的 Turtle 檔案輸出資料夾

# 執行轉換
convert_json_folder_to_turtle(input_folder, output_folder)

RDF Turtle file saved as ./docs/output/6_rdf/event\「友仔」是什麼？光復初期臺北地區非法組織調查報告告訴您(37).ttl


KeyboardInterrupt: 