# 第四階段：主題映射與「數位韌性」指數計算（LLM Prompt）

In [None]:
# %pip install pandas numpy tqdm openai==1.* plotly

In [None]:
import os, json, re
import numpy as np
import pandas as pd
from pathlib import Path
from openai import OpenAI
client = OpenAI()
BASE=Path('/mnt/data')
CANDS=[BASE/'part3_corpus_with_topics_v2.csv', BASE/'part2_corpus_with_topics.csv']
for p in CANDS:
    if p.exists(): CORPUS=p; break
assert CORPUS.exists()
df=pd.read_csv(CORPUS)
df.columns=[c.strip().lower() for c in df.columns]
TOP = 'topic_v2' if 'topic_v2' in df.columns else 'topic'
print('使用主題欄位：', TOP)

In [None]:
# 主題→數位韌性構面 對映（可自訂）
DIMENSIONS=["ITC","ACAP","DC","GOVSEC","DATA","ECO","OTHER"]
MANUAL_MAP={}  # 例如 {"雲端與DevOps":"ITC"}

def llm_map_topics_to_dims(topic_labels):
    sys='你是研究助理，請把主題標籤映射到數位韌性構面：ITC/ACAP/DC/GOVSEC/DATA/ECO/OTHER。只輸出 JSON 物件。'
    usr={"dimensions":DIMENSIONS, "topics":topic_labels}
    r=client.chat.completions.create(model='gpt-4o-mini', temperature=0, messages=[
        {"role":"system","content":sys}, {"role":"user","content": json.dumps(usr, ensure_ascii=False)}])
    raw=r.choices[0].message.content
    try: m=json.loads(raw)
    except Exception:
        m=json.loads(re.search(r'\{[\s\S]*\}$', raw).group(0))
    return m

labels=sorted([int(t) for t in df[TOP].dropna().unique().tolist() if t!=-1])
label_text={tid: f"Topic {tid}" for tid in labels}
auto_map=llm_map_topics_to_dims(list(label_text.values()))
auto_map

In [None]:
# 以 LLM 依規則對文本×構面打分（0–5）
from tqdm import tqdm
RUBRIC=("請你以 0–5 分量表評分該段文字對於某構面的實質性與強度(0=無關/非常空泛;3=有具體行動或量化指標的一部分;5=明確、量化、可稽核且與策略/投資/制度化直接相關)。\n只輸出 JSON：{\"score\":數字, \"evidence\":\"代表性原文\"}")

def score_snippet(text, dim):
    sys='你是審稿人，依據評分規則對文本片段在指定構面上的實質性打分。'
    usr={"dimension":dim, "rubric":RUBRIC, "text": str(text)[:4000]}
    r=client.chat.completions.create(model='gpt-4o-mini', temperature=0, messages=[
        {"role":"system","content":sys},{"role":"user","content": json.dumps(usr, ensure_ascii=False)}])
    raw=r.choices[0].message.content
    try: d=json.loads(raw)
    except Exception: d=json.loads(re.search(r'\{[\s\S]*\}$', raw).group(0))
    s=max(0, min(5, float(d.get('score',0))))
    return s, d.get('evidence','')

DIMS=DIMENSIONS
scores_list=[]; evid_list=[]
for _,row in tqdm(df.iterrows(), total=len(df)):
    t=str(row['text'])
    srow={}; erow={}
    for dim in DIMS:
        s, ev = score_snippet(t, dim)
        srow[dim]=s; erow[dim]=ev
    scores_list.append(srow); evid_list.append(erow)

scores_df=pd.DataFrame(scores_list); evid_df=pd.DataFrame(evid_list).add_prefix('evi_')
out=pd.concat([df.reset_index(drop=True), scores_df, evid_df], axis=1)
out.to_csv('/mnt/data/part4_doc_dimension_scores.csv', index=False, encoding='utf-8')
out.head(2)

In [None]:
# 依公司/年度聚合，計算 DRI（等權平均；可自行換權重）
entity_col='company' if 'company' in out.columns else ('firm' if 'firm' in out.columns else None)
time_col='year' if 'year' in out.columns else ('date' if 'date' in out.columns else None)
cols=[c for c in [entity_col,time_col] if c]
if not cols: print('未偵測到 company/year，以下示範整體聚合')
agg=out[cols + DIMS].groupby(cols).mean().reset_index()
agg['DRI']=agg[DIMS].mean(axis=1)
agg.to_csv('/mnt/data/part4_entity_time_dri.csv', index=False, encoding='utf-8')
agg.head(10)

In [None]:
import plotly.express as px
if cols:
    fig=px.line(agg, x=cols[-1], y='DRI', color=cols[0], markers=True, title='DRI 時序'); fig.show()
else:
    fig=px.bar(agg, x=list(range(len(agg))), y='DRI', title='整體 DRI'); fig.show()
print('輸出：/mnt/data/part4_doc_dimension_scores.csv, /mnt/data/part4_entity_time_dri.csv')