# 數位韌性指數計算系統 - 雙重優化版

## 優化重點
1. **集中配置管理**：所有關鍵參數集中在開頭
2. **緩存機制**：避免重複計算和 API 調用
3. **主題層級評分**：Phase 4 基於主題而非文本評分，大幅提升速度
4. **多輪迭代優化**：Phase 3 支持多輪優化
5. **關鍵詞增強提示**：LLM 映射時提供主題關鍵詞
6. **批次 + 選擇性評分 (NEW!)**：一次 API 調用評分多個相關構面
   - 減少 85% 的 API 調用次數 (819 → 117)
   - 預計將 Phase 4 從 94 分鐘縮短到 5-10 分鐘
   - 7-18x 加速比

In [1]:
# ========================================
# 全局配置區 - 所有重要設置集中管理
# ========================================

from pathlib import Path
import json

# === API 配置 ===
from dotenv import dotenv_values
config = dotenv_values(".env")
OPENAI_API_KEY = config.get("OPENAI_API_KEY")

# === 模型配置 ===
EMBEDDING_MODEL = 'text-embedding-3-small'  # OpenAI 嵌入模型
LLM_MODEL = 'gpt-5-mini-2025-08-07'  # LLM 模型（用於主題優化和評分）
#LLM_MODEL = 'gpt-4.1-mini-2025-04-14'  # LLM 模型（用於主題優化和評分）
LLM_TEMPERATURE = 1  # 溫度參數

# === 文件路徑配置 ===
DATA_DIR = Path('data')
CORPUS_PATH = DATA_DIR / 'corpus.csv'

# Phase 2 輸出
EMBEDDINGS_PATH = DATA_DIR / 'embeddings_text-3-small.npy'
EMBEDDINGS_INDEX_PATH = DATA_DIR / 'embeddings_index.json'
PHASE2_MODEL_DIR = DATA_DIR / 'part2_bertopic_model'
PHASE2_TOPICS_CSV = DATA_DIR / 'part2_topics.csv'
PHASE2_DOC_PROBS = DATA_DIR / 'part2_doc_topic_probs.npy'
PHASE2_CORPUS_CSV = DATA_DIR / 'part2_corpus_with_topics.csv'
PHASE2_TOPIC_YEAR_CSV = DATA_DIR / 'part2_topic_prop_by_year.csv'

# Phase 3 輸出
PHASE3_MODEL_DIR = DATA_DIR / 'part3_optimized_bertopic_model'
PHASE3_CORPUS_CSV = DATA_DIR / 'part3_corpus_with_topics_v2.csv'
PHASE3_OPTIMIZATION_CACHE = DATA_DIR / 'phase3_optimization_plans.json'

# Phase 4 輸出
PHASE4_TOPIC_DIM_MAP_CACHE = DATA_DIR / 'phase4_topic_dimension_map.json'
PHASE4_TOPIC_SCORES_CACHE = DATA_DIR / 'phase4_topic_dimension_scores.json'
PHASE4_DOC_SCORES_CSV = DATA_DIR / 'part4_doc_dimension_scores.csv'
PHASE4_DRI_CSV = DATA_DIR / 'part4_entity_time_dri.csv'

# === BERTopic 參數配置 ===
# Phase 2 初始參數
UMAP_N_NEIGHBORS = 15
UMAP_N_COMPONENTS = 10
UMAP_MIN_DIST = 0.0
UMAP_METRIC = 'cosine'
HDBSCAN_MIN_CLUSTER_SIZE = 30
HDBSCAN_MIN_SAMPLES = 10
HDBSCAN_METRIC = 'euclidean'
HDBSCAN_SELECTION_METHOD = 'eom'

# === Phase 3 優化配置 ===
MAX_OPTIMIZATION_ITERATIONS = 10  # 最大迭代次數（防止無限循環）
TOPIC_SAMPLE_SIZE = 3  # 每個主題採樣的例句數量
ENABLE_SMART_STOPPING = True  # 啟用基於 LLM 的智能停止判斷

# === Phase 4 評分配置 ===
DIMENSIONS = ["ITC", "ACAP", "DC", "GOVSEC", "DATA", "ECO", "OTHER"]  # 數位韌性構面
DIMENSION_WEIGHTS = {  # 各構面權重（總和為 1）
    "ITC": 0.20,
    "ACAP": 0.20,
    "DC": 0.15,
    "GOVSEC": 0.15,
    "DATA": 0.15,
    "ECO": 0.15,
    "OTHER": 0.0
}

# 構面語義分組（用於選擇性評分優化）
DIMENSION_GROUPS = {
    "ITC": ["ITC", "ACAP", "DC"],  # 技術基礎設施相關
    "ACAP": ["ACAP", "ITC", "GOVSEC"],  # 安全與治理相關
    "DC": ["DC", "ITC", "GOVSEC"],  # 基礎設施與連續性
    "GOVSEC": ["GOVSEC", "ACAP", "DATA"],  # 治理與合規
    "DATA": ["DATA", "GOVSEC", "ECO"],  # 數據與生態系統
    "ECO": ["ECO", "DATA", "ITC"],  # 數位生態系統
    "OTHER": ["OTHER"]  # 其他類別
}

# 評分規則
SCORING_RUBRIC = (
    "Rate the substantiveness and strength on a 0–5 scale:\n"
    "0 = irrelevant/very vague\n"
    "3 = part of a specific action or quantitative indicator\n"
    "5 = clear, quantitative, auditable, and directly related to strategy/investment/institutionalization"
)

# === 其他配置 ===
EMBEDDING_BATCH_SIZE = 256
RANDOM_SEED = 42

print("✓ 配置加載完成")
print(f"  - 嵌入模型: {EMBEDDING_MODEL}")
print(f"  - LLM 模型: {LLM_MODEL}")
print(f"  - 數據目錄: {DATA_DIR}")
print(f"  - Phase 3 最大迭代次數: {MAX_OPTIMIZATION_ITERATIONS}")
print(f"  - Phase 3 智能停止: {'啟用' if ENABLE_SMART_STOPPING else '禁用'}")
print(f"  - 評分構面: {', '.join(DIMENSIONS)}")

✓ 配置加載完成
  - 嵌入模型: text-embedding-3-small
  - LLM 模型: gpt-5-mini-2025-08-07
  - 數據目錄: data
  - Phase 3 最大迭代次數: 10
  - Phase 3 智能停止: 啟用
  - 評分構面: ITC, ACAP, DC, GOVSEC, DATA, ECO, OTHER


In [2]:
# ========================================
# 導入必要的庫
# ========================================

import os
import json
import re
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from typing import Dict, List, Tuple

# OpenAI
from openai import OpenAI

# BERTopic and dependencies
from bertopic import BERTopic
from umap import UMAP
import hdbscan

# Sklearn
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

# 初始化 OpenAI 客戶端
client = OpenAI(api_key=OPENAI_API_KEY)

# 確保數據目錄存在
DATA_DIR.mkdir(exist_ok=True)

print("✓ 庫導入完成")

✓ 庫導入完成


---
# Phase 0: 數據下載與預處理
（此部分保持不變，如需重新下載請取消註釋）

In [3]:
# from src.data_download import download_sec_filings
# from src.data_processing import process_sec_filings

# download_sec_filings()
# process_sec_filings(config)

---
# Phase 2: 初始主題生成（BERTopic）

- 嵌入模型：OpenAI text-embedding-3-small (1536維)
- 降維：UMAP，聚類：HDBSCAN
- **緩存機制**：嵌入向量緩存，避免重複計算

In [4]:
# ========================================
# 加載語料庫
# ========================================

assert CORPUS_PATH.exists(), f"找不到語料庫: {CORPUS_PATH}"

df = pd.read_csv(CORPUS_PATH)
df.columns = [c.strip().lower() for c in df.columns]
assert 'text' in df.columns, "語料需要包含 'text' 欄位"

# 偵測元數據欄位
meta_cols = [c for c in ['doc_id', 'company', 'firm', 'ticker', 'year', 'date'] if c in df.columns]
print(f"✓ 載入語料庫: {len(df)} 筆文檔")
print(f"  - 元數據欄位: {meta_cols or '(無)'}")

texts = df['text'].astype(str).tolist()

✓ 載入語料庫: 6233 筆文檔
  - 元數據欄位: ['ticker', 'year']


In [5]:
# ========================================
# 生成/載入嵌入向量（帶緩存）
# ========================================

if EMBEDDINGS_PATH.exists() and EMBEDDINGS_INDEX_PATH.exists():
    print("✓ 從緩存加載嵌入向量...")
    embeddings = np.load(EMBEDDINGS_PATH)
    with open(EMBEDDINGS_INDEX_PATH, 'r') as f:
        emb_info = json.load(f)
    print(f"  - 模型: {emb_info.get('model')}")
    print(f"  - 數量: {emb_info.get('count')}")
else:
    print("⚙ 生成嵌入向量（這可能需要幾分鐘）...")
    vecs = []
    for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="生成嵌入"):
        batch = texts[i:i + EMBEDDING_BATCH_SIZE]
        response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
        vecs.extend([np.array(d.embedding, dtype=np.float32) for d in response.data])
    
    embeddings = np.vstack(vecs)
    np.save(EMBEDDINGS_PATH, embeddings)
    
    with open(EMBEDDINGS_INDEX_PATH, 'w') as f:
        json.dump({'count': len(texts), 'model': EMBEDDING_MODEL}, f)
    
    print(f"✓ 嵌入向量已保存")

print(f"  - 嵌入形狀: {embeddings.shape}")

✓ 從緩存加載嵌入向量...
  - 模型: text-embedding-3-small
  - 數量: 6233
  - 嵌入形狀: (6233, 1536)


In [6]:
# ========================================
# 初始 BERTopic 模型訓練
# ========================================

print("⚙ 訓練 BERTopic 模型...")

umap_model = UMAP(
    n_neighbors=UMAP_N_NEIGHBORS,
    n_components=UMAP_N_COMPONENTS,
    min_dist=UMAP_MIN_DIST,
    metric=UMAP_METRIC,
    random_state=RANDOM_SEED
)

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE,
    min_samples=HDBSCAN_MIN_SAMPLES,
    metric=HDBSCAN_METRIC,
    cluster_selection_method=HDBSCAN_SELECTION_METHOD,
    prediction_data=True
)

topic_model = BERTopic(
    calculate_probabilities=True,
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model
)

topics, probs = topic_model.fit_transform(texts, embeddings=embeddings)
df['topic'] = topics

# 保存結果
topic_model.save(PHASE2_MODEL_DIR.as_posix(), serialization="safetensors")
topic_info = topic_model.get_topic_info()
topic_info.to_csv(PHASE2_TOPICS_CSV, index=False, encoding='utf-8')
if probs is not None:
    np.save(PHASE2_DOC_PROBS, probs)
df.to_csv(PHASE2_CORPUS_CSV, index=False, encoding='utf-8')

# 按年度分析主題分佈
if 'year' in df.columns:
    year_dist = df.groupby('year')['topic'].value_counts(normalize=True).rename('prop').reset_index()
    year_dist.to_csv(PHASE2_TOPIC_YEAR_CSV, index=False, encoding='utf-8')

print("✓ Phase 2 完成")
print(f"  - 主題數量: {len(topic_info[topic_info['Topic'] != -1])}")
print(f"  - 離群點比例: {(np.array(topics) == -1).mean():.2%}")
topic_info.head(10)

2025-10-14 22:20:17,802 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


⚙ 訓練 BERTopic 模型...


2025-10-14 22:20:37,334 - BERTopic - Dimensionality - Completed ✓
2025-10-14 22:20:37,335 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-14 22:20:38,220 - BERTopic - Cluster - Completed ✓
2025-10-14 22:20:38,223 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-14 22:20:39,247 - BERTopic - Representation - Completed ✓


✓ Phase 2 完成
  - 主題數量: 68
  - 離群點比例: 16.52%


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1030,-1_our_and_of_the,"[our, and, of, the, to, or, in, we, are, for]",[There can be no assurance that licenses will ...
1,0,302,0_tax_income_taxes_deferred,"[tax, income, taxes, deferred, foreign, rate, ...",[We are subject to income taxes in the U.S. an...
2,1,222,1_loans_loan_credit_portfolio,"[loans, loan, credit, portfolio, consumer, all...",[858\r\nmillion\r\nwere included in TDRs at\r\...
3,2,191,2_gas_reserves_oil_proved,"[gas, reserves, oil, proved, production, exxon...","[In\r\n\r\nIn some cases, substantial new inve..."
4,3,136,3_our_we_may_could,"[our, we, may, could, or, products, and, to, b...",[Litigation and regulatory proceedings are inh...
5,4,128,4_care_health_medicare_medical,"[care, health, medicare, medical, unitedhealth...",[UnitedHealthcare Medicare & Retirement provid...
6,5,125,5_goodwill_assets_impairment_intangible,"[goodwill, assets, impairment, intangible, val...","[Property and equipment, which includes amount..."
7,6,123,6_driven_banking_higher_income,"[driven, banking, higher, income, fees, billio...",[The operating margin was\r\n27\r\npercent com...
8,7,117,7_pension_plans_plan_benefit,"[pension, plans, plan, benefit, assets, postre...",[. Note 10—Pension and Other Postretirement Be...
9,8,113,8_capital_basel_reserve_bank,"[capital, basel, reserve, bank, federal, requi...",[banks are subject to quantitative and qualita...


---
# Phase 3: LLM 迭代優化（完整實現版）

**核心功能（完整實現所有 LLM 建議）**：

1. **✓ 主題合併 (merge_pairs)**：
   - 合併語義相似或有父子關係的主題
   - 直接更新文檔的主題分配
   - 示例：將 Topic 6 和 Topic 24 合併

2. **✓ 主題拆分 (split_topics)**：
   - 拆分過於寬泛、包含多個概念的主題
   - 使用子聚類算法重新分組
   - 示例：將包含「員工培訓」和「數據隱私」的主題拆分

3. **✓ 停用詞管理 (new_stopwords)**：
   - 動態添加領域特定的噪音詞
   - 重新計算主題表示（移除無意義詞）
   - 示例：過濾「company」、「billion」、「fiscal」等詞

4. **✓ 參數調整 (params)**：
   - 根據 LLM 建議調整 HDBSCAN/UMAP 參數
   - 重新訓練整個聚類模型
   - 目標：減少離群點、提升主題品質

5. **✓ 主題重命名 (rename)**：
   - 將關鍵詞列表改為有意義的主題名稱
   - 提升可解釋性
   - 示例：「tax, income, deferred」→「Corporate Tax Strategy & Deferred Assets」

6. **✓ 智能停止判斷**：
   - LLM 基於指標歷史自動決定何時停止迭代
   - 收斂檢測、退化檢測、持續改進判斷

**執行順序**：
1. 合併相似主題 → 2. 拆分過寬主題 → 3. 更新停用詞 → 4. 調整參數重訓練 → 5. 重命名 → 6. 評估是否繼續

**預期效果**：
- 完整應用 LLM 的所有優化建議
- 每輪迭代顯示實際應用的操作（合併幾對、拆分幾個、停用詞幾個等）
- 智能收斂：平均 2-4 輪即可達到最佳配置
- 顯著提升主題質量和可解釋性

In [7]:
# ========================================
# 工具函數：指標計算
# ========================================

def compute_topic_centers(emb: np.ndarray, topics: List[int]) -> Dict[int, np.ndarray]:
    """計算每個主題的中心向量"""
    centers = {}
    s = pd.Series(topics)
    for tid, idxs in s.groupby(s).groups.items():
        if tid == -1:
            continue
        vecs = emb[list(idxs)]
        centers[tid] = normalize(vecs.mean(axis=0, keepdims=True))[0]
    return centers

def compute_metrics(emb: np.ndarray, topics: List[int]) -> Tuple[Dict, float, float, float]:
    """計算主題質量指標：一致性、區分度、輪廓係數、離群率"""
    centers = compute_topic_centers(emb, topics)
    
    # 一致性（Cohesion）：每個主題內部的平均相似度
    s = pd.Series(topics)
    cohesion = {}
    for tid, idxs in s.groupby(s).groups.items():
        if tid == -1 or tid not in centers:
            continue
        sims = cosine_similarity(emb[list(idxs)], centers[tid].reshape(1, -1)).ravel()
        cohesion[tid] = float(np.mean(sims))
    
    # 區分度（Separation）：主題中心之間的平均距離
    separation = np.nan
    if len(centers) >= 2:
        center_matrix = np.vstack(list(centers.values()))
        separation = cosine_distances(center_matrix).mean()
    
    # 輪廓係數（Silhouette）
    mask = np.array(topics) != -1
    silhouette = np.nan
    if mask.sum() > 5 and len(set(np.array(topics)[mask])) > 1:
        silhouette = silhouette_score(emb[mask], np.array(topics)[mask])
    
    # 離群率
    outlier_rate = (np.array(topics) == -1).mean()
    
    return cohesion, separation, silhouette, outlier_rate

def print_metrics(cohesion: Dict, separation: float, silhouette: float, outlier: float, prefix=""):
    """打印指標"""
    coh_mean = np.mean(list(cohesion.values())) if cohesion else np.nan
    print(f"{prefix}一致性: {coh_mean:.4f}")
    print(f"{prefix}區分度: {separation:.4f}")
    print(f"{prefix}Silhouette: {silhouette:.4f}")
    print(f"{prefix}離群率: {outlier:.2%}")

print("✓ 工具函數就緒")

✓ 工具函數就緒


In [8]:
# ========================================
# Phase 3 優化操作函數
# ========================================

def merge_topics(model, topics_list: List[int], merge_pairs: List[List[int]]) -> List[int]:
    """
    合併主題對
    
    Args:
        model: BERTopic 模型
        topics_list: 當前文檔的主題分配列表
        merge_pairs: 要合併的主題對 [[source, target], ...]
    
    Returns:
        更新後的主題列表
    """
    if not merge_pairs:
        return topics_list
    
    topics_array = np.array(topics_list)
    merge_count = 0
    
    for pair in merge_pairs:
        if len(pair) != 2:
            continue
        source, target = int(pair[0]), int(pair[1])
        
        # 將 source 主題的所有文檔重新分配到 target 主題
        mask = topics_array == source
        if mask.sum() > 0:
            topics_array[mask] = target
            merge_count += 1
            print(f"    ✓ 合併 Topic {source} → Topic {target} ({mask.sum()} 文檔)")
    
    print(f"  - 完成 {merge_count} 個主題合併")
    return topics_array.tolist()


def split_topic(model, df, embeddings, topic_id: int, topic_col: str = 'topic') -> Tuple[pd.DataFrame, np.ndarray]:
    """
    拆分指定主題為多個子主題
    
    Args:
        model: BERTopic 模型
        df: 文檔 DataFrame
        embeddings: 文檔嵌入向量
        topic_id: 要拆分的主題 ID
        topic_col: 主題欄位名稱
    
    Returns:
        更新後的 DataFrame 和主題列表
    """
    # 獲取該主題的所有文檔
    topic_mask = df[topic_col] == topic_id
    topic_indices = df[topic_mask].index.tolist()
    
    if len(topic_indices) < 10:  # 太小的主題不拆分
        print(f"    ⚠ Topic {topic_id} 文檔數太少 ({len(topic_indices)})，跳過拆分")
        return df, df[topic_col].values
    
    # 提取該主題的嵌入向量
    topic_embeddings = embeddings[topic_indices]
    
    # 對該主題進行子聚類（使用更小的 min_cluster_size）
    sub_min_cluster = max(5, len(topic_indices) // 4)
    
    sub_hdbscan = hdbscan.HDBSCAN(
        min_cluster_size=sub_min_cluster,
        min_samples=5,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )
    
    sub_labels = sub_hdbscan.fit_predict(topic_embeddings)
    
    # 找到最大的主題 ID，用於生成新 ID
    max_topic_id = int(df[topic_col].max())
    unique_sub_labels = set(sub_labels[sub_labels != -1])
    
    if len(unique_sub_labels) <= 1:
        print(f"    ⚠ Topic {topic_id} 無法進一步拆分")
        return df, df[topic_col].values
    
    # 創建新的主題 ID 映射
    new_topic_map = {}
    for i, sub_label in enumerate(sorted(unique_sub_labels)):
        if i == 0:
            # 第一個子主題保持原 ID
            new_topic_map[sub_label] = topic_id
        else:
            # 其他子主題使用新 ID
            max_topic_id += 1
            new_topic_map[sub_label] = max_topic_id
    
    # 更新主題分配
    topics_array = df[topic_col].values.copy()
    for idx, sub_label in zip(topic_indices, sub_labels):
        if sub_label in new_topic_map:
            topics_array[idx] = new_topic_map[sub_label]
    
    df[topic_col] = topics_array
    
    print(f"    ✓ 拆分 Topic {topic_id} → {len(unique_sub_labels)} 個子主題")
    for sub_label, new_id in new_topic_map.items():
        count = (sub_labels == sub_label).sum()
        print(f"      - Topic {new_id}: {count} 文檔")
    
    return df, topics_array


def update_stopwords_and_representation(model, texts: List[str], topics: List[int], 
                                       embeddings, new_stopwords: List[str] = None):
    """
    更新停用詞並重新計算主題表示
    
    Args:
        model: BERTopic 模型
        texts: 文檔文本列表
        topics: 主題分配列表
        embeddings: 文檔嵌入向量
        new_stopwords: 新增的停用詞列表
    
    Returns:
        更新後的模型
    """
    if not new_stopwords:
        return model
    
    print(f"  - 新增 {len(new_stopwords)} 個停用詞: {new_stopwords}")
    
    # BERTopic 使用 CountVectorizer，我們需要更新它的停用詞
    from sklearn.feature_extraction.text import CountVectorizer
    
    # 創建新的 vectorizer with updated stopwords
    current_stopwords = set()
    if hasattr(model, 'vectorizer_model') and model.vectorizer_model is not None:
        if hasattr(model.vectorizer_model, 'stop_words_'):
            current_stopwords = set(model.vectorizer_model.stop_words_)
    
    # 添加新停用詞
    updated_stopwords = current_stopwords.union(set(new_stopwords))
    
    # 創建新的 vectorizer
    vectorizer_model = CountVectorizer(
        stop_words=list(updated_stopwords),
        ngram_range=(1, 2),
        min_df=5
    )
    
    # 更新模型的 vectorizer
    model.vectorizer_model = vectorizer_model
    
    # 重新計算主題表示
    try:
        model.update_topics(texts, topics=topics, vectorizer_model=vectorizer_model)
        print(f"  - 已更新主題表示（移除停用詞）")
    except Exception as e:
        print(f"  ⚠ 更新主題表示時出錯: {e}")
    
    return model

print("✓ Phase 3 優化操作函數就緒")

✓ Phase 3 優化操作函數就緒


In [9]:
# ========================================
# 迭代優化主循環（完整實現版）
# ========================================

# 載入 Phase 2 結果
df = pd.read_csv(PHASE2_CORPUS_CSV)
embeddings = np.load(EMBEDDINGS_PATH)
current_model = BERTopic.load(PHASE2_MODEL_DIR.as_posix())
current_topics = df['topic'].tolist()

print(f"⚙ 開始智能迭代優化（最多 {MAX_OPTIMIZATION_ITERATIONS} 輪）...\n")
print("✓ 完整實現版：支持合併、拆分、停用詞、參數調整、重命名\n")

# 記錄優化歷史
optimization_history = []

# 初始指標
coh, sep, sil, out = compute_metrics(embeddings, current_topics)
print("初始指標:")
print_metrics(coh, sep, sil, out, "  ")
print()

# 記錄初始狀態
optimization_history.append({
    'iteration': 0,
    'cohesion': float(np.mean(list(coh.values()))) if coh else None,
    'separation': float(sep) if not np.isnan(sep) else None,
    'silhouette': float(sil) if not np.isnan(sil) else None,
    'outlier_rate': float(out),
    'action': 'baseline'
})

iteration = 0
should_continue = True

while should_continue and iteration < MAX_OPTIMIZATION_ITERATIONS:
    iteration += 1
    print(f"{'='*50}")
    print(f"第 {iteration} 輪優化")
    print(f"{'='*50}")
    
    # 1. 採樣主題代表詞和例句
    def sample_topic_info(model, df, topic_col='topic', k=TOPIC_SAMPLE_SIZE):
        samples = {}
        for tid in model.get_topic_info()['Topic'].tolist():
            if tid == -1:
                continue
            words = ', '.join([w for w, _ in model.get_topic(tid)[:10]])
            examples = df[df[topic_col] == tid]['text'].head(k).tolist()
            samples[tid] = {"words": words, "examples": examples}
        return samples
    
    topic_col = 'topic' if iteration == 1 else f'topic_v{iteration+1}'
    samples = sample_topic_info(current_model, df, topic_col)
    
    # 2. 請 LLM 給出優化建議
    print("⚙ 請求 LLM 優化建議...")
    
    plan_prompt = (
        """
        # --- Persona and Goal Definition ---
        You are an expert academic researcher specializing in the analysis of Corporate ESG (Environmental, Social, and Governance) reports. Your primary goal is to refine a raw list of topics, generated from financial documents (like 10-K filings), into a set of meaningful, coherent, and high-level ESG themes. These refined themes must be suitable for subsequent quantitative analysis to evaluate a corporation's Digital Resilience.

        # --- Guiding Principles ---
        Analyze the provided topic list based on the following principles to propose optimization actions:

        1.  **Abstraction and Generalization:**
            -   Identify and merge topics that are too specific or company-centric (e.g., topics about 'JPMorgan's loan portfolio' and 'Bank of America's credit risk' should be merged into a more general theme like 'Financial Institution Credit Risk Management').
            -   Rename topics to reflect broader ESG concepts rather than just listing keywords. The name should be an interpretable theme (e.g., rename a topic with keywords 'emissions, carbon, green' to 'Climate Change & Carbon Footprint').

        2.  **Cohesion and Separation:**
            -   Propose to merge topics that are semantically synonymous or represent a parent-child relationship (e.g., 'Data Security' and 'Cybersecurity Incidents' can be merged).
            -   Propose to split topics that contain multiple, distinct concepts (e.g., a topic containing keywords for both 'employee training' and 'data privacy' is too broad and should be split).

        3.  **Noise Reduction:**
            -   Identify common, non-informative words across many topics that should be added to a stopword list (e.g., 'company', 'billion', 'report', 'fiscal').
            -   Flag topics that are clearly artifacts of document structure or boilerplate language (e.g., topics about SEC filing numbers, proxy statements) for potential removal or re-evaluation.

        4.  **Parameter Tuning Logic:**
            -   The goal of tuning HDBSCAN/UMAP parameters is to reduce the number of documents in the outlier cluster (topic ID '-1') while ensuring the remaining topics remain coherent and distinct. Suggest SMALL, incremental changes to 'min_cluster_size' or 'min_samples' to achieve this balance. For example, slightly decreasing `min_cluster_size` might help capture smaller, emerging themes.

        # --- Input Format ---
        You will be provided with a list of topics. Each topic includes an ID, its original top keywords, and a custom label.

        # --- Output Format ---
        Your response MUST be a single, valid JSON object, and nothing else. Follow the structure below precisely.

        {
            "merge_pairs": [
                // List of pairs of topic IDs to be merged. Justify based on semantic similarity or hierarchical relationship.
                // Example: [[6, 24], [40, 64]]
            ],
            "split_topics": [
                // List of topic IDs that are too broad and contain multiple distinct themes.
                // Example: [15]
            ],
            "rename": {
                // Dictionary of topic IDs to new, more descriptive and abstract names. The new name should be a high-level ESG theme.
                // Example: {"0": "Corporate Tax Strategy & Deferred Assets", "51": "Data Privacy & Regulatory Compliance"}
            },
            "new_stopwords": [
                // List of new, domain-specific stopwords to filter out noise.
                // Example: ["company", "statement", "billion", "fiscal"]
            ],
            "params": {
                // Suggestions for tuning clustering parameters to reduce outliers. Provide only the parameters to be changed.
                // Example: {"min_cluster_size": 45}
            }
        }

        # --- Final Instruction ---
        Now, analyze the following topics and provide your optimization suggestions. Output ONLY the valid JSON object without any explanations or surrounding text.
        """
    )
    
    response = client.chat.completions.create(
        model=LLM_MODEL,
        temperature=LLM_TEMPERATURE,
        messages=[
            {"role": "system", "content": "You are a topic modeling optimization expert. Output JSON only."},
            {"role": "user", "content": plan_prompt},
            {"role": "user", "content": json.dumps({"topics": samples}, ensure_ascii=False)}
        ]
    )
    
    raw = response.choices[0].message.content
    try:
        plan = json.loads(raw)
    except Exception:
        # 嘗試提取 JSON
        match = re.search(r'\{[\s\S]*\}', raw)
        if match:
            plan = json.loads(match.group(0))
        else:
            print("⚠ 無法解析 LLM 輸出，跳過本輪")
            continue
    
    print(f"  - 合併建議: {len(plan.get('merge_pairs', []))} 對")
    print(f"  - 拆分建議: {len(plan.get('split_topics', []))} 個主題")
    print(f"  - 停用詞建議: {len(plan.get('new_stopwords', []))} 個詞")
    print(f"  - 參數調整: {plan.get('params', {})}")
    print(f"  - 重命名: {len(plan.get('rename', {}))} 個主題")
    
    # 標記是否有實質性變更
    has_structural_changes = (
        len(plan.get('merge_pairs', [])) > 0 or
        len(plan.get('split_topics', [])) > 0 or
        len(plan.get('new_stopwords', [])) > 0
    )
    
    # 3. 應用合併操作
    if plan.get('merge_pairs'):
        print("\n⚙ 應用主題合併...")
        current_topics = merge_topics(current_model, current_topics, plan['merge_pairs'])
        df[topic_col] = current_topics
    
    # 4. 應用拆分操作
    if plan.get('split_topics'):
        print("\n⚙ 應用主題拆分...")
        for tid in plan['split_topics']:
            df, current_topics = split_topic(current_model, df, embeddings, int(tid), topic_col)
    
    # 5. 應用停用詞更新
    if plan.get('new_stopwords'):
        print("\n⚙ 更新停用詞...")
        current_model = update_stopwords_and_representation(
            current_model, texts, current_topics, embeddings, plan['new_stopwords']
        )
    
    # 6. 如果有參數變更，重新訓練模型
    params = plan.get('params', {})
    if params:
        print("\n⚙ 根據參數建議重新訓練模型...")
        
        min_cluster_size = int(params.get('min_cluster_size', HDBSCAN_MIN_CLUSTER_SIZE))
        min_samples = int(params.get('min_samples', HDBSCAN_MIN_SAMPLES))
        n_neighbors = int(params.get('n_neighbors', UMAP_N_NEIGHBORS))
        n_components = int(params.get('n_components', UMAP_N_COMPONENTS))
        
        umap_opt = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=UMAP_MIN_DIST,
            metric=UMAP_METRIC,
            random_state=RANDOM_SEED
        )
        
        hdbscan_opt = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric=HDBSCAN_METRIC,
            cluster_selection_method=HDBSCAN_SELECTION_METHOD,
            prediction_data=True
        )
        
        optimized_model = BERTopic(
            calculate_probabilities=True,
            verbose=False,
            umap_model=umap_opt,
            hdbscan_model=hdbscan_opt
        )
        
        new_topics, new_probs = optimized_model.fit_transform(texts, embeddings=embeddings)
        current_topics = new_topics
        current_model = optimized_model
        df[topic_col] = current_topics
    
    # 7. 應用重命名
    if plan.get('rename') and isinstance(plan['rename'], dict):
        print("\n⚙ 應用主題重命名...")
        rename_map = {int(k): v for k, v in plan['rename'].items()}
        rename_count = 0
        for tid, name in rename_map.items():
            try:
                current_model.set_topic_labels({tid: name})
                rename_count += 1
            except:
                pass
        print(f"  - 完成 {rename_count} 個主題重命名")
    
    # 8. 計算新指標
    new_coh, new_sep, new_sil, new_out = compute_metrics(embeddings, current_topics)
    
    print("\n結果對比:")
    print_metrics(coh, sep, sil, out, "  舊: ")
    print_metrics(new_coh, new_sep, new_sil, new_out, "  新: ")
    
    # 9. 記錄歷史（轉換為 Python 原生類型）
    optimization_history.append({
        'iteration': iteration,
        'plan': plan,
        'cohesion': float(np.mean(list(new_coh.values()))) if new_coh else None,
        'separation': float(new_sep) if not np.isnan(new_sep) else None,
        'silhouette': float(new_sil) if not np.isnan(new_sil) else None,
        'outlier_rate': float(new_out),
        'actions_applied': {
            'merge': len(plan.get('merge_pairs', [])),
            'split': len(plan.get('split_topics', [])),
            'stopwords': len(plan.get('new_stopwords', [])),
            'params': bool(params),
            'rename': len(plan.get('rename', {}))
        }
    })
    
    # 10. 智能停止判斷
    if ENABLE_SMART_STOPPING and iteration >= 2:
        print("\n⚙ 評估是否繼續優化...")
        
        # 準備歷史數據
        history_summary = []
        for h in optimization_history:
            history_summary.append({
                'iter': h['iteration'],
                'coh': h['cohesion'],
                'sep': h['separation'],
                'sil': h['silhouette'],
                'out': h['outlier_rate'],
                'actions': h.get('actions_applied', {})
            })
        
        stopping_prompt = (
            "You are evaluating whether to continue topic model optimization.\n\n"
            f"Optimization history (last {len(history_summary)} iterations):\n"
            f"{json.dumps(history_summary, indent=2)}\n\n"
            "Metrics explanation:\n"
            "- cohesion: higher is better (internal similarity)\n"
            "- separation: higher is better (topic distinctiveness)\n"
            "- silhouette: higher is better (-1 to 1 range)\n"
            "- outlier_rate: lower is better\n\n"
            "Decision criteria:\n"
            "- STOP if metrics have converged (< 1% change for 2 iterations)\n"
            "- STOP if metrics are degrading consistently\n"
            "- CONTINUE if showing improvement\n\n"
            "Output JSON: {\"decision\": \"STOP\" or \"CONTINUE\", \"reason\": \"brief explanation\"}"
        )
        
        stopping_response = client.chat.completions.create(
            model=LLM_MODEL,
            temperature=LLM_TEMPERATURE,
            messages=[
                {"role": "system", "content": "You are an optimization expert. Output JSON only."},
                {"role": "user", "content": stopping_prompt}
            ]
        )
        
        stopping_raw = stopping_response.choices[0].message.content
        try:
            stopping_decision = json.loads(stopping_raw)
        except:
            match = re.search(r'\{[\s\S]*\}', stopping_raw)
            stopping_decision = json.loads(match.group(0)) if match else {"decision": "CONTINUE", "reason": "Parse error"}
        
        decision = stopping_decision.get('decision', 'CONTINUE').upper()
        reason = stopping_decision.get('reason', 'N/A')
        
        print(f"\n  決策: {decision}")
        print(f"  理由: {reason}")
        
        if decision == 'STOP':
            should_continue = False
            print("\n✓ 智能停止：達到優化目標或已收斂")
        else:
            # 更新狀態，準備下一輪
            coh, sep, sil, out = new_coh, new_sep, new_sil, new_out
    else:
        # 更新狀態，準備下一輪
        coh, sep, sil, out = new_coh, new_sep, new_sil, new_out
    
    # 保存中間結果
    next_topic_col = f'topic_v{iteration + 2}'
    df[next_topic_col] = current_topics
    
    print(f"\n✓ 第 {iteration} 輪完成\n")

# 保存最終結果
current_model.save(PHASE3_MODEL_DIR.as_posix(), serialization="safetensors")
df.to_csv(PHASE3_CORPUS_CSV, index=False, encoding='utf-8')

# 保存優化歷史
with open(PHASE3_OPTIMIZATION_CACHE, 'w') as f:
    json.dump(optimization_history, f, indent=2, ensure_ascii=False)

print("="*50)
print("✓ Phase 3 完成")
print(f"  - 總迭代次數: {iteration}")
print(f"  - 最終主題數: {len(set([t for t in current_topics if t != -1]))}")
print(f"  - 優化歷史已保存: {PHASE3_OPTIMIZATION_CACHE}")

# 顯示優化趨勢和應用的操作
print("\n優化趨勢和操作摘要:")
for h in optimization_history:
    iter_num = h['iteration']
    coh_val = h.get('cohesion', 0)
    sil_val = h.get('silhouette', 0)
    out_val = h.get('outlier_rate', 0)
    actions = h.get('actions_applied', {})
    
    if iter_num == 0:
        print(f"  Iteration {iter_num} (基準): coh={coh_val:.4f}, sil={sil_val:.4f}, out={out_val:.2%}")
    else:
        action_str = f"merge={actions.get('merge', 0)}, split={actions.get('split', 0)}, stopwords={actions.get('stopwords', 0)}, rename={actions.get('rename', 0)}"
        print(f"  Iteration {iter_num}: coh={coh_val:.4f}, sil={sil_val:.4f}, out={out_val:.2%} [{action_str}]")



⚙ 開始智能迭代優化（最多 10 輪）...

✓ 完整實現版：支持合併、拆分、停用詞、參數調整、重命名

初始指標:
  一致性: 0.7721
  區分度: 0.4064
  Silhouette: 0.0771
  離群率: 16.52%

第 1 輪優化
⚙ 請求 LLM 優化建議...




  - 合併建議: 32 對
  - 拆分建議: 3 個主題
  - 停用詞建議: 31 個詞
  - 參數調整: {'min_cluster_size': 30, 'min_samples': 5, 'umap_n_neighbors': 15, 'umap_min_dist': 0.1}
  - 重命名: 68 個主題

⚙ 應用主題合併...
    ✓ 合併 Topic 8 → Topic 31 (113 文檔)
    ✓ 合併 Topic 35 → Topic 16 (67 文檔)
    ✓ 合併 Topic 13 → Topic 27 (105 文檔)
    ✓ 合併 Topic 50 → Topic 37 (43 文檔)
    ✓ 合併 Topic 10 → Topic 14 (107 文檔)
    ✓ 合併 Topic 19 → Topic 33 (94 文檔)
    ✓ 合併 Topic 66 → Topic 3 (30 文檔)
    ✓ 合併 Topic 12 → Topic 41 (105 文檔)
    ✓ 合併 Topic 29 → Topic 42 (76 文檔)
    ✓ 合併 Topic 65 → Topic 18 (32 文檔)
    ✓ 合併 Topic 30 → Topic 54 (75 文檔)
    ✓ 合併 Topic 37 → Topic 5 (104 文檔)
    ✓ 合併 Topic 58 → Topic 36 (37 文檔)
    ✓ 合併 Topic 60 → Topic 28 (37 文檔)
  - 完成 14 個主題合併

⚙ 應用主題拆分...
    ⚠ Topic 3 無法進一步拆分
    ⚠ Topic 13 文檔數太少 (0)，跳過拆分
    ⚠ Topic 22 無法進一步拆分

⚙ 更新停用詞...
  - 新增 31 個停用詞: ['company', 'companys', 'corporation', 'inc', 'incorporated', 'registrant', 'form', 'document', 'proxy', 'filed', 'report', 'item', 'note', 'table', 'page', 'see', 'billion



    ⚠ Topic 8 無法進一步拆分

⚙ 更新停用詞...
  - 新增 30 個停用詞: ['company', 'companies', 'corporation', 'corporations', 'companys', 'report', 'reports', 'fiscal', 'billion', 'million', 'percent', 'the', 'our', 'we', 'may', 'could', 'or', 'of', 'and', 'to', 'in', 'results', 'item', 'page', 'note', 'inc', 'inc.', 'llc', 'ltd', 'www']
  - 已更新主題表示（移除停用詞）

⚙ 根據參數建議重新訓練模型...

⚙ 應用主題重命名...
  - 完成 72 個主題重命名

結果對比:
  舊: 一致性: 0.7715
  舊: 區分度: 0.4042
  舊: Silhouette: 0.0695
  舊: 離群率: 13.72%
  新: 一致性: 0.7751
  新: 區分度: 0.4082
  新: Silhouette: 0.0716
  新: 離群率: 14.73%

⚙ 評估是否繼續優化...

  決策: CONTINUE
  理由: Metrics not yet converged and show mixed improvement: cohesion +0.47%, separation +0.99%, silhouette +2.94% (improved) while outlier_rate worsened +7.37%. Not consistent degradation — continue and focus on reducing outlier rate.

✓ 第 2 輪完成

第 3 輪優化
⚙ 請求 LLM 優化建議...
  - 合併建議: 15 對
  - 拆分建議: 9 個主題
  - 停用詞建議: 31 個詞
  - 參數調整: {'min_cluster_size': 30, 'min_samples': 5, 'umap_n_neighbors': 15, 'umap_min_dist': 0.1}
  - 



    ⚠ Topic 24 無法進一步拆分
    ⚠ Topic 31 無法進一步拆分
    ⚠ Topic 62 無法進一步拆分
    ⚠ Topic 61 無法進一步拆分
    ⚠ Topic 63 無法進一步拆分

⚙ 更新停用詞...
  - 新增 31 個停用詞: ['company', 'corporation', 'companys', 'our', 'we', 'may', 'could', 'or', 'the', 'and', 'to', 'of', 'billion', 'million', 'fiscal', 'report', 'statement', 'statements', 'page', 'table', 'note', 'item', 'percent', 'year', 'years', 'registrant', 'file', 'form', 'inc', 'inc.', 'us']
  - 已更新主題表示（移除停用詞）

⚙ 根據參數建議重新訓練模型...

⚙ 應用主題重命名...
  - 完成 77 個主題重命名

結果對比:
  舊: 一致性: 0.7751
  舊: 區分度: 0.4082
  舊: Silhouette: 0.0716
  舊: 離群率: 14.73%
  新: 一致性: 0.7715
  新: 區分度: 0.4042
  新: Silhouette: 0.0695
  新: 離群率: 13.72%

⚙ 評估是否繼續優化...

  決策: CONTINUE
  理由: Metrics have not converged (no two consecutive iterations with <1% change) and are oscillating rather than degrading; some metrics improved in iter 2, so continue optimization to seek stable gains (particularly to reduce outlier rate).

✓ 第 3 輪完成

第 4 輪優化
⚙ 請求 LLM 優化建議...
  - 合併建議: 17 對
  - 拆分建議: 8 個主題
  - 停用詞建議



    ⚠ Topic 8 無法進一步拆分
    ⚠ Topic 16 文檔數太少 (0)，跳過拆分
    ⚠ Topic 23 無法進一步拆分
    ⚠ Topic 26 無法進一步拆分
    ⚠ Topic 30 無法進一步拆分
    ⚠ Topic 35 無法進一步拆分
    ⚠ Topic 62 無法進一步拆分

⚙ 更新停用詞...
  - 新增 44 個停用詞: ['company', 'companies', 'corporation', 'registrant', 'report', 'reports', 'item', 'note', 'table', 'page', 'see', 'management', 'statement', 'statements', 'period', 'year', 'years', 'percent', 'million', 'billion', 'fiscal', 'the', 'and', 'of', 'to', 'our', 'we', 'may', 'could', 'or', 'us', 'also', 'including', 'based', 'related', 'information', 'file', 'filing', 'inc', 'ltd', 'corp', 'www', 'http', 'note']
  - 已更新主題表示（移除停用詞）

⚙ 根據參數建議重新訓練模型...

⚙ 應用主題重命名...
  - 完成 42 個主題重命名

結果對比:
  舊: 一致性: 0.7715
  舊: 區分度: 0.4042
  舊: Silhouette: 0.0695
  舊: 離群率: 13.72%
  新: 一致性: 0.7761
  新: 區分度: 0.4074
  新: Silhouette: 0.0679
  新: 離群率: 13.99%

⚙ 評估是否繼續優化...

  決策: CONTINUE
  理由: Metrics have not converged (<1% change not met across two iterations). Cohesion and separation show small improvements and outlier



  - 合併建議: 30 對
  - 拆分建議: 6 個主題
  - 停用詞建議: 50 個詞
  - 參數調整: {'min_cluster_size': 25, 'min_samples': 3, 'n_neighbors': 15}
  - 重命名: 81 個主題

⚙ 應用主題合併...
    ✓ 合併 Topic 12 → Topic 32 (102 文檔)
    ✓ 合併 Topic 32 → Topic 53 (170 文檔)
    ✓ 合併 Topic 20 → Topic 43 (88 文檔)
    ✓ 合併 Topic 21 → Topic 26 (86 文檔)
    ✓ 合併 Topic 11 → Topic 42 (103 文檔)
    ✓ 合併 Topic 18 → Topic 65 (91 文檔)
    ✓ 合併 Topic 36 → Topic 40 (59 文檔)
    ✓ 合併 Topic 7 → Topic 39 (117 文檔)
    ✓ 合併 Topic 15 → Topic 29 (95 文檔)
    ✓ 合併 Topic 29 → Topic 37 (169 文檔)
    ✓ 合併 Topic 34 → Topic 76 (62 文檔)
    ✓ 合併 Topic 30 → Topic 58 (72 文檔)
    ✓ 合併 Topic 27 → Topic 50 (77 文檔)
    ✓ 合併 Topic 65 → Topic 66 (127 文檔)
    ✓ 合併 Topic 14 → Topic 56 (95 文檔)
    ✓ 合併 Topic 46 → Topic 70 (46 文檔)
    ✓ 合併 Topic 51 → Topic 72 (45 文檔)
    ✓ 合併 Topic 10 → Topic 52 (106 文檔)
    ✓ 合併 Topic 31 → Topic 64 (72 文檔)
    ✓ 合併 Topic 24 → Topic 55 (84 文檔)
    ✓ 合併 Topic 57 → Topic 67 (42 文檔)
    ✓ 合併 Topic 1 → Topic 19 (138 文檔)
  - 完成 22 個主題合併

⚙ 應用主題拆分...
  

KeyboardInterrupt: 

---
# Phase 4: 主題映射與評分（雙重優化版）

**核心優化**：
1. **提供主題關鍵詞**：LLM 映射時不再是 "Topic 0"，而是 "Topic 0: tax, income, taxes, deferred..."
2. **主題層級評分**：先對每個主題評分，再根據文檔的主題分佈加權計算
3. **批次評分 (NEW!)**：一次 API 調用返回多個構面分數，而非 7 次獨立調用
4. **選擇性評分 (NEW!)**：只評分語義相關的構面，而非所有 7 個構面
5. **大幅加速**：從 819 次 API 調用 → 117 次 (85% 減少，94分鐘 → 約 5-10 分鐘)

**優化效果對比**：
- 原始方法：117 主題 × 7 構面 = 819 次 API 調用 (~94 分鐘)
- 優化方法：117 主題 × 1 次批次調用 = 117 次 API 調用 (~5-10 分鐘)
- **加速比：7-18x**

In [None]:
# ========================================
# 載入 Phase 3 結果
# ========================================

print("⚙ 載入優化後的主題模型...")

# 嘗試載入最新版本
if PHASE3_CORPUS_CSV.exists():
    df = pd.read_csv(PHASE3_CORPUS_CSV)
    model = BERTopic.load(PHASE3_MODEL_DIR.as_posix())
    # 找到最後一個 topic 欄位
    topic_cols = [c for c in df.columns if c.startswith('topic')]
    TOPIC_COL = topic_cols[-1] if topic_cols else 'topic'
    print(f"  - 使用主題欄位: {TOPIC_COL}")
else:
    print("  - Phase 3 結果未找到，使用 Phase 2")
    df = pd.read_csv(PHASE2_CORPUS_CSV)
    model = BERTopic.load(PHASE2_MODEL_DIR.as_posix())
    TOPIC_COL = 'topic'

df.columns = [c.strip().lower() for c in df.columns]
print(f"  - 文檔數: {len(df)}")
print(f"  - 主題數: {df[TOPIC_COL].nunique() - 1}")



⚙ 載入優化後的主題模型...
  - 使用主題欄位: topic_v3
  - 文檔數: 6233
  - 主題數: 117


In [None]:
# ========================================
# Step 1: 主題 → 構面映射（帶關鍵詞）
# ========================================

print("⚙ 映射主題到數位韌性構面...")

# 檢查緩存
if PHASE4_TOPIC_DIM_MAP_CACHE.exists():
    print("  - 從緩存載入映射...")
    with open(PHASE4_TOPIC_DIM_MAP_CACHE, 'r') as f:
        topic_to_dimension = json.load(f)
    # 轉換 key 為 int
    topic_to_dimension = {int(k): v for k, v in topic_to_dimension.items()}
else:
    print("  - 生成新映射（請求 LLM）...")
    
    # 獲取所有主題及其關鍵詞
    topic_ids = sorted([int(t) for t in df[TOPIC_COL].dropna().unique() if t != -1])
    topic_descriptions = {}
    
    for tid in topic_ids:
        # 獲取主題的代表詞（前10個）
        try:
            words = [w for w, _ in model.get_topic(tid)[:10]]
            topic_descriptions[tid] = f"Topic {tid}: {', '.join(words)}"
        except:
            topic_descriptions[tid] = f"Topic {tid}"
    
    # 請求 LLM 映射
    mapping_prompt = (
        "You are a research assistant. Map each topic to ONE digital resilience dimension:\n"
        f"Dimensions: {', '.join(DIMENSIONS)}\n\n"
        "Dimension definitions:\n"
        "- ITC: IT infrastructure, cloud, networks, hardware, software systems\n"
        "- ACAP: Cybersecurity, threat detection, access control, encryption\n"
        "- DC: Data centers, disaster recovery, business continuity, redundancy\n"
        "- GOVSEC: Governance, compliance, regulations, security policies, audits\n"
        "- DATA: Data management, analytics, privacy, data quality\n"
        "- ECO: Digital ecosystem, partnerships, innovation, digital transformation\n"
        "- OTHER: None of the above\n\n"
        "Output JSON: {\"Topic 0: keywords\": \"DIMENSION\", ...}\n"
        "Output ONLY valid JSON, no explanation."
    )
    
    response = client.chat.completions.create(
        model=LLM_MODEL,
        temperature=LLM_TEMPERATURE,
        messages=[
            {"role": "system", "content": "You are a research assistant. Output JSON only."},
            {"role": "user", "content": mapping_prompt},
            {"role": "user", "content": json.dumps({
                "topics": list(topic_descriptions.values())
            }, ensure_ascii=False)}
        ]
    )
    
    raw = response.choices[0].message.content
    try:
        mapping_result = json.loads(raw)
    except Exception:
        match = re.search(r'\{[\s\S]*\}', raw)
        mapping_result = json.loads(match.group(0)) if match else {}
    
    # 解析映射結果（key 可能是 "Topic X: ..." 格式）
    topic_to_dimension = {}
    for key, dim in mapping_result.items():
        # 提取 topic id
        match = re.search(r'Topic (\d+)', key)
        if match:
            tid = int(match.group(1))
            topic_to_dimension[tid] = dim
    
    # 保存緩存
    with open(PHASE4_TOPIC_DIM_MAP_CACHE, 'w') as f:
        json.dump(topic_to_dimension, f, indent=2, ensure_ascii=False)
    
    print(f"  - 已保存緩存: {PHASE4_TOPIC_DIM_MAP_CACHE}")

# 統計映射結果
dim_counts = pd.Series(topic_to_dimension.values()).value_counts()
print("\n映射統計:")
for dim, count in dim_counts.items():
    print(f"  - {dim}: {count} 個主題")

# 顯示部分映射示例
print("\n映射示例（前10個）:")
for tid in sorted(topic_to_dimension.keys())[:10]:
    words = ', '.join([w for w, _ in model.get_topic(tid)[:5]])
    dim = topic_to_dimension[tid]
    print(f"  Topic {tid} ({words}...) → {dim}")

⚙ 映射主題到數位韌性構面...
  - 生成新映射（請求 LLM）...
  - 已保存緩存: data/phase4_topic_dimension_map.json

映射統計:
  - OTHER: 51 個主題
  - GOVSEC: 35 個主題
  - ITC: 17 個主題
  - DATA: 10 個主題
  - ACAP: 2 個主題
  - ECO: 2 個主題

映射示例（前10個）:
  Topic 0 (tax, income, taxes, deferred, foreign...) → GOVSEC
  Topic 1 (health, care, medicare, medical, unitedhealthcare...) → OTHER
  Topic 2 (products, product, healthcare, drug, generic...) → OTHER
  Topic 3 (pension, plans, benefit, plan, postretirement...) → OTHER
  Topic 4 (exxonmobil, gas, oil, exxonmobils, energy...) → OTHER
  Topic 5 (procter, gamble, net, care, sales...) → OTHER
  Topic 6 (standard, asu, adoption, accounting, update...) → GOVSEC
  Topic 7 (cash, billion, financing, activities, net...) → GOVSEC
  Topic 8 (hedges, derivative, derivatives, hedge, instruments...) → OTHER
  Topic 9 (products, earnings, industrial, revenues, manufacturing...) → OTHER


In [None]:
# ========================================
# Step 2: 主題層級評分（優化版 - 批次+選擇性評分）
# ========================================

print("\n⚙ 對主題×構面進行評分（優化版：批次+選擇性）...")

# 檢查緩存
if PHASE4_TOPIC_SCORES_CACHE.exists():
    print("  - 從緩存載入評分...")
    with open(PHASE4_TOPIC_SCORES_CACHE, 'r') as f:
        topic_scores = json.load(f)
    # 轉換 key
    topic_scores = {int(k): v for k, v in topic_scores.items()}
else:
    print("  - 生成新評分（優化版：批次評分 + 選擇性維度）...")
    print("  - 優化策略：")
    print("    1. 根據主題映射只評分相關構面（而非全部7個）")
    print("    2. 一次API調用返回多個構面分數（而非7次調用）")
    print("    3. 預期加速：819次API調用 → ~117次 (85%減少)\n")
    
    topic_scores = {}  # {topic_id: {dim: score}}
    
    # 為每個主題生成代表性描述
    topic_ids = sorted([int(t) for t in df[TOPIC_COL].dropna().unique() if t != -1])
    
    # 統計API調用次數
    api_calls_old = len(topic_ids) * len(DIMENSIONS)
    api_calls_new = len(topic_ids)
    print(f"  - 舊方法需要: {api_calls_old} 次 API 調用")
    print(f"  - 新方法需要: {api_calls_new} 次 API 調用")
    print(f"  - 減少: {api_calls_old - api_calls_new} 次 ({(1 - api_calls_new/api_calls_old)*100:.1f}%)\n")
    
    for tid in tqdm(topic_ids, desc="評分主題"):
        # 獲取主題信息
        words = ', '.join([w for w, _ in model.get_topic(tid)[:10]])
        examples = df[df[TOPIC_COL] == tid]['text'].head(3).tolist()
        
        # 構建主題描述
        topic_desc = (
            f"Topic {tid}\n"
            f"Keywords: {words}\n"
            f"Example excerpts:\n" +
            "\n---\n".join([ex[:500] for ex in examples])
        )
        
        # 確定要評分的構面（基於映射）
        primary_dim = topic_to_dimension.get(tid, "OTHER")
        dims_to_score = DIMENSION_GROUPS.get(primary_dim, [primary_dim])
        
        # 構建批次評分提示
        dim_definitions = {
            "ITC": "IT infrastructure, cloud, networks, hardware, software systems",
            "ACAP": "Cybersecurity, threat detection, access control, encryption",
            "DC": "Data centers, disaster recovery, business continuity, redundancy",
            "GOVSEC": "Governance, compliance, regulations, security policies, audits",
            "DATA": "Data management, analytics, privacy, data quality",
            "ECO": "Digital ecosystem, partnerships, innovation, digital transformation",
            "OTHER": "None of the above dimensions"
        }
        
        dims_desc = "\n".join([f"- {dim}: {dim_definitions[dim]}" for dim in dims_to_score])
        
        scoring_prompt = (
            f"Rate this topic's relevance to MULTIPLE digital resilience dimensions.\n\n"
            f"Dimensions to evaluate:\n{dims_desc}\n\n"
            f"{SCORING_RUBRIC}\n\n"
            f"Output JSON format: {{{', '.join([f'\"{d}\": <score>' for d in dims_to_score])}}}\n"
            f"Output ONLY valid JSON with numeric scores 0-5, no explanation.\n\n"
            f"Topic information:\n{topic_desc}"
        )
        
        try:
            response = client.chat.completions.create(
                model=LLM_MODEL,
                temperature=LLM_TEMPERATURE,
                messages=[
                    {"role": "system", "content": "You are a domain expert evaluating topics. Output JSON only with numeric scores."},
                    {"role": "user", "content": scoring_prompt}
                ]
            )
            
            raw = response.choices[0].message.content
            # 解析 JSON
            try:
                result = json.loads(raw)
            except:
                # 嘗試提取 JSON
                match = re.search(r'\{[^}]+\}', raw)
                if match:
                    result = json.loads(match.group(0))
                else:
                    result = {}
            
            # 驗證並規範化分數
            scores = {}
            for dim in dims_to_score:
                score = result.get(dim, 0)
                # 處理可能的嵌套格式 {"score": 3, "reasoning": "..."}
                if isinstance(score, dict):
                    score = score.get('score', 0)
                score = float(score)
                score = max(0, min(5, score))  # 限制在 0-5
                scores[dim] = score
            
            # 填充未評分的維度為 0
            full_scores = {dim: scores.get(dim, 0.0) for dim in DIMENSIONS}
            topic_scores[tid] = full_scores
            
        except Exception as e:
            print(f"\n  ⚠ Topic {tid} 評分失敗: {e}")
            # 失敗時全部填0
            topic_scores[tid] = {dim: 0.0 for dim in DIMENSIONS}
    
    # 保存緩存
    with open(PHASE4_TOPIC_SCORES_CACHE, 'w') as f:
        json.dump(topic_scores, f, indent=2, ensure_ascii=False)
    
    print(f"\n  - 已保存緩存: {PHASE4_TOPIC_SCORES_CACHE}")

print(f"\n✓ 完成 {len(topic_scores)} 個主題的評分")

# 顯示評分示例
print("\n評分示例（前5個主題）:")
for tid in sorted(topic_scores.keys())[:5]:
    words = ', '.join([w for w, _ in model.get_topic(tid)[:3]])
    scores = topic_scores[tid]
    mapped_dim = topic_to_dimension.get(tid, "UNKNOWN")
    scored_dims = DIMENSION_GROUPS.get(mapped_dim, [mapped_dim])
    print(f"  Topic {tid} ({words}...) [映射到 {mapped_dim}, 評分 {scored_dims}]:")
    for dim, score in scores.items():
        if score > 0:
            print(f"    {dim}: {score:.1f}")

In [None]:
# ========================================
# Step 3: 文檔層級評分（基於主題分佈）
# ========================================

print("⚙ 計算文檔級別的構面評分...")

# 如果有主題概率分佈，使用加權平均；否則使用硬分配
if PHASE2_DOC_PROBS.exists():
    print("  - 使用主題概率分佈進行加權計算")
    probs = np.load(PHASE2_DOC_PROBS)
    use_probs = True
else:
    print("  - 使用硬主題分配")
    use_probs = False

doc_scores = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="計算文檔評分"):
    scores = {dim: 0.0 for dim in DIMENSIONS}
    
    if use_probs and idx < len(probs):
        # 基於主題概率的加權評分
        prob_dist = probs[idx]
        for tid, prob in enumerate(prob_dist):
            if tid == -1 or prob < 0.01:  # 忽略離群和低概率
                continue
            if tid in topic_scores:
                for dim in DIMENSIONS:
                    scores[dim] += prob * topic_scores[tid].get(dim, 0)
    else:
        # 基於硬分配
        tid = int(row[TOPIC_COL])
        if tid != -1 and tid in topic_scores:
            scores = topic_scores[tid].copy()
    
    doc_scores.append(scores)

# 合併到 DataFrame
scores_df = pd.DataFrame(doc_scores)
result_df = pd.concat([df.reset_index(drop=True), scores_df], axis=1)

# 保存
result_df.to_csv(PHASE4_DOC_SCORES_CSV, index=False, encoding='utf-8')

print(f"✓ 文檔評分完成")
print(f"  - 已保存: {PHASE4_DOC_SCORES_CSV}")
result_df[['text'] + DIMENSIONS].head()

⚙ 計算文檔級別的構面評分...
  - 使用主題概率分佈進行加權計算


計算文檔評分: 100%|██████████| 6233/6233 [00:00<00:00, 43295.82it/s]


✓ 文檔評分完成
  - 已保存: data/part4_doc_dimension_scores.csv


Unnamed: 0,text,ITC,ACAP,DC,GOVSEC,DATA,ECO,OTHER
0,\n\n10-K\n1\nbac-1231201710xk.htm\n10-K\nDocum...,0.567895,0.742863,0.770305,0.766579,0.943984,1.435887,0.831297
1,We routinely post and make accessible financia...,0.150651,0.175765,0.249193,0.229236,0.262637,0.443559,0.229552
2,"and in international markets, we provide a div...",0.273042,0.407353,0.242627,0.702621,0.826736,0.94203,0.31855
3,We compete with some of these competitors glob...,0.229346,0.368801,0.353451,0.288197,0.244599,0.381106,0.368801
4,None of our domestic employees are subject to ...,3.0,3.0,1.0,2.0,3.0,5.0,3.0


In [None]:
# ========================================
# Step 4: 計算數位韌性指數（DRI）
# ========================================

print("⚙ 計算數位韌性指數（DRI）...")

# 偵測實體和時間欄位
entity_col = None
for col in ['company', 'firm', 'ticker']:
    if col in result_df.columns:
        entity_col = col
        break

time_col = None
for col in ['year', 'date']:
    if col in result_df.columns:
        time_col = col
        break

group_cols = [c for c in [entity_col, time_col] if c]

if not group_cols:
    print("  ⚠ 未偵測到 company/year，計算整體 DRI")
    agg = result_df[DIMENSIONS].mean().to_frame().T
else:
    print(f"  - 按 {group_cols} 聚合")
    agg = result_df[group_cols + DIMENSIONS].groupby(group_cols).mean().reset_index()

# 計算加權 DRI
print("  - 使用加權平均計算 DRI")
dri_scores = np.zeros(len(agg))
for dim in DIMENSIONS:
    weight = DIMENSION_WEIGHTS.get(dim, 0)
    dri_scores += agg[dim].values * weight

agg['DRI'] = dri_scores

# 保存
agg.to_csv(PHASE4_DRI_CSV, index=False, encoding='utf-8')

print(f"✓ DRI 計算完成")
print(f"  - 已保存: {PHASE4_DRI_CSV}")
print(f"\nDRI 統計:")
print(f"  - 平均值: {agg['DRI'].mean():.3f}")
print(f"  - 標準差: {agg['DRI'].std():.3f}")
print(f"  - 範圍: [{agg['DRI'].min():.3f}, {agg['DRI'].max():.3f}]")

agg.head(10)

⚙ 計算數位韌性指數（DRI）...
  - 按 ['ticker', 'year'] 聚合
  - 使用加權平均計算 DRI
✓ DRI 計算完成
  - 已保存: data/part4_entity_time_dri.csv

DRI 統計:
  - 平均值: 1.322
  - 標準差: 0.204
  - 範圍: [0.818, 1.585]


Unnamed: 0,ticker,year,ITC,ACAP,DC,GOVSEC,DATA,ECO,OTHER,DRI
0,AAPL,2018,0.732843,1.214491,1.199411,0.919845,1.080862,2.008562,1.397826,1.170769
1,AAPL,2019,0.647035,1.140406,1.164747,0.79824,0.950223,1.854673,1.329306,1.072671
2,AMZN,2018,0.475591,0.870687,0.633933,0.78584,0.928183,1.310896,0.873558,0.818084
3,AMZN,2019,0.484569,1.055978,0.722968,0.765328,1.108889,1.528314,1.0367,0.926934
4,BAC,2018,0.73917,1.219561,0.817715,1.259404,1.5028,2.012307,0.964983,1.23058
5,BAC,2019,0.63873,1.051745,0.688897,1.225159,1.37689,2.017759,1.142673,1.134401
6,BRK-B,2018,0.848677,1.271849,1.049523,1.524231,1.952482,2.036254,1.467172,1.408479
7,BRK-B,2019,0.903161,1.45835,1.197646,1.616142,2.092728,2.241823,1.651319,1.544553
8,CSCO,2018,1.044537,1.507539,1.043962,1.170067,1.587339,2.066058,1.480383,1.390529
9,CSCO,2019,1.002996,1.451169,1.014497,1.219425,1.624125,2.019357,1.474923,1.372443


In [None]:
# ========================================
# 可視化 DRI
# ========================================

import plotly.express as px

if entity_col and time_col:
    fig = px.line(
        agg,
        x=time_col,
        y='DRI',
        color=entity_col,
        markers=True,
        title='數位韌性指數（DRI）時序趨勢'
    )
    fig.show()
elif entity_col:
    fig = px.bar(
        agg,
        x=entity_col,
        y='DRI',
        title='各實體的數位韌性指數（DRI）'
    )
    fig.show()
else:
    fig = px.bar(
        agg,
        x=list(range(len(agg))),
        y='DRI',
        title='整體數位韌性指數（DRI）'
    )
    fig.show()

print("\n✓ 可視化完成")


✓ 可視化完成


---
# 總結

## 優化成果

### 1. 集中配置管理
- 所有關鍵參數在第一個 cell 統一管理
- 易於調整和實驗

### 2. Phase 3 多輪迭代
- 支持多輪 LLM 優化（可配置）
- 追蹤每輪的指標變化

### 3. Phase 4 主題層級評分
- **速度提升**: 從 6000+ 文檔 → ~70 主題
- **準確性提升**: LLM 獲得主題關鍵詞，不再盲目猜測
- 使用主題概率分佈加權計算文檔評分

### 4. 緩存機制
- 嵌入向量緩存
- Phase 3 優化計劃緩存
- Phase 4 主題映射和評分緩存

### 5. 加權 DRI 計算
- 支持自定義構面權重
- 更靈活的指標計算

## 輸出文件清單

- `data/part2_bertopic_model/` - Phase 2 模型
- `data/part2_topics.csv` - Phase 2 主題列表
- `data/part2_corpus_with_topics.csv` - Phase 2 帶主題標註的語料
- `data/part3_optimized_bertopic_model/` - Phase 3 優化模型
- `data/part3_corpus_with_topics_v2.csv` - Phase 3 優化後語料
- `data/phase3_optimization_plans.json` - 優化歷史記錄
- `data/phase4_topic_dimension_map.json` - 主題-構面映射緩存
- `data/phase4_topic_dimension_scores.json` - 主題評分緩存
- `data/part4_doc_dimension_scores.csv` - 文檔級構面評分
- `data/part4_entity_time_dri.csv` - 最終 DRI 指數