In [1]:
from src.data_download import download_sec_filings
from src.data_processing import process_sec_filings

import pandas as pd
from dotenv import dotenv_values
import os
import math
from typing import List, Iterable
import numpy as np
from openai import OpenAI


config = dotenv_values(".env")
client = OpenAI(
    api_key=config.get("OPENAI_API_KEY")
)

---

# 原始資料下載

In [2]:
# download_sec_filings()

---

# 第一階段：語義前處理
過濾、語義分塊
輸出:data/corpus.csv

In [3]:
# process_sec_filings(config)

---

# 第二階段：初始主題生成（BERTopic）

- 嵌入模型：OpenAI **text-3-small**（1536 維）
- 降維：UMAP，聚類：HDBSCAN，表示：c-TF-IDF（BERTopic）
- 輸出：
  - `data/part2_bertopic_model/`（可供後續載入）
  - `data/part2_topics.csv`（主題概覽）
  - `data/part2_doc_topic_probs.npy`（doc×topic 機率）
  - `data/part2_corpus_with_topics.csv`（語料含 topic）

In [4]:
import os, json
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI

CSV_PATH = Path('data/corpus.csv')
assert CSV_PATH.exists(), '找不到 data/corpus.csv'

df = pd.read_csv(CSV_PATH)
df.columns = [c.strip().lower() for c in df.columns]
assert 'text' in df.columns, "語料需要包含 'text' 欄位"
meta_cols = [c for c in ['doc_id','company','firm','ticker','year','date'] if c in df.columns]
print('偵測到 metadata 欄位：', meta_cols or '(無)')

client = OpenAI(
    api_key=config.get("OPENAI_API_KEY")
)  # 需先設 OPENAI_API_KEY
MODEL_EMB = 'text-embedding-3-small'
EMB_PATH = Path('data/embeddings_text-3-small.npy')
IDX_PATH = Path('data/embeddings_index.json')
texts = df['text'].astype(str).tolist()

if EMB_PATH.exists() and IDX_PATH.exists():
    embeddings = np.load(EMB_PATH)
else:
    BATCH=256
    vecs=[]
    for i in tqdm(range(0,len(texts),BATCH)):
        batch=texts[i:i+BATCH]
        r=client.embeddings.create(model=MODEL_EMB, input=batch)
        vecs.extend([np.array(d.embedding, dtype=np.float32) for d in r.data])
    embeddings=np.vstack(vecs)
    np.save(EMB_PATH, embeddings)
    with open(IDX_PATH,'w') as f: json.dump({'count':len(texts),'model':MODEL_EMB},f)
print('embeddings shape:', embeddings.shape)

偵測到 metadata 欄位： ['ticker', 'year']
embeddings shape: (6233, 1536)


In [5]:
from bertopic import BERTopic
from umap import UMAP
import hdbscan

umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=10, metric='euclidean',
                                 cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(calculate_probabilities=True, verbose=True,
                       umap_model=umap_model, hdbscan_model=hdbscan_model)
topics, probs = topic_model.fit_transform(texts, embeddings=embeddings)
df['topic'] = topics
topic_info = topic_model.get_topic_info(); topic_info.head(10)

2025-09-30 14:28:36,106 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-30 14:28:46,522 - BERTopic - Dimensionality - Completed ✓
2025-09-30 14:28:46,523 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-30 14:28:47,002 - BERTopic - Cluster - Completed ✓
2025-09-30 14:28:47,004 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-30 14:28:47,503 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,920,-1_our_of_and_the,"[our, of, and, the, to, or, in, we, as, on]",[Our Supplier Relationships Subject Us to a Nu...
1,0,314,0_tax_income_taxes_deferred,"[tax, income, taxes, deferred, foreign, the, r...",[We record provision for income taxes for the ...
2,1,205,1_gas_production_oil_reserves,"[gas, production, oil, reserves, proved, exxon...",[These excluded amounts for both\nconsolidated...
3,2,203,2_currency_foreign_derivative_exchange,"[currency, foreign, derivative, exchange, hedg...",[A master netting arrangement allows counterpa...
4,3,162,3_goodwill_assets_impairment_intangible,"[goodwill, assets, impairment, intangible, val...","[Upon the occurrence of a triggering event, we..."
5,4,154,4_capital_basel_bank_liquidity,"[capital, basel, bank, liquidity, reserve, fed...",[banks are subject to quantitative and qualita...
6,5,153,5_our_may_we_could,"[our, may, we, could, or, products, results, a...",[Demand for our products is variable and hard ...
7,6,152,6_driven_banking_income_higher,"[driven, banking, income, higher, net, fees, r...","[Net revenue was\n$52.1 billion\n, an increase..."
8,7,150,7_products_product_or_our,"[products, product, or, our, healthcare, to, a...",[presidential administration’s policy proposal...
9,8,137,8_health_care_medicare_medical,"[health, care, medicare, medical, unitedhealth...",[UnitedHealthcare Medicare & Retirement provid...


In [6]:
OUT_DIR = Path('data/part2_bertopic_model')
topic_model.save(OUT_DIR.as_posix(), serialization="safetensors")
topic_info.to_csv('data/part2_topics.csv', index=False, encoding='utf-8')
if probs is not None:
    np.save('data/part2_doc_topic_probs.npy', probs)
df.to_csv('data/part2_corpus_with_topics.csv', index=False, encoding='utf-8')
print('已輸出 part2_bertopic_model / part2_topics.csv / part2_doc_topic_probs.npy / part2_corpus_with_topics.csv')

已輸出 part2_bertopic_model / part2_topics.csv / part2_doc_topic_probs.npy / part2_corpus_with_topics.csv


In [7]:
fig = topic_model.visualize_barchart(top_n_topics=20); fig.show()
fig = topic_model.visualize_hierarchy(top_n_topics=50); fig.show()
fig = topic_model.visualize_topics(); fig.show()

if 'year' in df.columns:
    year_dist = df.groupby('year')['topic'].value_counts(normalize=True).rename('prop').reset_index()
    year_dist.to_csv('data/part2_topic_prop_by_year.csv', index=False, encoding='utf-8')
    year_dist.head()

# 第三階段：LLM Agent 迭代優化（自我批改 / 合併 / 重生）

In [8]:
import os, json, math
from pathlib import Path
import numpy as np
import pandas as pd
from openai import OpenAI
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

client = OpenAI(
    api_key=config.get("OPENAI_API_KEY")
)
BASE=Path('data')
MODEL_DIR=BASE/'part2_bertopic_model'
CORPUS=BASE/'part2_corpus_with_topics.csv'
EMB=BASE/'embeddings_text-3-small.npy'
assert MODEL_DIR.exists() and CORPUS.exists() and EMB.exists()

df=pd.read_csv(CORPUS)
embeddings=np.load(EMB)
model=BERTopic.load(MODEL_DIR.as_posix())
topics=df['topic'].tolist()

def centers(emb, topics):
    m={}
    s=pd.Series(topics)
    for tid, idxs in s.groupby(s).groups.items():
        if tid==-1: continue
        vecs=emb[list(idxs)]
        m[tid]=normalize(vecs.mean(axis=0, keepdims=True))[0]
    return m

def metrics(emb, topics):
    c=centers(emb, topics)
    # cohesion
    s=pd.Series(topics)
    coh={}
    for tid, idxs in s.groupby(s).groups.items():
        if tid==-1 or tid not in c: continue
        sims=cosine_similarity(emb[list(idxs)], c[tid].reshape(1,-1)).ravel()
        coh[tid]=float(np.mean(sims))
    # separation
    sep=np.nan
    if len(c)>=2:
        mat=np.vstack(list(c.values()))
        sep=cosine_distances(mat).mean()
    # silhouette
    mask=np.array(topics)!=-1
    sil=np.nan
    if mask.sum()>5 and len(set(np.array(topics)[mask]))>1:
        sil=silhouette_score(emb[mask], np.array(topics)[mask])
    out=(np.array(topics)==-1).mean()
    return coh, sep, sil, out

coh, sep, sil, out = metrics(embeddings, topics)
print('一致性均值:', np.mean(list(coh.values())) if coh else np.nan)
print('區分度均值:', sep)
print('Silhouette:', sil)
print('Outlier 比例:', out)



一致性均值: 0.7667531050168551
區分度均值: 0.4119281
Silhouette: 0.07075929641723633
Outlier 比例: 0.14760147601476015


In [10]:
def sample_topic_snippets(topic_model, df, k_each=3):
    samples={}
    for tid in topic_model.get_topic_info()['Topic'].tolist():
        if tid==-1: continue
        words=', '.join([w for w,_ in topic_model.get_topic(tid)[:10]])
        ex=df[df['topic']==tid]['text'].head(k_each).tolist()
        samples[tid]={"words":words, "examples":ex}
    return samples


"""
你是一位主題建模審稿人。以下是主題代表詞與例句，請輸出 JSON
僅輸出 JSON，不要解釋
"""
samples=sample_topic_snippets(model, df)
plan_prompt='''You are a reviewer for topic modeling. Below are the representative words and example sentences of the topic. Please output the JSON：
- "merge_pairs": [[a,b],...]
- "split_topics": [id,...]
- "rename": {id: "new name"}
- "new_stopwords": ["...", ...]
- "params": {"min_cluster_size": 25, "min_samples": 7, "n_neighbors": 15, "n_components": 10}
Just output JSON, no interpretation'''

"""
你會產出可機器讀取且可執行的最佳化建議。
"""
resp=client.chat.completions.create(model='gpt-5-nano-2025-08-07', temperature=1,
    messages=[{"role":"system","content":"You will produce machine-readable and actionable optimization recommendations."},
             {"role":"user","content": plan_prompt},
             {"role":"user","content": json.dumps({"topics":samples}, ensure_ascii=False)}])
raw=resp.choices[0].message.content
try:
    plan=json.loads(raw)
except Exception:
    import re
    plan=json.loads(re.search(r'\{[\s\S]*\}$', raw).group(0))
plan

{'merge_pairs': [[22, 25], [25, 34], [34, 54]],
 'split_topics': [6, 7, 11, 18, 31, 39, 50, 57, 58, 60, 61, 62, 63, 64],
 'rename': {'4': 'Regulatory Basel and Bank Regulation',
  '8': 'UnitedHealthcare and Optum Segments',
  '10': 'Pension and Postretirement Plans',
  '11': 'Procter & Gamble Brand Portfolio',
  '18': 'Disney: Media Networks and Direct-to-Consumer',
  '31': 'Johnson & Johnson Corporate Overview',
  '34': 'Visa: Core Payments and Products',
  '25': 'Mastercard: Global Payments Network',
  '35': 'Verizon: Wireless and Wireline',
  '37': 'Facebook: Advertising and User Engagement',
  '50': 'Brexit and European Economic Conditions',
  '55': 'Litigation and Legal Matters (Visa/Interchange)',
  '56': 'Intel: Data-centric Strategy and IP',
  '60': 'Securitization and VIEs'},
 'new_stopwords': ['NOTE',
  'Table',
  'Contents',
  'PART',
  'I',
  'II',
  'III',
  'MD&A',
  'to',
  'the',
  'and',
  'of',
  'our',
  'we'],
 'params': {'min_cluster_size': 25,
  'min_samples': 7,


In [11]:
# 依建議重跑（示範：參數+命名；合併/拆分可再擴充）
from bertopic import BERTopic

p=plan.get('params', {})
min_cluster_size=int(p.get('min_cluster_size',30))
min_samples=int(p.get('min_samples',10))
n_neighbors=int(p.get('n_neighbors',15))
n_components=int(p.get('n_components',10))

umap2=UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42)
hs2=hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean',
                    cluster_selection_method='eom', prediction_data=True)
model2=BERTopic(calculate_probabilities=True, verbose=True, umap_model=umap2, hdbscan_model=hs2)

top2, prob2 = model2.fit_transform(df['text'].astype(str).tolist(), embeddings=embeddings)

# 命名
if isinstance(plan.get('rename',{}), dict):
    for k,v in {int(k):v for k,v in plan['rename'].items()}.items():
        try: model2.set_topic_labels({k:v})
        except: pass

df['topic_v2']=top2
coh2, sep2, sil2, out2 = metrics(embeddings, top2)
print('Before -> After')
print('一致性均值:', np.mean(list(coh.values())) if coh else np.nan, '->', np.mean(list(coh2.values())) if coh2 else np.nan)
print('區分度均值:', sep, '->', sep2)
print('Silhouette:', sil, '->', sil2)
print('Outlier 比例:', out, '->', out2)

OUT=Path('data/part3_optimized_bertopic_model'); model2.save(OUT.as_posix(), serialization="safetensors")
df.to_csv('data/part3_corpus_with_topics_v2.csv', index=False, encoding='utf-8')
print('已輸出 part3_optimized_bertopic_model / part3_corpus_with_topics_v2.csv')

2025-09-30 14:34:40,877 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-30 14:34:45,664 - BERTopic - Dimensionality - Completed ✓
2025-09-30 14:34:45,665 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-30 14:34:46,194 - BERTopic - Cluster - Completed ✓
2025-09-30 14:34:46,196 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-30 14:34:46,695 - BERTopic - Representation - Completed ✓


Before -> After
一致性均值: 0.7667531050168551 -> 0.7695309309892251
區分度均值: 0.4119281 -> 0.41552007
Silhouette: 0.07075929641723633 -> 0.07346311956644058
Outlier 比例: 0.14760147601476015 -> 0.1501684582063212
已輸出 part3_optimized_bertopic_model / part3_corpus_with_topics_v2.csv


In [12]:
# （選用）找出「模糊」文件 id（最近兩中心距離差小），可配合 LITA 只在這些文件上請 LLM 判讀

def ambiguous_indices(emb, topics, margin_th=0.02, cap=200):
    c=centers(emb, topics)
    ids=[]
    if not c: return ids
    C=np.vstack(list(c.values()))
    for i,tid in enumerate(topics):
        if tid==-1 or tid not in c: continue
        d=cosine_distances(emb[i].reshape(1,-1), C).ravel()
        s=np.sort(d)
        if s[1]-s[0] < margin_th: ids.append(i)
        if len(ids)>=cap: break
    return ids

amb_ids=ambiguous_indices(embeddings, top2)
len(amb_ids), amb_ids[:10]

(200, [5, 13, 17, 21, 67, 79, 88, 120, 122, 153])

# 第四階段：主題映射與「數位韌性」指數計算（LLM Prompt）

In [13]:
import os, json, re
import numpy as np
import pandas as pd
from pathlib import Path
from openai import OpenAI
client = OpenAI(
    api_key=config.get("OPENAI_API_KEY")
)
BASE=Path('data')
CANDS=[BASE/'part3_corpus_with_topics_v2.csv', BASE/'part2_corpus_with_topics.csv']
for p in CANDS:
    if p.exists(): CORPUS=p; break
assert CORPUS.exists()
df=pd.read_csv(CORPUS)
df.columns=[c.strip().lower() for c in df.columns]
TOP = 'topic_v2' if 'topic_v2' in df.columns else 'topic'
print('使用主題欄位：', TOP)

使用主題欄位： topic_v2


In [14]:
# 主題→數位韌性構面 對映（可自訂）
DIMENSIONS=["ITC","ACAP","DC","GOVSEC","DATA","ECO","OTHER"]
MANUAL_MAP={}  # 例如 {"雲端與DevOps":"ITC"}

"""
你是研究助理，請把主題標籤映射到數位韌性構面：ITC/ACAP/DC/GOVSEC/DATA/ECO/OTHER。只輸出 JSON 物件。
"""
def llm_map_topics_to_dims(topic_labels):
    sys='If you are a research assistant, please map the topic labels to the digital resilience facets: ITC/ACAP/DC/GOVSEC/DATA/ECO/OTHER. Output is JSON object only.'
    usr={"dimensions":DIMENSIONS, "topics":topic_labels}
    r=client.chat.completions.create(model='gpt-5-nano-2025-08-07', temperature=1, messages=[
        {"role":"system","content":sys}, {"role":"user","content": json.dumps(usr, ensure_ascii=False)}])
    raw=r.choices[0].message.content
    try: m=json.loads(raw)
    except Exception:
        m=json.loads(re.search(r'\{[\s\S]*\}$', raw).group(0))
    return m

labels=sorted([int(t) for t in df[TOP].dropna().unique().tolist() if t!=-1])
label_text={tid: f"Topic {tid}" for tid in labels}
auto_map=llm_map_topics_to_dims(list(label_text.values()))
auto_map

{'Topic 0': 'OTHER',
 'Topic 1': 'OTHER',
 'Topic 2': 'OTHER',
 'Topic 3': 'OTHER',
 'Topic 4': 'OTHER',
 'Topic 5': 'OTHER',
 'Topic 6': 'OTHER',
 'Topic 7': 'OTHER',
 'Topic 8': 'OTHER',
 'Topic 9': 'OTHER',
 'Topic 10': 'OTHER',
 'Topic 11': 'OTHER',
 'Topic 12': 'OTHER',
 'Topic 13': 'OTHER',
 'Topic 14': 'OTHER',
 'Topic 15': 'OTHER',
 'Topic 16': 'OTHER',
 'Topic 17': 'OTHER',
 'Topic 18': 'OTHER',
 'Topic 19': 'OTHER',
 'Topic 20': 'OTHER',
 'Topic 21': 'OTHER',
 'Topic 22': 'OTHER',
 'Topic 23': 'OTHER',
 'Topic 24': 'OTHER',
 'Topic 25': 'OTHER',
 'Topic 26': 'OTHER',
 'Topic 27': 'OTHER',
 'Topic 28': 'OTHER',
 'Topic 29': 'OTHER',
 'Topic 30': 'OTHER',
 'Topic 31': 'OTHER',
 'Topic 32': 'OTHER',
 'Topic 33': 'OTHER',
 'Topic 34': 'OTHER',
 'Topic 35': 'OTHER',
 'Topic 36': 'OTHER',
 'Topic 37': 'OTHER',
 'Topic 38': 'OTHER',
 'Topic 39': 'OTHER',
 'Topic 40': 'OTHER',
 'Topic 41': 'OTHER',
 'Topic 42': 'OTHER',
 'Topic 43': 'OTHER',
 'Topic 44': 'OTHER',
 'Topic 45': 'OTHER'

In [15]:
# 以 LLM 依規則對文本×構面打分（0–5）
from tqdm import tqdm
"""
請你以 0–5 分量表評分該段文字對於某構面的實質性與強度(0=無關/非常空泛;3=有具體行動或量化指標的一部分;5=明確、量化、可稽核且與策略/投資/制度化直接相關)。\n只輸出 JSON：{\"score\":數字, \"evidence\":\"代表性原文\"}
"""
RUBRIC=("Please rate the substantiveness and strength of this paragraph on a 0–5 scale (0 = irrelevant/very vague; 3 = part of a specific action or quantitative indicator; 5 = clear, quantitative, auditable, and directly related to strategy/investment/institutionalization). \nOnly output JSON: {\"score\": number, \"evidence\":\"representative original text\"}")


"""
你是審稿人，依據評分規則對文本片段在指定構面上的實質性打分。
"""
def score_snippet(text, dim):
    sys='You are the reviewer and you score the substantiveness of the text fragments on the specified dimensions according to the scoring rules.'
    usr={"dimension":dim, "rubric":RUBRIC, "text": str(text)[:4000]}
    r=client.chat.completions.create(model='gpt-5-nano-2025-08-07', temperature=1, messages=[
        {"role":"system","content":sys},{"role":"user","content": json.dumps(usr, ensure_ascii=False)}])
    raw=r.choices[0].message.content
    try: d=json.loads(raw)
    except Exception: d=json.loads(re.search(r'\{[\s\S]*\}$', raw).group(0))
    s=max(0, min(5, float(d.get('score',0))))
    return s, d.get('evidence','')

DIMS=DIMENSIONS
scores_list=[]; evid_list=[]
for _,row in tqdm(df.iterrows(), total=len(df)):
    t=str(row['text'])
    srow={}; erow={}
    for dim in DIMS:
        s, ev = score_snippet(t, dim)
        srow[dim]=s; erow[dim]=ev
    scores_list.append(srow); evid_list.append(erow)

scores_df=pd.DataFrame(scores_list); evid_df=pd.DataFrame(evid_list).add_prefix('evi_')
out=pd.concat([df.reset_index(drop=True), scores_df, evid_df], axis=1)
out.to_csv('data/part4_doc_dimension_scores.csv', index=False, encoding='utf-8')
out.head(2)

  0%|          | 6/6233 [06:04<105:03:48, 60.74s/it]


KeyboardInterrupt: 

In [None]:
# 依公司/年度聚合，計算 DRI（等權平均；可自行換權重）
entity_col='company' if 'company' in out.columns else ('firm' if 'firm' in out.columns else None)
time_col='year' if 'year' in out.columns else ('date' if 'date' in out.columns else None)
cols=[c for c in [entity_col,time_col] if c]
if not cols: print('未偵測到 company/year，以下示範整體聚合')
agg=out[cols + DIMS].groupby(cols).mean().reset_index()
agg['DRI']=agg[DIMS].mean(axis=1)
agg.to_csv('data/part4_entity_time_dri.csv', index=False, encoding='utf-8')
agg.head(10)

In [None]:
import plotly.express as px
if cols:
    fig=px.line(agg, x=cols[-1], y='DRI', color=cols[0], markers=True, title='DRI 時序'); fig.show()
else:
    fig=px.bar(agg, x=list(range(len(agg))), y='DRI', title='整體 DRI'); fig.show()
print('輸出：data/part4_doc_dimension_scores.csv, data/part4_entity_time_dri.csv')