# 第二階段：初始主題生成（BERTopic）

- 嵌入模型：OpenAI **text-3-small**（1536 維）
- 降維：UMAP，聚類：HDBSCAN，表示：c-TF-IDF（BERTopic）
- 輸出：
  - `/mnt/data/part2_bertopic_model/`（可供後續載入）
  - `/mnt/data/part2_topics.csv`（主題概覽）
  - `/mnt/data/part2_doc_topic_probs.npy`（doc×topic 機率）
  - `/mnt/data/part2_corpus_with_topics.csv`（語料含 topic）

> 執行前請先設定環境變數 `OPENAI_API_KEY`。

In [None]:
# 第一次執行請取消註解安裝依賴，裝完重啟 kernel
# %pip install pandas numpy tqdm plotly umap-learn hdbscan bertopic openai==1.*

In [None]:
import os, json
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI

CSV_PATH = Path('/mnt/data/corpus.csv')
assert CSV_PATH.exists(), '找不到 /mnt/data/corpus.csv'

df = pd.read_csv(CSV_PATH)
df.columns = [c.strip().lower() for c in df.columns]
assert 'text' in df.columns, "語料需要包含 'text' 欄位"
meta_cols = [c for c in ['doc_id','company','firm','ticker','year','date'] if c in df.columns]
print('偵測到 metadata 欄位：', meta_cols or '(無)')

client = OpenAI()  # 需先設 OPENAI_API_KEY
MODEL_EMB = 'text-3-small'
EMB_PATH = Path('/mnt/data/embeddings_text-3-small.npy')
IDX_PATH = Path('/mnt/data/embeddings_index.json')
texts = df['text'].astype(str).tolist()

if EMB_PATH.exists() and IDX_PATH.exists():
    embeddings = np.load(EMB_PATH)
else:
    BATCH=256
    vecs=[]
    for i in tqdm(range(0,len(texts),BATCH)):
        batch=texts[i:i+BATCH]
        r=client.embeddings.create(model=MODEL_EMB, input=batch)
        vecs.extend([np.array(d.embedding, dtype=np.float32) for d in r.data])
    embeddings=np.vstack(vecs)
    np.save(EMB_PATH, embeddings)
    with open(IDX_PATH,'w') as f: json.dump({'count':len(texts),'model':MODEL_EMB},f)
print('embeddings shape:', embeddings.shape)

In [None]:
from bertopic import BERTopic
from umap import UMAP
import hdbscan

umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=10, metric='euclidean',
                                 cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(calculate_probabilities=True, verbose=True,
                       umap_model=umap_model, hdbscan_model=hdbscan_model)
topics, probs = topic_model.fit_transform(texts, embeddings=embeddings)
df['topic'] = topics
topic_info = topic_model.get_topic_info(); topic_info.head(10)

In [None]:
OUT_DIR = Path('/mnt/data/part2_bertopic_model'); OUT_DIR.mkdir(parents=True, exist_ok=True)
topic_model.save(OUT_DIR.as_posix())
topic_info.to_csv('/mnt/data/part2_topics.csv', index=False, encoding='utf-8')
if probs is not None:
    np.save('/mnt/data/part2_doc_topic_probs.npy', probs)
df.to_csv('/mnt/data/part2_corpus_with_topics.csv', index=False, encoding='utf-8')
print('已輸出 part2_bertopic_model / part2_topics.csv / part2_doc_topic_probs.npy / part2_corpus_with_topics.csv')

In [None]:
fig = topic_model.visualize_barchart(top_n_topics=20); fig.show()
fig = topic_model.visualize_hierarchy(top_n_topics=50); fig.show()
fig = topic_model.visualize_topics(); fig.show()

if 'year' in df.columns:
    year_dist = df.groupby('year')['topic'].value_counts(normalize=True).rename('prop').reset_index()
    year_dist.to_csv('/mnt/data/part2_topic_prop_by_year.csv', index=False, encoding='utf-8')
    year_dist.head()