# 第三階段：LLM Agent 迭代優化（自我批改 / 合併 / 重生）

In [None]:
# %pip install pandas numpy tqdm bertopic umap-learn hdbscan openai==1.* scikit-learn

In [None]:
import os, json, math
from pathlib import Path
import numpy as np
import pandas as pd
from openai import OpenAI
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

client = OpenAI()
BASE=Path('/mnt/data')
MODEL_DIR=BASE/'part2_bertopic_model'
CORPUS=BASE/'part2_corpus_with_topics.csv'
EMB=BASE/'embeddings_text-3-small.npy'
assert MODEL_DIR.exists() and CORPUS.exists() and EMB.exists()

df=pd.read_csv(CORPUS)
embeddings=np.load(EMB)
model=BERTopic.load(MODEL_DIR.as_posix())
topics=df['topic'].tolist()

def centers(emb, topics):
    m={}
    s=pd.Series(topics)
    for tid, idxs in s.groupby(s).groups.items():
        if tid==-1: continue
        vecs=emb[list(idxs)]
        m[tid]=normalize(vecs.mean(axis=0, keepdims=True))[0]
    return m

def metrics(emb, topics):
    c=centers(emb, topics)
    # cohesion
    s=pd.Series(topics)
    coh={}
    for tid, idxs in s.groupby(s).groups.items():
        if tid==-1 or tid not in c: continue
        sims=cosine_similarity(emb[list(idxs)], c[tid].reshape(1,-1)).ravel()
        coh[tid]=float(np.mean(sims))
    # separation
    sep=np.nan
    if len(c)>=2:
        mat=np.vstack(list(c.values()))
        sep=cosine_distances(mat).mean()
    # silhouette
    mask=np.array(topics)!=-1
    sil=np.nan
    if mask.sum()>5 and len(set(np.array(topics)[mask]))>1:
        sil=silhouette_score(emb[mask], np.array(topics)[mask])
    out=(np.array(topics)==-1).mean()
    return coh, sep, sil, out

coh, sep, sil, out = metrics(embeddings, topics)
print('一致性均值:', np.mean(list(coh.values())) if coh else np.nan)
print('區分度均值:', sep)
print('Silhouette:', sil)
print('Outlier 比例:', out)

In [None]:
def sample_topic_snippets(topic_model, df, k_each=3):
    samples={}
    for tid in topic_model.get_topic_info()['Topic'].tolist():
        if tid==-1: continue
        words=', '.join([w for w,_ in topic_model.get_topic(tid)[:10]])
        ex=df[df['topic']==tid]['text'].head(k_each).tolist()
        samples[tid]={"words":words, "examples":ex}
    return samples

samples=sample_topic_snippets(model, df)
plan_prompt='''你是一位主題建模審稿人。以下是主題代表詞與例句，請輸出 JSON：
- "merge_pairs": [[a,b],...]
- "split_topics": [id,...]
- "rename": {id: "新名稱"}
- "new_stopwords": ["...", ...]
- "params": {"min_cluster_size": 25, "min_samples": 7, "n_neighbors": 15, "n_components": 10}
僅輸出 JSON，不要解釋。'''

resp=client.chat.completions.create(model='gpt-4o-mini', temperature=0.2,
    messages=[{"role":"system","content":"你會產出可機器讀取且可執行的最佳化建議。"},
             {"role":"user","content": plan_prompt},
             {"role":"user","content": json.dumps({"topics":samples}, ensure_ascii=False)}])
raw=resp.choices[0].message.content
try:
    plan=json.loads(raw)
except Exception:
    import re
    plan=json.loads(re.search(r'\{[\s\S]*\}$', raw).group(0))
plan

In [None]:
# 依建議重跑（示範：參數+命名；合併/拆分可再擴充）
from bertopic import BERTopic

p=plan.get('params', {})
min_cluster_size=int(p.get('min_cluster_size',30))
min_samples=int(p.get('min_samples',10))
n_neighbors=int(p.get('n_neighbors',15))
n_components=int(p.get('n_components',10))

umap2=UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42)
hs2=hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean',
                    cluster_selection_method='eom', prediction_data=True)
model2=BERTopic(calculate_probabilities=True, verbose=True, umap_model=umap2, hdbscan_model=hs2)

top2, prob2 = model2.fit_transform(df['text'].astype(str).tolist(), embeddings=embeddings)

# 命名
if isinstance(plan.get('rename',{}), dict):
    for k,v in {int(k):v for k,v in plan['rename'].items()}.items():
        try: model2.set_topic_labels({k:v})
        except: pass

df['topic_v2']=top2
coh2, sep2, sil2, out2 = metrics(embeddings, top2)
print('Before -> After')
print('一致性均值:', np.mean(list(coh.values())) if coh else np.nan, '->', np.mean(list(coh2.values())) if coh2 else np.nan)
print('區分度均值:', sep, '->', sep2)
print('Silhouette:', sil, '->', sil2)
print('Outlier 比例:', out, '->', out2)

OUT=Path('/mnt/data/part3_optimized_bertopic_model'); model2.save(OUT.as_posix())
df.to_csv('/mnt/data/part3_corpus_with_topics_v2.csv', index=False, encoding='utf-8')
print('已輸出 part3_optimized_bertopic_model / part3_corpus_with_topics_v2.csv')

In [None]:
# （選用）找出「模糊」文件 id（最近兩中心距離差小），可配合 LITA 只在這些文件上請 LLM 判讀

def ambiguous_indices(emb, topics, margin_th=0.02, cap=200):
    c=centers(emb, topics)
    ids=[]
    if not c: return ids
    C=np.vstack(list(c.values()))
    for i,tid in enumerate(topics):
        if tid==-1 or tid not in c: continue
        d=cosine_distances(emb[i].reshape(1,-1), C).ravel()
        s=np.sort(d)
        if s[1]-s[0] < margin_th: ids.append(i)
        if len(ids)>=cap: break
    return ids

amb_ids=ambiguous_indices(embeddings, top2)
len(amb_ids), amb_ids[:10]