In [None]:
# ===== alpha_sweep_w2v_text8_with_loss_and_overlay.py =====
import os, time, re
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec

# --------- 路径与超参 ---------
TEXT8 = Path("data/text8")
assert TEXT8.exists(), "缺少 data/text8，请先准备语料（http://mattmahoney.net/dc/textdata.html）。"

OUT = Path("outputs"); OUT.mkdir(exist_ok=True)
ALPHAS = [0.5, 0.6, 0.75, 0.9, 1.0]   # ← 5 个 α
VECTOR_SIZE = 300
WINDOW = 5
NEGATIVE = 10
SAMPLE = 1e-3
MIN_COUNT = 5
EPOCHS = 10
SEED = 42
WORKERS = (os.cpu_count() and max(1, os.cpu_count()-1)) or 1

# --------- 数据读取：把 text8 切成“伪句子”块 ---------
def read_text8(path: Path, chunk=10_000):
    with open(path, "r", encoding="utf-8") as f:
        toks = f.read().strip().split()
    for i in range(0, len(toks), chunk):
        yield toks[i:i+chunk]

SENTS = list(read_text8(TEXT8))  # 列表化，便于多次遍历

# --------- 训练损失回调：逐 epoch 记录“增量损失” ---------
class EpochLossLogger(CallbackAny2Vec):
    def __init__(self):
        self.losses = []
        self.prev = 0.0
    def on_epoch_end(self, model):
        loss_cum = model.get_latest_training_loss()
        self.losses.append(loss_cum - self.prev)  # 本 epoch 的增量
        self.prev = loss_cum

# --------- 可视化所需工具：子集选择 + PCA ---------
def collect_subset(wv, seed_words, nn_per_seed=12, limit=300):
    chosen, seen = [], set()
    seeds = [w for w in seed_words if w in wv]
    for s in seeds:
        if s not in seen:
            chosen.append(s); seen.add(s)
        for w, _ in wv.most_similar(s, topn=nn_per_seed):
            if w not in seen:
                chosen.append(w); seen.add(w)
            if len(chosen) >= limit: break
        if len(chosen) >= limit: break
    if len(chosen) < limit:
        for w in wv.index_to_key:
            if w not in seen:
                chosen.append(w); seen.add(w)
            if len(chosen) >= limit: break
    return chosen

def pca_reduce(X, n_components=2):
    Xc = X - X.mean(axis=0, keepdims=True)
    U, S, Vt = np.linalg.svd(Xc, full_matrices=False)
    return Xc @ Vt[:n_components].T

# --------- 主流程：训练 5 个 α，记录损失并保存向量 ---------
loss_curves = {}         # alpha -> [loss_epoch1, ..., loss_epochE]
models_kv = {}           # alpha -> KeyedVectors
vec_paths = {}           # alpha -> path

for alpha in ALPHAS:
    print(f"[train] alpha={alpha} | ns_exponent={alpha} | epochs={EPOCHS}")
    loss_cb = EpochLossLogger()
    t0 = time.time()
    model = Word2Vec(
        sg=1, vector_size=VECTOR_SIZE, window=WINDOW,
        negative=NEGATIVE, ns_exponent=alpha,
        sample=SAMPLE, min_count=MIN_COUNT,
        workers=WORKERS, seed=SEED,
        compute_loss=True
    )
    model.build_vocab(SENTS)
    model.train(
        SENTS, total_examples=model.corpus_count, epochs=EPOCHS,
        callbacks=[loss_cb]
    )
    dt = time.time() - t0
    loss_curves[alpha] = loss_cb.losses
    print(f"  → done in {dt/60:.1f} min | losses per epoch: {[round(x,2) for x in loss_cb.losses]}")
    # 保存
    model_path = OUT / f"w2v_text8_alpha{alpha:.3f}.model"
    vec_path   = OUT / f"w2v_text8_alpha{alpha:.3f}.vec"
    # model.save(str(model_path))
    # model.wv.save_word2vec_format(str(vec_path), binary=False)
    models_kv[alpha] = model.wv
    vec_paths[alpha] = str(vec_path)

# --------- 图1：不同 α 的训练损失曲线 ---------
plt.figure(figsize=(10,6))
for alpha in ALPHAS:
    y = loss_curves[alpha]
    x = list(range(1, len(y)+1))
    plt.plot(x, y, marker='o', linewidth=1, label=f"α={alpha}")
plt.xlabel("Epoch")
plt.ylabel("Training loss increment per epoch (↓)")
plt.title("SGNS on Text8 — Loss vs Epoch for different α")
plt.grid(alpha=0.3)
plt.legend()
out_loss = OUT / "loss_vs_epoch_by_alpha.png"
plt.tight_layout(); plt.savefig(out_loss, dpi=200); plt.close()
print(f"✅ Saved loss figure → {out_loss}")

# --------- 图2/图3：五种 α 的 2D/3D 叠绘（联合 PCA，坐标一致）---------
# 选一批词（基于中间一个 α 的近邻，例如 0.75；若不存在就用第一个）
alpha_ref = 0.75 if 0.75 in models_kv else ALPHAS[0]
wv_ref = models_kv[alpha_ref]
seed_words = ["king","queen","man","woman","london","paris","france","england",
              "computer","software","data","science","music","art","city","country","river","mountain"]
WORDS = collect_subset(wv_ref, seed_words, nn_per_seed=12, limit=300)

# 联合 PCA：把 5 组 [len(WORDS) x D] 拼起来做一次 PCA
blocks = []
for alpha in ALPHAS:
    wv = models_kv[alpha]
    X = np.stack([wv[w] if w in wv else np.zeros(VECTOR_SIZE, dtype=np.float32) for w in WORDS], axis=0)
    blocks.append(X)
X_all = np.vstack(blocks)                               # [5*W, D]
Z2_all = pca_reduce(X_all, n_components=2)              # [5*W, 2]
Z3_all = pca_reduce(X_all, n_components=3)              # [5*W, 3]

# 按块切回每个 α
W = len(WORDS)
coords2 = {alpha: Z2_all[i*W:(i+1)*W] for i, alpha in enumerate(ALPHAS)}
coords3 = {alpha: Z3_all[i*W:(i+1)*W] for i, alpha in enumerate(ALPHAS)}

# 2D 叠绘
plt.figure(figsize=(12,9))
for alpha in ALPHAS:
    Z = coords2[alpha]
    plt.scatter(Z[:,0], Z[:,1], s=12, label=f"α={alpha}", alpha=0.8)
# 只标注部分词（种子），避免过密
for w in seed_words:
    if w in WORDS:
        j = WORDS.index(w)
        # 用参考 α 的坐标标注，防止多次重叠
        z = coords2[alpha_ref][j]
        plt.annotate(w, (z[0], z[1]), fontsize=8, alpha=0.9)
plt.title("Word Embeddings (PCA 2D) — overlay of 5 α")
plt.tight_layout()
out_2d = OUT / "emb_compare_2d_5alpha.png"
plt.legend()
plt.savefig(out_2d, dpi=200); plt.close()
print(f"✅ Saved 2D overlay → {out_2d}")

# 3D 叠绘
from mpl_toolkits.mplot3d import Axes3D  # noqa
fig = plt.figure(figsize=(12,9))
ax = fig.add_subplot(111, projection='3d')
for alpha in ALPHAS:
    Z = coords3[alpha]
    ax.scatter(Z[:,0], Z[:,1], Z[:,2], s=12, label=f"α={alpha}", alpha=0.85)
# 参考 α 的种子词标注
for w in seed_words:
    if w in WORDS:
        j = WORDS.index(w)
        z = coords3[alpha_ref][j]
        ax.text(z[0], z[1], z[2], w, fontsize=7)
ax.set_title("Word Embeddings (PCA 3D) — overlay of 5 α")
plt.tight_layout()
out_3d = OUT / "emb_compare_3d_5alpha.png"
plt.legend()
plt.savefig(out_3d, dpi=200); plt.close()
print(f"✅ Saved 3D overlay → {out_3d}")

print("\nAll done.")


[train] alpha=0.5 | ns_exponent=0.5 | epochs=10


KeyboardInterrupt: 

In [2]:
# ===== alpha_sweep_w2v_text8_with_loss_and_grads.py =====
import os, time, math, re
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec

# ---------------- 路径与超参 ----------------
TEXT8 = Path("data/text8")
assert TEXT8.exists(), "缺少 data/text8，请先下载放到 data/text8"

OUT = Path("outputs"); OUT.mkdir(exist_ok=True)
ALPHAS = [0.5, 0.6, 0.75, 0.9, 1.0]   # 5个 α
VECTOR_SIZE = 300
WINDOW = 5
NEGATIVE = 10
SAMPLE = 1e-3
MIN_COUNT = 5
EPOCHS = 10
SEED = 42
WORKERS = (os.cpu_count() and max(1, os.cpu_count()-1)) or 1

# 梯度探针采样规模（适当即可，过大计算会慢）
GRAD_PROBE_SAMPLES = 1000    # 每个 epoch 采样多少正样本对 (w,c)
GRAD_PROBE_NEG_K   = NEGATIVE

# ---------------- 读取 text8 → “伪句子” ----------------
def read_text8(path: Path, chunk=10_000):
    with open(path, "r", encoding="utf-8") as f:
        toks = f.read().strip().split()
    for i in range(0, len(toks), chunk):
        yield toks[i:i+chunk]

SENTS = list(read_text8(TEXT8))

# ---------------- 简易工具 ----------------
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def sample_positive_pairs_indices(sents, word2id, window, n_samples, rng):
    """随机抽 n_samples 个 (w,c) 索引对，使用动态窗口；过滤 OOV。"""
    pairs = []
    V = len(word2id)
    for _ in range(n_samples * 3):  # 多采一点尝试，直到凑够 n_samples
        sent = sents[rng.integers(0, len(sents))]
        if len(sent) < 3:
            continue
        i = rng.integers(0, len(sent))
        w = sent[i]
        if w not in word2id:
            continue
        r = int(rng.integers(1, window + 1))
        left = max(0, i - r); right = min(len(sent), i + r + 1)
        ctx_positions = [j for j in range(left, right) if j != i]
        if not ctx_positions:
            continue
        j = ctx_positions[rng.integers(0, len(ctx_positions))]
        c = sent[j]
        if c not in word2id:
            continue
        pairs.append((word2id[w], word2id[c]))
        if len(pairs) >= n_samples:
            break
    return pairs

def build_neg_sampler_probs(model, alpha):
    """按 α 计算负采样分布 P_neg ∝ count^α （按词的索引顺序）。"""
    wv = model.wv
    counts = np.array([wv.get_vecattr(w, "count") for w in wv.index_to_key], dtype=np.float64)
    probs = counts ** float(alpha)
    probs_sum = probs.sum()
    if probs_sum <= 0:
        probs = np.ones_like(probs) / len(probs)
    else:
        probs /= probs_sum
    return probs

def probe_grad_norms(model, sents, alpha, window, n_samples=1000, K=10, seed=0):
    """
    对当前模型参数做一次 SGNS 梯度探针：
      返回 {'vw': mean ||dL/dv_w||, 'uc': mean ||dL/du_c||, 'un': mean ||dL/du_n||}
    """
    rng = np.random.default_rng(seed)
    wv = model.wv
    V = len(wv.index_to_key)
    word2id = wv.key_to_index
    V_in  = wv.vectors                         # [V, D]  输入向量 v_w
    U_out = model.syn1neg                      # [V, D]  输出向量 u_*（负采样使用）
    assert U_out is not None, "model.syn1neg 缺失（确保 negative>0）"

    # 采样正样本对 (w,c)
    pairs = sample_positive_pairs_indices(sents, word2id, window, n_samples, rng)
    if not pairs:
        return {'vw': None, 'uc': None, 'un': None}

    # 负采样分布
    p_neg = build_neg_sampler_probs(model, alpha)

    vw_norms, uc_norms, un_norms = [], [], []

    for (wi, ci) in pairs:
        v_w = V_in[wi]             # (D,)
        u_c = U_out[ci]            # (D,)
        # 采 K 个负样本索引（允许重复，不排除 c，本探针近似即可）
        neg_idx = rng.choice(len(p_neg), size=K, replace=True, p=p_neg)
        u_neg = U_out[neg_idx]     # (K, D)

        # 打分
        s_pos = float(np.dot(u_c, v_w))
        s_neg = u_neg @ v_w        # (K,)

        # sigmoids
        sp = sigmoid(s_pos)        # 标量
        sn = sigmoid(s_neg)        # (K,)

        # 梯度（见 SGNS 闭式公式）
        # dL/dv_w = (σ(s_pos)-1)*u_c + Σ σ(s_neg_i)*u_neg_i
        grad_vw = (sp - 1.0) * u_c + (sn.reshape(-1,1) * u_neg).sum(axis=0)
        # dL/du_c = (σ(s_pos)-1)*v_w
        grad_uc = (sp - 1.0) * v_w
        # dL/du_{n_i} = σ(s_neg_i) * v_w
        grad_un_each = (sn.reshape(-1,1) * v_w.reshape(1,-1))  # (K, D)

        vw_norms.append(np.linalg.norm(grad_vw))
        uc_norms.append(np.linalg.norm(grad_uc))
        un_norms.append(np.linalg.norm(grad_un_each, axis=1).mean())  # 对 K 个负样本取均值

    return {
        'vw': float(np.mean(vw_norms)),
        'uc': float(np.mean(uc_norms)),
        'un': float(np.mean(un_norms)),
    }

# ---------------- 训练回调：记录损失 + 梯度探针 ----------------
class LossAndGradLogger(CallbackAny2Vec):
    def __init__(self, sents, alpha, window, n_samples=1000, K=10, seed=0):
        self.prev_loss = 0.0
        self.losses = []
        self.grads_vw = []
        self.grads_uc = []
        self.grads_un = []
        self.sents = sents
        self.alpha = alpha
        self.window = window
        self.n_samples = n_samples
        self.K = K
        self.seed = seed

    def on_epoch_end(self, model):
        # 累计损失差分 → 本 epoch 增量
        cur = model.get_latest_training_loss()
        self.losses.append(cur - self.prev_loss)
        self.prev_loss = cur
        # 梯度探针
        stats = probe_grad_norms(
            model, self.sents, self.alpha, self.window,
            n_samples=self.n_samples, K=self.K, seed=self.seed
        )
        self.grads_vw.append(stats['vw'])
        self.grads_uc.append(stats['uc'])
        self.grads_un.append(stats['un'])

# ---------------- 可视化工具：子集+PCA ----------------
def collect_subset(wv, seed_words, nn_per_seed=12, limit=300):
    chosen, seen = [], set()
    seeds = [w for w in seed_words if w in wv]
    for s in seeds:
        if s not in seen:
            chosen.append(s); seen.add(s)
        for w, _ in wv.most_similar(s, topn=nn_per_seed):
            if w not in seen:
                chosen.append(w); seen.add(w)
            if len(chosen) >= limit: break
        if len(chosen) >= limit: break
    if len(chosen) < limit:
        for w in wv.index_to_key:
            if w not in seen:
                chosen.append(w); seen.add(w)
            if len(chosen) >= limit: break
    return chosen

def pca_reduce(X, n_components=2):
    Xc = X - X.mean(axis=0, keepdims=True)
    U, S, Vt = np.linalg.svd(Xc, full_matrices=False)
    return Xc @ Vt[:n_components].T

# ---------------- 主流程：训练五个 α，记录损失/梯度并保存模型 ----------------
loss_curves = {}          # alpha -> [E]
grad_curves = {}          # alpha -> dict{'vw':[E],'uc':[E],'un':[E]}
models_kv = {}            # alpha -> KeyedVectors

for alpha in ALPHAS:
    print(f"[train] alpha={alpha} | epochs={EPOCHS}")
    cb = LossAndGradLogger(
        sents=SENTS, alpha=alpha, window=WINDOW,
        n_samples=GRAD_PROBE_SAMPLES, K=GRAD_PROBE_NEG_K,
        seed=SEED
    )
    t0 = time.time()
    # 显式 build_vocab + train，才能在回调中看到逐 epoch 损失
    model = Word2Vec(
        sg=1, vector_size=VECTOR_SIZE, window=WINDOW,
        negative=NEGATIVE, ns_exponent=alpha,
        sample=SAMPLE, min_count=MIN_COUNT,
        workers=WORKERS, seed=SEED,
        compute_loss=True
    )
    model.build_vocab(SENTS)
    model.train(
        SENTS, total_examples=model.corpus_count, epochs=EPOCHS,
        callbacks=[cb]
    )
    print(f"  -> time {(time.time()-t0)/60:.1f} min | loss per epoch: {[round(x,2) for x in cb.losses]}")
    # 保存
    # model.save(str(OUT / f"w2v_text8_alpha{alpha:.3f}.model"))
    # model.wv.save_word2vec_format(str(OUT / f"w2v_text8_alpha{alpha:.3f}.vec"), binary=False)
    models_kv[alpha] = model.wv
    loss_curves[alpha] = cb.losses
    grad_curves[alpha] = {'vw': cb.grads_vw, 'uc': cb.grads_uc, 'un': cb.grads_un}

# ---------------- 图：损失曲线（不同 α） ----------------
plt.figure(figsize=(10,6))
for alpha in ALPHAS:
    y = loss_curves[alpha]; x = range(1, len(y)+1)
    plt.plot(x, y, marker='o', linewidth=1, label=f"α={alpha}")
plt.xlabel("Epoch"); plt.ylabel("Training loss increment per epoch (↓)")
plt.title("SGNS on Text8 — Loss vs Epoch (different α)")
plt.grid(alpha=0.3); plt.legend()
plt.tight_layout(); plt.savefig(OUT/"loss_vs_epoch_by_alpha.png", dpi=200); plt.close()
print("✅ Saved → outputs/loss_vs_epoch_by_alpha.png")

# ---------------- 图：梯度范数曲线（不同 α） ----------------
def plot_grad(key, ylabel, out_name):
    plt.figure(figsize=(10,6))
    for alpha in ALPHAS:
        y = grad_curves[alpha][key]; x = range(1, len(y)+1)
        plt.plot(x, y, marker='o', linewidth=1, label=f"α={alpha}")
    plt.xlabel("Epoch"); plt.ylabel(ylabel)
    plt.title(f"SGNS on Text8 — {ylabel} vs Epoch (different α)")
    plt.grid(alpha=0.3); plt.legend()
    plt.tight_layout(); plt.savefig(OUT/out_name, dpi=200); plt.close()
    print(f"✅ Saved → outputs/{out_name}")

plot_grad('vw', "‖∂L/∂v_w‖ (mean over samples) (↓)", "grad_vw_vs_epoch.png")
plot_grad('uc', "‖∂L/∂u_c‖ (mean over samples) (↓)", "grad_uc_vs_epoch.png")
plot_grad('un', "‖∂L/∂u_n‖ (mean over negatives) (↓)", "grad_un_vs_epoch.png")

# ---------------- 词向量可视化：同图叠绘（2D/3D） ----------------
alpha_ref = 0.75 if 0.75 in models_kv else ALPHAS[0]
wv_ref = models_kv[alpha_ref]
seed_words = ["king","queen","man","woman","london","paris","france","england",
              "computer","software","data","science","music","art","city","country","river","mountain"]
WORDS = collect_subset(wv_ref, seed_words, nn_per_seed=12, limit=300)

# 联合 PCA（保证坐标一致）
blocks = []
for alpha in ALPHAS:
    wv = models_kv[alpha]
    X = np.stack([wv[w] if w in wv else np.zeros(VECTOR_SIZE, dtype=np.float32) for w in WORDS], axis=0)
    blocks.append(X)
X_all = np.vstack(blocks)
Z2_all = pca_reduce(X_all, n_components=2)
Z3_all = pca_reduce(X_all, n_components=3)

W = len(WORDS)
coords2 = {alpha: Z2_all[i*W:(i+1)*W] for i, alpha in enumerate(ALPHAS)}
coords3 = {alpha: Z3_all[i*W:(i+1)*W] for i, alpha in enumerate(ALPHAS)}

# 2D overlay
plt.figure(figsize=(12,9))
for alpha in ALPHAS:
    Z = coords2[alpha]
    plt.scatter(Z[:,0], Z[:,1], s=12, label=f"α={alpha}", alpha=0.8)
# 只标注种子词（参考 α 的坐标），避免过密
for w in seed_words:
    if w in WORDS:
        j = WORDS.index(w)
        z = coords2[alpha_ref][j]
        plt.annotate(w, (z[0], z[1]), fontsize=8, alpha=0.9)
plt.title("Word Embeddings (PCA 2D) — overlay of 5 α")
plt.tight_layout()
plt.legend()
plt.savefig(OUT/"emb_compare_2d_5alpha.png", dpi=200); plt.close()
print("✅ Saved → outputs/emb_compare_2d_5alpha.png")

# 3D overlay
from mpl_toolkits.mplot3d import Axes3D  # noqa
fig = plt.figure(figsize=(12,9))
ax = fig.add_subplot(111, projection='3d')
for alpha in ALPHAS:
    Z = coords3[alpha]
    ax.scatter(Z[:,0], Z[:,1], Z[:,2], s=12, label=f"α={alpha}", alpha=0.85)
for w in seed_words:
    if w in WORDS:
        j = WORDS.index(w)
        z = coords3[alpha_ref][j]
        ax.text(z[0], z[1], z[2], w, fontsize=7)
ax.set_title("Word Embeddings (PCA 3D) — overlay of 5 α")
plt.tight_layout(); plt.legend()
plt.savefig(OUT/"emb_compare_3d_5alpha.png", dpi=200); plt.close()
print("✅ Saved → outputs/emb_compare_3d_5alpha.png")

print("\nAll done.")


[train] alpha=0.5 | epochs=10
  -> time 4.2 min | loss per epoch: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[train] alpha=0.6 | epochs=10
  -> time 4.3 min | loss per epoch: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[train] alpha=0.75 | epochs=10
  -> time 4.2 min | loss per epoch: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[train] alpha=0.9 | epochs=10
  -> time 4.4 min | loss per epoch: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[train] alpha=1.0 | epochs=10
  -> time 5.0 min | loss per epoch: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
✅ Saved → outputs/loss_vs_epoch_by_alpha.png
✅ Saved → outputs/grad_vw_vs_epoch.png
✅ Saved → outputs/grad_uc_vs_epoch.png
✅ Saved → outputs/grad_un_vs_epoch.png
✅ Saved → outputs/emb_compare_2d_5alpha.png
✅ Saved → outputs/emb_compare_3d_5alpha.png

All done.
