In [1]:
# eval_suite.py
import os, sys, math, json, random
import numpy as np

# ---------- 尝试用 gensim 读取 .vec；失败则手写解析 ----------
def load_vectors(vec_path):
    try:
        from gensim.models import KeyedVectors
        wv = KeyedVectors.load_word2vec_format(vec_path, binary=False)
        vocab = wv.index_to_key
        W = np.stack([wv[w] for w in vocab], axis=0)
        return vocab, W
    except Exception as e:
        print("[WARN] gensim 读取失败，改用手写解析:", e)
        vocab, vecs = [], []
        with open(vec_path, "r", encoding="utf-8", errors="ignore") as f:
            first = f.readline().strip().split()
            # 如果第一行是 "V D" 头
            if len(first) == 2 and all(s.isdigit() for s in first):
                pass
            else:
                # 第一行就是一个词向量
                parts = first
                vocab.append(parts[0]); vecs.append([float(x) for x in parts[1:]])
            for line in f:
                parts = line.strip().split()
                if len(parts) < 3: continue
                vocab.append(parts[0]); vecs.append([float(x) for x in parts[1:]])
        W = np.array(vecs, dtype=np.float32)
        return vocab, W

def l2_normalize(mat, eps=1e-9):
    nrm = np.linalg.norm(mat, axis=1, keepdims=True) + eps
    return mat / nrm

def cosine_matrix(A, B):
    # A,B 均需先单位化
    return A @ B.T

# ---------- Intrinsic 1: kNN Coherence ----------
def knn_coherence(W, sample_size=1000, k=10, seed=42):
    rng = np.random.default_rng(seed)
    V = W.shape[0]
    idx = rng.choice(V, size=min(sample_size, V), replace=False)
    X = l2_normalize(W[idx])
    S = cosine_matrix(X, X)
    # 排除自身：将对角设为 -inf 以便不被选入 top-k
    np.fill_diagonal(S, -np.inf)
    # top-k 平均相似度
    part = np.partition(S, -k, axis=1)[:, -k:]       # [n, k] 未排序的 top-k 值
    coh = part.mean()
    return float(coh)

# ---------- Intrinsic 2: Hubness ----------
def hubness(W, sample_size=2000, k=10, seed=42):
    rng = np.random.default_rng(seed)
    V = W.shape[0]
    idx = rng.choice(V, size=min(sample_size, V), replace=False)
    X = l2_normalize(W[idx])
    S = cosine_matrix(X, X)
    np.fill_diagonal(S, -np.inf)
    # 统计每个点被其他点选为top-k邻居的次数
    topk_idx = np.argpartition(S, -k, axis=1)[:, -k:]  # [n, k]
    counts = np.bincount(topk_idx.reshape(-1), minlength=X.shape[0]).astype(np.float64)
    # 归一化 + 偏度（第三中心矩/标准差^3）
    c = counts / counts.sum()
    mu = c.mean()
    sd = c.std() + 1e-12
    skew = (((c - mu)**3).mean()) / (sd**3)
    # Gini（可选）：越接近0越均匀
    sorted_c = np.sort(c)
    n = len(sorted_c)
    gini = 1 - 2 * np.sum((n - np.arange(1, n+1) + 0.5) * sorted_c) / (n * sorted_c.sum() + 1e-12)
    return {"skew": float(skew), "gini": float(gini)}

# ---------- Intrinsic 3: Isotropy ----------
def isotropy(W):
    X = l2_normalize(W)
    m = X.mean(axis=0)
    iso = 1.0 - float(np.linalg.norm(m))
    return max(0.0, min(1.0, iso))  # 裁剪到 [0,1]

# ---------- Similarity (Spearman, 无 SciPy 版) ----------
def _rankdata(a):
    # 稳健的秩（考虑并列：取平均秩）
    temp = np.argsort(a)
    ranks = np.empty_like(temp, dtype=np.float64)
    ranks[temp] = np.arange(len(a))
    # 处理 ties
    unique, inv, counts = np.unique(a, return_inverse=True, return_counts=True)
    cum = np.cumsum(counts)
    start = cum - counts
    avg = (start + cum - 1) / 2.0
    return avg[inv]

def spearmanr_no_scipy(x, y):
    rx, ry = _rankdata(x), _rankdata(y)
    # 皮尔逊相关 on ranks
    rxm, rym = rx - rx.mean(), ry - ry.mean()
    denom = (np.linalg.norm(rxm) * np.linalg.norm(rym) + 1e-12)
    return float(np.dot(rxm, rym) / denom)

def evaluate_similarity(vec_path, sim_path):
    # 期望 TSV/CSV: word1, word2, score（带表头也行）
    vocab, W = load_vectors(vec_path)
    word2id = {w:i for i,w in enumerate(vocab)}
    pairs, gold = [], []
    with open(sim_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            parts = [p.strip() for p in line.replace(",", "\t").split()]
            if len(parts) < 3: continue
            a, b, s = parts[0], parts[1], parts[2]
            if a in word2id and b in word2id:
                pairs.append((word2id[a], word2id[b]))
                gold.append(float(s))
    if not pairs:
        return {"spearman": None, "used": 0, "total": 0}
    X = l2_normalize(W)
    sim = [float(np.dot(X[i], X[j])) for i,j in pairs]
    rho = spearmanr_no_scipy(np.array(sim), np.array(gold))
    return {"spearman": rho, "used": len(pairs)}

# ---------- Analogy (Google questions-words.txt 格式) ----------
def evaluate_analogy(vec_path, qwords_path, case_insensitive=True):
    vocab, W = load_vectors(vec_path)
    word2id = {w:i for i,w in enumerate(vocab)}
    X = l2_normalize(W)
    total = correct = 0
    with open(qwords_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if line.startswith(":"):  # 类别头
                continue
            parts = line.strip().split()
            if len(parts) != 4: 
                continue
            a,b,c,d = parts
            if case_insensitive:
                a,b,c,d = a.lower(), b.lower(), c.lower(), d.lower()
            if not all(w in word2id for w in (a,b,c,d)):
                continue
            va, vb, vc = X[word2id[a]], X[word2id[b]], X[word2id[c]]
            target = vb - va + vc
            target /= (np.linalg.norm(target) + 1e-9)
            scores = X @ target
            # 屏蔽 a,b,c 本身
            for w in (a,b,c):
                scores[word2id[w]] = -np.inf
            pred = np.argmax(scores)
            total += 1
            if pred == word2id[d]:
                correct += 1
    acc = (correct / total) if total > 0 else None
    return {"accuracy": acc, "used": total}

# ---------- CLI ----------
def main():
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--vec", type=str, default="outputs/w2v_text8_sgns.vec")
    ap.add_argument("--similarity", type=str, default="", help="词相似度数据集路径（csv/tsv: w1,w2,score）")
    ap.add_argument("--analogy", type=str, default="", help="Google questions-words.txt 路径")
    ap.add_argument("--sample-size", type=int, default=1000)
    ap.add_argument("--knn", type=int, default=10)
    args, _ = ap.parse_known_args()

    print(f"[Load] {args.vec}")
    vocab, W = load_vectors(args.vec)
    print(f"Vocab={len(vocab)}, dim={W.shape[1]}")

    print("\n[Intrinsic] kNN Coherence …")
    coh = knn_coherence(W, sample_size=args.sample_size, k=args.knn)
    print(f"  coherence@{args.knn} = {coh:.4f}  (越高越紧密)")

    print("\n[Intrinsic] Hubness …")
    hub = hubness(W, sample_size=min(2000, args.sample_size*2), k=args.knn)
    print(f"  hubness skew={hub['skew']:.4f}  gini={hub['gini']:.4f}  (越低越好)")

    print("\n[Intrinsic] Isotropy …")
    iso = isotropy(W)
    print(f"  isotropy score = {iso:.4f}  (越接近 1 越好)")

    if args.similarity:
        print("\n[Similarity] Spearman ρ …")
        sim = evaluate_similarity(args.vec, args.similarity)
        if sim["spearman"] is None:
            print("  无有效样本（可能是 OOV 太多）")
        else:
            print(f"  spearman = {sim['spearman']:.4f}  (越高越好)  | used={sim['used']}")

    if args.analogy:
        print("\n[Analogy] 3CosAdd Accuracy …")
        ana = evaluate_analogy(args.vec, args.analogy)
        if ana["accuracy"] is None:
            print("  无有效样本（可能是 OOV 太多）")
        else:
            print(f"  accuracy = {ana['accuracy']*100:.2f}%  | used={ana['used']}")

if __name__ == "__main__":
    main()


[Load] outputs/w2v_text8_sgns.vec
Vocab=71290, dim=300

[Intrinsic] kNN Coherence …
  coherence@10 = 0.6936  (越高越紧密)

[Intrinsic] Hubness …
  hubness skew=2.1793  gini=0.6105  (越低越好)

[Intrinsic] Isotropy …
  isotropy score = 0.3758  (越接近 1 越好)


# 复评（ABTT）

In [2]:
# postprocess_abtt.py
import numpy as np, os
from pathlib import Path

def load_vec(path):
    vocab, vecs = [], []
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        head = f.readline().strip().split()
        if len(head)==2 and all(s.isdigit() for s in head):
            pass
        else:
            parts = head; vocab.append(parts[0]); vecs.append([float(x) for x in parts[1:]])
        for line in f:
            ps = line.strip().split()
            if len(ps)<3: continue
            vocab.append(ps[0]); vecs.append([float(x) for x in ps[1:]])
    W = np.asarray(vecs, dtype=np.float32)
    return vocab, W

def save_vec(path, vocab, W):
    with open(path, "w", encoding="utf-8") as f:
        print(len(vocab), W.shape[1], file=f)
        for w, v in zip(vocab, W):
            f.write(w + " " + " ".join(f"{x:.6f}" for x in v) + "\n")

def l2norm(W, eps=1e-9):
    n = np.linalg.norm(W, axis=1, keepdims=True) + eps
    return W / n

def abtt(W, r=2):
    # 单位化 → 去均值 → SVD → 去前r主成分 → 再单位化
    X = l2norm(W)
    mu = X.mean(axis=0, keepdims=True)
    Xc = X - mu
    U, S, Vt = np.linalg.svd(Xc, full_matrices=False)
    P = Vt[:r].T   # [D,r]
    X_hat = Xc - (Xc @ P) @ P.T
    return l2norm(X_hat)

def main(in_path="outputs/w2v_text8_sgns.vec", out_path="outputs/w2v_text8_sgns_abtt.vec", r=2):
    vocab, W = load_vec(in_path)
    W2 = abtt(W, r=r)
    os.makedirs("outputs", exist_ok=True)
    save_vec(out_path, vocab, W2)
    print(f"✅ ABTT done: r={r}\n→ {out_path}")

if __name__ == "__main__":
    # 直接运行：python postprocess_abtt.py
    main()


✅ ABTT done: r=2
→ outputs/w2v_text8_sgns_abtt.vec


In [3]:
# eval_suite.py
import os, sys, math, json, random
import numpy as np

# ---------- 尝试用 gensim 读取 .vec；失败则手写解析 ----------
def load_vectors(vec_path):
    try:
        from gensim.models import KeyedVectors
        wv = KeyedVectors.load_word2vec_format(vec_path, binary=False)
        vocab = wv.index_to_key
        W = np.stack([wv[w] for w in vocab], axis=0)
        return vocab, W
    except Exception as e:
        print("[WARN] gensim 读取失败，改用手写解析:", e)
        vocab, vecs = [], []
        with open(vec_path, "r", encoding="utf-8", errors="ignore") as f:
            first = f.readline().strip().split()
            # 如果第一行是 "V D" 头
            if len(first) == 2 and all(s.isdigit() for s in first):
                pass
            else:
                # 第一行就是一个词向量
                parts = first
                vocab.append(parts[0]); vecs.append([float(x) for x in parts[1:]])
            for line in f:
                parts = line.strip().split()
                if len(parts) < 3: continue
                vocab.append(parts[0]); vecs.append([float(x) for x in parts[1:]])
        W = np.array(vecs, dtype=np.float32)
        return vocab, W

def l2_normalize(mat, eps=1e-9):
    nrm = np.linalg.norm(mat, axis=1, keepdims=True) + eps
    return mat / nrm

def cosine_matrix(A, B):
    # A,B 均需先单位化
    return A @ B.T

# ---------- Intrinsic 1: kNN Coherence ----------
def knn_coherence(W, sample_size=1000, k=10, seed=42):
    rng = np.random.default_rng(seed)
    V = W.shape[0]
    idx = rng.choice(V, size=min(sample_size, V), replace=False)
    X = l2_normalize(W[idx])
    S = cosine_matrix(X, X)
    # 排除自身：将对角设为 -inf 以便不被选入 top-k
    np.fill_diagonal(S, -np.inf)
    # top-k 平均相似度
    part = np.partition(S, -k, axis=1)[:, -k:]       # [n, k] 未排序的 top-k 值
    coh = part.mean()
    return float(coh)

# ---------- Intrinsic 2: Hubness ----------
def hubness(W, sample_size=2000, k=10, seed=42):
    rng = np.random.default_rng(seed)
    V = W.shape[0]
    idx = rng.choice(V, size=min(sample_size, V), replace=False)
    X = l2_normalize(W[idx])
    S = cosine_matrix(X, X)
    np.fill_diagonal(S, -np.inf)
    # 统计每个点被其他点选为top-k邻居的次数
    topk_idx = np.argpartition(S, -k, axis=1)[:, -k:]  # [n, k]
    counts = np.bincount(topk_idx.reshape(-1), minlength=X.shape[0]).astype(np.float64)
    # 归一化 + 偏度（第三中心矩/标准差^3）
    c = counts / counts.sum()
    mu = c.mean()
    sd = c.std() + 1e-12
    skew = (((c - mu)**3).mean()) / (sd**3)
    # Gini（可选）：越接近0越均匀
    sorted_c = np.sort(c)
    n = len(sorted_c)
    gini = 1 - 2 * np.sum((n - np.arange(1, n+1) + 0.5) * sorted_c) / (n * sorted_c.sum() + 1e-12)
    return {"skew": float(skew), "gini": float(gini)}

# ---------- Intrinsic 3: Isotropy ----------
def isotropy(W):
    X = l2_normalize(W)
    m = X.mean(axis=0)
    iso = 1.0 - float(np.linalg.norm(m))
    return max(0.0, min(1.0, iso))  # 裁剪到 [0,1]

# ---------- Similarity (Spearman, 无 SciPy 版) ----------
def _rankdata(a):
    # 稳健的秩（考虑并列：取平均秩）
    temp = np.argsort(a)
    ranks = np.empty_like(temp, dtype=np.float64)
    ranks[temp] = np.arange(len(a))
    # 处理 ties
    unique, inv, counts = np.unique(a, return_inverse=True, return_counts=True)
    cum = np.cumsum(counts)
    start = cum - counts
    avg = (start + cum - 1) / 2.0
    return avg[inv]

def spearmanr_no_scipy(x, y):
    rx, ry = _rankdata(x), _rankdata(y)
    # 皮尔逊相关 on ranks
    rxm, rym = rx - rx.mean(), ry - ry.mean()
    denom = (np.linalg.norm(rxm) * np.linalg.norm(rym) + 1e-12)
    return float(np.dot(rxm, rym) / denom)

def evaluate_similarity(vec_path, sim_path):
    # 期望 TSV/CSV: word1, word2, score（带表头也行）
    vocab, W = load_vectors(vec_path)
    word2id = {w:i for i,w in enumerate(vocab)}
    pairs, gold = [], []
    with open(sim_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            parts = [p.strip() for p in line.replace(",", "\t").split()]
            if len(parts) < 3: continue
            a, b, s = parts[0], parts[1], parts[2]
            if a in word2id and b in word2id:
                pairs.append((word2id[a], word2id[b]))
                gold.append(float(s))
    if not pairs:
        return {"spearman": None, "used": 0, "total": 0}
    X = l2_normalize(W)
    sim = [float(np.dot(X[i], X[j])) for i,j in pairs]
    rho = spearmanr_no_scipy(np.array(sim), np.array(gold))
    return {"spearman": rho, "used": len(pairs)}

# ---------- Analogy (Google questions-words.txt 格式) ----------
def evaluate_analogy(vec_path, qwords_path, case_insensitive=True):
    vocab, W = load_vectors(vec_path)
    word2id = {w:i for i,w in enumerate(vocab)}
    X = l2_normalize(W)
    total = correct = 0
    with open(qwords_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if line.startswith(":"):  # 类别头
                continue
            parts = line.strip().split()
            if len(parts) != 4: 
                continue
            a,b,c,d = parts
            if case_insensitive:
                a,b,c,d = a.lower(), b.lower(), c.lower(), d.lower()
            if not all(w in word2id for w in (a,b,c,d)):
                continue
            va, vb, vc = X[word2id[a]], X[word2id[b]], X[word2id[c]]
            target = vb - va + vc
            target /= (np.linalg.norm(target) + 1e-9)
            scores = X @ target
            # 屏蔽 a,b,c 本身
            for w in (a,b,c):
                scores[word2id[w]] = -np.inf
            pred = np.argmax(scores)
            total += 1
            if pred == word2id[d]:
                correct += 1
    acc = (correct / total) if total > 0 else None
    return {"accuracy": acc, "used": total}

# ---------- CLI ----------
def main():
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--vec", type=str, default="outputs/w2v_text8_sgns_abtt.vec")
    ap.add_argument("--similarity", type=str, default="", help="词相似度数据集路径（csv/tsv: w1,w2,score）")
    ap.add_argument("--analogy", type=str, default="", help="Google questions-words.txt 路径")
    ap.add_argument("--sample-size", type=int, default=1000)
    ap.add_argument("--knn", type=int, default=10)
    args, _ = ap.parse_known_args()

    print(f"[Load] {args.vec}")
    vocab, W = load_vectors(args.vec)
    print(f"Vocab={len(vocab)}, dim={W.shape[1]}")

    print("\n[Intrinsic] kNN Coherence …")
    coh = knn_coherence(W, sample_size=args.sample_size, k=args.knn)
    print(f"  coherence@{args.knn} = {coh:.4f}  (越高越紧密)")

    print("\n[Intrinsic] Hubness …")
    hub = hubness(W, sample_size=min(2000, args.sample_size*2), k=args.knn)
    print(f"  hubness skew={hub['skew']:.4f}  gini={hub['gini']:.4f}  (越低越好)")

    print("\n[Intrinsic] Isotropy …")
    iso = isotropy(W)
    print(f"  isotropy score = {iso:.4f}  (越接近 1 越好)")

    if args.similarity:
        print("\n[Similarity] Spearman ρ …")
        sim = evaluate_similarity(args.vec, args.similarity)
        if sim["spearman"] is None:
            print("  无有效样本（可能是 OOV 太多）")
        else:
            print(f"  spearman = {sim['spearman']:.4f}  (越高越好)  | used={sim['used']}")

    if args.analogy:
        print("\n[Analogy] 3CosAdd Accuracy …")
        ana = evaluate_analogy(args.vec, args.analogy)
        if ana["accuracy"] is None:
            print("  无有效样本（可能是 OOV 太多）")
        else:
            print(f"  accuracy = {ana['accuracy']*100:.2f}%  | used={ana['used']}")

if __name__ == "__main__":
    main()


[Load] outputs/w2v_text8_sgns_abtt.vec
Vocab=71290, dim=300

[Intrinsic] kNN Coherence …
  coherence@10 = 0.3983  (越高越紧密)

[Intrinsic] Hubness …
  hubness skew=1.4041  gini=0.4672  (越低越好)

[Intrinsic] Isotropy …
  isotropy score = 0.9673  (越接近 1 越好)
