In [10]:
# eval_authoritative.py
# 评测权威基准：SimLex-999 / WordSim-353 / Google analogies / BATS
# 依赖：numpy, gensim（无需 scipy）。Jupyter 友好（无 argparse）。

import os, json
from pathlib import Path
import numpy as np
from collections import defaultdict, Counter

# ----------------- 通用工具 -----------------
def l2_normalize(mat, eps=1e-9):
    nrm = np.linalg.norm(mat, axis=1, keepdims=True) + eps
    return mat / nrm

def _rankdata(a):
    # 稳健的秩（考虑并列，取平均秩）
    temp = np.argsort(a)
    ranks = np.empty_like(temp, dtype=np.float64)
    ranks[temp] = np.arange(len(a))
    uniq, inv, cnt = np.unique(a, return_inverse=True, return_counts=True)
    cum = np.cumsum(cnt)
    start = cum - cnt
    avg = (start + cum - 1) / 2.0
    return avg[inv]

def spearman_no_scipy(x, y):
    rx, ry = _rankdata(x), _rankdata(y)
    rx -= rx.mean(); ry -= ry.mean()
    denom = (np.linalg.norm(rx) * np.linalg.norm(ry) + 1e-12)
    return float(np.dot(rx, ry) / denom)

# ----------------- 载入词向量 -----------------
def load_vectors(vec_path, lowercase=True):
    from gensim.models import KeyedVectors
    wv = KeyedVectors.load_word2vec_format(vec_path, binary=False)
    vocab = wv.index_to_key
    if lowercase:
        # 构建小写映射：若词表是小写的，这样可兼容大小写数据集
        mapping = {}
        for w in vocab:
            wl = w.lower()
            if wl not in mapping:  # 第一次出现的小写形式
                mapping[wl] = w
        return wv, mapping
    else:
        return wv, None

def get_vec(wv, mapping, token, case_insensitive=True):
    if not case_insensitive:
        return (wv[token] if token in wv else None)
    # CI：优先小写映射
    wl = token.lower()
    if wl in mapping:
        return wv[mapping[wl]]
    return (wv[token] if token in wv else None)

# ----------------- 数据集：SimLex-999 -----------------
def load_simlex(path):
    """
    适配官方 SimLex-999 txt（tab分隔）格式：
    word1, word2, POS, SimLex999, conc(w1), conc(w2), concQ, Assoc(USF), SimAssoc333, SD(SimLex)
    返回: (pairs, gold) 或 None
    """
    from pathlib import Path
    p = Path(path)
    if not p.exists():
        return None

    def norm(s):
        return s.strip().lower().replace(" ", "").replace("\ufeff", "")

    pairs, scores = [], []
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        header = f.readline()
        if not header:
            return None
        # 主要是 \t；容错逗号/空白
        sep = "\t" if "\t" in header else ("," if "," in header else None)
        def split(line):
            if sep is None:
                return [t for t in line.strip().replace("\t", " ").split() if t]
            else:
                return [t.strip() for t in line.strip().split(sep)]

        cols = [norm(c) for c in split(header)]
        # 期望至少包含 word1、word2、simlex999
        try:
            i_w1 = cols.index("word1")
            i_w2 = cols.index("word2")
        except ValueError:
            # 列名不对就直接放弃（上层会显示 SimLex=N）
            return None
        i_sc = cols.index("simlex999") if "simlex999" in cols else None
        if i_sc is None:
            # 少数变体用 score
            if "score" in cols:
                i_sc = cols.index("score")
            else:
                return None

        # 逐行读取
        for line in f:
            if not line.strip():
                continue
            ps = split(line)
            # 有些行尾可能多空列，确保够长
            if len(ps) <= max(i_w1, i_w2, i_sc):
                # 再尝试宽松按空白切分一次
                ps = [t for t in line.strip().replace("\t", " ").split() if t]
                if len(ps) <= max(i_w1, i_w2, i_sc):
                    continue
            w1, w2 = ps[i_w1], ps[i_w2]
            try:
                s = float(ps[i_sc])
            except Exception:
                continue
            if not w1 or not w2:
                continue
            pairs.append((w1, w2))
            scores.append(s)

    if not pairs:
        return None
    return pairs, np.array(scores, dtype=np.float64)

def eval_similarity(wv, mapping, pairs, gold, case_insensitive=True):
    # 计算余弦并与 gold 做 Spearman；返回 ρ 与覆盖率
    vecs = []
    used = []
    for (a, b), s in zip(pairs, gold):
        va = get_vec(wv, mapping, a, case_insensitive)
        vb = get_vec(wv, mapping, b, case_insensitive)
        if va is None or vb is None: 
            continue
        # 归一化余弦（向量通常已近似零均值）
        va_u = va / (np.linalg.norm(va) + 1e-12)
        vb_u = vb / (np.linalg.norm(vb) + 1e-12)
        vecs.append(np.dot(va_u, vb_u))
        used.append(s)
    if len(vecs) == 0:
        return None, 0.0
    rho = spearman_no_scipy(np.array(vecs), np.array(used, dtype=np.float64))
    cov = len(vecs) / len(gold)
    return rho, cov

# ----------------- 数据集：WordSim-353 -----------------
def load_wordsim(path):
    p = Path(path)
    if not p.exists():
        return None
    pairs, scores = [], []
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        header = f.readline()
        # 格式兼容：CSV（有表头）、空格/制表分隔、或 "word1 word2 score"
        sep = "," if "," in header else None
        def parse(line):
            if sep is not None:
                ps = [x.strip() for x in line.strip().split(",")]
            else:
                ps = [x.strip() for x in line.strip().replace("\t"," ").split()]
            if len(ps) < 3: return None
            try:
                return (ps[0], ps[1], float(ps[2]))
            except:
                return None
        first = parse(header)
        if first:
            pairs.append((first[0], first[1])); scores.append(first[2])
        for line in f:
            it = parse(line)
            if it:
                pairs.append((it[0], it[1])); scores.append(it[2])
    return pairs, np.array(scores, dtype=np.float64)

# ----------------- 数据集：Google analogies -----------------
def load_google_analogies(path):
    p = Path(path)
    if not p.exists():
        return None
    sections = []
    current = []
    current_name = "ALL"
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line: 
                continue
            if line.startswith(":"):
                # 存上一个 section
                if current:
                    sections.append((current_name, current))
                    current = []
                current_name = line[1:].strip()
            else:
                parts = line.split()
                if len(parts) == 4:
                    current.append(tuple(parts))
    if current:
        sections.append((current_name, current))
    return sections

def eval_analogy(wv, mapping, sections, case_insensitive=True):
    # 返回总准确率 + 分章节准确率
    Xnorm_cache = {}  # 缓存归一化后的向量，减小重复计算
    def vec(word):
        if case_insensitive:
            wl = word.lower()
            if wl in Xnorm_cache: 
                return Xnorm_cache[wl]
            v = get_vec(wv, mapping, word, case_insensitive)
            if v is None: 
                Xnorm_cache[wl] = None
                return None
            vu = v / (np.linalg.norm(v) + 1e-12)
            Xnorm_cache[wl] = vu
            return vu
        else:
            if word in Xnorm_cache: 
                return Xnorm_cache[word]
            if word not in wv:
                Xnorm_cache[word] = None
                return None
            v = wv[word]; vu = v / (np.linalg.norm(v) + 1e-12)
            Xnorm_cache[word] = vu
            return vu

    total = correct = 0
    per_sec = []
    vocab_mat = l2_normalize(wv.vectors)   # [|V|, D]
    for name, items in sections:
        t = c = 0
        for a,b,c_,d in items:
            va, vb, vc, vd = vec(a), vec(b), vec(c_), vec(d)
            if va is None or vb is None or vc is None or vd is None:
                continue
            target = vb - va + vc
            target /= (np.linalg.norm(target) + 1e-12)
            scores = vocab_mat @ target
            # 屏蔽 a, b, c 本身
            for w in (a,b,c_):
                idx = wv.key_to_index.get((mapping[w.lower()] if (case_insensitive and w.lower() in mapping) else w), None)
                if idx is not None:
                    scores[idx] = -np.inf
            pred = int(np.argmax(scores))
            t += 1
            if np.allclose(vocab_mat[pred], vd, atol=0):  # 直接比较向量不可靠；改成索引比较
                pass
            # 比较词索引
            didx = wv.key_to_index.get((mapping[d.lower()] if (case_insensitive and d.lower() in mapping) else d), None)
            if didx is not None and pred == didx:
                c += 1
        total += t; correct += c
        acc = (c / t) if t > 0 else None
        per_sec.append((name, acc, t))
    overall = (correct / total) if total > 0 else None
    return overall, per_sec, total

# ----------------- 数据集：BATS（目录） -----------------
# ==== 放到 eval_authoritative.py，替换原来的 load_bats / 保留 eval_bats 或一起替换 ====
import re
from pathlib import Path

def load_bats(root_dir, include_groups=None, max_pairs_per_file=None, random_state=42):
    """
    读取 BATS_3.0：
      - 支持“三种”格式：
        1) 两列成对： album  albums
        2) 4元组一行： a b c d
        3) '::' 左右两侧成对集合： a b :: c d [c2 d2 ...]
      - 每个文件视为一个任务；将文件内的所有 (a,b) 与 (c,d) 做笛卡尔积，生成 (a,b,c,d)
      - include_groups: 仅包含目录前缀，如 ('1_','2_')
      - max_pairs_per_file: 若文件对数过多，可随机下采样到该数量（避免 OOM）
    返回: [(task_name, [(a,b,c,d), ...]), ...] 或 None
    """
    rng = np.random.default_rng(random_state)
    root = Path(root_dir)
    if not root.exists() or not root.is_dir():
        return None

    # 收集要读的 .txt 文件
    files = []
    for sub in sorted(root.iterdir()):
        if not sub.is_dir():
            continue
        if include_groups and not any(sub.name.startswith(p) for p in include_groups):
            continue
        files += sorted(sub.glob("*.txt"))

    tasks = []
    for f in files:
        name = f.relative_to(root).as_posix()
        pairs = []         # 存 (a,b)
        quads_direct = []  # 若文件直接给了 4 元组

        with open(f, "r", encoding="utf-8", errors="ignore") as fh:
            for line in fh:
                line = line.strip()
                if not line:
                    continue

                # 格式3： '::' 左右成对集合
                if "::" in line:
                    L, R = line.split("::", 1)
                    ltoks = [t for t in re.split(r"\s+", L.strip()) if t]
                    rtoks = [t for t in re.split(r"\s+", R.strip()) if t]
                    lpairs = [(ltoks[i], ltoks[i+1]) for i in range(0, len(ltoks)-1, 2)]
                    rpairs = [(rtoks[i], rtoks[i+1]) for i in range(0, len(rtoks)-1, 2)]
                    for a,b in lpairs:
                        for c,d in rpairs:
                            quads_direct.append((a,b,c,d))
                    continue

                # 其它：按空白切分
                toks = [t for t in re.split(r"\s+", line) if t]

                # 格式2：4元组
                if len(toks) >= 4 and len(toks) % 2 == 0 and len(toks) % 4 == 0:
                    # 每4个为一组
                    for i in range(0, len(toks), 4):
                        quads_direct.append((toks[i], toks[i+1], toks[i+2], toks[i+3]))
                    continue

                # 格式1：两列成对（最常见）
                if len(toks) >= 2:
                    a, b = toks[0], toks[1]
                    pairs.append((a, b))
                    continue
                # 其他异常行：忽略

        items = []
        if pairs:
            # 可选：对数过多时下采样，避免笛卡尔积过大
            if (max_pairs_per_file is not None) and (len(pairs) > max_pairs_per_file):
                idx = rng.choice(len(pairs), size=max_pairs_per_file, replace=False)
                pairs = [pairs[i] for i in sorted(idx)]

            # 生成笛卡尔积（避免同一对与自身配对）
            for i, (a,b) in enumerate(pairs):
                for j, (c,d) in enumerate(pairs):
                    if i == j: 
                        continue
                    items.append((a,b,c,d))

        # 合并直接的 4 元组
        if quads_direct:
            items.extend(quads_direct)

        if items:
            tasks.append((name, items))

    return tasks if tasks else None

def eval_bats(wv, mapping, tasks, case_insensitive=True, verbose=False):
    """
    评测 BATS：返回 (micro, macro, per_task, total_used)
      - micro: 所有题目联合准确率
      - macro: 各任务准确率的平均
      - per_task: [(task, acc, used, oov), ...]
    """
    def get_vec_ci(word):
        v = get_vec(wv, mapping, word, case_insensitive)
        if v is None:
            return None
        return v / (np.linalg.norm(v) + 1e-12)

    vocab_mat = l2_normalize(wv.vectors)
    all_t = all_c = 0
    per_task = []

    for name, items in tasks:
        t = c = 0
        oov = 0
        for a,b,c_,d in items:
            va, vb, vc, vd = map(get_vec_ci, (a,b,c_,d))
            if va is None or vb is None or vc is None or vd is None:
                oov += 1
                continue
            target = vb - va + vc
            target /= (np.linalg.norm(target) + 1e-12)
            scores = vocab_mat @ target
            # 屏蔽 a,b,c
            for w in (a,b,c_):
                key = (mapping[w.lower()] if (case_insensitive and w.lower() in mapping) else w)
                idx = wv.key_to_index.get(key, None)
                if idx is not None:
                    scores[idx] = -np.inf
            pred = int(np.argmax(scores))
            didx = wv.key_to_index.get((mapping[d.lower()] if (case_insensitive and d.lower() in mapping) else d), None)
            if didx is not None:
                t += 1
                if pred == didx:
                    c += 1
        acc = (c / t) if t > 0 else None
        per_task.append((name, acc, t, oov))
        all_t += t; all_c += c

    micro = (all_c / all_t) if all_t > 0 else None
    valid = [acc for _, acc, used, _ in per_task if (acc is not None and used > 0)]
    macro = (sum(valid) / len(valid)) if valid else None

    if verbose:
        for name, acc, used, oov in per_task[:5]:
            print(f"  {name:40s} acc={('%.2f%%'%(acc*100)) if acc is not None else 'None':>7s}  used={used:5d}  oov={oov:5d}")
    return micro, macro, per_task, all_t

# ----------------- 主流程：自动发现模型并评测 -----------------
def evaluate_models(vec_paths=None, data_dir="data", case_insensitive=True, save_json="outputs/bench_results.json"):
    data_dir = Path(data_dir)
    # 自动发现模型
    if vec_paths is None:
        vec_paths = sorted([str(p) for p in Path("outputs").glob("*.vec")])
    if not vec_paths:
        print("未找到 outputs/*.vec，请先训练或指定 vec_paths。"); return

    # 尝试加载各数据集
    simlex = (load_simlex(data_dir/"SimLex-999.txt") or load_simlex(data_dir/"SimLex-999.csv"))
    ws353 = (load_wordsim(data_dir/"wordsim353.csv") or load_wordsim(data_dir/"wordsim_similarity_goldstandard.txt"))
    goog = load_google_analogies(data_dir/"questions-words.txt")
    bats = load_bats(data_dir/"BATS_3.0")

    print("Datasets found:",
          f"SimLex={'Y' if simlex else 'N'}",
          f"WS353={'Y' if ws353 else 'N'}",
          f"Google={'Y' if goog else 'N'}",
          f"BATS={'Y' if bats else 'N'}")

    all_results = []
    for vec in vec_paths:
        print(f"\n=== Evaluating: {Path(vec).name} ===")
        wv, mapping = load_vectors(vec, lowercase=case_insensitive)

        model_res = {"model": Path(vec).name}

        # SimLex-999
        if simlex:
            pairs, gold = simlex
            rho, cov = eval_similarity(wv, mapping, pairs, gold, case_insensitive)
            print(f"SimLex-999 Spearman: {None if rho is None else round(rho,4)} | coverage={cov:.2%}")
            model_res["simlex_rho"] = rho; model_res["simlex_cov"] = cov

        # WordSim-353
        if ws353:
            pairs, gold = ws353
            rho, cov = eval_similarity(wv, mapping, pairs, gold, case_insensitive)
            print(f"WordSim-353 Spearman: {None if rho is None else round(rho,4)} | coverage={cov:.2%}")
            model_res["ws353_rho"] = rho; model_res["ws353_cov"] = cov

        # Google analogies
        if goog:
            overall, per_sec, total = eval_analogy(wv, mapping, goog, case_insensitive)
            acc_str = f"{overall*100:.2f}%" if overall is not None else "None"
            print(f"Google Analogies Acc: {acc_str} | used={total}")
            # 可选：打印前几类
            for name, acc, t in per_sec[:5]:
                if acc is not None:
                    print(f"  - {name:20s} {acc*100:6.2f}% (n={t})")
            model_res["google_acc"] = overall; model_res["google_used"] = total

        # BATS
        if bats:
            micro, macro, per_task, total = eval_bats(wv, mapping, bats, case_insensitive)
            mi_str = f"{micro*100:.2f}%" if micro is not None else "None"
            ma_str = f"{macro*100:.2f}%" if macro is not None else "None"
            print(f"BATS micro: {mi_str} | macro: {ma_str} | used={total}")
            model_res["bats_micro"] = micro; model_res["bats_macro"] = macro; model_res["bats_used"] = total

        all_results.append(model_res)

    # 保存结果 JSON 方便后续对比/作图
    os.makedirs("outputs", exist_ok=True)
    with open(save_json, "w", encoding="utf-8") as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)
    print(f"\n✅ Done. Results saved to {save_json}")
    return all_results

# ----------------- 直接运行 -----------------
if __name__ == "__main__":
    evaluate_models()


Datasets found: SimLex=Y WS353=Y Google=Y BATS=Y

=== Evaluating: w2v_text8_alpha0.500.vec ===
SimLex-999 Spearman: 0.3263 | coverage=99.30%
WordSim-353 Spearman: 0.6574 | coverage=99.43%
Google Analogies Acc: 27.58% | used=17827
  - capital-common-countries  46.64% (n=506)
  - capital-world         19.33% (n=3564)
  - currency              14.93% (n=596)
  - city-in-state         17.60% (n=2330)
  - family                53.33% (n=420)
BATS micro: 24.08% | macro: 14.87% | used=41970

=== Evaluating: w2v_text8_alpha0.526.vec ===
SimLex-999 Spearman: 0.3389 | coverage=99.30%
WordSim-353 Spearman: 0.659 | coverage=99.43%
Google Analogies Acc: 27.82% | used=17827
  - capital-common-countries  48.81% (n=506)
  - capital-world         19.92% (n=3564)
  - currency              12.92% (n=596)
  - city-in-state         17.21% (n=2330)
  - family                53.81% (n=420)
BATS micro: 24.80% | macro: 15.27% | used=41970

=== Evaluating: w2v_text8_alpha0.553.vec ===
SimLex-999 Spearman: 0.344

In [12]:
# plot_benchmarks_alpha_all_and_separate.py
import json, re
import matplotlib.pyplot as plt
from pathlib import Path

# ------- 读取结果 -------
with open("outputs/bench_results.json","r",encoding="utf-8") as f:
    rows = json.load(f)

def parse_alpha(name: str):
    stem = Path(name).stem  # 去扩展名，避免 "0.500."
    m = re.search(r'alpha([0-9]+(?:\.[0-9]+)?)', stem)
    return float(m.group(1)) if m else None

pts = [(parse_alpha(r["model"]), r) for r in rows if parse_alpha(r["model"]) is not None]
pts.sort(key=lambda x: x[0])
if not pts:
    raise SystemExit("No alpha-tagged models in outputs/bench_results.json")

alph = [a for a,_ in pts]
simlex = [r.get("simlex_rho") for _,r in pts]
ws353  = [r.get("ws353_rho")  for _,r in pts]
goog   = [r.get("google_acc") for _,r in pts]
bats_mi = [r.get("bats_micro") for _,r in pts]
bats_ma = [r.get("bats_macro") for _,r in pts]

def add_line(xs_all, ys_all, label):
    xs = [x for x,y in zip(xs_all, ys_all) if y is not None]
    ys = [y for y in ys_all if y is not None]
    if xs and ys:
        plt.plot(xs, ys, marker='o', markersize=3, linewidth=1, label=label)
        return True
    return False

Path("outputs").mkdir(exist_ok=True)

# ------- 合并一张图（一起画） -------
plt.figure(figsize=(10,6))
any_plotted = False
any_plotted |= add_line(alph, simlex, "SimLex-999 ρ (↑)")
any_plotted |= add_line(alph, ws353,  "WordSim-353 ρ (↑)")
any_plotted |= add_line(alph, goog,   "Google analogies acc (↑)")
any_plotted |= add_line(alph, bats_mi,"BATS micro acc (↑)")
any_plotted |= add_line(alph, bats_ma,"BATS macro acc (↑)")
plt.xlabel("ns_exponent α")
plt.ylabel("score")
plt.title("Benchmarks vs α (combined)")
plt.grid(alpha=0.3)
if any_plotted:
    plt.legend()
plt.tight_layout()
plt.savefig("outputs/alpha_benchmarks_all.png", dpi=200)
plt.close()
print("✅ Saved → outputs/alpha_benchmarks_all.png")

# ------- 分别单画（五张图，自动跳过缺失） -------
def plot_one(xs_all, ys_all, title, ylabel, out_path):
    xs = [x for x,y in zip(xs_all, ys_all) if y is not None]
    ys = [y for y in ys_all if y is not None]
    if not xs:
        print(f"⚠️ {title}: 无可用数据，跳过。"); return
    plt.figure(figsize=(9,5))
    plt.plot(xs, ys, marker='o', markersize=3, linewidth=1)
    plt.xlabel("ns_exponent α")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()
    print(f"✅ Saved → {out_path}")

plot_one(alph, simlex, "SimLex-999 vs α",        "Spearman ρ (↑)",   "outputs/alpha_simlex.png")
plot_one(alph, ws353,  "WordSim-353 vs α",       "Spearman ρ (↑)",   "outputs/alpha_ws353.png")
plot_one(alph, goog,   "Google Analogies vs α",  "Accuracy (↑)",     "outputs/alpha_google.png")
plot_one(alph, bats_mi,"BATS Micro vs α",        "Accuracy (↑)",     "outputs/alpha_bats_micro.png")
plot_one(alph, bats_ma,"BATS Macro vs α",        "Accuracy (↑)",     "outputs/alpha_bats_macro.png")


✅ Saved → outputs/alpha_benchmarks_all.png
✅ Saved → outputs/alpha_simlex.png
✅ Saved → outputs/alpha_ws353.png
✅ Saved → outputs/alpha_google.png
✅ Saved → outputs/alpha_bats_micro.png
✅ Saved → outputs/alpha_bats_macro.png
