In [1]:
# download_text8.py
import os, zipfile, urllib.request
os.makedirs("data", exist_ok=True)
url = "http://mattmahoney.net/dc/text8.zip"
zip_path = "data/text8.zip"
txt_path = "data/text8"

if not os.path.exists(txt_path):
    print("Downloading text8.zip ...")
    urllib.request.urlretrieve(url, zip_path)
    print("Unzipping ...")
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall("data")
    print("OK → data/text8")
else:
    print("Found data/text8, skip.")


Found data/text8, skip.


In [2]:
# train_text8_gensim.py
import os, time, multiprocessing as mp
from pathlib import Path
from gensim.models import Word2Vec

TEXT8 = Path("data/text8")
assert TEXT8.exists(), "缺少 data/text8，请先运行 download_text8.py"

def read_text8(path: Path, chunk=10_000):
    # Text8 是一行空格分词文本；切成“伪句子”块更利于迭代
    with open(path, "r", encoding="utf-8") as f:
        toks = f.read().strip().split()
    n = len(toks)
    for i in range(0, n, chunk):
        yield toks[i:i+chunk]

def main():
    os.makedirs("outputs", exist_ok=True)
    sentences = list(read_text8(TEXT8))

    # 推荐超参（SGNS）
    vector_size = 300   # 词向量维度
    window      = 5     # 窗口大小
    negative    = 10    # 负采样个数
    sample      = 1e-3  # 子采样阈值
    min_count   = 5
    epochs      = 5
    workers     = max(1, mp.cpu_count() - 1)

    print(f"Training SGNS on Text8 ... (vocab pruning: min_count={min_count})")
    t0 = time.time()
    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        sg=1,                 # 1=Skip-Gram
        negative=negative,
        sample=sample,
        min_count=min_count,
        workers=workers,
        epochs=epochs,
        seed=42
    )
    dt = time.time() - t0
    print(f"✅ Done. Training time: {dt/60:.1f} min with workers={workers}")

    # 保存
    model.save("outputs/w2v_text8_sgns.model")
    model.wv.save_word2vec_format("outputs/w2v_text8_sgns.vec", binary=False)
    print("Saved to outputs/w2v_text8_sgns.model / .vec")

    # 快速查看几个词的相似词
    probe = ["king", "queen", "london", "computer"]
    for q in probe:
        if q in model.wv:
            print(f"\nTop-10 for '{q}':")
            for w, s in model.wv.most_similar(q, topn=10):
                print(f"  {w:15s} {s:.4f}")

if __name__ == "__main__":
    main()


Training SGNS on Text8 ... (vocab pruning: min_count=5)
✅ Done. Training time: 1.1 min with workers=31
Saved to outputs/w2v_text8_sgns.model / .vec

Top-10 for 'king':
  queen           0.5844
  canute          0.5761
  kings           0.5650
  haakon          0.5608
  vasa            0.5554
  valdemar        0.5469
  montferrat      0.5410
  corvinus        0.5396
  nemanja         0.5373
  valois          0.5367

Top-10 for 'queen':
  elizabeth       0.6709
  boleyn          0.6055
  margrethe       0.6046
  highness        0.6018
  regnant         0.5978
  consort         0.5896
  hrh             0.5893
  king            0.5844
  isabella        0.5720
  monarch         0.5659

Top-10 for 'london':
  kensington      0.6006
  manchester      0.5840
  newham          0.5738
  guildhall       0.5723
  piccadilly      0.5708
  southwark       0.5645
  surrey          0.5574
  hertfordshire   0.5573
  paddington      0.5563
  holborn         0.5532

Top-10 for 'computer':
  computers    