In [None]:
import numpy as np
import torch

import os
import sys
import json

sys.path.insert(0, "./../")
from FileManager import FileManager
import utils

In [2]:
data_dir = os.path.join(os.getenv("DATASETPATH"), "qwem")
data_fm = FileManager(data_dir)

analogy_dict = data_fm.load("analogies.pickle")
if analogy_dict is None:
    raise FileNotFoundError("Analogy file not found.")

data_fm.set_filepath("enwiki500")
word_counts = data_fm.load("word_counts.pickle")

In [3]:
qwem_dir = os.path.join(os.getenv("EXPTPATH"), "qwem", "qwem-small")
sgns_dir = os.path.join(os.getenv("EXPTPATH"), "qwem", "sgns-small")
qweml_dir = os.path.join(os.getenv("EXPTPATH"), "qwem", "qwem-large")
sgnsl_dir = os.path.join(os.getenv("EXPTPATH"), "qwem", "sgns-large")

fm = FileManager(qwem_dir)
with open(fm.get_filename("hypers.json")) as f:
    H = json.load(f)

VOCAB_SZ = H["vocab_sz"]
EMBEDDIM = H["embeddim"]
vocab = utils.Vocabulary(word_counts[:VOCAB_SZ])
unigram = vocab.counts / vocab.counts.sum()
analogy_dataset = utils.AnalogyDataset(analogy_dict, vocab)

benchmark_fm = FileManager('./benchmarks')
models = benchmark_fm.load(f"models_d{EMBEDDIM}_V{VOCAB_SZ}.pickle")

In [4]:
if models is None:
    def get_W(expt_dir):
        expt_fm = FileManager(expt_dir)
        expt_fm.set_filepath("models")
        W = expt_fm.load("W_final.npy")
        V, S, _ = np.linalg.svd(W, full_matrices=False)
        W = V @ np.diag(S)
        return W
    
    def get_W_from_M(M, d):
        print("starting.. ", end='')
        M = torch.tensor(M, dtype=torch.float64).cuda()
        eigvals, eigvecs = torch.linalg.eigh(M)
        eigvals, eigvecs = eigvals.flip(dims=(0,)), eigvecs.flip(dims=(1,))
        eigvals, eigvecs = eigvals.cpu().numpy(), eigvecs.cpu().numpy()
        W = eigvecs[:, :d] @ np.diag(np.sqrt(eigvals[:d]))
        print("done.")
        return W

    W_QWEM = get_W(qwem_dir)
    W_SGNS = get_W(sgns_dir)
    
    print(f"Computing M*... ", end="")
    corpus_stats = data_fm.load("corpus_stats.pickle")
    cL = corpus_stats["context_len"]
    Cij, Crwij = corpus_stats["counts"], corpus_stats["counts_reweight"]
    numcounts = Cij[:VOCAB_SZ, :VOCAB_SZ].sum()
    Pij = Crwij[:VOCAB_SZ, :VOCAB_SZ] / (numcounts * (cL + 1)/2)
    PiPj = np.outer(unigram, unigram)
    Mstar = 2*(Pij - PiPj)/(Pij + PiPj)
    PMI = np.log((Pij / PiPj) + 1e-25)
    print("done.")

    W_Mstar = get_W_from_M(Mstar, EMBEDDIM)
    W_PMI = get_W_from_M(PMI, EMBEDDIM)
    W_PPMI = get_W_from_M(np.maximum(0, PMI), EMBEDDIM)
    
    models = {
        "SGNS": W_SGNS,
        "QWEM": W_QWEM,
        "Mstar": W_Mstar,
        "PPMI": W_PPMI,
        "PMI": W_PMI,
    }
    benchmark_fm.save(models, f"models_d{EMBEDDIM}_V{VOCAB_SZ}.pickle")

Computing M*... done.
starting.. done.
starting.. done.
starting.. done.


## Eval benchmarks

In [5]:
from scipy.stats import spearmanr


def read_similarity_data(file_path, vocab):
    similarity_data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 3:
                word1, word2, similarity = parts[0], parts[1], float(parts[2])
                if word1 in vocab.word2token and word2 in vocab.word2token:
                    similarity_data.append([vocab.word2token[word1],
                                            vocab.word2token[word2],
                                            similarity])
    return similarity_data


def evaluate_similarity(W, similarity_data):
    norms = np.linalg.norm(W, axis=1, keepdims=True)
    embeds = W / (norms + 1e-10)
    
    predicted_sims, human_sims = [], []
    
    for tok1, tok2, similarity in similarity_data:
        predicted_sims.append(np.dot(embeds[tok1], embeds[tok2]).item())
        human_sims.append(similarity)
    
    if len(predicted_sims) == 0:
        raise ValueError("No valid word pairs found in embeddings.")
    
    return spearmanr(predicted_sims, human_sims).correlation

dataset_dir = os.getenv("DATASETPATH")
mendir = os.path.join(dataset_dir, "qwem/benchmarks/MEN.txt")
ws353dir = os.path.join(dataset_dir, "qwem/benchmarks/ws353.txt")
men_dataset = read_similarity_data(mendir, vocab)
ws353_dataset = read_similarity_data(ws353dir, vocab)
print(len(men_dataset))
print(len(ws353_dataset))

1588
249


In [6]:
benchmarks = ["Google analogies", "MEN", "ws353"]
results = np.empty((len(models.items()), len(benchmarks)))
for i, (k, W) in enumerate(models.items()):
    print(k)
    acc = analogy_dataset.eval_accuracy(W)
    results[i, 0] = acc
    print(f"Analogy acc: {100*acc:.1f}")
    
    rho = evaluate_similarity(W, men_dataset).mean().item()
    results[i, 1] = rho
    print(f"MEN score: {rho:.3f}")
    
    rho = evaluate_similarity(W, ws353_dataset).mean().item()
    results[i, 2] = rho
    print(f"ws353 score: {rho:.4f}")
    
    print()

SGNS
Analogy acc: 67.9
MEN score: 0.743
ws353 score: 0.6962

QWEM
Analogy acc: 65.0
MEN score: 0.753
ws353 score: 0.6814

Mstar
Analogy acc: 66.5
MEN score: 0.756
ws353 score: 0.6829

PPMI
Analogy acc: 50.7
MEN score: 0.744
ws353 score: 0.6900

PMI
Analogy acc: 8.6
MEN score: 0.444
ws353 score: 0.2034



## Eigenfeatures

In [7]:
NORMALIZE = True
# W = models["QWEM"]
W = models["PPMI"]
# W = models["SGNS"]

V, S, Ut = np.linalg.svd(W, full_matrices=False)
assert np.allclose(np.abs(Ut), np.eye(EMBEDDIM))
norms = np.linalg.norm(W, axis=1, keepdims=True) if NORMALIZE else 1
embeds = W / norms

dd = [1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 100]
for d in dd:
    vec = embeds[:, d-1]
    idxs = np.argsort(vec[:4000])[::-1]
    vec_sort = vec[idxs]
    print(f"PCA dir {d}")
    print(f'{(vec_sort[:10]).mean():.3f} {vocab.to_words(idxs[:20])}')
    print(f'{(vec_sort[-10:]).mean():.3f} {vocab.to_words(idxs[-20:][::-1])}')
    print()   

PCA dir 1
-0.025 lemmon kitt socorro spacewatch fefefe id sort km median right peak households expatriate mount establishments hispanic footballers survey bgcolor census
-0.507 eric cooper jones sam dennis oliver tom robinson roberts thompson harris jack miller lewis scott michael taylor moore wilson barry

PCA dir 2
0.468 furthermore requires can specific useful require therefore particular processes example typically such component whereas types specifically components possible additionally appropriate
-0.598 jones dennis eric robinson scott taylor oliver michael roberts david miller smith harris lewis thompson cooper moore russell mitchell wilson

PCA dir 3
0.322 like can uses simple makes soft typical baby typically shape combination featuring eyes usually using surface happy dark similar charlie
-0.410 government establishment governments foreign authorities leaders declared civil officials behalf political independence citizens sought federal union commission relations organisati