In [1]:
import numpy as np
import torch

import os
import sys
sys.path.insert(0, './../')
import json
import utils

from FileManager import FileManager

In [2]:
data_dir = os.path.join(os.getenv("DATASETPATH"), "qwem")
data_fm = FileManager(data_dir)

analogy_dict = data_fm.load("analogies.pickle")
if analogy_dict is None:
    raise FileNotFoundError("Analogy file not found.")

data_fm.set_filepath("min500")
word_counts = data_fm.load("word_counts.pickle")

In [None]:
expt_dir = os.path.join(os.getenv("QWEMPATH"), "stepwise-sgns")
fm = FileManager(expt_dir)
with open(fm.get_filename("hypers.json")) as f:
    H = json.load(f)

VOCAB_SZ = H["vocab_sz"]
EMBEDDIM = H["embeddim"]
vocab = utils.Vocabulary(word_counts[:VOCAB_SZ])
unigram = vocab.counts / vocab.counts.sum()
analogy_dataset = utils.AnalogyDataset(analogy_dict, vocab)

save_fm = FileManager('../analysis/bench_models')
models = save_fm.load(f"models_d{EMBEDDIM}_V{VOCAB_SZ}.pickle")


## Create and save models

In [None]:
def get_W(model_dir):
    model_fm = FileManager(model_dir)
    W = model_fm.load("W_final.npy")
    V, S, _ = np.linalg.svd(W, full_matrices=False)
    W = V @ np.diag(S)
    return W

model_dir = os.path.join(os.getenv("QWEMPATH"), "stepwise-qwem/models")
W_QWEM = get_W(model_dir)

model_dir = os.path.join(os.getenv("QWEMPATH"), "stepwise-sgns/models")
W_SGNS = get_W(model_dir)

In [81]:
print(f"Computing M*... ", end="")
corpus_stats = data_fm.load("corpus_stats.pickle")
cL = corpus_stats["context_len"]
Cij, Crwij = corpus_stats["counts"], corpus_stats["counts_reweight"]
numcounts = Cij[:VOCAB_SZ, :VOCAB_SZ].sum()
Pij = Crwij[:VOCAB_SZ, :VOCAB_SZ] / (numcounts * (cL + 1)/2)
PiPj = np.outer(unigram, unigram)
Mstar = 2*(Pij - PiPj)/(Pij + PiPj)
PMI = np.log((Pij / PiPj) + 1e-25)
print("done.")

Computing M*... done.


In [82]:
def get_W_from_M(M, d):
    print("starting.. ", end='')
    M = torch.tensor(M, dtype=torch.float64).cuda()
    eigvals, eigvecs = torch.linalg.eigh(M)
    eigvals, eigvecs = eigvals.flip(dims=(0,)), eigvecs.flip(dims=(1,))
    eigvals, eigvecs = eigvals.cpu().numpy(), eigvecs.cpu().numpy()
    W = eigvecs[:, :d] @ np.diag(np.sqrt(eigvals[:d]))
    print("done.")
    return W

W_Mstar = get_W_from_M(Mstar, EMBEDDIM)
W_PMI = get_W_from_M(PMI, EMBEDDIM)
W_PPMI = get_W_from_M(np.maximum(0, PMI), EMBEDDIM)

starting.. done.
starting.. done.
starting.. done.


In [83]:
models = {
    "SGNS": W_SGNS,
    "QWEM": W_QWEM,
    "Mstar": W_Mstar,
    "PPMI": W_PPMI,
    "PMI": W_PMI,
}

save_fm = FileManager('../analysis/bench_models')
save_fm.save(models, f"models_d{EMBEDDIM}_V{VOCAB_SZ}.pickle")

## Eval benchmarks

In [84]:
from scipy.stats import spearmanr


def read_similarity_data(file_path, vocab):
    similarity_data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 3:
                word1, word2, similarity = parts[0], parts[1], float(parts[2])
                if word1 in vocab.word2token and word2 in vocab.word2token:
                    similarity_data.append([vocab.word2token[word1],
                                            vocab.word2token[word2],
                                            similarity])
    return similarity_data


def evaluate_similarity(W, similarity_data):
    norms = np.linalg.norm(W, axis=1, keepdims=True)
    embeds = W / (norms + 1e-10)
    
    predicted_sims, human_sims = [], []
    
    for tok1, tok2, similarity in similarity_data:
        predicted_sims.append(np.dot(embeds[tok1], embeds[tok2]).item())
        human_sims.append(similarity)
    
    if len(predicted_sims) == 0:
        raise ValueError("No valid word pairs found in embeddings.")
    
    return spearmanr(predicted_sims, human_sims).correlation

dataset_dir = os.getenv("DATASETPATH")
mendir = os.path.join(dataset_dir, "qwem/benchmarks/MEN.txt")
ws353dir = os.path.join(dataset_dir, "qwem/benchmarks/ws353.txt")
men_dataset = read_similarity_data(mendir, vocab)
ws353_dataset = read_similarity_data(ws353dir, vocab)
print(len(men_dataset))
print(len(ws353_dataset))

1588
249


In [87]:
benchmarks = ["Google analogies", "MEN", "ws353"]
results = np.empty((len(models.items()), len(benchmarks)))
for i, (k, W) in enumerate(models.items()):
    print(k)
    acc = analogy_dataset.eval_accuracy(W)
    results[i, 0] = acc
    print(f"Analogy acc: {100*acc:.1f}")
    
    rho = evaluate_similarity(W, men_dataset).mean().item()
    results[i, 1] = rho
    print(f"MEN score: {rho:.3f}")
    
    rho = evaluate_similarity(W, ws353_dataset).mean().item()
    results[i, 2] = rho
    print(f"ws353 score: {rho:.4f}")
    
    print()

SGNS
Analogy acc: 68.0
MEN score: 0.744
ws353 score: 0.6976

QWEM
Analogy acc: 65.1
MEN score: 0.755
ws353 score: 0.6815

Mstar
Analogy acc: 66.3
MEN score: 0.755
ws353 score: 0.6829

PPMI
Analogy acc: 50.6
MEN score: 0.744
ws353 score: 0.6904

PMI
Analogy acc: 8.4
MEN score: 0.448
ws353 score: 0.2206



## Eigenfeatures

In [79]:
NORMALIZE = True
# W = models["QWEM"]
W = models["PPMI"]
# W = models["SGNS"]

V, S, Ut = np.linalg.svd(W, full_matrices=False)
assert np.allclose(np.abs(Ut), np.eye(EMBEDDIM))
norms = np.linalg.norm(W, axis=1, keepdims=True) if NORMALIZE else 1
embeds = W / norms

dd = [1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 100]
for d in dd:
    vec = embeds[:, d-1]
    idxs = np.argsort(vec[:4000])[::-1]
    vec_sort = vec[idxs]
    print(f"PCA dir {d}")
    print(f'{(vec_sort[:10]).mean():.3f} {vocab.to_words(idxs[:20])}')
    print(f'{(vec_sort[-10:]).mean():.3f} {vocab.to_words(idxs[-20:][::-1])}')
    print()   

PCA dir 1
-0.025 lemmon kitt socorro spacewatch fefefe id sort km median right peak households expatriate mount establishments hispanic survey footballers census races
-0.519 eric cooper jones dennis oliver sam tom robinson roberts jack michael thompson miller harris scott lewis taylor alex barry wilson

PCA dir 2
0.475 furthermore requires can specific useful require therefore particular processes example typically such whereas component types specifically components additionally possible instance
-0.589 jones dennis eric robinson scott michael oliver taylor roberts david miller harris smith lewis russell mitchell wilson thompson frank cooper

PCA dir 3
0.320 like can uses makes soft simple baby typical typically shape featuring eyes combination usually using surface dark charlie happy similar
-0.410 government establishment governments foreign authorities leaders declared officials behalf civil political independence citizens sought union federal relations commission regime organisat