In [4]:
import numpy as np
import torch

import os
import sys
sys.path.insert(0, "./../")
import utils

from FileManager import FileManager

In [2]:
VOCAB_SZ = 10_000

data_dir = os.path.join(os.getenv("DATASETPATH"), "qwem")
data_fm = FileManager(data_dir)

data_fm.set_filepath("min500")
word_counts = data_fm.load("word_counts.pickle")
vocab = utils.Vocabulary(word_counts[:VOCAB_SZ])
unigram = vocab.counts / vocab.counts.sum()

print(f"Computing M*... ", end="")
corpus_stats = data_fm.load("corpus_stats.pickle")
cL = corpus_stats["context_len"]
Cij, Crwij = corpus_stats["counts"], corpus_stats["counts_reweight"]
numcounts = Cij[:VOCAB_SZ, :VOCAB_SZ].sum()
Pij = Crwij[:VOCAB_SZ, :VOCAB_SZ] / (numcounts * (cL + 1)/2)
PiPj = np.outer(unigram, unigram)
Mstar = 2*(Pij - PiPj)/(Pij + PiPj)
print("done.")

Computing M*... done.


In [None]:
print(f"Computing eigenfeatures... ", end="")
Mstar = torch.tensor(Mstar, dtype=torch.float64).cuda()
eigvals, eigvecs = torch.linalg.eigh(Mstar)
eigvals, eigvecs = eigvals.flip(dims=(0,)), eigvecs.flip(dims=(1,))
assert torch.allclose(Mstar, eigvecs @ torch.diag(eigvals) @ eigvecs.T)
eigvecs, eigvals = eigvecs.cpu().numpy(), eigvals.cpu().numpy()
print("done.")

analysis_fm = FileManager("../analysis")
analysis_fm.save(eigvecs, "mstar-eigvecs.npy")
analysis_fm.save(eigvals, "mstar-eigvals.npy")

Computing eigenfeatures... done.
