In [1]:
import gensim.downloader as api
import torch
import random
import numpy as np
import os
from torch.nn.functional import normalize
from torch import cosine_similarity
from pytorch_lightning import seed_everything
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import requests, zipfile
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE



# choosing top_k among top_similar words to pivot words
NO_quantitative_shared_words = 20000 # 20K

# Ensure reproducibility
seed = 100

# input range in visualizing plots of latents
plot_lim = 5

# number of anchors for relative represntation dimensions to compute the similarity
num_anchors = 300

In [2]:
def relative_projection(x, anchors):
    x = normalize(x, p=2, dim=-1)
    anchors = normalize(anchors, p=2, dim=-1)
    return torch.einsum("bm, am -> ba", x, anchors)


def get_relative(vectors, anchor_ids):
    anchor_latents = vectors[anchor_ids]
    relative_vectors = relative_projection(x=vectors, anchors=anchor_latents)
    return relative_vectors

In [3]:
# method of choosing anchors (uniform sampling)
def get_anchor_ids(num_anchors, num_latents):
    # Select anchors
    seed_everything(seed)
    anchor_set = random.sample(range(num_latents), num_anchors)
    result = sorted(anchor_set)
    return result

# given two encoders returns shared vocabulary
def get_shared_vocabulary(encoder1, encoder2):
    vocab1 = set(encoder1.key_to_index.keys())
    vocab2 = set(encoder2.key_to_index.keys())
    return list(vocab1.intersection(vocab2))

In [4]:
# Step 1: Download the file
dataset = "wiki-news-300d-1M.vec"
url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-english/{dataset}.zip"
response = requests.get(url)
with open("wiki-news-300d-1M.vec.zip", "wb") as f:
    f.write(response.content)

# Step 2: Extract the file
with zipfile.ZipFile("wiki-news-300d-1M.vec.zip", "r") as zip_ref:
    zip_ref.extractall()


# Step 3: Load the vectors into gensim
fasttext_model = KeyedVectors.load_word2vec_format(dataset, binary=False)

In [5]:
ENCODERS = {
    "fasttext": fasttext_model,
    "word2vec": api.load("word2vec-google-news-300"),
}



In [6]:
shared_vocab = get_shared_vocabulary(ENCODERS["word2vec"], ENCODERS["fasttext"])

In [7]:
# Select 20K random pivot words
seed_everything(seed)
quantitative_words = random.sample(shared_vocab, NO_quantitative_shared_words)

In [8]:
# get absolute representations of selected words among the shared vocabulary
word2vec_absolute_latents = torch.tensor([ENCODERS["fasttext"][word]
                                       for word in quantitative_words])
fasttext_absolute_latents = torch.tensor([ENCODERS["word2vec"][word]
                                       for word in quantitative_words])
      
# The shape is [num_samples, hidden_dim]
print(word2vec_absolute_latents.shape, fasttext_absolute_latents.shape)

# Compare the absolute latents --> low similarity since there is an isometry
cosine_similarity(word2vec_absolute_latents, fasttext_absolute_latents).mean()

  word2vec_absolute_latents = torch.tensor([ENCODERS["fasttext"][word]


torch.Size([20000, 300]) torch.Size([20000, 300])


tensor(-0.0032)

In [9]:
# Get some anchors
anchor_ids = get_anchor_ids(num_anchors, num_latents=NO_quantitative_shared_words)

# Transform both spaces w.r.t. the same anchors
word2vec_relative_latents = get_relative(word2vec_absolute_latents, anchor_ids)
fasttext_relative_latents = get_relative(fasttext_absolute_latents, anchor_ids)

# The shape is [num_samples, num_anchors]
print(word2vec_relative_latents.shape, fasttext_relative_latents.shape)

# Compare the relative spaces --> perfect similarity, since we are invariant to isometries
cosine_similarity(word2vec_relative_latents, fasttext_relative_latents).mean()

torch.Size([20000, 300]) torch.Size([20000, 300])


tensor(0.7779)