In [None]:
import numpy as np
import sys
import time
import random
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pickle 
import nltk
from nltk.corpus import wordnet
nltk.download("stopwords")
nltk.download("wordnet")

import torch
import torch.nn as nn
import torch.optim as optim

PATH = '/content/drive/MyDrive/word2vec_eltdm'

sys.path.append(PATH)

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from word2vec_eltdm.common import (
    Tokenizer, VocabCreator, DataLoader, TokenCleaner, 
    Preprocessor, Subsampler, evaluate, visualization_tsne
)

In [None]:
filepath = "/content/drive/MyDrive/word2vec_eltdm/models/pytorch_best.p"
with open(filepath, "rb") as file:
    pytorch_mod = pickle.load(file)

In [None]:
filepath = "/content/drive/MyDrive/word2vec_eltdm/models/numpy_best.p"
with open(filepath, "rb") as file:
    numpy_mod = pickle.load(file)

In [None]:
pytorch_model = pytorch_mod['PytorchNegWord2Vec']
numpy_model = numpy_mod['NegWord2Vec']

In [None]:
pytorch_embeddings = pytorch_model.embedding_input.weight.data.numpy()
numpy_embeddings = numpy_model.best_W1

In [None]:
datapath = "/content/drive/MyDrive/word2vec_eltdm/data/text8"

RATIO = 1
return_only_train = True
tokenizer = Tokenizer(datapath)
token_cleaner = TokenCleaner(freq_threshold=5)
vocab_creator = VocabCreator()
text8_dataset = Preprocessor(tokenizer, token_cleaner, vocab_creator, RATIO, return_only_train).preprocess()

## Utils

In [None]:
import itertools
import pandas as pd
import numpy as np
import more_itertools

def cosine_similarity(x, y):
    """Cosine of the angle between the vectors x and y.
    
    The cosine similarity is contained in the range [-1, 1], and is equal to:
    - 1 if x and y are positively colinear;
    - 0 if x and y are orthogonal;
    - -1 if x and y are negatively colinear.
    """
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def cosine_distance(x, y):
    """Cosine distance between the vectors x and y.
    
    The cosine distance is an inverted version of the cosine distance, normalized to be between 0 and 1.
    It is contained in the range [0, 1], and is equal to:
    - 0 if x and y are positively colinear;
    - 1/2 if x and y are orthogonal;
    - 1 if x and y are negatively colinear.
    """
    return 1 - (cosine_similarity(x, y) + 1) / 2

def get_metric_function(metric):
    if callable(metric):
        return metric

    ## Deal with the case where x and y are scalars and numpy complains
    def norm(x, ord):
        if len(np.shape(x)) == 0:
            if ord == 0:
                return 1*(x != 0)
            else:
                return np.abs(x)
        else:
            return np.linalg.norm(x, ord=ord)

    metric_functions = {
        "l0": lambda x, y: norm(x-y, ord=0),
        "l2": lambda x, y: norm(x-y, ord=2),
        "l1": lambda x, y: norm(x-y, ord=1),
        "linf": lambda x, y: norm(x-y, ord=np.inf),
        "dot": np.dot,
        "cosine_sim": cosine_similarity,
        "cosine_dist": cosine_distance,
        "cov": lambda x, y: ((x - x.mean()) * (y - y.mean())).mean(),
        "corr": lambda x, y: ((x - x.mean()) * (y - y.mean())).mean() / np.sqrt(np.var(x) * np.var(y)),
        "eq_sim": lambda x, y: (x == y).mean(),
        "eq_dist": lambda x, y: (x != y).mean(),
        "bool_sim": lambda x, y: (x.astype("bool") == y.astype("bool")).mean(),
        "bool_dist": lambda x, y: (x.astype("bool") != y.astype("bool")).mean(),
    }

    assert metric in metric_functions, f"Invalid metric string alias: {metric}."
    return metric_functions[metric]

def get_pairwise_metric_matrix(vectors, metric="l2", symmetric=False, labels=None, return_series=False, exclude_diagonal=False, verbose=False):
    """Compute pairwise metrics in one or two lists of objects and return them as a matrix.
    
    Args:
        vectors (iterable of objects or tuple): Objects to compare.
            An iterable of objects can be a 2-D numpy array, with its rows the objects to compare.
            An iterable of objects can also be a pd.DataFrame, in which case the objects to compare are its columns,
            and the returned matrix will be a DataFrame with appropriate column and index labels.
                (unless the argument `labels` is set, in which case the labels are taken from that argument instead)
            This argument can also be a 2-tuple of two iterables of objects to compare, which can be of different lengths.
                These two iterables can also be DataFrames.
            If 2-tuple, the matrix will have the first iterable as its rows and the second as its columns.
        metric (str or callable): Metric to use.
            The provided string aliases work for numpy arrays of arbitrary shape,
            provided any two numpy arrays to compare have the same shape.
            Currently supported string aliases:
            - "l0": L0 distance (number of non-equal components)
            - "l1": L1 distance
            - "l2": L2 distance
            - "linf": L-infinity distance
            - "dot": Dot product
            - "cosine_sim": Cosine similarity (cosine of the angle between 2 vectors)
            - "cosine_dist": Cosine distance (1 - cosine similarity, normalized to be between 0 and 1)
            - "cov": Covariance (calculated with n in the denominator, i.e. the biased estimator)
            - "corr": Correlation (calculated with n in the denominator, i.e. the biased estimator)
            - "eq_sim": Proportion of times the two vectors are equal.
            - "eq_dist": Proportion of times the two vectors are not equal.
            - "bool_sim": Taking each vector as boolean, proportion of times the two vectors agree.
            - "bool_dist": Taking each vector as boolean, proportion of times the two vectors disagree.
            If callable, must take two objects as positional arguments and return a single number.
            Default: "l2".
        symmetric (bool): Whether the provided metric function is assumed to be symmetric.
            If True, will only make about half of the calculations and automatically fill the rest.
            If `metric` is provided as a string, this argument will be ignored and automatically inferred.
            Default: False.
        labels (list of str, tuple or None): Labels to add to the returned matrix, which will then be returned as a DataFrame
            rather than a numpy array.
            If tuple, must be of length two and be of the form (row_labels, column_labels).
            If list of strings, then the two sets of items to compare must be of the same length, and the length of this argument
                must be equal to that as well.
            If None, return result as a numpy array unless one of the iterables of objects to compare is a DataFrame.
        return_series (bool): Whether to return a flattened pd.Series with a MultiIndex instead of a full matrix.
            Default: False.
        exclude_diagonal (bool): Only active if return_series is True.
            Whether to skip all diagonal (self_distances) from the returned Series.
            Default: False.
        verbose (bool): Whether to display a progress bar.
            Default: False.
    
    Returns:
        np.array or pd.DataFrame: Pairwise metric matrix.
            If the argument `labels` is set or if one of the iterables of objects is a DataFrame,
            then this is a DataFrame. Otherwise, it is a numpy array.
    """
    # Define iterables
    if isinstance(vectors, tuple):
        rows, columns = vectors
    else:
        rows, columns = vectors, vectors

    if isinstance(labels, tuple):
        row_labels, column_labels = labels
    else:
        row_labels, column_labels = labels, labels

    if isinstance(rows, pd.DataFrame):
        if row_labels is None:
            row_labels = rows.columns.tolist()
        rows = rows.values.T
    if isinstance(columns, pd.DataFrame):
        if column_labels is None:
            column_labels = columns.columns.tolist()
        columns = columns.values.T

    if isinstance(rows, pd.Series):
        if row_labels is None:
            row_labels = rows.index.tolist()
        rows = rows.values
    if isinstance(columns, pd.Series):
        if column_labels is None:
            column_labels = columns.index.tolist()
        columns = columns.values

    # Define distance function
    symmetric_metrics = ["l0", "l2", "l1", "linf", "dot", "cosine_sim", "cosine_dist", "cov", "corr", "eq_sim", "eq_dist", "bool_sim", "bool_dist"]
    if isinstance(metric, str):
        symmetric = metric in symmetric_metrics
    
    metric = get_metric_function(metric)

    # Construct matrix
    n, m = len(rows), len(columns)
    if symmetric:
        # Construct the indices by first taking the upper triangular matrix in the square part,
        # then adding all the indices in the rest of the matrix.
        min_len, max_len = min(n, m), max(n, m)
        square_pairs = [(i, j) for i in range(min_len) for j in range(i, min_len)]
        if m <= n:
            pairs = square_pairs + list(itertools.product(range(min_len, n), range(m)))
        else:
            pairs = square_pairs + list(itertools.product(range(n), range(min_len, m)))
    else:
        pairs = itertools.product(range(n), range(m))
    matrix = np.zeros((n, m))
    
    for i, j in tqdm(pairs, disable=not verbose):
        res = metric(rows[i], columns[j])
        matrix[i, j] = res
        if symmetric and (i < m) and (j < n):
            matrix[j, i] = res
    
    # Transform to DF if any of the inputs was a DF or if return_series is True
    if ((row_labels, column_labels) != (None, None)) or return_series:
        matrix = pd.DataFrame(matrix, index=row_labels, columns=column_labels)
    
    # Exclude repeated elements in the Series
    if return_series:
        diag_offset = -1 if exclude_diagonal else 0
        row_indices, col_indices = np.tril_indices_from(matrix, k=diag_offset)
        indices = list(zip(matrix.index[row_indices], matrix.columns[col_indices]))
        matrix = matrix.unstack().loc[indices]

    return matrix


## Code

In [None]:
evaluate(pytorch_embeddings, text8_dataset.id_to_tokens, nb_words=20)

x | os, v, f, macintosh, z
long | length, tracks, like, double, longer
british | english, actor, american, canadian, britain
five | four, six, seven, three, one
often | sometimes, typically, certain, considered, frequently
form | called, non, commonly, forms, means
language | languages, dialect, dialects, spoken, vocabulary
early | began, late, century, later, became
made | since, also, turned, prepared, producing
work | published, life, study, wrote, works
via | systems, access, based, interface, terminal
physics | physicists, chemistry, mechanics, dirac, theory
environment | environmental, habitat, biodiversity, pollution, virtual
convention | conventions, amendment, treaty, amend, berne
refers | refer, terminology, confused, sometimes, term
speech | speeches, propaganda, televised, hatred, freedom
conflict | conflicts, political, violence, armed, parties
places | entrance, arches, pagans, rare, oldest
critical | studies, reviews, homeopathy, perception, criticism
electric | electric

In [None]:
text8_dataset.tokens_to_id

{'one': 0,
 'zero': 1,
 'nine': 2,
 'two': 3,
 'eight': 4,
 'five': 5,
 'three': 6,
 'four': 7,
 'six': 8,
 'seven': 9,
 'also': 10,
 'first': 11,
 'many': 12,
 'new': 13,
 'used': 14,
 'american': 15,
 'time': 16,
 'see': 17,
 'may': 18,
 'world': 19,
 'b': 20,
 'would': 21,
 'however': 22,
 'years': 23,
 'states': 24,
 'people': 25,
 'war': 26,
 'united': 27,
 'known': 28,
 'called': 29,
 'use': 30,
 'th': 31,
 'system': 32,
 'often': 33,
 'state': 34,
 'history': 35,
 'city': 36,
 'english': 37,
 'made': 38,
 'well': 39,
 'e': 40,
 'number': 41,
 'government': 42,
 'later': 43,
 'since': 44,
 'part': 45,
 'name': 46,
 'c': 47,
 'century': 48,
 'x': 49,
 'university': 50,
 'early': 51,
 'life': 52,
 'british': 53,
 'year': 54,
 'like': 55,
 'including': 56,
 'became': 57,
 'example': 58,
 'day': 59,
 'even': 60,
 'work': 61,
 'language': 62,
 'although': 63,
 'several': 64,
 'form': 65,
 'john': 66,
 'u': 67,
 'national': 68,
 'much': 69,
 'g': 70,
 'french': 71,
 'general': 72,
 'n'

In [None]:
def get_synonyms(word):
    synonyms = []
    for synset in wordnet.synsets(word):
        for l in synset.lemmas():
            synonyms.append(l.name())
    return [synonym for synonym in synonyms if synonym != word]

def get_random_words(embeddings, labels, nb_words=20):
    ids = random.sample(range(len(labels)), nb_words)
    associated_labels = [labels[i] for i in ids]
    return embeddings[ids], associated_labels

def get_word_sim(vectors1, labels1, vectors2, labels2):
    sim = get_pairwise_metric_matrix(
        (vectors1, vectors2), 
        labels=(labels1, labels2), 
        metric="cosine_sim"
    )
    return sim

def get_vector(word, embeddings):
    return embeddings[text8_dataset.tokens_to_id[word]]

def get_vectors(word_list, embeddings):
    word_list = [token for token in word_list if token in text8_dataset.tokens_to_id]
    ids = [text8_dataset.tokens_to_id[token] for token in word_list]
    return word_list, embeddings[ids]

def sim_with_syns(word, embeddings):
    vect = get_vector(word, embeddings)
    syns = get_synonyms(word)
    syns, vect_syns = get_vectors(syns, embeddings)
    return get_word_sim([vect], [word], vect_syns, syns)

def compare_sim(embeddings, nb_words=20):
    vectors, labels = get_random_words(pytorch_embeddings, text8_dataset.id_to_tokens, nb_words=nb_words)   
    syn_sim = np.array([np.mean(sim_with_syns(word, pytorch_embeddings), axis=1)[0] for word in labels])
    syn_sim_mean = np.mean(syn_sim[~np.isnan(syn_sim)])
    other_sim = get_pairwise_metric_matrix(vectors, labels=labels, metric="cosine_sim").to_numpy()
    np.fill_diagonal(other_sim, 0)
    other_sim_mean = np.mean(other_sim)
    return syn_sim_mean, other_sim_mean


print(f"pytorch : {compare_sim(pytorch_embeddings)}")
print(f"numpy : {compare_sim(numpy_embeddings)}")

pytorch : (0.2089091070792101, 0.0152484581844692)
numpy : (0.1214741871684358, 0.02788230531707086)


In [None]:
word_per_epoch = 100
epochs = 100

results_pytorch = []
results_numpy = []
for epoch in tqdm(range(epochs)):
    results_pytorch.append(compare_sim(pytorch_embeddings, nb_words=word_per_epoch))
    results_numpy.append(compare_sim(numpy_embeddings, nb_words=word_per_epoch))
results_numpy = np.mean(np.array(results_numpy), axis=0)
results_pytorch = np.mean(np.array(results_pytorch), axis=0)
pd.DataFrame([results_pytorch, results_numpy], columns=["synonyms", "random"], index=["pytorch", "numpy"])

  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,synonyms,random
pytorch,0.11915,0.02038
numpy,0.117582,0.020905


In [None]:
word_per_epoch = 100
epochs = 1000

results_pytorch = []
for epoch in tqdm(range(epochs)):
    results_pytorch.append(compare_sim(pytorch_embeddings, nb_words=word_per_epoch))
results_pytorch = np.mean(np.array(results_pytorch), axis=0)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
word_per_epoch = 100
epochs = 1000

results_numpy = []
for epoch in tqdm(range(epochs)):
    results_numpy.append(compare_sim(numpy_embeddings, nb_words=word_per_epoch))
results_numpy = np.mean(np.array(results_numpy), axis=0)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
pd.DataFrame([results_pytorch, results_numpy], columns=["synonyms", "random"], index=["pytorch", "numpy"])

Unnamed: 0,synonyms,random
pytorch,0.116839,0.020376
numpy,0.116991,0.0207
