In [None]:
import numpy as np
import torch
from gensim.models import KeyedVectors
from irproject.semantic_shifts import (
    compute_cosine_shifts, compute_freqs, 
    compute_nn_shifts, compute_senses_frequencies, 
    compute_targets_embeddings, evaluate_shifts, 
    get_autoencoded_embeddings, get_most_freq_targets, 
    get_targets, get_sentences_with_targets, 
    get_umap_embeddings, intersect_vocabulary, 
    load_data, load_target_embeddings, 
    load_target_sentences_indices, load_semeval_targets, 
    perform_clustering, plot_targets_senses, 
    procrustes_align_gensim, save_context_emb_results, 
    save_static_emb_results, save_targets_embeddings, 
    tokenize_sentences
)
from scipy.spatial import distance
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel

In [None]:
np.random.seed(42)

# Semantic shifts

## Orthogonal Procrustes Approach

In [None]:
model_old = KeyedVectors.load_word2vec_format(
    "datasets/histo-fast-300d.bin", binary=True
)
model_new = KeyedVectors.load_word2vec_format(
    "datasets/wiki-news-300d-1M.vec"
)

semeval_targets = load_semeval_targets("datasets/semeval2020/")
targets = get_most_freq_targets(model_old, model_new, semeval_targets)

In [None]:
model_new = procrustes_align_gensim(model_old, model_new, targets)

In [None]:
op_results = compute_cosine_shifts(model_old, model_new, targets)
list(op_results.items())[:15]

In [None]:
np.round(evaluate_shifts(op_results)[0], 3)

In [None]:
save_static_emb_results(
    op_results, model_old, model_new, "orthogonal_procrustes"
)

## Nearest Neighbors Approach 

With this approach we do not need the alignment we performed with Orthogonal Procrustes.

In [None]:
model_old = KeyedVectors.load_word2vec_format(
    "datasets/histo-fast-300d.bin", binary=True
)
model_new = KeyedVectors.load_word2vec_format(
    "datasets/wiki-news-300d-1M.vec"
)

semeval_targets = load_semeval_targets("datasets/semeval2020/")
targets = get_most_freq_targets(model_old, model_new, semeval_targets)

In [None]:
model_old, model_new = intersect_vocabulary(
    model_old, model_new
)
nn_results = compute_nn_shifts(model_old, model_new, targets, topn=15)
list(nn_results.items())[:15]

In [None]:
np.round(evaluate_shifts(nn_results)[0], 3)

In [None]:
save_static_emb_results(
    nn_results, model_old, model_new, "nearest_neighbors"
)

## Jensen Shannon Distance Approach

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
data = load_data("datasets/semeval2020/")
freqs_dict = compute_freqs(data)
data["targets_tagged"], data["targets_clean"] = get_targets(data, freqs_dict)
sentences_with_trg = get_sentences_with_targets(
    data, data["targets_tagged"]
)

corpora_names = ["corpus_old", "corpus_new"]

In [None]:
# We need to truncate because we have a sentence
# with 524 tokens, which is greater than 512, the
# maximum number of tokens allowed by the 
# pretrained BERT model.
tokenizer = AutoTokenizer.from_pretrained(
    "pretrained/bert-semeval2020-tokenizer",
    truncation=True, max_length=512
)
model = AutoModel.from_pretrained(
    "pretrained/bert-semeval2020"
)
model.to(device)
model.eval()

Here we tokenize the sentences and we compute the embeddings. This is a memory intensive step. Systems with less than 16GB of RAM will crash. Consider computing the embeddings for the two corpora with two different cells, making sure to have enough memory, for instance by triggering the garbage collector.

We are saving the embeddings on disk after the computation, so we can skip this cell on subsequent executions.

In [None]:
for corpus_name in tqdm(
    corpora_names,
    desc="Tokenizing corpus",
    leave=False
):
    tokenized_sentences = tokenize_sentences(
        sentences_with_trg[corpus_name], 
        tokenizer,
        data["targets_tagged"],
        data["targets_clean"]
    )

    targets_embeddings_dict, targets_sentences_dict = compute_targets_embeddings(
        model, tokenizer, corpus_name, data, tokenized_sentences
    )

    save_targets_embeddings(
        targets_embeddings_dict, 
        targets_sentences_dict,
        corpus_name
    )

Here we process the embeddings of each target word. First, we pass the embeddings through an autoencoder. Then we further reduce the dimensionality of the embeddings by using UMAP. Finally, we cluster the embeddings with HDBSCAN, estimate the frequencies of the senses (clusters) and compute the Jensen Shannon Distance between the frequencies of the senses in the two epochs (corpora).

In [None]:
from irproject.semantic_shifts import load_semeval_targets

semeval_targets = load_semeval_targets(
    "datasets/semeval2020", remove_tags=False
)

In [None]:
from collections import Counter

results = dict()
top_sentences = dict()
targets_senses_frequencies = dict()
targets_senses_counts = dict()
num_sentences = 3

one_epoch_targets = []
hdbscan_errors = []
memory_issues = []

for target_idx, target in tqdm(
    enumerate(data["targets_tagged"]),
    # enumerate(semeval_targets),
    desc="Computing score for targets",
    leave=False
):
    # Skip the proper nouns, since they are too noisy.
    if target[-2:] == "pn":
        continue

    targets_senses_frequencies[target] = dict()
    targets_senses_counts[target] = dict()

    embeddings = []
    embeddings_num_per_epoch = []
    for corpus_name in corpora_names:
        corpus_embeddings = load_target_embeddings(target, corpus_name)
        embeddings.append(corpus_embeddings)
        embeddings_num_per_epoch.append(len(corpus_embeddings))

    try:
        X = np.vstack(embeddings)
    except:
        # The target has embeddings only for one
        # corpus. 
        one_epoch_targets.append(target)
        continue

    if X.shape[0] > 27500:
        # Unable to further compute these
        # targets with 16GB of RAM due to
        # high number of embeddings. 
        memory_issues.append(target)

    if target in memory_issues:
        continue
    
    X = get_autoencoded_embeddings(X, target)

    embeddings_epochs = []
    embeddings_epochs.extend([0] * embeddings_num_per_epoch[0])
    embeddings_epochs.extend([1] * embeddings_num_per_epoch[1])

    X = np.vstack(
        # Autoencoded embeddings old 
        # and autoencoded embeddings new.
        [X[:embeddings_num_per_epoch[0]], 
        X[embeddings_num_per_epoch[0]:]]
    )
    X = get_umap_embeddings(X, target)

    try:
        # min_cluster_size = int(0.3 * len(X))
        min_cluster_size = max(
            # min(80, int(0.09770099572992251 * len(X))), 2
            min(80, int(0.1 * len(X))), 2
        )
        labels, probas = perform_clustering(X, min_cluster_size)
    except:
        hdbscan_errors.append(target)
        continue

    senses_frequencies = compute_senses_frequencies(
        labels, embeddings_epochs, embeddings_num_per_epoch
    )

    # Key "0" in senses_frequencies is for corpus_old, 
    # while key "1" is for corpus_new.
    targets_senses_frequencies[target]["corpus_old"] = senses_frequencies[0]
    targets_senses_frequencies[target]["corpus_new"] = senses_frequencies[1]

    old_counts = Counter(labels[:embeddings_num_per_epoch[0]])
    old_counts = {
        int(key): val for key, val in old_counts.items()
    }
    old_counts = dict(
        sorted(
            old_counts.items(), 
            key=lambda item: item[0],
        )
    )
    new_counts = Counter(labels[embeddings_num_per_epoch[0]:])
    new_counts = {
        int(key): val for key, val in new_counts.items()
    }
    new_counts = dict(
        sorted(
            new_counts.items(), 
            key=lambda item: item[0],
        )
    )
    
    targets_senses_counts[target]["corpus_old"] = old_counts
    targets_senses_counts[target]["corpus_new"] = new_counts
    sentences_num_per_sense = Counter(labels)

    jsd = distance.jensenshannon(
        list(senses_frequencies[0].values()), 
        list(senses_frequencies[1].values()), 
        2.0
    )

    results[target] = jsd
    top_sentences[target] = []

    # Here we get the sentences for each sense.
    for label in set(labels):
        if label == -1:
            # We skip the "noise" label.
            continue

        label_probas = []
        for i, x_label in enumerate(labels):
            if x_label == label:
                label_probas.append((i, probas[i]))

        label_probas = sorted(
            label_probas, key=lambda item: item[1], reverse=True
        )
        top_label_sentences = []
        for j in range(
            min(
                num_sentences, 
                sentences_num_per_sense[label]
            )
        ):
            idx, _ = label_probas[j]
            if embeddings_epochs[idx] == 0:
                corpus_name = "corpus_old"
                idx_in_epoch = idx
            else:
                corpus_name = "corpus_new"
                # Each embedding has its own sentence. We have
                # stacked the embeddings of the two epochs. So,
                # if we want the sentence of the second embedding
                # of the "corpus_new", we need to take into account
                # that idx will not be 2, since we have the
                # embeddings of the "corpus_old" first. 
                idx_in_epoch = idx - embeddings_num_per_epoch[0]

            sentences_indices = load_target_sentences_indices(
                target, corpus_name
            )
            sentence_idx = sentences_indices[idx_in_epoch]
            sentence = sentences_with_trg[corpus_name][sentence_idx]
            top_label_sentences.append(sentence)

        top_sentences[target].append(top_label_sentences)

In [None]:
np.round(evaluate_shifts(results, remove_tags=False)[0], 3)

In [None]:
save_context_emb_results(
    results, top_sentences, one_epoch_targets, 
    hdbscan_errors, memory_issues, targets_senses_frequencies,
    targets_senses_counts, fname="jensen_shannon"
)

In [None]:
plot_targets_senses(fname="jensen_shannon")