In [None]:
import os
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

from evolvemb import load_diachronic_dataset, compute_emb_snapshots, most_changed_tokens, analyze_emb_over_time

%load_ext autoreload
%autoreload 2

In [None]:
def get_emb_snapshots(snapshots, start_date="2019-01-01", local_emb_name="dummy", min_freq=100, n_tokens=10000, 
                      saveemb=False, datapath="data/nytimes_dataset.txt"):
    if local_emb_name.startswith("data/"):
        # e.g. for a fine-tuned model saved in the data folder
        savepath = f"data/snapshot_emb_{local_emb_name.lower()[5:]}_{start_date}_{snapshots[-1]}_{min_freq}.pkl"
    else:
        savepath = f"data/snapshot_emb_{local_emb_name.lower()}_{start_date}_{snapshots[-1]}_{min_freq}.pkl"
    # see if we can just load the embeddings
    if os.path.exists(savepath):
        try:
            snapshot_emb = pickle.load(open(savepath, "rb"))
            return snapshot_emb
        except Exception as e:
            print("could not load embeddings:", e)
    # load dataset
    sentences, dates = load_diachronic_dataset(datapath, start_date, snapshots[-1])
    # compute snapshots
    snapshot_emb = compute_emb_snapshots(sentences, dates, snapshots, local_emb_name, min_freq, n_tokens)
    # possibly save embeddings
    if saveemb:
        try:
            pickle.dump(snapshot_emb, open(savepath, "wb"), -1)
            print(f"successfully saved embeddings at {savepath}")
        except Exception as e:
            print("error saving embeddings:", e)
    return snapshot_emb

## Minimal Example (to create embedding snapshots for the Dash App)

In [None]:
# desired snapshot dates: pre- and post-corona outbreak in detail
snapshots = [f"2019-{i:02}-31" for i in range(6, 13)] + [f"2020-{i:02}-31" for i in range(1, 13)]
# compute embedding snapshots with "bert-base-uncased" (can be abbreviated as "bert"; only works for bert and roberta)
snapshot_emb = get_emb_snapshots(snapshots, start_date="2019-04-01", local_emb_name="bert", min_freq=50)
# save embeddings to use with app.py
pickle.dump(snapshot_emb, open("snapshot_emb.pkl", "wb"), -1)

In [None]:
# see which words have changed the most at some point in the time period
most_changed = most_changed_tokens(snapshot_emb, ignore_zeros=True)
print("most changed tokens (ignoring new words)")
print("\n".join([f"{x[0]:15} ({x[1]:.4f})" for x in most_changed[:25]]))

In [None]:
# create interactive plots for word "category"
fig_time, fig_pca = analyze_emb_over_time(snapshot_emb, "category")
fig_time.show()
fig_pca.show()

## Full Analysis (to reproduce results from paper)

#### Compare different transformer architectures

In [None]:
def test_cosine_sim_knn(snapshot_emb, k=10):
    # check the overlap between cosine similarity and knn intersection score (Gonen et al., 2020)
    snapshots = sorted(snapshot_emb)
    f, l = snapshots[0], snapshots[-1]
    token_sim = []
    token_knn_score = []
    # ignore words that had a zero embedding in the beginning
    tokens = [t for t in snapshot_emb[f].input_model.index2token if np.any(snapshot_emb[f][t] != 0)]
    for i, t in enumerate(tokens):
        if not i%100: print(f"Processing {i+1:6}/{len(tokens)}", end="\r")
        token_sim.append(cosine_similarity(snapshot_emb[f][t][None, :], snapshot_emb[l][t][None, :])[0, 0])
        knn1 = set(snapshot_emb[f].get_nneighbors(t, k, include_simscore=False))
        knn2 = set(snapshot_emb[l].get_nneighbors(t, k, include_simscore=False))
        token_knn_score.append(len(knn1.intersection(knn2))/k)
    print(f"Processing {len(tokens):6}/{len(tokens)}")
    token_sim, token_knn_score = np.array(token_sim), np.array(token_knn_score)
    plt.figure()
    plt.scatter(token_sim, token_knn_score)
    plt.xlabel("cosine similarity")
    plt.ylabel(f"intersection of NN @ k={k}")
    plt.title(f"correlation: {pearsonr(token_sim, token_knn_score)[0]:.3f}")
    return tokens, token_sim, token_knn_score


def compare_most_changed_tokens(tokens1, tokens2, name1, name2, c="#7C0033", new_fig=True):
    # compare the similarity scores of the most changed tokens from two models
    tokens1, tokens2 = dict(tokens1), dict(tokens2)
    tokens = set(tokens1.keys())
    tokens.intersection_update(tokens2.keys())
    tokens = sorted(tokens)
    scores1 = np.array([tokens1[t] for t in tokens])
    scores2 = np.array([tokens2[t] for t in tokens])
    if new_fig:
        plt.figure(figsize=(6, 6))
        plt.grid()
    plt.scatter(scores1, scores2, s=10, c=c, alpha=0.5)
    plt.xlabel(name1, fontsize=14)
    plt.ylabel(name2, fontsize=14)
    corr = pearsonr(scores1, scores2)[0]
    plt.title(f"correlation: {corr:.3f}")
    return corr
    

In [None]:
# desired snapshot dates: pre- and post-corona outbreak in detail
snapshots = [f"2019-{i:02}-31" for i in range(6, 13)] + [f"2020-{i:02}-31" for i in range(1, 13)]

def run_analysis(local_emb_name="dummy", savefigs="", check_knn_score=False):   
    # generate/load embeddings
    snapshot_emb = get_emb_snapshots(snapshots, start_date="2019-04-01", local_emb_name=local_emb_name, min_freq=50, n_tokens=10000, saveemb=True)
    # see which words have changed the most at some point in the time period
    most_changed = most_changed_tokens(snapshot_emb, ignore_zeros=True)
    print("most changed tokens (ignore_zeros=True)")
    print("\n".join([f"{x[0]:15} ({x[1]:.4f})" for x in most_changed[:25]]))
    # see which words are new
    print("most changed tokens (ignore_zeros=False)")
    print("\n".join([f"{x[0]:15} ({x[1]:.4f})" for x in most_changed_tokens(snapshot_emb, ignore_zeros=False)[:25]]))
    if check_knn_score:
        # see in how far the cosine similarity and knn intersection score agree
        for k in [10, 100, 1000]:
            tokens, token_sim, token_knn_score = test_cosine_sim_knn(snapshot_emb, k=k)
    # create plots from paper
    fig_time, fig_pca = analyze_emb_over_time(snapshot_emb, "positive", k=5, savefigs=savefigs)
    fig_time.show()
    fig_pca.show()
    fig_time, fig_pca = analyze_emb_over_time(snapshot_emb, "category", k=5, savefigs=savefigs)
    fig_time.show()
    fig_pca.show()
    return snapshot_emb, most_changed


In [None]:
# run analysis for bert
snapshot_emb, bert_most_changed = run_analysis(local_emb_name="bert", savefigs="bert")
_ = analyze_emb_over_time(snapshot_emb, "biden", k=5, savefigs="bert")

In [None]:
# same analysis for roberta
_, roberta_most_changed = run_analysis(local_emb_name="roberta")
# and both finetuned models
snapshot_emb, bert_ft_most_changed = run_analysis(local_emb_name="data/nyt_bert")
_ = analyze_emb_over_time(snapshot_emb, "biden", k=5, savefigs="nyt_bert")
_, roberta_ft_most_changed = run_analysis(local_emb_name="data/nyt_roberta")

In [None]:
# see in how far the most changed tokens from BERT and RoBERTa agree (before and after fine-tuning)
_ = compare_most_changed_tokens(bert_most_changed, bert_ft_most_changed, "BERT", "BERT (fine-tuned)")
_ = compare_most_changed_tokens(roberta_most_changed, roberta_ft_most_changed, "RoBERTa", "RoBERTa (fine-tuned)")
corr1 = compare_most_changed_tokens(bert_most_changed, roberta_most_changed, "BERT", "RoBERTa")
corr_ft = compare_most_changed_tokens(bert_ft_most_changed, roberta_ft_most_changed, "BERT", "RoBERTa", c="#00537C", new_fig=False)
plt.title("")
plt.legend([f"pre-trained $(r: {corr1:.3f})$", f"fine-tuned $(r: {corr_ft:.3f})$"], fontsize=14)
plt.savefig("diachr_score_agreement.pdf", dpi=300, bbox_inches="tight")

#### Evaluate on data with artificial semantic (non-)shifts

In [None]:
# check most changed tokens when sentences are shuffled 
# (i.e. determine threshold on cosine similarity to avoid false positives)
savepath = f"data/snapshot_emb_shuffled_2019-04-01_{snapshots[-1]}_50.pkl"
# see if we can just load the embeddings
if os.path.exists(savepath):
    snapshot_emb = pickle.load(open(savepath, "rb"))
else:
    # load dataset
    sentences, dates = load_diachronic_dataset("data/nytimes_dataset.txt", "2019-04-01", snapshots[-1])
    # shuffle sentences (but leave dates as they were!)
    random.seed(10)
    random.shuffle(sentences)  # inplace
    # compute snapshots as before with shuffled sentences
    snapshot_emb = compute_emb_snapshots(sentences, dates, snapshots, "bert", 50)
    pickle.dump(snapshot_emb, open(savepath, "wb"), -1)
# check which are now the most changed words
most_changed = most_changed_tokens(snapshot_emb, ignore_zeros=True)
print("most changed tokens (ignore_zeros=True)")
print("\n".join([f"{x[0]:15} ({x[1]:.4f})" for x in most_changed[:25]]))
# example plot for our previous most changed token
fig_time, fig_pca = analyze_emb_over_time(snapshot_emb, "category", k=5, savefigs="shuffled")
fig_time.show()
fig_pca.show()

In [None]:
# load original bert checkpoint
savepath = f"data/snapshot_emb_bert_2019-04-01_{snapshots[-1]}_50.pkl"
snapshot_emb = pickle.load(open(savepath, "rb"))
# select two words that occur fairly often and that don't have too much in common
# the input model of the embeddings already contains counts of the tokens, check the 100 most frequent
print(snapshot_emb[snapshots[-1]].input_model.token_counts.most_common(100))
# select two words from which we believe they aren't too similar
word1 = "president"
word2 = "coronavirus"
# check their cosine similarities to be sure they really are not very similar
print(f"cosine similarity between {word1} and {word2}", cosine_similarity(snapshot_emb[snapshots[-1]][word1][None, :], snapshot_emb[snapshots[-1]][word2][None, :]))

In [None]:
# look at plots for both words to check their original nearest neighbors over time
fig_time, _ = analyze_emb_over_time(snapshot_emb, word1)
fig_time.show()
fig_time, _ = analyze_emb_over_time(snapshot_emb, word2)
fig_time.show()

In [None]:
# load dataset
sentences, dates = load_diachronic_dataset("data/nytimes_dataset.txt", "2019-04-01", snapshots[-1])
# split the original list with sentences into 3 list: those with word1, with word2, and without any of the words
sentences_word1 = []
sentences_word2 = []
sentences_without = []
dates_without = []
# create an artificial new word as a combination of both words
newword = f"{word1}{word2}"
for i, s in enumerate(sentences):
    if word1 in s:
        # ignore sentences with both words
        if word2 in s:
            continue
        # replace original word with artificial word
        sentences_word1.append([newword if w == word1 else w for w in s])
    elif word2 in s:
        sentences_word2.append([newword if w == word2 else w for w in s])
    else:
        sentences_without.append(s)
        dates_without.append(dates[i])
print(f"number of sentences with {word1}:", len(sentences_word1))
print(f"number of sentences with {word2}:", len(sentences_word2))
print("number of sentences without the words:", len(sentences_without))

In [None]:
# sigmoid function based on which we'll draw the sentences
def sigm(i, n):
    return 1/(1+np.exp(-(i-n/2)/(n/10)))

# check that it looks correctly independent of the number of sentences
# for n in [1000, 10000]:
#     x = np.arange(n)
#     plt.figure()
#     plt.plot(x, sigm(x, n));

In [None]:
# shuffle both sets of sentences and take the same number from each
random.seed(23)
random.shuffle(sentences_word1)
random.shuffle(sentences_word2)
min_len = min(len(sentences_word1), len(sentences_word2))
sentences_word1, sentences_word2 = sentences_word1[:min_len], sentences_word2[:min_len]
# combine both lists into a single list where we first have a high priority of choosing sentences from 
# the first word and then from the second
sentences_both = []
n = len(sentences_word1)+len(sentences_word2)
for i in range(n):
    # add either a sentence with word1 or word2 depending on sigmoid threshold
    if (len(sentences_word1) > len(sentences_word2)) or (len(sentences_word1) and random.random() >= sigm(i, n)):
        sentences_both.append(sentences_word1.pop())
    else:
        sentences_both.append(sentences_word2.pop())

In [None]:
# check some sentences at the beginning ... all about word1
print("\n".join([" ".join(s) for s in sentences_both[:10]]))

In [None]:
# ... and some at the end; they are about word2
print("\n".join([" ".join(s) for s in sentences_both[-10:]]))

In [None]:
# interleave the new sentences with the originals
sentences_new = []
dates_new = []
# every r_th sentence should be from our artificial list
r = len(sentences_without) // len(sentences_both)
n = len(sentences_without)
i_both = 0
for i in range(n):
    # always add the original sentence
    sentences_new.append(sentences_without[i])
    dates_new.append(dates_without[i])
    # in between add a sentence for the new list
    if not i % r and i_both < len(sentences_both):
        sentences_new.append(sentences_both[i_both])
        i_both += 1
        # add the same date again
        dates_new.append(dates_without[i])
# possibly add a last new sentence
if i_both < len(sentences_both):
    sentences_new.append(sentences_both[i_both])
    dates_new.append(dates_without[i])

In [None]:
# save new sentences as a dataset to fine tune bert on
with open(f"data/nytimes_dataset_{newword}.txt", "w") as f:
    f.write("\n".join([f"{dates_new[i]}\t{' '.join(sentences_new[i])}" for i in range(len(dates_new))]))

In [None]:
# compute snapshots from our new sentences
savepath = f"data/snapshot_emb_bert_{newword}_2019-04-01_{snapshots[-1]}_50.pkl"
# see if we can just load the embeddings
if os.path.exists(savepath):
    snapshot_emb = pickle.load(open(savepath, "rb"))
else:
    snapshot_emb = compute_emb_snapshots(sentences_new, dates_new, snapshots, "bert", 50)
    pickle.dump(snapshot_emb, open(savepath, "wb"), -1)
# check which are now the most changed words
most_changed = most_changed_tokens(snapshot_emb, ignore_zeros=True)
print("most changed tokens (ignore_zeros=True)")
print("\n".join([f"{x[0]:15} ({x[1]:.4f})" for x in most_changed[:25]]))
# example plot for our new word
fig_time, fig_pca = analyze_emb_over_time(snapshot_emb, newword, k=5, savefigs="bert")
fig_time.show()
fig_pca.show()

In [None]:
# compute snapshots from our new sentences with the fine-tuned model
savepath = f"data/snapshot_emb_nyt_bert_{newword}_2019-04-01_{snapshots[-1]}_50.pkl"
# see if we can just load the embeddings
if os.path.exists(savepath):
    snapshot_emb = pickle.load(open(savepath, "rb"))
else:
    snapshot_emb = compute_emb_snapshots(sentences_new, dates_new, snapshots, f"data/nyt_bert_{newword}", 50)
    pickle.dump(snapshot_emb, open(savepath, "wb"), -1)
# check which are now the most changed words
most_changed = most_changed_tokens(snapshot_emb, ignore_zeros=True)
print("most changed tokens (ignore_zeros=True)")
print("\n".join([f"{x[0]:15} ({x[1]:.4f})" for x in most_changed[:25]]))
# example plot for our new word
fig_time, fig_pca = analyze_emb_over_time(snapshot_emb, newword, k=5, savefigs="nyt_bert")
fig_time.show()
fig_pca.show()