In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import KernelPCA
import plotly.express as px
from flair.embeddings import TransformerWordEmbeddings

from evolvemb import EvolvingEmbeddings, DummyEmbeddings
from emb_noflair import SimplePretrainedEmbeddings

%load_ext autoreload
%autoreload 2

In [None]:
def load_nyt(start_date="2019-01-01", end_date="2020-12-31"):
    # read in NYT dataset
    sentences = []
    dates = []
    with open("data/nytimes_dataset.txt") as f:
        for line in f:
            d, s = line.strip().split("\t")
            if d < start_date:
                continue
            elif d > end_date:
                break
            dates.append(d)
            # lowercase! and some longer words mistakenly can end with "." due to the tokenizer; remove this!
            sentences.append([w if len(w) <= 3 or not w.endswith(".") else w[:-1] for w in s.lower().split()])
    print("Dataset contains %i sentences between %s and %s" % (len(sentences), start_date, end_date))
    return sentences, dates


def get_emb_snapshots(snapshots, start_date="2019-01-01", local_emb_name="dummy", min_freq=100, n_tokens=10000, saveemb=False):
    savepath = "data/snapshot_emb_%s_%s_%s_%i.pkl" % (local_emb_name.lower(), start_date, snapshots[-1], min_freq)
    # see if we can just load the embeddings
    if os.path.exists(savepath):
        try:
            snapshot_emb = pickle.load(open(savepath, "rb"))
            return snapshot_emb
        except:
            pass
    # load dataset
    sentences, dates = load_nyt(start_date, snapshots[-1])
    # transformer model to generate the local embeddings
    if local_emb_name.lower() == "dummy":
        local_emb = DummyEmbeddings(50, "testemb")  # for some quick testing only
    elif local_emb_name.lower() == "bert":
        local_emb = TransformerWordEmbeddings("bert-base-uncased", layers="all", use_scalar_mix=True, pooling_operation="mean", fine_tune=False)
    elif local_emb_name.lower() == "roberta":
        local_emb = TransformerWordEmbeddings("roberta-base", layers="all", use_scalar_mix=True, pooling_operation="mean", fine_tune=False)
    else:
        local_emb = TransformerWordEmbeddings(os.path.join("data", local_emb_name), layers="all", use_scalar_mix=True, pooling_operation="mean", fine_tune=False)
    # pass sentences directly to generate input model from all texts 
    # so we know which words are of interest and their count (to set alphas manually afterwards)
    emb = EvolvingEmbeddings(local_emb, sentences, alpha=None, min_freq=min_freq, n_tokens=n_tokens, update_index=False)
    print("Number of words in the vocabulary (for which we'll learn an embedding):", emb.input_model.n_tokens)
    # create counts dict based on token_counts/interval
    counts_dict = {t: emb.input_model.token_counts[t]/(1.*len(snapshots)) for t in emb.input_model.index2token}
    # manually create max_counts array with entries for individual words
    emb._set_max_count(counts_dict)
    # compute evolving embeddings and take snapshot at the end of each split
    snapshot_emb = {}
    current_snap = 0
    for i, s in enumerate(sentences):
        if not i % 100:
            print("Processing sentence %8i/%i" % (i, len(sentences)), end="\r")
        # check if we need to take the snapshot
        if dates[i] > snapshots[current_snap]:
            actual_snap = dates[i-1]  # the given snapshot might be at the 31st but the month could have only 30 days
            temp = emb.as_pretrained()
            # save as a model without flair dependency
            snapshot_emb[actual_snap] = SimplePretrainedEmbeddings(temp.embeddings, temp.input_model)
            # set OOV embedding to zeros
            snapshot_emb[actual_snap].embeddings[-1] = np.zeros(snapshot_emb[actual_snap].embeddings.shape[1])
            current_snap += 1
        # update embeddings with sentence
        emb.update_evolving_embeddings(s)
    # possibly take last snapshot
    if current_snap < len(snapshots):
        actual_snap = dates[i]
        temp = emb.as_pretrained()
        snapshot_emb[actual_snap] = SimplePretrainedEmbeddings(temp.embeddings, temp.input_model)
        snapshot_emb[actual_snap].embeddings[-1] = np.zeros(snapshot_emb[actual_snap].embeddings.shape[1])
    print("Processing sentence %8i/%i...done!" % (len(sentences), len(sentences)))
    # reduce file size by ensuring dtype of numpy arrays is float32; transform into objects with less dependencies
    for s in snapshot_emb:
        snapshot_emb[s].embeddings = np.array(snapshot_emb[s].embeddings, dtype=np.float32)
    # possibly save embeddings
    if saveemb:
        try:
            pickle.dump(snapshot_emb, open(savepath, "wb"), -1)
            print("successfully saved embeddings at %s" % savepath)
        except Exception as e:
            print("error saving embeddings:", e)
    return snapshot_emb

In [None]:
def test_cosine_sim_knn(snapshot_emb, k=10):
    # check the overlap between cosine similarity and knn intersection score (Gonen et al., 2020)
    snapshots = sorted(snapshot_emb)
    f, l = snapshots[0], snapshots[-1]
    token_sim = []
    token_knn_score = []
    # ignore words that had a zero embedding in the beginning
    tokens = [t for t in snapshot_emb[f].input_model.index2token if np.any(snapshot_emb[f][t] != 0)]
    for i, t in enumerate(tokens):
        if not i%100: print("Processing %6i/%i" % (i+1, len(tokens)), end="\r")
        token_sim.append(cosine_similarity(snapshot_emb[f][t][None, :], snapshot_emb[l][t][None, :])[0, 0])
        knn1 = set(snapshot_emb[f].get_nneighbors(t, k, include_simscore=False))
        knn2 = set(snapshot_emb[l].get_nneighbors(t, k, include_simscore=False))
        token_knn_score.append(len(knn1.intersection(knn2))/k)
    print("Processing %6i/%i" % (len(tokens), len(tokens)))
    token_sim, token_knn_score = np.array(token_sim), np.array(token_knn_score)
    plt.figure()
    plt.scatter(token_sim, token_knn_score)
    plt.xlabel("cosine similarity")
    plt.ylabel("intersection of NN @ k=%i" % k)
    plt.title("correlation: %.3f" % pearsonr(token_sim, token_knn_score)[0])
    return tokens, token_sim, token_knn_score
            
            
def most_changed_tokens(snapshot_emb, ignore_zeros=True):
    # find the tokens whos embedding has changed the most over the whole time period
    snapshots = sorted(snapshot_emb)
    token_sim = []
    for t in snapshot_emb[snapshots[0]].input_model.index2token:
        # ignore the zero embeddings
        if ignore_zeros:
            token_emb = np.vstack([snapshot_emb[s][t] for s in snapshots if np.any(snapshot_emb[s][t] != 0)])
        else:
            token_emb = np.vstack([snapshot_emb[s][t] for s in snapshots])
        # overall sim = min/mean of upper triangular similarity values 
        # -> take into account similarity of all emb to one another at all time points
        if token_emb.shape[0] > 1:
            sim = cosine_similarity(token_emb)
            if ignore_zeros:
                token_sim.append(sim[np.triu_indices(sim.shape[0], k=1)].min())
            else:
                token_sim.append(sim[np.triu_indices(sim.shape[0], k=1)].mean())
        else:
            token_sim.append(1)
    # sort index from smallest to largest - the more different the word, the smaller the sim
    token_idx = np.argsort(token_sim)
    tokens = [(snapshot_emb[snapshots[0]].input_model.index2token[i], token_sim[i]) for i in token_idx]
    return [t for t in tokens if t[0].isalnum()]


def compare_most_changed_tokens(tokens1, tokens2, name1, name2, c="#7C0033", new_fig=True):
    # compare the similarity scores of the most changed tokens from two models
    tokens1, tokens2 = dict(tokens1), dict(tokens2)
    tokens = set(tokens1.keys())
    tokens.intersection_update(tokens2.keys())
    tokens = sorted(tokens)
    scores1 = np.array([tokens1[t] for t in tokens])
    scores2 = np.array([tokens2[t] for t in tokens])
    if new_fig:
        plt.figure(figsize=(6, 6))
        plt.grid()
    plt.scatter(scores1, scores2, s=10, c=c, alpha=0.5)
    plt.xlabel(name1, fontsize=14)
    plt.ylabel(name2, fontsize=14)
    corr = pearsonr(scores1, scores2)[0]
    plt.title("correlation: %.3f" % corr)
    return corr
    

def analyze_emb_over_time(snapshot_emb, token, k=5, savefigs=""):
    snapshots = sorted(snapshot_emb)
    # get the two snapshots where the embeddings of the token are the most different
    snapshots_nonz = [s for s in snapshots if np.any(snapshot_emb[s][token] != 0)]
    if len(snapshots_nonz) > 1:
        token_emb = np.vstack([snapshot_emb[s][token] for s in snapshots_nonz])
        sim = cosine_similarity(token_emb)
        rowidx, colidx = np.triu_indices(sim.shape[0], k=1)
        minidx = sim[rowidx, colidx].argmin()
        first, last = snapshots_nonz[rowidx[minidx]], snapshots_nonz[colidx[minidx]]
    else:
        first, last = snapshots[0], snapshots_nonz[0]
    
    # get the corresponding nearest neighbors
    nn_first = snapshot_emb[first].get_nneighbors(token, k, include_simscore=False)
    nn_last = snapshot_emb[last].get_nneighbors(token, k, include_simscore=False)
    
    # get colors for plots later
    colors = {}
    colors[token] = (0., 0., 0., 1.)
    colors["%s (%s)" % (token, first)] = (0., 0., 0., 1.)
    colors["%s (%s)" % (token, last)] = (0., 0., 0., 1.)
    cmap = plt.get_cmap("RdBu")
    for i, t in enumerate(nn_first):
        colors[t] = cmap(0.4*(i/(k-1)))
    for i, t in enumerate(nn_last):
        if t not in colors:
            colors[t] = cmap(1-0.4*(i/(k-1)))
    # plotly colors (careful - wants css colors)
    color_plotly = {t: "rgb(%i,%i,%i)" % tuple(k*255 for k in v[:3]) for t, v in colors.items()}
    # make sure nn_last only contains tokens not in nn_first
    nn_last = [t for t in nn_last if t not in nn_first]
    
    # create embedding matrices per token over time
    token_emb = {}
    for t in [token] + nn_first + nn_last:
        token_emb[t] = np.vstack([snapshot_emb[s][t] for s in snapshots])
        
    # compute similarity of each nn to the token
    sim_scores = {}
    for t in nn_first + nn_last:
        sim_scores[t] = np.diag(cosine_similarity(token_emb[token], token_emb[t]))
    # similarity of token itself to beginning and end embedding
    sim_scores["%s (%s)" % (token, first)] = cosine_similarity(token_emb[token], token_emb[token][[0]]).flatten()
    sim_scores["%s (%s)" % (token, last)] = cosine_similarity(token_emb[token], token_emb[token][[-1]]).flatten()
                
    # plot evolution of similarity scores over time
    snapshot_dates = list(range(len(snapshots)))
    plot_tokens = ["%s (%s)" % (token, first), "%s (%s)" % (token, last)] + nn_first + nn_last
    plt.figure(figsize=(8, 5))
    for t in plot_tokens:
        plt.plot(snapshot_dates, sim_scores[t], "--" if t == "%s (%s)" % (token, first) else "-", color=colors[t], label=t)
    l = plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., fontsize=14)
    plt.xticks(snapshot_dates, snapshots, rotation=90 if len(snapshots) > 5 else 0)
    plt.ylabel("cosine similarity")
    # plt.title(token)
    if savefigs:
        plt.savefig("%s_%s_%s_%s_time.pdf" % (savefigs, token, snapshots[0], snapshots[-1]), dpi=300, bbox_inches="tight", bbox_extra_artists=[l])
    # interactive timelines with plotly
    df_temp = pd.DataFrame({
        "snapshot date": [datetime.strptime(s, '%Y-%m-%d') for t in plot_tokens for s in snapshots],
        "cosine similarity": np.array([sim_scores[t] for t in plot_tokens]).flatten(),
        "token": [t for t in plot_tokens for s in snapshots],
        "line": ["dash" if t == "%s (%s)" % (token, first) else "solid" for t in plot_tokens for s in snapshots]
    })
    fig_time = px.line(df_temp, x="snapshot date", y="cosine similarity", 
                       color="token", color_discrete_map=color_plotly, hover_name="token", 
                       line_dash="line", line_dash_map='identity')
    fig_time.show()

    # plot 2D PCA vis of embeddings
    full_embedding_mat = []
    labels = []
    color_keys = []
    size = []
    for t in [token] + nn_first + nn_last:
        full_embedding_mat.append(token_emb[t])
        labels.extend(["%s (%s)" % (t, s) for s in snapshots])
        color_keys.extend(len(snapshots)*[t])
        size.extend(list(range(1, len(snapshots) + 1)))
    full_embedding_mat = np.vstack(full_embedding_mat)
    X_kpca = KernelPCA(n_components=2, kernel="cosine").fit_transform(full_embedding_mat)
    # with matplotlib
    plt.figure(figsize=(6, 6))
    plt.scatter(x=X_kpca[:, 0], y=X_kpca[:, 1], s=10*np.array(size), c=[colors[t] for t in color_keys], alpha=0.6)
    plt.xticks([], [])
    plt.yticks([], [])
    plt.xlabel("PC 1")
    plt.ylabel("PC 2")
    # plt.title(token)
    if savefigs:
        plt.savefig("%s_%s_%s_%s_pca.pdf" % (savefigs, token, snapshots[0], snapshots[-1]), dpi=300, bbox_inches="tight")
    # interactive with plotly
    fig_pca = px.scatter(x=X_kpca[:, 0], y=X_kpca[:, 1], color=color_keys, size=np.sqrt(size), color_discrete_map=color_plotly, hover_name=labels)
    fig_pca.update_traces(hovertemplate='%{hovertext}')  # only show our text, no additional info
    fig_pca.show()
    

In [None]:
# pre and post-corona outbreak in detail
snapshots = ["2019-%02i-31" % i for i in range(6, 13)] + ["2020-%02i-31" % i for i in range(1, 13)]

def run_analysis(local_emb_name="dummy", savefigs="", check_knn_score=False):   
    # generate/load embeddings
    snapshot_emb = get_emb_snapshots(snapshots, start_date="2019-04-01", local_emb_name=local_emb_name, min_freq=50, n_tokens=10000, saveemb=True)
    # see which words have changed the most at some point in the time period
    most_changed = most_changed_tokens(snapshot_emb, ignore_zeros=True)
    print("most changed tokens (ignore_zeros=True)")
    print("\n".join(["%15s (%.4f)" % x for x in most_changed[:25]]))
    # see which words are new
    print("most changed tokens (ignore_zeros=False)")
    print("\n".join(["%15s (%.4f)" % x for x in most_changed_tokens(snapshot_emb, ignore_zeros=False)[:25]]))
    if check_knn_score:
        # see in how far the cosine similarity and knn intersection score agree
        for k in [10, 100, 1000]:
            tokens, token_sim, token_knn_score = test_cosine_sim_knn(snapshot_emb, k=k)
    # create plots from paper
    analyze_emb_over_time(snapshot_emb, "positive", k=5, savefigs=savefigs)
    analyze_emb_over_time(snapshot_emb, "category", k=5, savefigs=savefigs)
    return snapshot_emb, most_changed


In [None]:
# run analysis for bert
snapshot_emb, bert_most_changed = run_analysis(local_emb_name="bert", savefigs="bert")
analyze_emb_over_time(snapshot_emb, "biden", k=5, savefigs="bert")

In [None]:
# same analysis for roberta
_, roberta_most_changed = run_analysis(local_emb_name="roberta")
# and both finetuned models
snapshot_emb, bert_ft_most_changed = run_analysis(local_emb_name="nyt_bert")
analyze_emb_over_time(snapshot_emb, "biden", k=5, savefigs="nyt_bert")
_, roberta_ft_most_changed = run_analysis(local_emb_name="nyt_roberta")

In [None]:
# see in how far the most changed tokens from BERT and RoBERTa agree (before and after fine-tuning)
_ = compare_most_changed_tokens(bert_most_changed, bert_ft_most_changed, "BERT", "BERT (fine-tuned)")
_ = compare_most_changed_tokens(roberta_most_changed, roberta_ft_most_changed, "RoBERTa", "RoBERTa (fine-tuned)")
corr1 = compare_most_changed_tokens(bert_most_changed, roberta_most_changed, "BERT", "RoBERTa")
corr_ft = compare_most_changed_tokens(bert_ft_most_changed, roberta_ft_most_changed, "BERT", "RoBERTa", c="#00537C", new_fig=False)
plt.title("")
plt.legend(["pre-trained $(r: %.3f)$" % corr1, "fine-tuned $(r: %.3f)$" % corr_ft], fontsize=14)
plt.savefig("diachr_score_agreement.pdf", dpi=300, bbox_inches="tight")