# Analyze Word2Vec by Decades Run

This notebook is designed to calculate statistics on fully trained word2vec models trained in [01_word2vec_decade_runner.ipynb](01_word2vec_decade_runner.ipynb). The statistics calculated are the cosine distance between tokens on a global level and a local level. Cosine distance is a helpful metric as it isn't affected by the magnitude of vectors.

In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import pickle
import itertools

from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.linalg import orthogonal_procrustes
import tqdm
import plotnine as p9

from biovectors_modules.word2vec_analysis_helper import (
    get_global_distance,
    get_local_distance,
)

## Load Models and Parse Performance

In [2]:
# Align word2vec Models since cutoff year
year_cutoff = 2005
latest_year = 2020
aligned_model_file_path = (
    f"output/aligned_word_vectors_{year_cutoff}_{latest_year}_replace.pkl"
)
token_occurence_file = "output/earliest_token_occurence.tsv"

In [3]:
# Skip 2021 as that model is too small to analyze
# Try again December 2021
word_models = filter(
    lambda x: int(x.stem.split("_")[1]) >= year_cutoff
    and int(x.stem.split("_")[1]) != 2021,
    list(Path("output/models").rglob("*model")),
)
word_models = sorted(word_models, key=lambda x: int(x.stem.split("_")[1]), reverse=True)
print(word_models)

[PosixPath('output/models/word2vec_2020.model'), PosixPath('output/models/word2vec_2019.model'), PosixPath('output/models/word2vec_2018.model'), PosixPath('output/models/word2vec_2017.model'), PosixPath('output/models/word2vec_2016.model'), PosixPath('output/models/word2vec_2015.model'), PosixPath('output/models/word2vec_2014.model'), PosixPath('output/models/word2vec_2013.model'), PosixPath('output/models/word2vec_2012.model'), PosixPath('output/models/word2vec_2011.model'), PosixPath('output/models/word2vec_2010.model'), PosixPath('output/models/word2vec_2009.model'), PosixPath('output/models/word2vec_2008.model'), PosixPath('output/models/word2vec_2007.model'), PosixPath('output/models/word2vec_2006.model'), PosixPath('output/models/word2vec_2005.model')]


In [4]:
if not Path(token_occurence_file).exists():
    earliest_token_occurence = dict()
    for model in reversed(word_models):
        year = model.stem.split("_")[1]
        model = Word2Vec.load(str(model))
        for token in model.wv.vocab.keys():
            if token not in earliest_token_occurence:
                earliest_token_occurence[token] = f"{year}"
            else:
                earliest_token_occurence[token] += f"|{year}"
        (
            pd.DataFrame(
                list(earliest_token_occurence.items()),
                columns=["token", "year_occured"],
            ).to_csv(token_occurence_file, sep="\t", index=False)
        )

In [5]:
if not Path(aligned_model_file_path).exists():
    word_model_dict = dict()
    shared_tokens = set()
    for model in word_models:
        year = model.stem.split("_")[1]
        word_model_dict[year] = Word2Vec.load(str(model))
        if len(shared_tokens) == 0:
            shared_tokens = set(word_model_dict[year].wv.vocab.keys())
        else:
            shared_tokens &= set(word_model_dict[year].wv.vocab.keys())

    shared_tokens = sorted(list(shared_tokens))

## Calculate Global and Local Distances Between Time Periods

### Align Models via Orthogonal Procrustes

In [6]:
if not Path(aligned_model_file_path).exists():
    years_analyzed = sorted(list(word_model_dict.keys()), reverse=True)
    latest_year = str(latest_year)
    aligned_models = {}

    # Years must be in sorted descending order
    for year in years_analyzed:

        if year == latest_year:
            aligned_models[year] = word_model_dict[year].wv[shared_tokens]

        else:

            # align A to B subject to transition matrix being
            # orthogonal to preserve the cosine similarities
            translation_matrix, scale = orthogonal_procrustes(
                word_model_dict[year].wv[shared_tokens],
                word_model_dict[latest_year].wv[shared_tokens],
            )

            # Matrix Multiplication to project year onto 2020
            aligned_models[year] = (
                word_model_dict[year].wv[shared_tokens] @ translation_matrix
            )

    aligned_models["shared_tokens"] = shared_tokens

In [7]:
if not Path(aligned_model_file_path).exists():
    pickle.dump(aligned_models, open(aligned_model_file_path, "wb"))

### Calculate the Global and Local Distances between Words

In [8]:
aligned_models = pickle.load(open(aligned_model_file_path, "rb"))
years_analyzed = sorted(list(aligned_models.keys()), reverse=True)[1:]
origin_year = years_analyzed[-1]  # grab the earliest year to date
n_neighbors = 25
year_distance_folder = f"year_distances_{year_cutoff}_{latest_year}"

In [9]:
shared_tokens = sorted(aligned_models["shared_tokens"])
for key in tqdm.tqdm(years_analyzed[:-1]):

    global_distance = get_global_distance(
        aligned_models[origin_year],
        aligned_models[key],
        aligned_models["shared_tokens"],
    )

    local_distance = get_local_distance(
        aligned_models[origin_year],
        aligned_models[key],
        aligned_models["shared_tokens"],
        neighbors=n_neighbors,
    )

    label = f"{origin_year}_{key}"
    output_filepath = Path(f"output/{year_distance_folder}") / Path(f"{label}_dist.tsv")

    (
        global_distance.merge(local_distance)
        .assign(shift=lambda x: x.global_dist.values - x.local_dist.values)
        .to_csv(str(output_filepath), index=False, sep="\t")
    )

100%|██████████| 15/15 [2:38:32<00:00, 634.16s/it] 
