# Determine Word to PCA Associations using Doc2vec Model

This notebook is designed to run PCA over the document embeddings generated by the Doc2Vec model and calculate words-pca associations for each principal component. Upon visually looking at the concepts captured, linear methods are easier to interpret than non-linear ones.

In [1]:
from pathlib import Path
import os
import re

from gensim.models import Doc2Vec
import itertools
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import plotnine as p9
from PIL import ImageColor
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook
import wordcloud

matplotlib.use("SVG")  # set the backend to SVG

In [2]:
journal_map_df = pd.read_csv(
    "../exploratory_data_analysis/output/biorxiv_article_metadata.tsv", sep="\t"
)
journal_map_df.head()

Unnamed: 0,author_type,heading,category,document,doi
0,regular article,new results,genetics,440735_v1.xml,10.1101/440735
1,regular article,new results,systems biology,775270_v1.xml,10.1101/775270
2,regular article,new results,genetics,242404_v1.xml,10.1101/242404
3,regular article,new results,neuroscience,872994_v1.xml,10.1101/2019.12.11.872994
4,regular article,new results,developmental biology,080853_v2.xml,10.1101/080853


# Get the Word Vectors

Load the word vectors from disk.

In [3]:
model = Doc2Vec.load(
    "../word_vector_experiment/output/doc2vec_output/biorxiv_300.model"
)

# PCA the Documents

Run PCA over the documents. Generates 50 principal components, but can generate more or less.

In [4]:
n_components = 50
random_state = 100

In [5]:
reducer = PCA(n_components=n_components, random_state=random_state)

embedding = reducer.fit_transform(model.docvecs.vectors_docs)

pca_df = (
    pd.DataFrame(
        embedding, columns=[f"pca{dim}" for dim in range(1, n_components + 1, 1)]
    )
    .assign(document=[f"{str(tag)}.xml" for tag in model.docvecs.doctags])
    .merge(journal_map_df[["category", "document", "doi"]], on="document")
)

In [6]:
pca_comp_df = pd.DataFrame(
    reducer.components_,
    columns=[f"{dim+1}" for dim in range(reducer.components_.shape[1])],
)
pca_comp_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,0.099897,-0.002881,-0.007836,-0.054143,-0.027895,-0.016281,-0.021364,0.00123,0.00187,-0.070017,...,0.00519,-0.009552,-0.031998,-0.026097,-0.039476,-0.021258,-0.058824,0.041455,0.142442,-0.023566
1,0.070495,-0.144251,-0.086693,0.030134,0.020505,-0.032827,0.166508,0.07547,0.028479,0.001221,...,0.07592,-0.040243,0.098385,-0.040933,-0.043147,0.018124,0.034849,-0.001243,-0.053133,-0.072973
2,-0.002864,0.061922,-0.059614,-0.07579,0.027099,-0.038238,-0.023545,0.081939,-0.077146,0.000967,...,0.035804,-0.015126,0.063605,0.078057,-0.068316,0.042266,-0.031661,-0.032429,-0.104353,-0.034932
3,-0.132097,-0.060392,-0.028922,0.076926,-0.020094,0.089415,-0.008016,-0.020158,-0.002205,0.001816,...,-0.004885,-0.02716,0.029111,0.026868,0.006972,-0.025206,-0.03954,0.080746,-0.04768,-0.043767
4,0.096456,-0.000621,-0.064191,0.065095,0.022163,0.151746,0.076366,-0.00437,0.029303,0.014173,...,-0.014936,0.005223,0.033488,0.05906,-0.031153,-0.060356,-0.051431,0.031248,0.083952,-0.009739


In [7]:
pca_comp_df.to_csv(
    "output/word_pca_similarity/pca_components_doc2vec.tsv", sep="\t", index=False
)

# Calculate Word-PCA Cosine Similarity

Once PCA has finished, there are now 50 different principal components. The association between every word and principal component is calculated via [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) (cosine of the angle between two vectors).

In [7]:
# 1 - cosine distance = cosine similarity
# model.wv.vectors contains all unique word vectors generated by doc2vec
# sorted by frequency
word_pca_similarity = 1 - cdist(model.wv.vectors, reducer.components_, "cosine")

word_pca_similarity.shape

(675764, 50)

In [8]:
# index2word is a list of tokens sorted by frequency
# Gensim provides this attribute to make accessing word vectors eaiser
word_pca_sim_df = pd.DataFrame(
    word_pca_similarity,
    columns=[f"pca{dim}_cossim" for dim in range(1, n_components + 1, 1)],
).assign(word=model.wv.index2word)

word_pca_sim_df.to_csv(
    f"output/word_pca_similarity/word_pca_cos_sim_{n_components}_pcs_doc2vec.tsv",
    sep="\t",
    index=False,
)

# Remove those pesky citations from the word pca similarity
word_pca_sim_df = word_pca_sim_df.query(
    "~word.str.match(r'^(\(|\[)', na=False)"  # noqa: W605
)
word_pca_sim_df.head()

Unnamed: 0,pca1_cossim,pca2_cossim,pca3_cossim,pca4_cossim,pca5_cossim,pca6_cossim,pca7_cossim,pca8_cossim,pca9_cossim,pca10_cossim,...,pca42_cossim,pca43_cossim,pca44_cossim,pca45_cossim,pca46_cossim,pca47_cossim,pca48_cossim,pca49_cossim,pca50_cossim,word
0,-0.000729,0.002413,0.001125,0.031829,-0.015342,-0.007332,0.04993,-0.016376,-0.021773,0.00374,...,0.000917,-0.035307,0.093203,0.009636,0.037358,0.002742,0.034404,-0.001231,-0.025573,","
1,-0.032081,-0.057046,-0.055136,0.008052,-0.028604,-0.031474,0.049815,-0.008854,-0.018201,0.040472,...,0.018408,-0.005689,0.046438,0.014136,0.049979,0.01796,0.038572,0.038916,-0.024154,.
2,-0.032362,-0.013502,-0.005475,-0.036873,-0.006743,0.015549,0.005882,-0.006486,0.009503,-0.017492,...,-0.007684,-0.012067,-0.031199,-0.018405,-0.064531,-0.047717,-0.005243,-0.005004,0.010247,)
4,-0.065627,-0.100649,-0.09423,-0.066972,-0.042747,-0.021028,0.002573,0.024973,0.007476,-0.006744,...,-0.001724,0.022482,0.04235,-0.032296,0.020789,-0.0077,0.048178,-0.045664,0.014878,-
5,0.02453,5.7e-05,0.013701,0.027553,-0.034045,0.01205,0.016951,-0.004791,-0.017976,-0.01964,...,0.011876,0.012081,0.014475,0.001788,0.003556,0.063859,-0.006535,0.011429,-0.03044,-pron-


# Generate Word Clouds for the PC dimensions

Given word to principal component association, next step is to generate word clouds for each principal component. The word clouds have orange representing words that are most similar to the principal component and blue as words most dissimilar to the principal component.

In [9]:
class PolarityColorFunc:
    """
    This class is designed to color words based on their positive and negative association.
    The wordcloud package needs a class in order to provide a custom color scheme.
    Basically if a word is passed into the function below that states it is positive or negative
    the corresponding color is returned.
    """

    def __init__(
        self,
        word_class_map,
        positive_key="positive",
        positive="#ef8a62",
        negative_key="negative",
        negative="#67a9cf",
        default="#f7f7f7",
    ):
        self.positive_class = positive
        self.negative_class = negative
        self.positive_key = positive_key
        self.negative_key = negative_key
        self.default_class = default

        self.words_to_color_map = word_class_map

    def get_color_mapper(self, word):
        if word in self.words_to_color_map[self.positive_key]:
            return ImageColor.getrgb(self.positive_class)
        # wordcloud.get_single_color_func(self.positive_class)
        elif word in self.words_to_color_map[self.negative_key]:
            return ImageColor.getrgb(self.negative_class)
        else:
            return ImageColor.getrgb(self.default_class)

    def __call__(self, word, **kwargs):
        return self.get_color_mapper(word)

In [10]:
pca_dimensions = [f"pca{dim}_cossim" for dim in range(1, n_components + 1, 1)]
for pc, component in tqdm_notebook(enumerate(pca_dimensions, start=1)):
    word_class_map = {}

    word_class_map["negative"] = (
        word_pca_sim_df.sort_values(component, ascending=True)
        .head(100)
        .assign(**{component: lambda x: x[component].abs().values.tolist()})
        .assign(**{component: lambda x: x[component] / x[component].max()})[
            ["word", component]
        ]
        .to_dict(orient="records")
    )

    word_class_map["positive"] = (
        word_pca_sim_df.sort_values(component, ascending=False)
        .assign(**{component: lambda x: x[component] / x[component].max()})
        .head(100)[["word", component]]
        .to_dict(orient="records")
    )

    polarity_color_map = PolarityColorFunc(
        {
            word_class: set(map(lambda x: x["word"], word_class_map[word_class]))
            for word_class in word_class_map
        }
    )

    pc = f"{pc:02d}"

    polarity_cloud = (
        wordcloud.WordCloud(
            background_color="white", width=1024, height=768, collocations=False
        )
        .generate_from_frequencies(
            {
                record["word"]: record[component]
                for word_class in word_class_map
                for record in word_class_map[word_class]
            }
        )
        .recolor(color_func=polarity_color_map)
        .to_file(
            f"output/word_pca_similarity/figure_pieces/pca_{pc}_cossim_word_cloud_doc2vec.png"
        )
    )

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


