# Determine Word to PCA Associations

This notebook is designed to run PCA over the document embeddings and calculate words-pca associations and document centroid-pca associations for each principal component.

In [1]:
from pathlib import Path
import os
import re

from gensim.models import Word2Vec
import itertools
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import plotnine as p9
from PIL import ImageColor
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook
import wordcloud

matplotlib.use("SVG")  # set the backend to SVG

In [2]:
journal_map_df = pd.read_csv(
    "../exploratory_data_analysis/output/biorxiv_article_metadata.tsv", sep="\t"
)
journal_map_df.head()

Unnamed: 0,author_type,heading,category,document,doi
0,regular article,new results,genetics,440735_v1.xml,10.1101/440735
1,regular article,new results,systems biology,775270_v1.xml,10.1101/775270
2,regular article,new results,genetics,242404_v1.xml,10.1101/242404
3,regular article,new results,neuroscience,872994_v1.xml,10.1101/2019.12.11.872994
4,regular article,new results,developmental biology,080853_v2.xml,10.1101/080853


# Get the Word Vectors

Save the word vectors to disk, so later sections have easy access.

In [3]:
if not Path("output/word_pca_similarity/word_vectors_300.tsv.xz").exists():
    model = Word2Vec.load(
        "../word_vector_experiment/output/word2vec_models/300/biorxiv_300.model"
    )

In [4]:
if not Path("output/word_pca_similarity/word_vectors_300.tsv.xz").exists():
    word_vector_map = {
        word: model.wv[word] for word in tqdm_notebook(model.wv.vocab.keys())
    }

In [5]:
if not Path("output/word_pca_similarity/word_vectors_300.tsv.xz").exists():
    word_vector_df = pd.DataFrame.from_dict(word_vector_map, orient="index")

    word_vector_df.to_csv(
        "output/word_pca_similarity/word_vectors_300.tsv.xz", sep="\t", compression="xz"
    )

    print(word_vector_df.head())

# PCA the Documents

Run PCA over the documents. Generates 50 principal components, but can generate more or less.

In [6]:
n_components = 50
random_state = 100

In [7]:
biorxiv_articles_df = pd.read_csv(
    Path("..")
    / Path("word_vector_experiment")
    / Path("output/word2vec_output")
    / Path("biorxiv_all_articles_300_fixed.tsv.xz"),
    sep="\t",
)

# drop the withdrawn documents
biorxiv_articles_df = biorxiv_articles_df.dropna()
biorxiv_articles_df.head()

Unnamed: 0,document,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,440735_v1.xml,0.063813,0.012187,-0.156229,-0.158987,0.16548,-0.346612,0.001741,-0.107139,0.684055,...,-0.140772,-0.143405,-0.081427,-0.24822,-0.77147,-0.235455,0.547801,0.859663,-0.063151,0.012667
1,775270_v1.xml,-0.201064,0.01187,-0.354809,-0.150633,0.095305,-0.507254,-0.339986,0.303313,0.944561,...,-0.115199,-0.259798,0.272955,-0.346138,-0.844158,-0.596267,0.121678,0.95916,-0.747369,-0.073314
2,242404_v1.xml,-0.204741,-0.107589,-0.062853,0.140793,0.066307,-0.46329,-0.106492,-0.040269,0.557687,...,-0.229839,-0.325765,-0.064586,-0.300529,-0.707948,-0.47939,0.381809,0.968333,-0.402499,-0.10925
3,872994_v1.xml,-0.25699,-0.055972,-0.300816,0.495731,0.29007,-0.310817,-0.481578,-0.161373,0.483896,...,0.148916,-0.217426,0.195183,-0.069086,-0.545689,-0.517949,0.26523,0.645038,-0.843499,-0.142928
4,080853_v2.xml,0.167641,0.046808,-0.709753,0.232242,-0.139213,-0.319284,-0.314807,0.31018,0.317704,...,0.540693,0.264286,0.256323,0.137184,-0.754344,-0.145583,0.032544,0.50743,-0.37432,0.47306


In [8]:
reducer = PCA(n_components=n_components, random_state=random_state)

embedding = reducer.fit_transform(
    biorxiv_articles_df[[f"feat_{idx}" for idx in range(300)]].values
)

pca_df = (
    pd.DataFrame(
        embedding, columns=[f"pca{dim}" for dim in range(1, n_components + 1, 1)]
    )
    .assign(document=biorxiv_articles_df.document.values.tolist())
    .merge(journal_map_df[["category", "document", "doi"]], on="document")
)

In [9]:
(
    pd.DataFrame(
        reducer.components_,
        columns=[f"{dim+1}" for dim in range(reducer.components_.shape[1])],
    ).to_csv("output/word_pca_similarity/pca_components.tsv", sep="\t", index=False)
)

# Calculate Word-PCA Cosine Similarity

Once PCA has finished, there are now 50 different principal components. The association between every word and principal component is calculated via [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) (cosine of the angle between two vectors).

In [10]:
word_vector_df = pd.read_csv(
    "output/word_pca_similarity/word_vectors_300.tsv.xz", sep="\t", index_col=0
)
word_vector_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
AbstractAdaptation,-0.285844,-0.080816,0.131058,-0.164448,0.21379,0.289946,0.198592,-0.613607,-0.233766,0.169285,...,-0.00508,-0.127008,-0.108292,0.227025,0.219927,0.020532,-0.016353,-0.153878,0.047264,0.304174
response,1.98298,-0.921032,0.322336,-1.819926,1.794423,-2.737348,-1.29125,-1.636407,1.895059,-1.916617,...,2.644842,0.406725,1.360475,-2.210262,-3.219634,-1.696714,0.730268,-1.190313,-0.815162,-1.846863
selection,-0.991861,-1.071227,-0.182926,-4.167827,2.136675,2.197686,-0.744608,-0.422629,2.331867,-1.475493,...,1.926673,1.632876,-2.516936,0.276007,-3.51885,-2.68509,0.832754,4.678514,-0.146672,-2.214616
polygenic,0.410014,-1.32864,0.99285,-1.669407,1.523946,-0.893682,0.45054,-3.377987,1.740303,3.0221,...,-3.074039,-2.214306,-1.203037,-0.051227,-2.269419,1.449286,0.433501,4.2979,1.160446,-0.868293
phenotypes,-0.161185,-0.631166,2.041816,-1.221121,0.177153,0.605349,1.185401,-1.300211,0.637753,-0.737132,...,2.64048,3.109912,-0.83852,0.948513,-3.318489,-2.088322,2.560162,-0.631192,1.67985,0.125102


In [11]:
# 1 - cosine distance = cosine similarity
word_pca_similarity = 1 - cdist(word_vector_df.values, reducer.components_, "cosine")

word_pca_similarity.shape

(1158806, 50)

In [12]:
word_pca_sim_df = pd.DataFrame(
    word_pca_similarity,
    columns=[f"pca{dim}_cossim" for dim in range(1, n_components + 1, 1)],
).assign(word=word_vector_df.index.tolist())

# for files greater than a 1GB
if n_components > 40:
    word_pca_sim_df.to_csv(
        f"output/word_pca_similarity/word_pca_cos_sim_{n_components}_pcs.tsv.xz",
        sep="\t",
        index=False,
        compression="xz",
    )

else:
    word_pca_sim_df.to_csv(
        f"output/word_pca_similarity/word_pca_cos_sim_{n_components}_pcs.tsv",
        sep="\t",
        index=False,
    )

# Remove those pesky citations from the word pca similarity
word_pca_sim_df = word_pca_sim_df.query(
    "~word.str.match(r'^(\(|\[)', na=False)"  # noqa: W605
)
word_pca_sim_df.head()

Unnamed: 0,pca1_cossim,pca2_cossim,pca3_cossim,pca4_cossim,pca5_cossim,pca6_cossim,pca7_cossim,pca8_cossim,pca9_cossim,pca10_cossim,...,pca42_cossim,pca43_cossim,pca44_cossim,pca45_cossim,pca46_cossim,pca47_cossim,pca48_cossim,pca49_cossim,pca50_cossim,word
0,-0.053392,-0.074065,0.106587,0.136574,0.029004,0.061082,-0.151184,-0.047392,0.072516,-0.01373,...,-0.046665,-0.07766,-0.096686,0.098052,-0.006737,-0.080047,0.036344,-0.030962,0.100903,AbstractAdaptation
1,0.113727,-0.368244,0.154407,-0.344464,0.222648,-0.049117,0.18555,-0.154327,0.207214,-0.159015,...,-0.115045,0.050043,0.087124,0.013899,-0.121364,-0.095683,0.000557,-0.072848,0.007897,response
2,0.387475,0.262903,0.081759,-0.011203,0.284174,-0.128667,0.149459,-0.012951,0.243314,-0.009093,...,-0.095298,0.171603,-0.002386,0.078225,-0.044906,0.018615,0.001475,-0.067983,0.083021,selection
3,0.350396,0.210712,0.211186,-0.146657,0.074213,-0.134556,0.148512,-0.052609,-0.070375,0.169842,...,-0.023232,0.149557,-0.013585,0.01115,0.036175,0.025637,0.00331,-0.013658,-0.043605,polygenic
4,0.086763,0.190385,0.318258,-0.323507,0.145257,-0.076234,0.124734,0.094024,0.010925,0.063133,...,-0.039955,0.101386,0.040046,-0.028833,0.019462,0.026448,0.028133,-1.7e-05,0.008358,phenotypes


# Generate Word Clouds for the PC dimensions

Given word to principal component association, next step is to generate word clouds for each principal component. The word clouds have orange representing words that are most similar to the principal component and blue as words most dissimilar to the principal component.

In [13]:
class PolarityColorFunc:
    def __init__(
        self,
        word_class_map,
        positive_key="positive",
        positive="#ef8a62",
        negative_key="negative",
        negative="#67a9cf",
        default="#f7f7f7",
    ):
        self.positive_class = positive
        self.negative_class = negative
        self.positive_key = positive_key
        self.negative_key = negative_key
        self.default_class = default

        self.words_to_color_map = word_class_map

    def get_color_mapper(self, word):
        if word in self.words_to_color_map[self.positive_key]:
            return ImageColor.getrgb(self.positive_class)
        # wordcloud.get_single_color_func(self.positive_class)
        elif word in self.words_to_color_map[self.negative_key]:
            return ImageColor.getrgb(self.negative_class)
        else:
            return ImageColor.getrgb(self.default_class)

    def __call__(self, word, **kwargs):
        return self.get_color_mapper(word)

In [14]:
pca_dimensions = [f"pca{dim}_cossim" for dim in range(1, n_components + 1, 1)]
for pc, component in tqdm_notebook(enumerate(pca_dimensions, start=1)):
    word_class_map = {}

    word_class_map["negative"] = (
        word_pca_sim_df.sort_values(component, ascending=True)
        .head(100)
        .assign(**{component: lambda x: x[component].abs().values.tolist()})
        .assign(**{component: lambda x: x[component] / x[component].max()})[
            ["word", component]
        ]
        .to_dict(orient="records")
    )

    word_class_map["positive"] = (
        word_pca_sim_df.sort_values(component, ascending=False)
        .assign(**{component: lambda x: x[component] / x[component].max()})
        .head(100)[["word", component]]
        .to_dict(orient="records")
    )

    polarity_color_map = PolarityColorFunc(
        {
            word_class: set(map(lambda x: x["word"], word_class_map[word_class]))
            for word_class in word_class_map
        }
    )

    pc = f"{pc}" if pc > 9 else f"0{pc}"

    polarity_cloud = (
        wordcloud.WordCloud(
            background_color="white", width=1024, height=768, collocations=False
        )
        .generate_from_frequencies(
            {
                record["word"]: record[component]
                for word_class in word_class_map
                for record in word_class_map[word_class]
            }
        )
        .recolor(color_func=polarity_color_map)
        .to_file(
            f"output/word_pca_similarity/figure_pieces/pca_{pc}_cossim_word_cloud.png"
        )
    )

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# Document Centroid Cosine Similarity

Finally this section calculates document centroid to principal component associations. This means the higher score the higher the association is between a document category and given principal component.

In [15]:
document_centroid_df = (
    journal_map_df[["document", "category"]]
    .merge(biorxiv_articles_df, on="document")
    .groupby("category")
    .agg({f"feat_{dim}": "mean" for dim in range(300)})
    .reset_index()
)
document_centroid_df.head()

Unnamed: 0,category,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,animal behavior and cognition,-0.067102,0.006245,-0.166934,0.244088,-0.024791,-0.307849,-0.199946,-0.066602,0.425728,...,0.066075,-0.048537,0.287565,-0.220762,-0.673418,-0.339518,0.333853,0.702956,-0.745664,-0.075889
1,biochemistry,-0.034316,-0.053148,-0.544352,0.01929,-0.109152,-0.388323,-0.308966,0.12925,0.504788,...,0.348292,-0.142577,0.328045,-0.215893,-0.387695,-0.121766,0.271929,0.503924,-0.608495,0.482355
2,bioengineering,0.07386,-0.125204,-0.474422,0.35839,-0.099635,-0.449946,-0.233647,0.137219,0.562315,...,0.289395,-0.171282,0.287534,-0.268522,-0.42938,-0.169883,0.153069,0.463736,-0.724947,0.196278
3,bioinformatics,-0.033937,0.010944,-0.256139,0.080992,-0.134498,-0.459363,-0.143522,0.133023,0.516498,...,0.09279,-0.095201,0.119077,-0.001623,-0.538245,-0.173349,0.39715,0.747096,-0.508269,0.241085
4,biophysics,-0.143168,-0.122253,-0.603587,0.340979,0.05795,-0.444904,-0.476023,0.04609,0.559676,...,0.232531,-0.225747,0.394882,-0.07084,-0.395552,-0.197208,0.227844,0.629617,-0.679095,0.206965


In [16]:
# 1 - cosine distance = cosine similarity
centroid_pca_similarity = 1 - cdist(
    document_centroid_df[[f"feat_{dim}" for dim in range(300)]].values,
    reducer.components_,
    "cosine",
)

centroid_pca_similarity.shape

(29, 50)

In [17]:
centroid_pca_df = pd.DataFrame(
    centroid_pca_similarity,
    columns=[f"pca{dim}_cossim" for dim in range(1, n_components + 1, 1)],
).assign(category=document_centroid_df.category.tolist())[
    ["category"] + [f"pca{dim}_cossim" for dim in range(1, n_components + 1, 1)]
]
centroid_pca_df.to_csv(
    "output/word_pca_similarity/centroid_pca_cos_sim.tsv", sep="\t", index=False
)
centroid_pca_df.head()

Unnamed: 0,category,pca1_cossim,pca2_cossim,pca3_cossim,pca4_cossim,pca5_cossim,pca6_cossim,pca7_cossim,pca8_cossim,pca9_cossim,...,pca41_cossim,pca42_cossim,pca43_cossim,pca44_cossim,pca45_cossim,pca46_cossim,pca47_cossim,pca48_cossim,pca49_cossim,pca50_cossim
0,animal behavior and cognition,0.570731,-0.134138,-0.047122,-0.117175,0.144135,-0.195521,0.203488,0.001325,0.088645,...,0.110908,-0.063468,0.14164,-0.036175,0.014203,-0.14215,-0.069484,-0.011198,-0.154146,0.046125
1,biochemistry,0.126778,0.051383,-0.315872,-0.10277,0.21452,-0.218283,0.262517,-0.127188,-0.015786,...,0.125014,-0.089375,0.171611,-0.037691,0.018286,-0.182387,-0.071546,-0.018325,-0.186889,0.053927
2,bioengineering,0.341896,-0.082508,-0.254776,-0.068848,0.101629,-0.184302,0.359868,0.047101,0.039834,...,0.121561,-0.090466,0.164318,-0.04146,0.022428,-0.167361,-0.075161,-0.014209,-0.177462,0.052864
3,bioinformatics,0.551289,0.22242,-0.239134,-0.172629,0.068064,-0.175018,0.2667,0.014189,0.015259,...,0.095436,-0.071802,0.134066,-0.032896,0.01437,-0.141805,-0.066615,-0.012146,-0.152024,0.044632
4,biophysics,0.338997,-0.082934,-0.351103,-0.101333,0.251271,-0.225578,0.281077,-0.008876,-0.027261,...,0.104815,-0.07347,0.143359,-0.039904,0.017568,-0.16025,-0.065506,-0.010309,-0.166023,0.049482
