# Get Token Counts and Word Vectors

This notebook is designed to calculate token frequencies for each token within processed preprints. Plus, this notebook generates document vector representations for preprints to be analyzed downstream of the pipeline.

In [1]:
import csv
from pathlib import Path

from gensim.models import Word2Vec
import pandas as pd
from tqdm import tqdm_notebook

from annorxiver_modules.corpora_comparison_helper import get_word_stats
from annorxiver_modules.document_helper import generate_doc_vector

# BioRxiv

In [2]:
mapped_documents_df = pd.read_csv("output/polka_et_al_pmc_mapped_subset.tsv", sep="\t")
mapped_documents_df.head()

Unnamed: 0,biorxiv_doi,published_doi,PMID,PMCID,Version,MID,IsCurrent,IsLive,ReleaseDate,Msg
0,10.1101/2019.12.18.881391,10.1128/JVI.00426-20,32295925.0,PMC7307142,,,,1,,
1,10.1101/2019.12.19.882274,10.3389/fpls.2020.00355,32373138.0,PMC7176908,,,,1,,
2,10.1101/2020.01.13.905190,10.1182/blood.2019002867,32128578.0,PMC7243144,,,,1,,
3,10.1101/2020.01.21.914929,10.1128/AAC.00086-20,32284379.0,PMC7269492,,,,1,,
4,10.1101/2020.01.22.914952,10.1038/s41586-020-2012-7,32015507.0,PMC7095418,,,,1,,


In [3]:
biorxiv_documents = [
    Path(x.name) for x in list(Path("output/biorxiv_xml_files").rglob("*xml"))
]

## BioRxiv -> Term counts

In [4]:
Path("output/biorxiv_word_counts").mkdir(exist_ok=True)
sentence_length = get_word_stats(
    document_list=biorxiv_documents,
    document_folder="output/biorxiv_xml_files",
    tag_path="//abstract/p|//abstract/title|//body/sec//p|//body/sec//title",
    output_folder="output/biorxiv_word_counts",
)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




## BioRxiv -> Doc Embeddings

In [5]:
biorxiv_xpath_str = "//abstract/p|//abstract/title|//body/sec//p|//body/sec//title"
word_model = Word2Vec.load(
    str(Path("../word_vector_experiment/output/word2vec_models/300/biorxiv_300.model"))
)

In [6]:
biorxiv_document_map = {
    document: generate_doc_vector(
        word_model,
        document_path=str(Path("output/biorxiv_xml_files") / document),
        xpath=biorxiv_xpath_str,
    )
    for document in tqdm_notebook(biorxiv_documents)
}

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




In [7]:
biorxiv_vec_df = (
    pd.DataFrame.from_dict(biorxiv_document_map, orient="index")
    .rename(columns={col: f"feat_{col}" for col in range(int(300))})
    .rename_axis("document")
    .reset_index()
)

biorxiv_vec_df.to_csv(
    "output/polka_et_al_biorxiv_embeddings.tsv", sep="\t", index=False
)

biorxiv_vec_df.head().T

Unnamed: 0,0,1,2,3,4
document,838870_v1.xml,2020.02.13.945485_v1.xml,2020.01.13.905190_v1.xml,865089_v1.xml,832675_v1.xml
feat_0,-0.236496,-0.20954,0.229247,0.259484,0.331504
feat_1,-0.15839,0.0621202,0.10372,0.187865,0.3973
feat_2,-0.137343,-0.0134206,-0.675217,-0.476495,-0.662221
feat_3,-0.00101136,-0.00027926,-0.279651,0.104272,-0.0822386
...,...,...,...,...,...
feat_295,-0.345592,-0.0448133,0.0678894,-0.00910428,-0.768766
feat_296,0.0396399,0.324615,0.266495,0.15263,0.229174
feat_297,0.73011,0.655335,0.434768,0.0261503,0.504789
feat_298,-0.348575,-0.402705,-0.672433,-0.4789,-0.708815


# PMCOA

In [8]:
pmcoa_documents = [
    Path(f"{x.parent.stem}/{x.name}")
    for x in list(Path("output/pmcoa_xml_files").rglob("*nxml"))
]

## PMCOA -> Term counts

In [9]:
Path("output/pmcoa_word_counts").mkdir(exist_ok=True)
sentence_length = get_word_stats(
    document_list=pmcoa_documents,
    document_folder="output/pmcoa_xml_files",
    tag_path="//abstract/sec/*|//abstract/p|//body/sec/*|//body/p",
    output_folder="output/pmcoa_word_counts",
)

HBox(children=(IntProgress(value=0, max=39), HTML(value='')))




## PMCOA -> Doc Vectors

In [10]:
pmcoa_vec_map = {
    document.stem: generate_doc_vector(
        word_model,
        str(Path("output/pmcoa_xml_files") / Path(document)),
        "//abstract/sec/*|//abstract/p|//body/sec/*|//body/p",
    )
    for document in pmcoa_documents
}

In [11]:
pmcoa_vec_df = (
    pd.DataFrame.from_dict(pmcoa_vec_map, orient="index")
    .rename(columns={col: f"feat_{col}" for col in range(int(300))})
    .rename_axis("document")
    .reset_index()
)

pmcoa_vec_df.to_csv("output/polka_et_al_pmcoa_embeddings.tsv", sep="\t", index=False)

pmcoa_vec_df.head().T

Unnamed: 0,0,1,2,3,4
document,PMC7095418,PMC7054013,PMC7182430,PMC7176908,PMC6907167
feat_0,0.0248885,0.0235554,-0.0556303,0.117712,0.226095
feat_1,-0.0709754,0.215864,-0.159898,-0.00885073,0.16165
feat_2,-0.377182,-0.443536,-0.455295,-0.116698,-0.38245
feat_3,-0.102496,0.0476431,0.131003,0.319624,0.504041
...,...,...,...,...,...
feat_295,-0.146963,-0.11197,-0.262596,-0.389951,0.104617
feat_296,0.204912,0.282327,0.429216,-0.118042,0.139984
feat_297,0.419672,0.564426,0.556789,0.743982,0.281341
feat_298,-0.734921,-0.316237,-0.663488,-0.15828,-0.474767
