# Get Token Counts and Word Vectors

In [1]:
import csv
from pathlib import Path

from gensim.models import Word2Vec
import pandas as pd
from tqdm import tqdm_notebook

from annorxiver_modules.corpora_comparison_helper import get_word_stats
from annorxiver_modules.document_helper import generate_doc_vector

# BioRxiv

In [2]:
mapped_documents_df = pd.read_csv("output/polka_et_al_pmc_mapped_subset.tsv", sep="\t")
mapped_documents_df.head()

Unnamed: 0,biorxiv_doi,published_doi,PMID,PMCID,Version,MID,IsCurrent,IsLive,ReleaseDate,Msg
0,10.1101/2019.12.18.881391,10.1128/JVI.00426-20,32295925.0,PMC7307142,,,,1,,
1,10.1101/2019.12.19.882274,10.3389/fpls.2020.00355,32373138.0,PMC7176908,,,,1,,
2,10.1101/2020.01.13.905190,10.1182/blood.2019002867,32128578.0,PMC7243144,,,,1,,
3,10.1101/2020.01.21.914929,10.1128/AAC.00086-20,32284379.0,PMC7269492,,,,1,,
4,10.1101/2020.01.22.914952,10.1038/s41586-020-2012-7,32015507.0,PMC7095418,,,,1,,


In [3]:
biorxiv_documents = [
    Path(x.name) 
    for x in list(Path("output/biorxiv_xml_files").rglob("*xml"))
]

## BioRxiv -> Term counts

In [4]:
Path("output/biorxiv_word_counts").mkdir(exist_ok=True)
sentence_length = get_word_stats(
    document_list=biorxiv_documents,
    document_folder="output/biorxiv_xml_files",
    tag_path="//abstract/p|//abstract/title|//body/sec//p|//body/sec//title",
    output_folder="output/biorxiv_word_counts"
)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




## BioRxiv -> Doc Embeddings

In [5]:
biorxiv_xpath_str = "//abstract/p|//abstract/title|//body/sec//p|//body/sec//title"
word_model = Word2Vec.load(
    str(
        Path("../word_vector_experiment/output/word2vec_models/300/biorxiv_300.model")
    )
)

In [6]:
biorxiv_document_map = {
    document:generate_doc_vector(
        word_model, 
        document_path = str(Path("output/biorxiv_xml_files")/document),
        xpath=biorxiv_xpath_str
    )
    for document in tqdm_notebook(biorxiv_documents)
}

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




In [7]:
biorxiv_vec_df = (
    pd.DataFrame
    .from_dict(biorxiv_document_map, orient="index")
    .rename(columns={col:f"feat_{col}" for col in range(int(300))})
    .rename_axis("document")
    .reset_index()
)

biorxiv_vec_df.to_csv(
    "output/polka_et_al_biorxiv_embeddings.tsv", 
    sep="\t", index=False
)

biorxiv_vec_df.head().T

Unnamed: 0,0,1,2,3,4
document,838870_v1.xml,2020.02.13.945485_v1.xml,2020.01.13.905190_v1.xml,865089_v1.xml,832675_v1.xml
feat_0,-0.480256,-0.526527,0.0497441,0.133945,0.120574
feat_1,-0.1716,0.0138942,0.0582557,0.149573,0.435998
feat_2,-0.0388579,-0.0591003,-0.828798,-0.497015,-0.730727
feat_3,-0.0210492,0.0415092,-0.477814,0.186168,-0.144449
...,...,...,...,...,...
feat_295,-0.294772,-0.0705802,0.177389,0.126316,-0.809388
feat_296,0.209652,0.509606,0.374656,0.233592,0.447944
feat_297,0.638616,0.487253,0.136399,-0.127426,0.241587
feat_298,-0.0210951,-0.0329004,-0.516485,-0.238472,-0.461618


# PMCOA

In [8]:
pmcoa_documents = [
    Path(f"{x.parent.stem}/{x.name}") 
    for x in list(Path("output/pmcoa_xml_files").rglob("*nxml"))
]

## PMCOA -> Term counts

In [9]:
Path("output/pmcoa_word_counts").mkdir(exist_ok=True)
sentence_length = get_word_stats(
    document_list=pmcoa_documents,
    document_folder="output/pmcoa_xml_files",
    tag_path="//abstract/sec/*|//abstract/p|//body/sec/*|//body/p",
    output_folder="output/pmcoa_word_counts"
)

HBox(children=(IntProgress(value=0, max=39), HTML(value='')))




## PMCOA -> Doc Vectors

In [10]:
pmcoa_vec_map = {
    document.stem:generate_doc_vector(
        word_model,
        str(Path("output/pmcoa_xml_files")/Path(document)), 
        "//abstract/sec/*|//abstract/p|//body/sec/*|//body/p"
    )
    for document in pmcoa_documents
}

In [11]:
pmcoa_vec_df = (
    pd.DataFrame
    .from_dict(pmcoa_vec_map, orient="index")
    .rename(columns={col:f"feat_{col}" for col in range(int(300))})
    .rename_axis("document")
    .reset_index()
)

pmcoa_vec_df.to_csv(
    f"output/polka_et_al_pmcoa_embeddings.tsv", 
    sep="\t", index=False
)

pmcoa_vec_df.head().T

Unnamed: 0,0,1,2,3,4
document,PMC7095418,PMC7054013,PMC7182430,PMC7176908,PMC6907167
feat_0,-0.196777,-0.242958,-0.420515,-0.179555,-0.0898645
feat_1,-0.0923691,0.176935,-0.296674,-0.0550115,0.0552031
feat_2,-0.342167,-0.379532,-0.526647,-0.240822,-0.515187
feat_3,-0.115627,-0.187553,0.0424938,0.320286,0.64707
...,...,...,...,...,...
feat_295,-0.166095,-0.103187,-0.243082,-0.486879,0.169077
feat_296,0.368638,0.551733,0.603597,-0.0650531,0.277399
feat_297,0.203294,0.478303,0.453615,0.538904,0.128918
feat_298,-0.425629,0.0482001,-0.30829,0.191572,-0.0715847
