In [1]:
from pathlib import Path
import re
import sys

from gensim.models import Word2Vec
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm import tqdm_notebook
import umap

from annorxiver_modules.document_helper import (
    generate_doc_vector,
    DocIterator,
    dump_article_text,
)

In [2]:
journal_map_df = pd.read_csv(
    "../exploratory_data_analysis/output/biorxiv_article_metadata.tsv", sep="\t"
)
journal_map_df.head()

Unnamed: 0,author_type,heading,category,document,doi
0,regular article,new results,genetics,440735_v1.xml,10.1101/440735
1,regular article,new results,systems biology,775270_v1.xml,10.1101/775270
2,regular article,new results,genetics,242404_v1.xml,10.1101/242404
3,regular article,new results,neuroscience,872994_v1.xml,10.1101/2019.12.11.872994
4,regular article,new results,developmental biology,080853_v2.xml,10.1101/080853


In [3]:
biorxiv_xpath_str = "//abstract/p|//abstract/title|//abstract/sec/*"

In [4]:
word_model_path = list(Path().rglob("output/word2vec_models/300/*.model"))[0]
model_dim = word_model_path.parents[0].stem
word_model = Word2Vec.load(str(word_model_path.resolve()))

biorxiv_document_map = {
    document: generate_doc_vector(
        word_model,
        document_path=f"../biorxiv_articles/{document}",
        xpath=biorxiv_xpath_str,
    )
    for document in tqdm_notebook(journal_map_df.document.tolist())
}

biorxiv_vec_df = (
    pd.DataFrame.from_dict(biorxiv_document_map, orient="index")
    .rename(columns={col: f"feat_{col}" for col in range(int(model_dim))})
    .rename_axis("document")
    .reset_index()
)

biorxiv_vec_df.to_csv(
    f"output/word2vec_output/biorxiv_all_articles_{model_dim}_abstract_only.tsv.xz",
    sep="\t",
    index=False,
    compression="xz",
)

HBox(children=(IntProgress(value=0, max=98023), HTML(value='')))


