# Compare Pubmed Central Corpus with bioRxiv Corpus

In [1]:
%load_ext autoreload
%autoreload 2

from collections import Counter
import csv
from pathlib import Path
import pickle
import string

import pandas as pd
import spacy
from tqdm import tqdm

from annorxiver_modules.document_helper import dump_article_text
from annorxiver_modules.corpora_comparison_helper import get_word_stats

In [2]:
lemma_model = spacy.load("en_core_web_sm")
lemma_model.max_length = 9000000 

# Calculate Word Frequency of bioRxiv

In [3]:
biorxiv_map_df = (
    pd.read_csv("../exploratory_data_analysis/output/biorxiv_article_metadata.tsv", sep="\t")
    .groupby("doi")
    .agg({"document":"first", "doi":"last"})
)
print(biorxiv_map_df.shape)
biorxiv_map_df.head()

(71118, 2)


Unnamed: 0_level_0,document,doi
doi,Unnamed: 1_level_1,Unnamed: 2_level_1
10.1101/000026,000026_v1.xml,10.1101/000026
10.1101/000042,000042_v1.xml,10.1101/000042
10.1101/000067,000067_v1.xml,10.1101/000067
10.1101/000091,000091_v1.xml,10.1101/000091
10.1101/000109,000109_v1.xml,10.1101/000109


In [4]:
Path("output/biorxiv_word_counts/").mkdir(parents=True, exist_ok=True)

In [5]:
sentence_length = get_word_stats(
    document_list=biorxiv_map_df.document.tolist(),
    document_folder="output/biorxiv_word_counts/",
    tag_path="//abstract/p|//abstract/title|//body/sec//p|//body/sec//title",
    output_folder="output/biorxiv_word_counts/"
)

100%|██████████| 71118/71118 [9:26:59<00:00,  2.09it/s]   


In [6]:
pickle.dump(
    sentence_length, 
    open("output/biorxiv_sentence_length.pkl", "wb")
)

# Calculate Word Frequency of Pubmed Central

In [3]:
pmc_map_df = (
    pd.read_csv(
        "../../pmc/exploratory_data_analysis/output/pubmed_central_journal_paper_map.tsv.xz", 
        sep="\t"
    )
    .query("article_type=='research-article'")
)
print(pmc_map_df.shape)
pmc_map_df.head()

(1977651, 4)


Unnamed: 0,journal,article_type,doi,pmcid
0,Environ_Health,research-article,10.1186/1476-069X-5-22,PMC1552054
1,Environ_Health,research-article,10.1186/1476-069X-4-12,PMC1226148
3,Environ_Health,research-article,10.1186/s12940-017-0316-3,PMC5635510
4,Environ_Health,research-article,10.1186/1476-069X-10-46,PMC3125232
5,Environ_Health,research-article,10.1186/1476-069X-11-91,PMC3533997


In [4]:
Path("../../pmc/pmc_corpus/pmc_word_counts/").mkdir(parents=True, exist_ok=True)

In [None]:
pmc_path_list = [
    Path(f"{doc_path[0]}/{doc_path[1]}.nxml")
    for doc_path in pmc_map_df[["journal", "pmcid"]].values.tolist()
]

sentence_length = get_word_stats(
    document_list=pmc_path_list,
    document_folder="../../pmc/journals/",
    tag_path="//abstract/sec/*|//body/sec/*",
    output_folder="../../pmc/pmc_corpus/pmc_word_counts/",
    skip_condition=lambda folder, document: (
        Path(f"{folder}/{str(document)}").exists() or 
        Path(f"../../pmc/pmc_corpus/pmc_word_counts/{document.stem}.tsv").exists()
    )
)

 69%|██████▉   | 1362827/1977651 [58:02:12<55:23:48,  3.08it/s]  

In [None]:
pickle.dump(
    sentence_length, 
    open("../../pmc/pmc_corpus/pmc_sentence_length.pkl", "wb")
)