# Compare Pubmed Central Corpus with bioRxiv Corpus

In [1]:
%load_ext autoreload
%autoreload 2

from collections import Counter, defaultdict
import csv
from pathlib import Path
import pickle
import string
import sys

sys.path.append("../../modules/")

import gensim
import pandas as pd
import spacy
from tqdm import tqdm_notebook

from document_helper import dump_article_text

In [2]:
lemma_model = spacy.load("en_core_web_sm")
lemma_model.max_length = 9000000 

In [None]:
def fix_pronouns(x):
    """
    Spacy replaces pronouns with -pron- when
    processing pronouns, which makes analysis difficult
    as "me" would be the same as "I" or "we".
    This lambda function is designed to fix that issue.
    
    Arguments: 
        x - the token parsed by spacy
    """
    return (
        x.lemma_.lower() 
        if x.lemma_.lower() != '-pron-' 
        else x.text.lower()
    )

# Calculate Word Frequency of bioRxiv

In [3]:
biorxiv_map_df = (
    pd.read_csv("../exploratory_data_analysis/output/biorxiv_article_metadata.tsv", sep="\t")
    .groupby("doi")
    .agg({"document":"first", "doi":"last"})
)
biorxiv_map_df.head()

Unnamed: 0_level_0,document,doi
doi,Unnamed: 1_level_1,Unnamed: 2_level_1
10.1101/000026,000026_v1.xml,10.1101/000026
10.1101/000042,000042_v1.xml,10.1101/000042
10.1101/000067,000067_v1.xml,10.1101/000067
10.1101/000091,000091_v1.xml,10.1101/000091
10.1101/000109,000109_v1.xml,10.1101/000109


In [4]:
Path('output/biorxiv_word_counts').mkdir(exist_ok=True)

In [5]:
for document in tqdm_notebook(biorxiv_map_df.document.tolist()):
    
    document_text = dump_article_text(
        file_path=f"../biorxiv_articles/{document}",
        xpath_str="//abstract/p|//abstract/title|//body/sec//p|//body/sec//title",
        remove_stop_words=False
    )

    doc = lemma_model(" ".join(document_text),  disable = ['ner', 'parser'])
    tokens = [tok for tok in doc if tok.text.lower() not in string.punctuation]
    
    with open(f"output/biorxiv_word_counts/{Path(document).stem}.tsv", "w") as file:
        writer = csv.DictWriter(file, fieldnames=["lemma", "count"], delimiter="\t")
        writer.writeheader()
        
        lemma_freq = Counter(
            list(
                map(
                    fix_pronouns, 
                    tokens
                )
            )
        )
        
        writer.writerows([
            {"lemma":val[0], "count":val[1]}
            for val in lemma_freq.items()
        ])

HBox(children=(IntProgress(value=0, max=71118), HTML(value='')))




# Calculate Word Frequency of Pubmed Central

In [3]:
pmc_map_df = (
    pd.read_csv(
        "../../pmc/exploratory_data_analysis/output/pubmed_central_journal_paper_map.tsv.xz", 
        sep="\t"
    )
    .query("article_type=='research-article'")
)
print(pmc_map_df.shape)
pmc_map_df.head()

(1977651, 4)


Unnamed: 0,journal,article_type,doi,pmcid
0,Environ_Health,research-article,10.1186/1476-069X-5-22,PMC1552054
1,Environ_Health,research-article,10.1186/1476-069X-4-12,PMC1226148
3,Environ_Health,research-article,10.1186/s12940-017-0316-3,PMC5635510
4,Environ_Health,research-article,10.1186/1476-069X-10-46,PMC3125232
5,Environ_Health,research-article,10.1186/1476-069X-11-91,PMC3533997


In [4]:
Path("output/pmc_word_counts").mkdir(exist_ok=True)

In [5]:
for document in tqdm_notebook(pmc_map_df[["journal", "pmcid"]].values.tolist()):
    
    #Skip files that dont exist or files already parsed
    if (
        not Path(f"../../pmc/journals/{document[0]}/{document[1]}.nxml").exists()
        or Path(f"output/pmc_word_counts/{document[1]}.tsv").exists()
    ):
        continue
    
    document_text = dump_article_text(
        file_path=f"../../pmc/journals/{document[0]}/{document[1]}.nxml",
        xpath_str="//abstract/sec/*|//body/sec/*",
        remove_stop_words=False
    )
    
    doc = lemma_model(" ".join(document_text),  disable = ['ner', 'parser'])
    tokens = [tok for tok in doc if tok.text.lower() not in string.punctuation]
    with open(f"output/pmc_word_counts/{document[1]}.tsv", "w") as file:
        writer = csv.DictWriter(file, fieldnames=["lemma", "count"],delimiter="\t")
        writer.writeheader()

        lemma_freq = Counter(
            list(
                map(
                    fix_pronouns,
                    tokens
                )
            )
        )
              
        writer.writerows([
            {"lemma":val[0], "count":val[1]}
            for val in lemma_freq.items()
        ])
        

HBox(children=(IntProgress(value=0, max=1977651), HTML(value='')))


