# Comparative Linguistic Analysis of bioRxiv and PMC

In [1]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict, Counter
import csv
import itertools
from pathlib import Path

import numpy as np
import pandas as pd
import pickle
import spacy
from scipy.stats import chi2_contingency
from tqdm import tqdm_notebook

from annorxiver_modules.corpora_comparison_helper import(
    aggregate_word_counts,
    dump_to_dataframe,
    get_term_statistics,
    KL_divergence
)

# Full Text Comparison (Global)

## Gather Word Frequencies

In [2]:
biorxiv_count_path = Path("output/total_word_counts/biorxiv_total_count.tsv")
pmc_count_path = Path("output/total_word_counts/pmc_total_count.tsv")
nytac_count_path = Path("output/total_word_counts/nytac_total_count.tsv")

In [3]:
if not biorxiv_count_path.exists():
    biorxiv_corpus_count = (
        aggregate_word_counts(
            list(Path("output/biorxiv_word_counts").rglob("*tsv"))
        )
    )
    dump_to_dataframe(biorxiv_corpus_count, "output/biorxiv_total_count.tsv")
    biorxiv_corpus_count.most_common(10)

In [4]:
if not pmc_count_path.exists():
    pmc_corpus_count = (
        aggregate_word_counts(
            list(Path("../../pmc/pmc_corpus/pmc_word_counts").rglob("*tsv"))
        )
    )
    dump_to_dataframe(pmc_corpus_count, "output/pmc_total_count.tsv")
    pmc_corpus_count.most_common(10)

In [5]:
if not nytac_count_path.exists():
    nytac_corpus_count = (
        aggregate_word_counts(
            list(Path("../../nytac/corpora_stats/output").rglob("*tsv"))
        )
    )
    dump_to_dataframe(nytac_corpus_count, "output/nytac_total_count.tsv")
    nytac_corpus_count.most_common(10)

In [6]:
biorxiv_total_count_df = pd.read_csv(
    biorxiv_count_path.resolve(), 
    sep="\t"
)

pmc_total_count_df = pd.read_csv(
    pmc_count_path.resolve(), 
    sep="\t"
)

nytac_total_count_df = pd.read_csv(
    nytac_count_path.resolve(), 
    sep="\t"
)

In [7]:
biorxiv_sentence_length = pickle.load(
    open("output/biorxiv_sentence_length.pkl", "rb")
)
pmc_sentence_length = pickle.load(
    open("../../pmc/pmc_corpus/pmc_sentence_length.pkl", "rb")
)
nytac_sentence_length = pickle.load(
    open("../../nytac/corpora_stats/nytac_sentence_length.pkl", "rb")
)

In [8]:
spacy_nlp = spacy.load('en_core_web_sm')
stop_word_list = list(spacy_nlp.Defaults.stop_words)

## Get Corpora Comparison Stats

In [9]:
biorxiv_sentence_len_list = list(biorxiv_sentence_length.items())
biorxiv_data = {
    "document_count": len(biorxiv_sentence_length),
    "sentence_count": sum(map(lambda x: len(x[1]), biorxiv_sentence_len_list)),
    "token_count": biorxiv_total_count_df['count'].sum(),
    "stop_word_count":(
        biorxiv_total_count_df
        .query(f"lemma in {stop_word_list}")
        ['count']
        .sum()
    ),
    
    "avg_document_length":np.mean(
        list(
            map(
                lambda x: len(x[1]), 
                biorxiv_sentence_len_list
            )
        )
    ),
    
    "avg_sentence_length":np.mean(
        list(
            itertools
            .chain(
                *list(
                    map(
                        lambda x: x[1], 
                        biorxiv_sentence_len_list
                    )
                )
            )
        )
    ),
    
    "negatives":(
        biorxiv_total_count_df
        .query(f"dep_tag =='neg'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions":(
        biorxiv_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions%":(
        biorxiv_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    )/biorxiv_total_count_df['count'].sum(),
    
    "pronouns":(
        biorxiv_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    ),
    
    "pronouns%":(
        biorxiv_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    )/biorxiv_total_count_df['count'].sum(),
    
    "passives":(
        biorxiv_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    ),
    
    "passive%":(
        biorxiv_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    )/biorxiv_total_count_df['count'].sum()
}

In [10]:
pmc_sentence_len_list = list(pmc_sentence_length.items())
pmc_data = {
    "document_count": len(pmc_sentence_length),
    "sentence_count": sum(map(lambda x: len(x[1]), pmc_sentence_len_list)),
    "token_count": pmc_total_count_df['count'].sum(),
    "stop_word_count":(
        pmc_total_count_df
        .query(f"lemma in {stop_word_list}")
        ['count']
        .sum()
    ),
    
    "avg_document_length":np.mean(
        list(
            map(
                lambda x: len(x[1]), 
                pmc_sentence_len_list
            )
        )
    ),
    
    "avg_sentence_length":np.mean(
        list(
            itertools
            .chain(
                *list(
                    map(
                        lambda x: x[1], 
                        pmc_sentence_len_list
                    )
                )
            )
        )
    ),
    
    "negatives":(
        pmc_total_count_df
        .query(f"dep_tag =='neg'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions":(
        pmc_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions%":(
        pmc_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    )/pmc_total_count_df['count'].sum(),
    
    "pronouns":(
        pmc_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    ),
    
    "pronouns%":(
        pmc_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    )/pmc_total_count_df['count'].sum(),
    
    "passives":(
        pmc_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    ),
    
    "passive%":(
        pmc_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    )/pmc_total_count_df['count'].sum()
}

In [11]:
nytac_sentence_len_list = list(nytac_sentence_length.items())
nytac_data = {
    "document_count": len(nytac_sentence_length),
    "sentence_count": sum(map(lambda x: len(x[1]), nytac_sentence_len_list)),
    "token_count": nytac_total_count_df['count'].sum(),
    "stop_word_count":(
        nytac_total_count_df
        .query(f"lemma in {stop_word_list}")
        ['count']
        .sum()
    ),
    
    "avg_document_length":np.mean(
        list(
            map(
                lambda x: len(x[1]), 
                nytac_sentence_len_list
            )
        )
    ),
    
    "avg_sentence_length":np.mean(
        list(
            itertools
            .chain(
                *list(
                    map(
                        lambda x: x[1], 
                        nytac_sentence_len_list
                    )
                )
            )
        )
    ),
    
    "negatives":(
        nytac_total_count_df
        .query(f"dep_tag =='neg'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions":(
        nytac_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions%":(
        nytac_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    )/nytac_total_count_df['count'].sum(),
    
    "pronouns":(
        nytac_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    ),
    
    "pronouns%":(
        nytac_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    )/nytac_total_count_df['count'].sum(),
    
    "passives":(
        nytac_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    ),
    
    "passive%":(
        nytac_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    )/nytac_total_count_df['count'].sum()
}

In [12]:
# This dataframe contains document statistics for each Corpus
# document count - the number of documents within the corpus
# Sentence count - the number of sentences within the corpus
# Token count - the number of tokens within the corpus
# Stop word counts - the number of stop words within the corpus
# Average document length - the average number of sentences within a document for a given corpus
# Average sentence length - the average number of words within a sentence for a given corpus
# Negatives - the number of negations (e.g. placing not in within a sentence) within a given corpus 
# Coordinating Conjunctions - the number of coordinating conjunctions (and, but, for etc.) within a given corpus 
# Pronouns - the number of pronouns within a given corpus 
# Passive - the number of passive words within a given corpus 

token_stats_df = (
    pd.DataFrame
    .from_records(
        [
            biorxiv_data,
            pmc_data,
            nytac_data
        ], 
        index=["bioRxiv", "PMC", "NYTAC"]
    )
    .T
)
token_stats_df.to_csv(
    "output/figures/corpora_token_stats.tsv", 
    sep="\t"
)
token_stats_df

Unnamed: 0,bioRxiv,PMC,NYTAC
document_count,71118.0,1977647.0,1855658.0
sentence_count,22195740.0,480489800.0,72171040.0
token_count,420969900.0,8597101000.0,1218673000.0
stop_word_count,158429400.0,3153077000.0,559391100.0
avg_document_length,312.0973,242.9604,38.89242
avg_sentence_length,22.70775,21.46228,19.89098
negatives,1148382.0,24928800.0,7272401.0
coordinating_conjunctions,14295740.0,307082300.0,38730050.0
coordinating_conjunctions%,0.03395904,0.03571929,0.0317805
pronouns,4604432.0,74994120.0,46712550.0


## LogLikelihood + Odds Ratio + KL Divergence Calculations

The goal here is to compare word frequencies between bioRxiv and pubmed central. The problem when comparing word frequencies is that non-meaningful words (aka stopwords) such as the, of, and, be, etc., appear the most often. To account for this problem the first step here is to remove those words from analyses. 

### Remove Stop words

In [13]:
biorxiv_total_count_df = (
    biorxiv_total_count_df
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
)
biorxiv_total_count_df

Unnamed: 0,lemma,count
1192554,et,1762717
632487,al,1754311
885798,cells,1281939
1032848,data,1054600
1265968,fig,1031811
...,...,...
1541922,"i,3a",1
1541921,"i,3",1
1541919,"i,14,i",1
1541918,"i,1016",1


In [14]:
pmc_total_count_df = (
    pmc_total_count_df
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
    .iloc[2:]
)
pmc_total_count_df

Unnamed: 0,lemma,count
44829210,cells,23853661
6832340,1,20800303
90301906,study,20300433
78312111,patients,18263142
3,\t\t\t\t,18199324
...,...,...
36210476,a2.6±0.4,1
36210474,a2.6±0.1n,1
36210472,a2.6±0.08,1
36210471,a2.6±0,1


In [15]:
nytac_total_count_df = (
    nytac_total_count_df
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
)
nytac_total_count_df

Unnamed: 0,lemma,count
1780479,said,7731500
1555930,mr.,5817945
1585217,new,4031910
2,,3356883
3685,--,2835532
...,...,...
1224054,gaietes,1
226116,17:18.5,1
226115,17:18.3,1
1224058,gaieté,1


### Calculate LogLikelihoods and Odds ratios

In [16]:
biorxiv_vs_pmc = get_term_statistics(
    biorxiv_total_count_df, 
    pmc_total_count_df, 
    100
)

biorxiv_vs_pmc.to_csv(
    "output/comparison_stats/biorxiv_vs_pmc_comparison.tsv", 
    sep="\t", index=False
)

biorxiv_vs_pmc

HBox(children=(IntProgress(value=0, max=126), HTML(value='')))




Unnamed: 0,lemma,corpus_one_a,corpus_two_b,corpus_one_c,corpus_two_d,log_likelihood,odds_ratio
0,time,615363,11258014,262494660,5365015381,6916.010681,1.117172
1,test,360585,6824993,262494660,5365015381,1971.939941,1.079831
2,increased,243041,6530384,262494660,5365015381,19089.367004,0.760661
3,model,730123,8136668,262494660,5365015381,208450.000793,1.834003
4,al,1754312,17451861,262494660,5365015381,678998.794312,2.054545
...,...,...,...,...,...,...,...
121,g,231888,4731600,262494660,5365015381,0.608002,1.001661
122,studies,324372,9318438,262494660,5365015381,40407.557037,0.711460
123,cancer,157764,5656957,262494660,5365015381,58210.641693,0.570001
124,high,440917,9086405,262494660,5365015381,28.673370,0.991780


In [17]:
biorxiv_vs_nytac = get_term_statistics(
    biorxiv_total_count_df, 
    nytac_total_count_df, 
    100
)
biorxiv_vs_nytac.to_csv(
    "output/comparison_stats/biorxiv_nytac_comparison.tsv", 
    sep="\t", index=False
)
biorxiv_vs_nytac

HBox(children=(IntProgress(value=0, max=192), HTML(value='')))




Unnamed: 0,lemma,corpus_one_a,corpus_two_b,corpus_one_c,corpus_two_d,log_likelihood,odds_ratio
0,time,615363,1578029,262494660,659277495,1.917716e+02,0.979409
1,united,8293,1024026,262494660,659277495,6.102781e+05,0.020340
2,test,360585,138836,262494660,659277495,4.082828e+05,6.523090
3,increased,243041,109599,262494660,659277495,2.468211e+05,5.569558
4,office,2303,489148,262494660,659277495,3.042661e+05,0.011825
...,...,...,...,...,...,...,...
187,night,8662,589751,262494660,659277495,3.263641e+05,0.036889
188,high,440917,710856,262494660,659277495,5.122786e+04,1.557840
189,mr.,120,5817946,262494660,659277495,3.882737e+06,0.000052
190,old,49991,1003174,262494660,659277495,3.953994e+05,0.125159


In [18]:
pmc_vs_nytac = get_term_statistics(
    pmc_total_count_df, 
    nytac_total_count_df, 
    100
)

pmc_vs_nytac.to_csv(
    "output/comparison_stats/pmc_nytac_comparison.tsv", 
    sep="\t", index=False
)

pmc_vs_nytac

HBox(children=(IntProgress(value=0, max=190), HTML(value='')))




Unnamed: 0,lemma,corpus_one_a,corpus_two_b,corpus_one_c,corpus_two_d,log_likelihood,odds_ratio
0,time,11258014,1578029,5365015381,659277495,2.314005e+04,0.876686
1,united,551100,1024026,5365015381,659277495,2.618264e+06,0.066133
2,test,6824993,138836,5365015381,659277495,8.336842e+05,6.040841
3,increased,6530384,109599,5365015381,659277495,8.810869e+05,7.321995
4,office,161045,489148,5365015381,659277495,1.473495e+06,0.040458
...,...,...,...,...,...,...,...
185,high,9086405,710856,5365015381,659277495,1.528909e+05,1.570752
186,mr.,8037,5817946,5365015381,659277495,2.557775e+07,0.000170
187,old,1323085,1003174,5365015381,659277495,1.563860e+06,0.162072
188,treatment,9392163,106985,5365015381,659277495,1.476342e+06,10.787977


## Calculate KL Divergence

In [19]:
term_grid = [100,200,300,400,500,1000,1500,2000,3000,5000]
kl_data = []
for num_terms in tqdm_notebook(term_grid):
    kl_data.append({
        "num_terms": num_terms,
        "KL_divergence":KL_divergence(
            biorxiv_total_count_df, 
            pmc_total_count_df, 
            num_terms = num_terms
        ),
        "comparison":"biorxiv_vs_pmc"
    })
    
    kl_data.append({
        "num_terms": num_terms,
        "KL_divergence":KL_divergence(
            biorxiv_total_count_df, 
            nytac_total_count_df, 
            num_terms = num_terms
        ),
        "comparison":"biorxiv_vs_nytac"
    })
        
    kl_data.append({
        "num_terms": num_terms,
        "KL_divergence":KL_divergence(
            pmc_total_count_df, 
            nytac_total_count_df, 
            num_terms = num_terms
        ),
        "comparison":"pmc_vs_nytac"
    }) 

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [20]:
kl_metrics = pd.DataFrame.from_records(kl_data)
kl_metrics.to_csv(
    "output/comparison_stats/corpora_kl_divergence.tsv", 
    sep="\t", index=False
)
kl_metrics

Unnamed: 0,num_terms,KL_divergence,comparison
0,100,0.02343,biorxiv_vs_pmc
1,100,0.473538,biorxiv_vs_nytac
2,100,0.29421,pmc_vs_nytac
3,200,0.037528,biorxiv_vs_pmc
4,200,0.639116,biorxiv_vs_nytac
5,200,0.459137,pmc_vs_nytac
6,300,0.047375,biorxiv_vs_pmc
7,300,0.77364,biorxiv_vs_nytac
8,300,0.541992,pmc_vs_nytac
9,400,0.178005,biorxiv_vs_pmc


# Preprint to Published View

In [21]:
mapped_doi_df = (
    pd.read_csv("../journal_tracker/output/mapped_published_doi.tsv", sep="\t")
    .query("published_doi.notnull()")
    .query("pmcid.notnull()")
    .groupby("preprint_doi")
    .agg({
        "author_type":"first",
        "heading":"first",
        "category":"first",
        "document":"first",
        "preprint_doi":"last",
        "published_doi":"last",
        "pmcid":"last"
    })
    .reset_index(drop=True)
)
mapped_doi_df.tail()

Unnamed: 0,author_type,heading,category,document,preprint_doi,published_doi,pmcid
30922,regular article,new results,microbiology,872325_v1.xml,10.1101/872325,10.1128/mbio.03197-19,PMC7078482
30923,regular article,new results,cell biology,872408_v1.xml,10.1101/872408,10.1186/s13072-020-00335-x,PMC7057672
30924,regular article,new results,ecology,872549_v1.xml,10.1101/872549,10.1093/aob/mcaa101,PMC7539359
30925,regular article,new results,biochemistry,872879_v1.xml,10.1101/872879,10.1038/s41467-020-14898-6,PMC7048817
30926,regular article,new results,developmental biology,873232_v1.xml,10.1101/873232,10.1534/g3.119.400967,PMC7056964


In [22]:
print(f"Total # of Preprints Mapped: {mapped_doi_df.shape[0]}")
print(f"Total % of Mapped: {mapped_doi_df.shape[0]/71118}")

Total # of Preprints Mapped: 30927
Total % of Mapped: 0.43486880958407154


In [23]:
preprint_count = aggregate_word_counts([
    Path(f"output/biorxiv_word_counts/{Path(file)}.tsv")
    for file in mapped_doi_df.document.values.tolist()
    if Path(f"output/biorxiv_word_counts/{Path(file)}.tsv").exists()
])

preprint_count_df = (
    pd.DataFrame.from_records([
        {
            "lemma":token[0],
            "pos_tag":token[1],
            "dep_tag":token[2],
            "count":preprint_count[token]
        }
        for token in preprint_count
    ])
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
)

preprint_count_df.head()

HBox(children=(IntProgress(value=0, max=30924), HTML(value='')))




Unnamed: 0,lemma,count
656470,et,778414
349385,al,773497
487606,cells,593026
568358,data,495464
694737,fig,490004


In [24]:
published_count = aggregate_word_counts([
    Path(f"../../pmc/pmc_corpus/pmc_word_counts/{file}.tsv")
    for file in mapped_doi_df.pmcid.values.tolist()
    if Path(f"../../pmc/pmc_corpus/pmc_word_counts/{file}.tsv").exists()
])

published_count_df = (
    pd.DataFrame.from_records([
        {
            "lemma":token[0],
            "pos_tag":token[1],
            "dep_tag":token[2],
            "count":published_count[token]
        }
        for token in published_count
    ])
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
)

published_count_df.head()

HBox(children=(IntProgress(value=0, max=17555), HTML(value='')))




Unnamed: 0,lemma,count
2292722,,785242
2340935,,644560
1070309,et,446730
638491,al,438015
1,\t\t\t\t,413820


In [25]:
preprint_vs_published = get_term_statistics(
    preprint_count_df, 
    published_count_df,
    100
)

preprint_vs_published.to_csv(
    "output/comparison_stats/preprint_to_published_comparison.tsv", 
    sep="\t", index=False
)

preprint_vs_published

HBox(children=(IntProgress(value=0, max=107), HTML(value='')))




Unnamed: 0,lemma,corpus_one_a,corpus_two_b,corpus_one_c,corpus_two_d,log_likelihood,odds_ratio
0,time,275599,180575,120680972,82761371,227.157404,1.046668
1,test,164653,110948,120680972,82761371,20.503205,1.017745
2,increased,105992,61736,120680972,82761371,1053.576725,1.177398
3,model,333119,215834,120680972,82761371,423.091934,1.058445
4,al,773498,438016,120680972,82761371,10360.943830,1.211039
...,...,...,...,...,...,...,...
102,levels,151751,97000,120680972,82761371,293.890493,1.072874
103,effects,127977,77206,120680972,82761371,798.358985,1.136763
104,based,225596,149164,120680972,82761371,119.765736,1.037185
105,studies,146478,87301,120680972,82761371,1087.868207,1.150647


Main takeaways from this analysis:
1. On a global scale bioRxiv contains more field specific articles as top words consist of: neuron, gene, genome, network
2. "Patients" appear more correlated with PMC as most preprints involving patients are shipped over to medRxiv.
3. Many words associated with PMC are health related which ties back to the medRxiv note.
4. Citation styles change as preprints transition to published versions. Et Al. has a greater association within bioRxiv compared to PMC.
5. On a local scale published articles contain more statistical concepts (e.g., t-test) as well as quantitative measures (e.g. degree signs). (High associated lemmas are t, -, degree sign etc.)
6. Publish articles have a focus shift on mentioning figures, adding supplementary data etc compared to preprints.
7. Preprints have a universal way of citing published works by using the et al. citation. Hard to pinpoint if leading factor is because of peer review or journal style, but it will be an interesting point to discuss in the paper.