# Comparative Linguistic Analysis of bioRxiv and PMC

In [1]:
from collections import defaultdict, Counter
import csv
from pathlib import Path

import numpy as np
import pandas as pd
import pickle
import spacy
from scipy.stats import chi2_contingency
from tqdm import tqdm_notebook

In [2]:
def get_term_statistics(corpus_one, corpus_two, term, psudeocount=1, eps=1e-20):
    """
    This function is designed to perform the folllowing calculations:
        - chi square contingency test 
          - log pvalue + an epsilon (1e-20)
        - log likelihood of contingency table
        - log odds ratio
        
    keywords:
        corpus_one - a Counter object with terms as keys and count as values
        corpus_two - a Counter object with terms as keys and count as values
        term - the word of interest
    """
    observed_contingency_table = np.array([
        [corpus_one[term], corpus_two[term]],
        [sum(corpus_one.values()), sum(corpus_two.values())]
    ])
    
    # Chi Squared Test
    (chi_test_stat, p_val, dof, exp) = chi2_contingency(
        observed_contingency_table, 
        correction=False
    )
    
    # Log Likelihood
    
    ## add psudeocount to prevent log(0)
    observed_contingency_table += psudeocount
    
    a, b, c, d = (
        observed_contingency_table[0][0],
        observed_contingency_table[0][1],
        observed_contingency_table[1][0],
        observed_contingency_table[1][1]
    )
    
    # Obtained from (Kilgarriff, 2001) - Comparing Corpora
    LL = lambda a,b,c,d: 2*(
        a*np.log(a) + b*np.log(b) + c*np.log(c) + d*np.log(d)
        - (a+b)*np.log(a+b) - (a+c)*np.log(a+c) - (b+d)*np.log(b+d)
        - (c+d)*np.log(c+d) + (a+b+c+d)*np.log(a+b+c+d)
    )
    log_likelihood = LL(a,b,c,d)
    
    
    # Log Odds
    log_ratio = float((a*d)/(b*c))
    
    return {
        "chi_sq": (
            chi_test_stat, np.log(p_val+eps), dof,
            (observed_contingency_table-psudeocount), exp
        ),
        "log_likelihood":log_likelihood,
        "odds_ratio":log_ratio
    }

In [3]:
def aggregate_word_counts(doc_iterator):
    global_word_counter = Counter()
    
    for doc in tqdm_notebook(doc_iterator):
        with open(doc, "r") as tsvfile:
            reader = csv.DictReader(tsvfile, delimiter="\t")
            global_word_counter.update({
                row['lemma']:int(row['count'])
                for row in reader
            })

    return global_word_counter

In [4]:
def remove_stop_words(corpus_one, corpus_two):
    spacy_nlp = spacy.load('en_core_web_sm')
    stop_word_list = list(spacy_nlp.Defaults.stop_words)
    stop_word_list += ['  ', '\t\t\t\t', '\u2009', ' ']
    
    for stopword in tqdm_notebook(stop_word_list):
        if stopword in corpus_one:
            del corpus_one[stopword]

        if stopword in corpus_two:
            del corpus_two[stopword]
            
    return corpus_one, corpus_two

# Full Text Comparison (Global)

## Gather Word Frequencies

In [4]:
biorxiv_corpus_count = (
    aggregate_word_counts(
        list(Path("output/biorxiv_word_counts").rglob("*tsv"))
    )
)

HBox(children=(IntProgress(value=0, max=71118), HTML(value='')))




In [5]:
pmc_corpus_count = (
    aggregate_word_counts(
        list(Path("output/pmc_word_counts").rglob("*tsv"))
    )
)

HBox(children=(IntProgress(value=0, max=1977647), HTML(value='')))




In [8]:
biorxiv_corpus_count.most_common(10)

[('the', 22645305),
 ('of', 14639481),
 ('be', 12811427),
 ('and', 11981224),
 ('in', 10135406),
 ('to', 8146337),
 ('a', 6603914),
 ('for', 4530456),
 ('with', 3974186),
 ('that', 3571258)]

In [9]:
pmc_corpus_count.most_common(10)

[('the', 455469538),
 ('of', 305684946),
 ('be', 286477836),
 ('and', 258669662),
 ('in', 209068921),
 ('to', 154692995),
 ('a', 119114407),
 ('with', 87644270),
 ('for', 86185660),
 ('that', 61166926)]

In [6]:
pickle.dump(biorxiv_corpus_count, open("output/biorxiv_total_count.pkl", "wb"))
pickle.dump(pmc_corpus_count, open("output/pmc_total_count.pkl", "wb"))

## Analysis without Stop Words

The goal here is to compare word frequencies between bioRxiv and pubmed central. The problem when comparing word frequencies is that non-meaningful words (aka stopwords) such as the, of, and, be, etc., appear the most often. To account for this problem the first step here is to remove those words from analyses. 

In [5]:
biorxiv_corpus_count = pickle.load(open("output/biorxiv_total_count.pkl", "rb"))
pmc_corpus_count = pickle.load(open("output/pmc_total_count.pkl", "rb"))

In [6]:
biorxiv_corpus_count, pmc_corpus_count = remove_stop_words(
    biorxiv_corpus_count,
    pmc_corpus_count
)

HBox(children=(IntProgress(value=0, max=330), HTML(value='')))




In [7]:
top_ten_biorxiv = biorxiv_corpus_count.most_common(100)
top_ten_biorxiv[0:10]

[('cell', 2244256),
 ('use', 2206407),
 ('et', 1762805),
 ('al', 1754536),
 ('gene', 1347906),
 ('model', 1056802),
 ('fig', 1048216),
 ('figure', 987374),
 ('1', 946363),
 ('datum', 905227)]

In [8]:
top_ten_pmc = pmc_corpus_count.most_common(100)
top_ten_pmc[0:10]

[('use', 41761817),
 ('cell', 38244783),
 ('study', 30963261),
 ('patient', 22691935),
 ('1', 20819358),
 ('result', 18720685),
 ('et', 18125240),
 ('group', 17766474),
 ('al', 17456991),
 ('high', 17388204)]

In [9]:
print("Number of words in biorxiv but not in Pubmed Central:")
biorxiv_difference = set(list(biorxiv_corpus_count.keys())) - set(list(pmc_corpus_count.keys()))
print(len(biorxiv_difference))

Number of words in biorxiv but not in Pubmed Central:
1096878


In [10]:
[
    biorxiv_difference.pop()
    for i in range(10)
]

['wavelength22,24',
 'chylomicrons[9',
 'l_sbr-3',
 'giulia20',
 'v0.36.3',
 'giantdatabase',
 'ma_336364g0010_6123',
 'ddseq_surecell.fa:2:30:10',
 'siz1-mrfp',
 'inhibition(cash']

In [11]:
print("Number of words in Pubmed Central but not in biorxiv:")
pmc_difference = set(list(pmc_corpus_count.keys())) - set(list(biorxiv_corpus_count.keys()))
print(len(pmc_difference))

Number of words in Pubmed Central but not in biorxiv:
99591968


In [12]:
[
    pmc_difference.pop()
    for i in range(10)
]

['gse6018',
 'ametekmode',
 'arthroplastycomplicationorifarthroplastyp',
 'ngtx.for',
 'conductivityrush',
 'guttataxp_00219807958%gallus',
 'ttaldh1a3',
 'philippines280,3065.9',
 'cjapanese151',
 '35004435.48±12.823501']

In [13]:
total_words = set(list(dict(top_ten_biorxiv).keys()) + list(dict(top_ten_pmc).keys()))
data = []
for word in tqdm_notebook(total_words):
    
    word_stat = get_term_statistics(
        biorxiv_corpus_count,
        pmc_corpus_count,
        word
    )
    
    data.append({
        "lemma": word,
        "biorxiv_count":biorxiv_corpus_count[word] if word in biorxiv_corpus_count else 0,
        "pmc_count":pmc_corpus_count[word] if word in pmc_corpus_count else 0,
        "biorxiv_total":word_stat['chi_sq'][3][1,0],
        "pmc_total":word_stat['chi_sq'][3][1,1],
        "log_p": word_stat['chi_sq'][1],
        "log_likelihood": word_stat['log_likelihood'],
        "odds_ratio": word_stat['odds_ratio']
    })

HBox(children=(IntProgress(value=0, max=124), HTML(value='')))




In [14]:
total_word_stats_df = pd.DataFrame.from_records(data)
total_word_stats_df.to_csv(
    "output/full_corpus_comparison_stats.tsv", 
    sep="\t", index=False
)
total_word_stats_df.head()

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
0,specie,384720,4254993,262487093,5343153698,-46.051702,111070.702393,1.840504
1,sample,666822,11482320,262487093,5343153698,-46.051702,16759.337433,1.182145
2,bind,332636,4368443,262487093,5343153698,-46.051702,52430.971863,1.550006
3,day,249939,7322853,262487093,5343153698,-46.051702,35943.777496,0.694776
4,level,621245,14983353,262487093,5343153698,-46.051702,18035.299988,0.844004


In [15]:
(
    total_word_stats_df
    .sort_values("log_likelihood", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
92,patient,213034,22691935,262487093,5343153698,-46.051702,1060795.0,0.191104
90,al,1754536,17456991,262487093,5343153698,-46.051702,671885.1,2.045893
58,et,1762805,18125240,262487093,5343153698,-46.051702,620939.0,1.97975
40,gene,1347906,14688675,262487093,5343153698,-46.051702,405305.2,1.867957
87,±,146921,11016755,262487093,5343153698,-46.051702,391258.1,0.271471
110,study,829495,30963261,262487093,5343153698,-46.051702,360682.8,0.545327
116,health,53602,6392583,262487093,5343153698,-46.051702,320851.8,0.170688
7,genome,397888,2713361,262487093,5343153698,-46.051702,316957.3,2.984997
73,model,1056802,11568141,262487093,5343153698,-46.051702,313918.1,1.859603
80,neuron,305653,1968004,262487093,5343153698,-46.051702,265066.6,3.161503


In [16]:
(
    total_word_stats_df
    .sort_values("log_likelihood", ascending=True)
    .head(20)
)

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
82,associate,335363,6918452,262487093,5343153698,-30.792952,57.262299,0.986727
85,c,707026,14198718,262487093,5343153698,-46.051702,122.483429,1.013623
120,effect,629481,12594004,262487093,5343153698,-46.051702,177.879028,1.017441
6,activity,415982,8252446,262487093,5343153698,-46.051702,260.129364,1.026083
76,system,312928,6616212,262487093,5343153698,-46.051702,434.387848,0.962778
121,perform,392223,8320829,262487093,5343153698,-46.051702,646.440918,0.959527
107,mouse,442439,8638058,262487093,5343153698,-46.051702,722.993317,1.042625
13,mean,364616,7791231,262487093,5343153698,-46.051702,831.607605,0.952622
70,determine,267848,5819940,262487093,5343153698,-46.051702,1111.032471,0.936831
21,t,316000,6047726,262487093,5343153698,-46.051702,1120.201904,1.063619


In [17]:
(
    total_word_stats_df
    .sort_values("odds_ratio", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
80,neuron,305653,1968004,262487093,5343153698,-46.051702,265066.560577,3.161503
7,genome,397888,2713361,262487093,5343153698,-46.051702,316957.302246,2.984997
103,network,327163,2817271,262487093,5343153698,-46.051702,173460.80954,2.363886
90,al,1754536,17456991,262487093,5343153698,-46.051702,671885.110535,2.045893
58,et,1762805,18125240,262487093,5343153698,-46.051702,620939.022369,1.97975
43,single,357273,3825621,262487093,5343153698,-46.051702,113198.720154,1.90103
40,gene,1347906,14688675,262487093,5343153698,-46.051702,405305.237457,1.867957
44,rna,305858,3333698,262487093,5343153698,-46.051702,92167.882568,1.867603
73,model,1056802,11568141,262487093,5343153698,-46.051702,313918.083496,1.859603
0,specie,384720,4254993,262487093,5343153698,-46.051702,111070.702393,1.840504


In [18]:
(
    total_word_stats_df
    .sort_values("odds_ratio", ascending=True)
    .head(20)
)

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
116,health,53602,6392583,262487093,5343153698,-46.051702,320851.8,0.170688
92,patient,213034,22691935,262487093,5343153698,-46.051702,1060795.0,0.191104
87,±,146921,11016755,262487093,5343153698,-46.051702,391258.1,0.271471
23,risk,98853,5700168,262487093,5343153698,-46.051702,150818.5,0.353017
19,year,118833,6734833,262487093,5343153698,-46.051702,174090.9,0.359173
108,treatment,238340,10425186,262487093,5343153698,-46.051702,175808.7,0.465377
56,group,442718,17766474,262487093,5343153698,-46.051702,248475.5,0.507244
110,study,829495,30963261,262487093,5343153698,-46.051702,360682.8,0.545327
16,h,166524,6053056,262487093,5343153698,-46.051702,65793.34,0.560008
10,age,194619,7050986,262487093,5343153698,-46.051702,75914.45,0.561859


# Preprint to Published View

In [19]:
mapped_doi_df = (
    pd.read_csv("../journal_tracker/output/mapped_published_doi.tsv", sep="\t")
    .query("published_doi.notnull()")
    .query("pmcid.notnull()")
    .groupby("doi")
    .agg({
        "author_type":"first",
        "heading":"first",
        "category":"first",
        "document":"first",
        "doi":"last",
        "published_doi":"last",
        "journal":"last",
        "pmcid":"last"
    })
    .reset_index(drop=True)
)
mapped_doi_df.tail()

Unnamed: 0,author_type,heading,category,document,doi,published_doi,journal,pmcid
17115,regular article,new results,animal behavior and cognition,852350_v1.xml,10.1101/852350,10.1371/journal.pone.0226774,PLOS ONE,PMC6961851
17116,regular article,new results,pathology,856542_v1.xml,10.1101/856542,10.1038/s41598-019-57046-x,Scientific Reports,PMC6969030
17117,regular article,new results,neuroscience,858100_v2.xml,10.1101/858100,10.3389/fnsyn.2019.00035,Frontiers in Synaptic Neuroscience,PMC6932971
17118,regular article,new results,genomics,862847_v1.xml,10.1101/862847,10.1186/s12864-019-6379-5,BMC Genomics,PMC6933653
17119,regular article,new results,bioinformatics,867903_v1.xml,10.1101/867903,10.1186/s13059-019-1915-9,Genome Biology,PMC6927177


In [20]:
print(f"Total # of Preprints Mapped: {mapped_doi_df.shape[0]}")
print(f"Total % of Mapped: {mapped_doi_df.shape[0]/71118}")

Total # of Preprints Mapped: 17120
Total % of Mapped: 0.24072667960291347


In [21]:
preprint_count = aggregate_word_counts([
    Path(f"output/biorxiv_word_counts/{Path(file).stem}.tsv")
    for file in mapped_doi_df.document.values.tolist()
    if Path(f"output/biorxiv_word_counts/{Path(file).stem}.tsv").exists()
])

HBox(children=(IntProgress(value=0, max=17120), HTML(value='')))




In [22]:
published_count = aggregate_word_counts([
    Path(f"output/pmc_word_counts/{file}.tsv")
    for file in mapped_doi_df.pmcid.values.tolist()
    if Path(f"output/pmc_word_counts/{file}.tsv").exists()
])

HBox(children=(IntProgress(value=0, max=16460), HTML(value='')))




In [23]:
preprint_count, published_count = remove_stop_words(
    preprint_count,
    published_count
)

HBox(children=(IntProgress(value=0, max=330), HTML(value='')))




In [24]:
top_hundred_preprint = preprint_count.most_common(100)
top_hundred_preprint[0:10]

[('use', 583640),
 ('cell', 537363),
 ('gene', 400044),
 ('et', 389408),
 ('al', 386267),
 ('model', 266728),
 ('fig', 261836),
 ('figure', 246034),
 ('datum', 245169),
 ('1', 238367)]

In [25]:
top_hundred_published = published_count.most_common(100)
top_hundred_published[0:10]

[('use', 674681),
 ('cell', 603715),
 ('gene', 430794),
 ('et', 388022),
 ('al', 379852),
 ('fig', 378927),
 ('model', 292296),
 ('datum', 286438),
 ('1', 286425),
 ('figure', 246296)]

In [26]:
print("Number of words in preprint but not in published version:")
preprint_difference = set(list(preprint_count.keys())) - set(list(published_count.keys()))
print(len(preprint_difference))

Number of words in preprint but not in published version:
359827


In [27]:
[
    preprint_difference.pop()
    for i in range(10)
]

['wavelength22,24',
 '13,955',
 'l_sbr-3',
 'd=0.100',
 'xapa',
 'giantdatabase',
 'rate,2',
 'expression71',
 'terms33,34',
 'etijk']

In [28]:
print("Number of words in published version but not in preprint:")
published_difference = set(list(published_count.keys())) - set(list(preprint_count.keys()))
print(len(published_difference))

Number of words in published version but not in preprint:
1508103


In [29]:
[
    published_difference.pop()
    for i in range(10)
]

['htal',
 '13368a',
 '1.720.6280.020.130.87lonelinessbody',
 'dipgs8,21,45,56,57',
 '3incorporate',
 '0.14]1.32',
 '2017).parameter',
 'expressionptpr',
 '114early',
 '2017‐29766‐sci‐sci']

In [30]:
total_words = set(list(dict(top_hundred_preprint).keys()) + list(dict(top_hundred_published).keys()))
data = []
for word in tqdm_notebook(total_words):
    
    word_stat = get_term_statistics(
        preprint_count,
        published_count,
        word
    )
    
    data.append({
        "lemma": word,
        "preprint_count":preprint_count[word] if word in preprint_count else 0,
        "published_count":published_count[word] if word in published_count else 0,
        "preprint_total":word_stat['chi_sq'][3][1,0],
        "published_total":word_stat['chi_sq'][3][1,1],
        "log_p": word_stat['chi_sq'][1],
        "log_likelihood": word_stat['log_likelihood'],
        "odds_ratio": word_stat['odds_ratio']
    })

HBox(children=(IntProgress(value=0, max=108), HTML(value='')))




In [31]:
published_comparison_stats_df = pd.DataFrame.from_records(data)
published_comparison_stats_df.to_csv(
    "output/preprint_to_published_comparison.tsv", 
    sep="\t", index=False
)
published_comparison_stats_df.head()

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
0,specie,89276,90762,65471550,74812843,-46.051702,613.354474,1.123969
1,sample,180653,207222,65471550,74812843,-1.453792,1.417912,0.996169
2,bind,86229,92972,65471550,74812843,-46.051702,150.603146,1.059803
3,level,153617,167457,65471550,74812843,-46.051702,177.273604,1.048238
4,file,32899,108465,65471550,74812843,-46.051702,33096.135285,0.346598


In [32]:
(
    published_comparison_stats_df
    .sort_values("log_likelihood", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
4,file,32899,108465,65471550,74812843,-46.051702,33096.135285,0.346598
46,additional,41916,105486,65471550,74812843,-46.051702,20491.416656,0.454062
51,–,52382,109366,65471550,74812843,-46.051702,13615.990573,0.547303
59,data,40873,88444,65471550,74812843,-46.051702,12136.172468,0.528077
62,supplementary,69865,129419,65471550,74812843,-46.051702,10999.092961,0.616862
100,n,65277,117848,65471550,74812843,-46.051702,9096.74168,0.632943
21,fig,261836,378927,65471550,74812843,-46.051702,8708.58064,0.789583
78,al,386267,379852,65471550,74812843,-46.051702,4290.353059,1.161975
72,p,113094,165249,65471550,74812843,-46.051702,4103.299726,0.782034
50,et,389408,388022,65471550,74812843,-46.051702,3622.004288,1.146759


In [33]:
(
    published_comparison_stats_df
    .sort_values("log_likelihood", ascending=True)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
91,estimate,85647,97781,65471550,74812843,-0.160969,0.035248,1.00088
11,follow,94652,107750,65471550,74812843,-0.921051,0.714491,1.003776
90,read,87522,100456,65471550,74812843,-1.091986,0.926851,0.995556
1,sample,180653,207222,65471550,74812843,-1.453792,1.417912,0.996169
20,size,88733,100775,65471550,74812843,-1.69098,1.762853,1.006136
81,strain,75640,85591,65471550,74812843,-2.992179,3.835939,1.009828
60,difference,92909,105212,65471550,74812843,-3.093586,4.006355,1.009059
10,line,96347,108966,65471550,74812843,-3.911443,5.410989,1.010349
77,represent,82287,95111,65471550,74812843,-4.124661,5.784316,0.988609
96,indicate,116938,135070,65471550,74812843,-4.957014,7.264571,0.989284


In [34]:
(
    published_comparison_stats_df
    .sort_values("odds_ratio", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
53,non,93542,88998,65471550,74812843,-46.051702,1528.371982,1.201019
78,al,386267,379852,65471550,74812843,-46.051702,4290.353059,1.161975
50,et,389408,388022,65471550,74812843,-46.051702,3622.004288,1.146759
76,figure,246034,246296,65471550,74812843,-46.051702,2145.501128,1.141462
40,',122663,123498,65471550,74812843,-46.051702,983.64811,1.134951
0,specie,89276,90762,65471550,74812843,-46.051702,613.354474,1.123969
98,human,88889,91578,65471550,74812843,-46.051702,482.78126,1.109125
8,genome,134896,139140,65471550,74812843,-46.051702,716.074085,1.107824
85,structure,79298,82063,65471550,74812843,-46.051702,395.220964,1.104177
22,suggest,98728,102562,65471550,74812843,-46.051702,455.532713,1.099962


In [35]:
(
    published_comparison_stats_df
    .sort_values("odds_ratio", ascending=True)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
4,file,32899,108465,65471550,74812843,-46.051702,33096.135285,0.346598
46,additional,41916,105486,65471550,74812843,-46.051702,20491.416656,0.454062
59,data,40873,88444,65471550,74812843,-46.051702,12136.172468,0.528077
51,–,52382,109366,65471550,74812843,-46.051702,13615.990573,0.547303
62,supplementary,69865,129419,65471550,74812843,-46.051702,10999.092961,0.616862
100,n,65277,117848,65471550,74812843,-46.051702,9096.74168,0.632943
72,p,113094,165249,65471550,74812843,-46.051702,4103.299726,0.782034
21,fig,261836,378927,65471550,74812843,-46.051702,8708.58064,0.789583
32,°,71831,93395,65471550,74812843,-46.051702,679.751598,0.878847
44,table,106141,130461,65471550,74812843,-46.051702,311.476383,0.929666


Main takeaways from this analysis:
1. On a global scale bioRxiv contains more field specific articles as top words consist of: neuron, gene, genome, network
2. "Patients" appear more correlated with PMC as most preprints involving patients are shipped over to medRxiv.
3. Many words associated with PMC are health related which ties back to the medRxiv note.
4. Citation styles change as preprints transition to published versions. Et Al. has a greater association within bioRxiv compared to PMC.
5. On a local scale published articles contain more statistical concepts (e.g., t-test) as well as quantitative measures (e.g. degree signs). (High associated lemmas are t, -, degree sign etc.)
6. Publish articles have a focus shift on mentioning figures, adding supplementary data etc compared to preprints.
7. Preprints have a universal way of citing published works by using the et al. citation. Hard to pinpoint if leading factor is because of peer review or journal style, but it will be an interesting point to discuss in the paper.