# Comparative Linguistic Analysis of bioRxiv and PMC

In [1]:
from collections import defaultdict, Counter
import csv
from pathlib import Path

import numpy as np
import pandas as pd
import pickle
import spacy
from scipy.stats import chi2_contingency
from tqdm import tqdm_notebook

In [2]:
def get_term_statistics(corpus_one, corpus_two, term, psudeocount=1, eps=1e-20):
    """
    This function is designed to perform the folllowing calculations:
        - chi square contingency test 
          - log pvalue + an epsilon (1e-20)
        - log likelihood of contingency table
        - log odds ratio
        
    keywords:
        corpus_one - a Counter object with terms as keys and count as values
        corpus_two - a Counter object with terms as keys and count as values
        term - the word of interest
    """
    observed_contingency_table = np.array([
        [corpus_one[term], corpus_two[term]],
        [sum(corpus_one.values()), sum(corpus_two.values())]
    ])
    
    # Chi Squared Test
    (chi_test_stat, p_val, dof, exp) = chi2_contingency(
        observed_contingency_table, 
        correction=False
    )
    
    # Log Likelihood
    
    ## add psudeocount to prevent log(0)
    observed_contingency_table += psudeocount
    
    a, b, c, d = (
        observed_contingency_table[0][0],
        observed_contingency_table[0][1],
        observed_contingency_table[1][0],
        observed_contingency_table[1][1]
    )
    
    # Obtained from (Kilgarriff, 2001) - Comparing Corpora
    LL = lambda a,b,c,d: 2*(
        a*np.log(a) + b*np.log(b) + c*np.log(c) + d*np.log(d)
        - (a+b)*np.log(a+b) - (a+c)*np.log(a+c) - (b+d)*np.log(b+d)
        - (c+d)*np.log(c+d) + (a+b+c+d)*np.log(a+b+c+d)
    )
    log_likelihood = LL(a,b,c,d)
    
    
    # Log Odds
    log_ratio = float((a*d)/(b*c))
    
    return {
        "chi_sq": (
            chi_test_stat, np.log(p_val+eps), dof,
            (observed_contingency_table-psudeocount), exp
        ),
        "log_likelihood":log_likelihood,
        "odds_ratio":log_ratio
    }

In [3]:
def aggregate_word_counts(doc_iterator):
    global_word_counter = Counter()
    
    for doc in tqdm_notebook(doc_iterator):
        with open(doc, "r") as tsvfile:
            reader = csv.DictReader(tsvfile, delimiter="\t")
            global_word_counter.update({
                row['lemma']:int(row['count'])
                for row in reader
            })

    return global_word_counter

In [4]:
def remove_stop_words(corpus_one, corpus_two):
    spacy_nlp = spacy.load('en_core_web_sm')
    stop_word_list = list(spacy_nlp.Defaults.stop_words)
    stop_word_list += ['  ', '\t\t\t\t', '\u2009', ' ']
    
    for stopword in tqdm_notebook(stop_word_list):
        if stopword in corpus_one:
            del corpus_one[stopword]

        if stopword in corpus_two:
            del corpus_two[stopword]
            
    return corpus_one, corpus_two

# Full Text Comparison (Global)

## Gather Word Frequencies

In [4]:
biorxiv_corpus_count = (
    aggregate_word_counts(
        list(Path("output/biorxiv_word_counts").rglob("*tsv"))
    )
)

HBox(children=(IntProgress(value=0, max=71118), HTML(value='')))




In [5]:
pmc_corpus_count = (
    aggregate_word_counts(
        list(Path("output/pmc_word_counts").rglob("*tsv"))
    )
)

HBox(children=(IntProgress(value=0, max=1977647), HTML(value='')))




In [8]:
biorxiv_corpus_count.most_common(10)

[('the', 22645305),
 ('of', 14639481),
 ('be', 12811427),
 ('and', 11981224),
 ('in', 10135406),
 ('to', 8146337),
 ('a', 6603914),
 ('for', 4530456),
 ('with', 3974186),
 ('that', 3571258)]

In [9]:
pmc_corpus_count.most_common(10)

[('the', 455469538),
 ('of', 305684946),
 ('be', 286477836),
 ('and', 258669662),
 ('in', 209068921),
 ('to', 154692995),
 ('a', 119114407),
 ('with', 87644270),
 ('for', 86185660),
 ('that', 61166926)]

In [6]:
pickle.dump(biorxiv_corpus_count, open("output/biorxiv_total_count.pkl", "wb"))
pickle.dump(pmc_corpus_count, open("output/pmc_total_count.pkl", "wb"))

## Analysis without Stop Words

In [5]:
biorxiv_corpus_count = pickle.load(open("output/biorxiv_total_count.pkl", "rb"))
pmc_corpus_count = pickle.load(open("output/pmc_total_count.pkl", "rb"))

In [6]:
biorxiv_corpus_count, pmc_corpus_count = remove_stop_words(
    biorxiv_corpus_count,
    pmc_corpus_count
)

HBox(children=(IntProgress(value=0, max=330), HTML(value='')))




In [7]:
top_ten_biorxiv = biorxiv_corpus_count.most_common(100)
top_ten_biorxiv[0:10]

[('cell', 2244256),
 ('use', 2206407),
 ('et', 1762805),
 ('al', 1754536),
 ('gene', 1347906),
 ('model', 1056802),
 ('fig', 1048216),
 ('figure', 987374),
 ('1', 946363),
 ('datum', 905227)]

In [8]:
top_ten_pmc = pmc_corpus_count.most_common(100)
top_ten_pmc[0:10]

[('use', 41761817),
 ('cell', 38244783),
 ('study', 30963261),
 ('patient', 22691935),
 ('1', 20819358),
 ('result', 18720685),
 ('et', 18125240),
 ('group', 17766474),
 ('al', 17456991),
 ('high', 17388204)]

In [9]:
print("Number of words in biorxiv but not in Pubmed Central:")
biorxiv_difference = set(list(biorxiv_corpus_count.keys())) - set(list(pmc_corpus_count.keys()))
print(len(biorxiv_difference))

Number of words in biorxiv but not in Pubmed Central:
1096878


In [10]:
[
    biorxiv_difference.pop()
    for i in range(10)
]

['daf-2-/--status',
 'pαstαcttime',
 'chlredraft_177061',
 '5’-gattcatcccagccaccagac',
 '11-rklxxxrrxxrwxxxx',
 '56–58,65,66',
 'roxlitinib',
 'ahn[2',
 'genome25,27',
 'facs(figures']

In [11]:
print("Number of words in Pubmed Central but not in biorxiv:")
pmc_difference = set(list(pmc_corpus_count.keys())) - set(list(biorxiv_corpus_count.keys()))
print(len(pmc_difference))

Number of words in Pubmed Central but not in biorxiv:
99591968


In [12]:
[
    pmc_difference.pop()
    for i in range(10)
]

['photographedsince',
 'headblocks16518',
 'plasmidit',
 '21.0)0.002',
 '10)fahi3.63(0.82)1–5k63.82(1.00)1–5s',
 '0c.guilliermondii',
 'dataset;set',
 '\\bar{p}(a_2,b_1)+\\bar{p}(a_2,b_2)&=',
 '−137.86',
 '0.67–1.00)0.53']

In [13]:
total_words = set(list(dict(top_ten_biorxiv).keys()) + list(dict(top_ten_pmc).keys()))
data = []
for word in tqdm_notebook(total_words):
    
    word_stat = get_term_statistics(
        biorxiv_corpus_count,
        pmc_corpus_count,
        word
    )
    
    data.append({
        "lemma": word,
        "biorxiv_count":biorxiv_corpus_count[word] if word in biorxiv_corpus_count else 0,
        "pmc_count":pmc_corpus_count[word] if word in pmc_corpus_count else 0,
        "biorxiv_total":word_stat['chi_sq'][3][1,0],
        "pmc_total":word_stat['chi_sq'][3][1,1],
        "log_p": word_stat['chi_sq'][1],
        "log_likelihood": word_stat['log_likelihood'],
        "odds_ratio": word_stat['odds_ratio']
    })

HBox(children=(IntProgress(value=0, max=124), HTML(value='')))




In [14]:
total_word_stats_df = pd.DataFrame.from_records(data)
total_word_stats_df.to_csv(
    "output/full_corpus_comparison_stats.tsv", 
    sep="\t", index=False
)
total_word_stats_df.head()

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
0,effect,629481,12594004,262487093,5343153698,-46.051702,177.879028,1.017441
1,age,194619,7050986,262487093,5343153698,-46.051702,75914.454681,0.561859
2,low,469088,10757812,262487093,5343153698,-46.051702,6615.132568,0.887608
3,disease,203108,6586937,262487093,5343153698,-46.051702,49606.12854,0.627676
4,function,424578,5970101,262487093,5343153698,-46.051702,48763.820831,1.44766


In [15]:
(
    total_word_stats_df
    .sort_values("log_likelihood", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
63,patient,213034,22691935,262487093,5343153698,-46.051702,1060795.0,0.191104
92,al,1754536,17456991,262487093,5343153698,-46.051702,671885.1,2.045893
122,et,1762805,18125240,262487093,5343153698,-46.051702,620939.0,1.97975
27,gene,1347906,14688675,262487093,5343153698,-46.051702,405305.2,1.867957
9,±,146921,11016755,262487093,5343153698,-46.051702,391258.1,0.271471
91,study,829495,30963261,262487093,5343153698,-46.051702,360682.8,0.545327
31,health,53602,6392583,262487093,5343153698,-46.051702,320851.8,0.170688
47,genome,397888,2713361,262487093,5343153698,-46.051702,316957.3,2.984997
53,model,1056802,11568141,262487093,5343153698,-46.051702,313918.1,1.859603
12,neuron,305653,1968004,262487093,5343153698,-46.051702,265066.6,3.161503


In [16]:
(
    total_word_stats_df
    .sort_values("odds_ratio", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
12,neuron,305653,1968004,262487093,5343153698,-46.051702,265066.560577,3.161503
47,genome,397888,2713361,262487093,5343153698,-46.051702,316957.302246,2.984997
108,network,327163,2817271,262487093,5343153698,-46.051702,173460.80954,2.363886
92,al,1754536,17456991,262487093,5343153698,-46.051702,671885.110535,2.045893
122,et,1762805,18125240,262487093,5343153698,-46.051702,620939.022369,1.97975
29,single,357273,3825621,262487093,5343153698,-46.051702,113198.720154,1.90103
27,gene,1347906,14688675,262487093,5343153698,-46.051702,405305.237457,1.867957
20,rna,305858,3333698,262487093,5343153698,-46.051702,92167.882568,1.867603
53,model,1056802,11568141,262487093,5343153698,-46.051702,313918.083496,1.859603
89,specie,384720,4254993,262487093,5343153698,-46.051702,111070.702393,1.840504


# Preprint to Published View

In [17]:
mapped_doi_df = (
    pd.read_csv("../journal_tracker/output/mapped_published_doi.tsv", sep="\t")
    .query("published_doi.notnull()")
    .query("pmcid.notnull()")
    .groupby("doi")
    .agg({
        "author_type":"first",
        "heading":"first",
        "category":"first",
        "document":"first",
        "doi":"last",
        "published_doi":"last",
        "journal":"last",
        "pmcid":"last"
    })
    .reset_index(drop=True)
)
mapped_doi_df.tail()

Unnamed: 0,author_type,heading,category,document,doi,published_doi,journal,pmcid
17115,regular article,new results,animal behavior and cognition,852350_v1.xml,10.1101/852350,10.1371/journal.pone.0226774,PLOS ONE,PMC6961851
17116,regular article,new results,pathology,856542_v1.xml,10.1101/856542,10.1038/s41598-019-57046-x,Scientific Reports,PMC6969030
17117,regular article,new results,neuroscience,858100_v2.xml,10.1101/858100,10.3389/fnsyn.2019.00035,Frontiers in Synaptic Neuroscience,PMC6932971
17118,regular article,new results,genomics,862847_v1.xml,10.1101/862847,10.1186/s12864-019-6379-5,BMC Genomics,PMC6933653
17119,regular article,new results,bioinformatics,867903_v1.xml,10.1101/867903,10.1186/s13059-019-1915-9,Genome Biology,PMC6927177


In [18]:
print(f"Total # of Preprints Mapped: {mapped_doi_df.shape[0]}")
print(f"Total % of Mapped: {mapped_doi_df.shape[0]/71118}")

Total # of Preprints Mapped: 17120
Total % of Mapped: 0.24072667960291347


In [19]:
preprint_count = aggregate_word_counts([
    Path(f"output/biorxiv_word_counts/{Path(file).stem}.tsv")
    for file in mapped_doi_df.document.values.tolist()
    if Path(f"output/biorxiv_word_counts/{Path(file).stem}.tsv").exists()
])

HBox(children=(IntProgress(value=0, max=17120), HTML(value='')))




In [20]:
published_count = aggregate_word_counts([
    Path(f"output/pmc_word_counts/{file}.tsv")
    for file in mapped_doi_df.pmcid.values.tolist()
    if Path(f"output/pmc_word_counts/{file}.tsv").exists()
])

HBox(children=(IntProgress(value=0, max=16460), HTML(value='')))




In [21]:
preprint_count, published_count = remove_stop_words(
    preprint_count,
    published_count
)

HBox(children=(IntProgress(value=0, max=330), HTML(value='')))




In [22]:
top_hundred_preprint = preprint_count.most_common(100)
top_hundred_preprint[0:10]

[('use', 583640),
 ('cell', 537363),
 ('gene', 400044),
 ('et', 389408),
 ('al', 386267),
 ('model', 266728),
 ('fig', 261836),
 ('figure', 246034),
 ('datum', 245169),
 ('1', 238367)]

In [23]:
top_hundred_published = published_count.most_common(100)
top_hundred_published[0:10]

[('use', 674681),
 ('cell', 603715),
 ('gene', 430794),
 ('et', 388022),
 ('al', 379852),
 ('fig', 378927),
 ('model', 292296),
 ('datum', 286438),
 ('1', 286425),
 ('figure', 246296)]

In [24]:
print("Number of words in preprint but not in published version:")
preprint_difference = set(list(preprint_count.keys())) - set(list(published_count.keys()))
print(len(preprint_difference))

Number of words in preprint but not in published version:
359827


In [25]:
[
    preprint_difference.pop()
    for i in range(10)
]

['5’-gattcatcccagccaccagac',
 'tomography45,46',
 '0.3/0.8',
 'raf-144',
 '3180=107.5',
 'he.ac.uk',
 'ae040',
 'zenodo.2538594',
 '45,61,68,69',
 'ijsselmeer']

In [26]:
print("Number of words in published version but not in preprint:")
published_difference = set(list(published_count.keys())) - set(list(preprint_count.keys()))
print(len(published_difference))

Number of words in published version but not in preprint:
1508103


In [27]:
[
    published_difference.pop()
    for i in range(10)
]

['whyit',
 'pabpc1sigma',
 'genemania26–32',
 'upassume',
 'fr\u2062e\u2062s>0',
 '621401708',
 'gsparticipant',
 'elife.14859.01210.7554',
 '\\begin{document}$${n}_{s}$$\\end{document}ns',
 'µm/10']

In [28]:
total_words = set(list(dict(top_hundred_preprint).keys()) + list(dict(top_hundred_published).keys()))
data = []
for word in tqdm_notebook(total_words):
    
    word_stat = get_term_statistics(
        preprint_count,
        published_count,
        word
    )
    
    data.append({
        "lemma": word,
        "preprint_count":preprint_count[word] if word in preprint_count else 0,
        "published_count":published_count[word] if word in published_count else 0,
        "preprint_total":word_stat['chi_sq'][3][1,0],
        "published_total":word_stat['chi_sq'][3][1,1],
        "log_p": word_stat['chi_sq'][1],
        "log_likelihood": word_stat['log_likelihood'],
        "odds_ratio": word_stat['odds_ratio']
    })

HBox(children=(IntProgress(value=0, max=108), HTML(value='')))




In [29]:
published_comparison_stats_df = pd.DataFrame.from_records(data)
published_comparison_stats_df.to_csv(
    "output/preprint_to_published_comparison.tsv", 
    sep="\t", index=False
)
published_comparison_stats_df.head()

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
0,effect,148196,160219,65471550,74812843,-46.051702,235.299123,1.05693
1,low,117431,130171,65471550,74812843,-30.696837,56.835171,1.030843
2,function,102866,107604,65471550,74812843,-46.051702,409.433948,1.092363
3,compare,115045,128548,65471550,74812843,-17.161924,30.385125,1.022648
4,2,167174,195250,65471550,74812843,-23.615229,42.99265,0.978367


In [30]:
(
    published_comparison_stats_df
    .sort_values("log_likelihood", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
76,file,32899,108465,65471550,74812843,-46.051702,33096.135285,0.346598
7,additional,41916,105486,65471550,74812843,-46.051702,20491.416656,0.454062
67,–,52382,109366,65471550,74812843,-46.051702,13615.990573,0.547303
83,data,40873,88444,65471550,74812843,-46.051702,12136.172468,0.528077
15,supplementary,69865,129419,65471550,74812843,-46.051702,10999.092961,0.616862
33,n,65277,117848,65471550,74812843,-46.051702,9096.74168,0.632943
72,fig,261836,378927,65471550,74812843,-46.051702,8708.58064,0.789583
78,al,386267,379852,65471550,74812843,-46.051702,4290.353059,1.161975
43,p,113094,165249,65471550,74812843,-46.051702,4103.299726,0.782034
106,et,389408,388022,65471550,74812843,-46.051702,3622.004288,1.146759


In [31]:
(
    published_comparison_stats_df
    .sort_values("log_likelihood", ascending=True)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
49,estimate,85647,97781,65471550,74812843,-0.160969,0.035248,1.00088
92,follow,94652,107750,65471550,74812843,-0.921051,0.714491,1.003776
22,read,87522,100456,65471550,74812843,-1.091986,0.926851,0.995556
79,sample,180653,207222,65471550,74812843,-1.453792,1.417912,0.996169
50,size,88733,100775,65471550,74812843,-1.69098,1.762853,1.006136
88,strain,75640,85591,65471550,74812843,-2.992179,3.835939,1.009828
28,difference,92909,105212,65471550,74812843,-3.093586,4.006355,1.009059
18,line,96347,108966,65471550,74812843,-3.911443,5.410989,1.010349
91,represent,82287,95111,65471550,74812843,-4.124661,5.784316,0.988609
25,indicate,116938,135070,65471550,74812843,-4.957014,7.264571,0.989284


In [32]:
(
    published_comparison_stats_df
    .sort_values("odds_ratio", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
14,non,93542,88998,65471550,74812843,-46.051702,1528.371982,1.201019
78,al,386267,379852,65471550,74812843,-46.051702,4290.353059,1.161975
106,et,389408,388022,65471550,74812843,-46.051702,3622.004288,1.146759
52,figure,246034,246296,65471550,74812843,-46.051702,2145.501128,1.141462
69,',122663,123498,65471550,74812843,-46.051702,983.64811,1.134951
74,specie,89276,90762,65471550,74812843,-46.051702,613.354474,1.123969
26,human,88889,91578,65471550,74812843,-46.051702,482.78126,1.109125
38,genome,134896,139140,65471550,74812843,-46.051702,716.074085,1.107824
31,structure,79298,82063,65471550,74812843,-46.051702,395.220964,1.104177
107,suggest,98728,102562,65471550,74812843,-46.051702,455.532713,1.099962


In [33]:
(
    published_comparison_stats_df
    .sort_values("odds_ratio", ascending=True)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
76,file,32899,108465,65471550,74812843,-46.051702,33096.135285,0.346598
7,additional,41916,105486,65471550,74812843,-46.051702,20491.416656,0.454062
83,data,40873,88444,65471550,74812843,-46.051702,12136.172468,0.528077
67,–,52382,109366,65471550,74812843,-46.051702,13615.990573,0.547303
15,supplementary,69865,129419,65471550,74812843,-46.051702,10999.092961,0.616862
33,n,65277,117848,65471550,74812843,-46.051702,9096.74168,0.632943
43,p,113094,165249,65471550,74812843,-46.051702,4103.299726,0.782034
72,fig,261836,378927,65471550,74812843,-46.051702,8708.58064,0.789583
64,°,71831,93395,65471550,74812843,-46.051702,679.751598,0.878847
48,table,106141,130461,65471550,74812843,-46.051702,311.476383,0.929666
