# Comparative Linguistic Analysis of bioRxiv and PMC

In [1]:
from collections import defaultdict, Counter
import csv
from pathlib import Path

import numpy as np
import pandas as pd
import pickle
import spacy
from scipy.stats import chi2_contingency
from tqdm import tqdm_notebook

In [2]:
def get_term_statistics(corpus_one, corpus_two, term, psudeocount=1, eps=1e-20):
    """
    This function is designed to perform the folllowing calculations:
        - chi square contingency test 
          - log pvalue + an epsilon (1e-20)
        - log likelihood of contingency table
        - log odds ratio
        
    keywords:
        corpus_one - a Counter object with terms as keys and count as values
        corpus_two - a Counter object with terms as keys and count as values
        term - the word of interest
    """
    observed_contingency_table = np.array([
        [corpus_one[term], corpus_two[term]],
        [sum(corpus_one.values()), sum(corpus_two.values())]
    ])
    
    # Chi Squared Test
    (chi_test_stat, p_val, dof, exp) = chi2_contingency(
        observed_contingency_table, 
        correction=False
    )
    
    # Log Likelihood
    
    ## add psudeocount to prevent log(0)
    observed_contingency_table += psudeocount
    
    a, b, c, d = (
        observed_contingency_table[0][0],
        observed_contingency_table[0][1],
        observed_contingency_table[1][0],
        observed_contingency_table[1][1]
    )
    
    # Obtained from (Kilgarriff, 2001) - Comparing Corpora
    LL = lambda a,b,c,d: 2*(
        a*np.log(a) + b*np.log(b) + c*np.log(c) + d*np.log(d)
        - (a+b)*np.log(a+b) - (a+c)*np.log(a+c) - (b+d)*np.log(b+d)
        - (c+d)*np.log(c+d) + (a+b+c+d)*np.log(a+b+c+d)
    )
    log_likelihood = LL(a,b,c,d)
    
    
    # Log Odds
    log_ratio = float((a*d)/(b*c))
    
    return {
        "chi_sq": (
            chi_test_stat, np.log(p_val+eps), dof,
            (observed_contingency_table-psudeocount), exp
        ),
        "log_likelihood":log_likelihood,
        "odds_ratio":log_ratio
    }

In [3]:
def aggregate_word_counts(doc_iterator):
    global_word_counter = Counter()
    
    for doc in tqdm_notebook(doc_iterator):
        with open(doc, "r") as tsvfile:
            reader = csv.DictReader(tsvfile, delimiter="\t")
            global_word_counter.update({
                row['lemma']:int(row['count'])
                for row in reader
            })

    return global_word_counter

In [4]:
def remove_stop_words(corpus_one, corpus_two):
    spacy_nlp = spacy.load('en_core_web_sm')
    stop_word_list = list(spacy_nlp.Defaults.stop_words)
    stop_word_list += ['  ', '\t\t\t\t', '\u2009', ' ']
    
    for stopword in tqdm_notebook(stop_word_list):
        if stopword in corpus_one:
            del corpus_one[stopword]

        if stopword in corpus_two:
            del corpus_two[stopword]
            
    return corpus_one, corpus_two

# Full Text Comparison (Global)

## Gather Word Frequencies

In [4]:
biorxiv_corpus_count = (
    aggregate_word_counts(
        list(Path("output/biorxiv_word_counts").rglob("*tsv"))
    )
)

HBox(children=(IntProgress(value=0, max=71118), HTML(value='')))




In [5]:
pmc_corpus_count = (
    aggregate_word_counts(
        list(Path("output/pmc_word_counts").rglob("*tsv"))
    )
)

HBox(children=(IntProgress(value=0, max=1977647), HTML(value='')))




In [8]:
biorxiv_corpus_count.most_common(10)

[('the', 22645305),
 ('of', 14639481),
 ('be', 12811427),
 ('and', 11981224),
 ('in', 10135406),
 ('to', 8146337),
 ('a', 6603914),
 ('for', 4530456),
 ('with', 3974186),
 ('that', 3571258)]

In [9]:
pmc_corpus_count.most_common(10)

[('the', 455469538),
 ('of', 305684946),
 ('be', 286477836),
 ('and', 258669662),
 ('in', 209068921),
 ('to', 154692995),
 ('a', 119114407),
 ('with', 87644270),
 ('for', 86185660),
 ('that', 61166926)]

In [6]:
pickle.dump(biorxiv_corpus_count, open("output/biorxiv_total_count.pkl", "wb"))
pickle.dump(pmc_corpus_count, open("output/pmc_total_count.pkl", "wb"))

## Analysis without Stop Words

In [5]:
biorxiv_corpus_count = pickle.load(open("output/biorxiv_total_count.pkl", "rb"))
pmc_corpus_count = pickle.load(open("output/pmc_total_count.pkl", "rb"))

In [6]:
biorxiv_corpus_count, pmc_corpus_count = remove_stop_words(
    biorxiv_corpus_count,
    pmc_corpus_count
)

HBox(children=(IntProgress(value=0, max=330), HTML(value='')))




In [7]:
top_ten_biorxiv = biorxiv_corpus_count.most_common(100)
top_ten_biorxiv[0:10]

[('cell', 2244256),
 ('use', 2206407),
 ('et', 1762805),
 ('al', 1754536),
 ('gene', 1347906),
 ('model', 1056802),
 ('fig', 1048216),
 ('figure', 987374),
 ('1', 946363),
 ('datum', 905227)]

In [8]:
top_ten_pmc = pmc_corpus_count.most_common(100)
top_ten_pmc[0:10]

[('use', 41761817),
 ('cell', 38244783),
 ('study', 30963261),
 ('patient', 22691935),
 ('1', 20819358),
 ('result', 18720685),
 ('et', 18125240),
 ('group', 17766474),
 ('al', 17456991),
 ('high', 17388204)]

In [9]:
print("Number of words in biorxiv but not in Pubmed Central:")
biorxiv_difference = set(list(biorxiv_corpus_count.keys())) - set(list(pmc_corpus_count.keys()))
print(len(biorxiv_difference))

Number of words in biorxiv but not in Pubmed Central:
1096878


In [10]:
[
    biorxiv_difference.pop()
    for i in range(10)
]

['f4,70=0.22',
 'p(cac',
 '8821×',
 '5awl',
 'n+2)(n+1)]/2',
 'hbb2-la',
 'cytichrome',
 'https://tinyurl.com',
 'ubccreert2/+:r26smom2/+',
 'e1.2ca']

In [11]:
print("Number of words in Pubmed Central but not in biorxiv:")
pmc_difference = set(list(pmc_corpus_count.keys())) - set(list(biorxiv_corpus_count.keys()))
print(len(pmc_difference))

Number of words in Pubmed Central but not in biorxiv:
99591968


In [12]:
[
    pmc_difference.pop()
    for i in range(10)
]

['matrix:(10)k=(kij)i',
 'min(cr',
 'd.fmultiple',
 '60.1)diabetes',
 'smokingsalt',
 '470.2699.2-((4-fluorophenyl)amino)-2-oxoethyl(4r,4as,6ar,9s,11ar,11bs)-4,9,11b',
 'product.**all',
 "5'-gcaagcttagaaacgatagcc-3",
 'tosurgery',
 'β2ftypyvqvkipgpgatyviwac']

In [13]:
total_words = set(list(dict(top_ten_biorxiv).keys()) + list(dict(top_ten_pmc).keys()))
data = []
for word in tqdm_notebook(total_words):
    
    word_stat = get_term_statistics(
        biorxiv_corpus_count,
        pmc_corpus_count,
        word
    )
    
    data.append({
        "lemma": word,
        "biorxiv_count":biorxiv_corpus_count[word] if word in biorxiv_corpus_count else 0,
        "pmc_count":pmc_corpus_count[word] if word in pmc_corpus_count else 0,
        "biorxiv_total":word_stat['chi_sq'][3][1,0],
        "pmc_total":word_stat['chi_sq'][3][1,1],
        "log_p": word_stat['chi_sq'][1],
        "log_likelihood": word_stat['log_likelihood'],
        "odds_ratio": word_stat['odds_ratio']
    })

HBox(children=(IntProgress(value=0, max=124), HTML(value='')))




In [14]:
total_word_stats_df = pd.DataFrame.from_records(data)
total_word_stats_df.to_csv(
    "output/full_corpus_comparison_stats.tsv", 
    sep="\t", index=False
)
total_word_stats_df.head()

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
0,network,327163,2817271,262487093,5343153698,-46.051702,173460.80954,2.363886
1,mm,238892,5766104,262487093,5343153698,-46.051702,7011.558929,0.843355
2,provide,313503,5998002,262487093,5343153698,-46.051702,1122.893677,1.063962
3,time,741727,13700113,262487093,5343153698,-46.051702,6440.151367,1.102073
4,al,1754536,17456991,262487093,5343153698,-46.051702,671885.110535,2.045893


In [15]:
(
    total_word_stats_df
    .sort_values("log_likelihood", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
67,patient,213034,22691935,262487093,5343153698,-46.051702,1060795.0,0.191104
4,al,1754536,17456991,262487093,5343153698,-46.051702,671885.1,2.045893
87,et,1762805,18125240,262487093,5343153698,-46.051702,620939.0,1.97975
88,gene,1347906,14688675,262487093,5343153698,-46.051702,405305.2,1.867957
105,±,146921,11016755,262487093,5343153698,-46.051702,391258.1,0.271471
48,study,829495,30963261,262487093,5343153698,-46.051702,360682.8,0.545327
62,health,53602,6392583,262487093,5343153698,-46.051702,320851.8,0.170688
37,genome,397888,2713361,262487093,5343153698,-46.051702,316957.3,2.984997
122,model,1056802,11568141,262487093,5343153698,-46.051702,313918.1,1.859603
21,neuron,305653,1968004,262487093,5343153698,-46.051702,265066.6,3.161503


In [16]:
(
    total_word_stats_df
    .sort_values("odds_ratio", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,biorxiv_count,pmc_count,biorxiv_total,pmc_total,log_p,log_likelihood,odds_ratio
21,neuron,305653,1968004,262487093,5343153698,-46.051702,265066.560577,3.161503
37,genome,397888,2713361,262487093,5343153698,-46.051702,316957.302246,2.984997
0,network,327163,2817271,262487093,5343153698,-46.051702,173460.80954,2.363886
4,al,1754536,17456991,262487093,5343153698,-46.051702,671885.110535,2.045893
87,et,1762805,18125240,262487093,5343153698,-46.051702,620939.022369,1.97975
71,single,357273,3825621,262487093,5343153698,-46.051702,113198.720154,1.90103
88,gene,1347906,14688675,262487093,5343153698,-46.051702,405305.237457,1.867957
72,rna,305858,3333698,262487093,5343153698,-46.051702,92167.882568,1.867603
122,model,1056802,11568141,262487093,5343153698,-46.051702,313918.083496,1.859603
63,specie,384720,4254993,262487093,5343153698,-46.051702,111070.702393,1.840504


# Preprint to Published View

In [17]:
mapped_doi_df = (
    pd.read_csv("../journal_tracker/output/mapped_published_doi.tsv", sep="\t")
    .query("published_doi.notnull()")
    .query("pmcid.notnull()")
    .groupby("doi")
    .agg({
        "author_type":"first",
        "heading":"first",
        "category":"first",
        "document":"last",
        "doi":"last",
        "published_doi":"last",
        "journal":"last",
        "pmcid":"last"
    })
    .reset_index(drop=True)
)
mapped_doi_df.tail()

Unnamed: 0,author_type,heading,category,document,doi,published_doi,journal,pmcid
17115,regular article,new results,animal behavior and cognition,852350_v1.xml,10.1101/852350,10.1371/journal.pone.0226774,PLOS ONE,PMC6961851
17116,regular article,new results,pathology,856542_v1.xml,10.1101/856542,10.1038/s41598-019-57046-x,Scientific Reports,PMC6969030
17117,regular article,new results,neuroscience,858100_v1.xml,10.1101/858100,10.3389/fnsyn.2019.00035,Frontiers in Synaptic Neuroscience,PMC6932971
17118,regular article,new results,genomics,862847_v1.xml,10.1101/862847,10.1186/s12864-019-6379-5,BMC Genomics,PMC6933653
17119,regular article,new results,bioinformatics,867903_v1.xml,10.1101/867903,10.1186/s13059-019-1915-9,Genome Biology,PMC6927177


In [18]:
preprint_count = aggregate_word_counts(
    [Path("output/biorxiv_word_counts/862847_v1.tsv")]
)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [19]:
published_count = aggregate_word_counts(
    [Path("output/pmc_word_counts/PMC6933653.tsv")]
)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [20]:
preprint_count, published_count = remove_stop_words(
    preprint_count,
    published_count
)

HBox(children=(IntProgress(value=0, max=330), HTML(value='')))




In [21]:
top_ten_preprint = preprint_count.most_common(100)
top_ten_preprint[0:10]

[('gene', 118),
 ('cell', 117),
 ('ctcf', 96),
 ('expression', 59),
 ('variation', 55),
 ('cellular', 32),
 ('knockdown', 31),
 ('kd', 29),
 ('level', 24),
 ('use', 24)]

In [22]:
top_ten_published = published_count.most_common(100)
top_ten_published[0:10]

[('cell', 144),
 ('gene', 138),
 ('ctcf', 116),
 ('expression', 71),
 ('variation', 59),
 ('cellular', 37),
 ('kd', 37),
 ('knockdown', 35),
 ('wt', 30),
 ('single', 27)]

In [23]:
print("Number of words in preprint but not in published version:")
preprint_difference = set(list(preprint_count.keys())) - set(list(published_count.keys()))
print(len(preprint_difference))

Number of words in preprint but not in published version:
44


In [24]:
[
    preprint_difference.pop()
    for i in range(10)
]

['likely',
 '0.05',
 '13–16',
 'r2=0.87',
 '1,490',
 '2c',
 '40µg',
 '0.16×10−6',
 '729',
 'r2=0.86']

In [25]:
print("Number of words in published version but not in preprint:")
published_difference = set(list(published_count.keys())) - set(list(preprint_count.keys()))
print(len(published_difference))

Number of words in published version but not in preprint:
109


In [26]:
[
    published_difference.pop()
    for i in range(10)
]

['shrnaknockdown',
 '\\setlength{\\oddsidemargin}{-69pt',
 'systemin',
 'unclear',
 'c{v}_{kd}-c{v}_{wt',
 '0.8',
 'sd',
 '\\usepackage{upgreek',
 'comparison',
 'cultureel4']

In [27]:
total_words = set(list(dict(top_ten_preprint).keys()) + list(dict(top_ten_published).keys()))
data = []
for word in tqdm_notebook(total_words):
    
    word_stat = get_term_statistics(
        preprint_count,
        published_count,
        word
    )
    
    data.append({
        "lemma": word,
        "preprint_count":preprint_count[word] if word in preprint_count else 0,
        "published_count":published_count[word] if word in published_count else 0,
        "preprint_total":word_stat['chi_sq'][3][1,0],
        "published_total":word_stat['chi_sq'][3][1,1],
        "log_p": word_stat['chi_sq'][1],
        "log_likelihood": word_stat['log_likelihood'],
        "odds_ratio": word_stat['odds_ratio']
    })

HBox(children=(IntProgress(value=0, max=116), HTML(value='')))




In [28]:
published_comparison_stats_df = pd.DataFrame.from_records(data)
published_comparison_stats_df.to_csv(
    "output/544536_v2_PMC6687187_comparison.tsv", 
    sep="\t", index=False
)
published_comparison_stats_df.head()

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
0,landscape,5,7,2190,2670,-0.206573,0.027547,0.914309
1,zinc,12,13,2190,2670,-0.263482,0.102826,1.132001
2,4,4,4,2190,2670,-0.249612,0.097742,1.219078
3,enrichment,5,6,2190,2670,-0.021087,0.006215,1.044924
4,western,6,6,2190,2670,-0.312917,0.136726,1.219078


In [29]:
(
    published_comparison_stats_df
    .sort_values("log_likelihood", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
29,file,0,11,2190,2670,-5.917748,8.899627,0.10159
31,figure,0,11,2190,2670,-5.917748,8.899627,0.10159
84,additional,0,10,2190,2670,-5.468921,7.87195,0.110825
96,17,7,1,2190,2670,-4.13855,5.13129,4.876312
36,p,1,9,2190,2670,-3.650212,4.345213,0.243816
28,10−,0,6,2190,2670,-3.626721,3.946189,0.174154
17,fig,23,17,2190,2670,-2.159152,2.438819,1.625437
103,plot,1,6,2190,2670,-2.283293,2.036108,0.348308
114,1,12,24,2190,2670,-1.840554,1.836137,0.633921
67,18,2,7,2190,2670,-1.778814,1.472369,0.457154


In [30]:
(
    published_comparison_stats_df
    .sort_values("odds_ratio", ascending=False)
    .head(20)
)

Unnamed: 0,lemma,preprint_count,published_count,preprint_total,published_total,log_p,log_likelihood,odds_ratio
96,17,7,1,2190,2670,-4.13855,5.13129,4.876312
17,fig,23,17,2190,2670,-2.159152,2.438819,1.625437
38,quality,5,4,2190,2670,-0.639962,0.395966,1.462894
65,different,6,5,2190,2670,-0.639113,0.401341,1.422258
77,category,6,5,2190,2670,-0.639113,0.401341,1.422258
68,test,8,7,2190,2670,-0.653296,0.421972,1.371463
45,control,9,8,2190,2670,-0.664468,0.435133,1.354531
58,interaction,4,4,2190,2670,-0.249612,0.097742,1.219078
85,play,5,5,2190,2670,-0.282524,0.117242,1.219078
100,find,4,4,2190,2670,-0.249612,0.097742,1.219078
