# Comparative Linguistic Analysis of bioRxiv and PMC

In [1]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict, Counter
import csv
import itertools
from pathlib import Path

import numpy as np
import pandas as pd
import pickle
import spacy
from scipy.stats import chi2_contingency
from tqdm import tqdm_notebook

from annorxiver_modules.corpora_comparison_helper import(
    aggregate_word_counts,
    dump_to_dataframe,
    get_term_statistics,
    KL_divergence
)

# Full Text Comparison (Global)

## Gather Word Frequencies

In [2]:
biorxiv_count_path = Path("output/total_word_counts/biorxiv_total_count.tsv")
pmc_count_path = Path("output/total_word_counts/pmc_total_count.tsv")
nytac_count_path = Path("output/total_word_counts/nytac_total_count.tsv")

In [3]:
if not biorxiv_count_path.exists():
    biorxiv_corpus_count = (
        aggregate_word_counts(
            list(Path("output/biorxiv_word_counts").rglob("*tsv"))
        )
    )
    dump_to_dataframe(biorxiv_corpus_count, "output/biorxiv_total_count.tsv")
    biorxiv_corpus_count.most_common(10)

In [4]:
if not pmc_count_path.exists():
    pmc_corpus_count = (
        aggregate_word_counts(
            list(Path("../../pmc/pmc_corpus/pmc_word_counts").rglob("*tsv"))
        )
    )
    dump_to_dataframe(pmc_corpus_count, "output/pmc_total_count.tsv")
    pmc_corpus_count.most_common(10)

In [5]:
if not nytac_count_path.exists():
    nytac_corpus_count = (
        aggregate_word_counts(
            list(Path("../../nytac/corpora_stats/output").rglob("*tsv"))
        )
    )
    dump_to_dataframe(nytac_corpus_count, "output/nytac_total_count.tsv")
    nytac_corpus_count.most_common(10)

In [6]:
biorxiv_total_count_df = pd.read_csv(
    biorxiv_count_path.resolve(), 
    sep="\t"
)

pmc_total_count_df = pd.read_csv(
    pmc_count_path.resolve(), 
    sep="\t"
)

nytac_total_count_df = pd.read_csv(
    nytac_count_path.resolve(), 
    sep="\t"
)

In [7]:
biorxiv_sentence_length = pickle.load(
    open("output/biorxiv_sentence_length.pkl", "rb")
)
pmc_sentence_length = pickle.load(
    open("../../pmc/pmc_corpus/pmc_sentence_length.pkl", "rb")
)
nytac_sentence_length = pickle.load(
    open("../../nytac/corpora_stats/nytac_sentence_length.pkl", "rb")
)

In [8]:
spacy_nlp = spacy.load('en_core_web_sm')
stop_word_list = list(spacy_nlp.Defaults.stop_words)

## Get Corpora Comparison Stats

In [9]:
biorxiv_sentence_len_list = list(biorxiv_sentence_length.items())
biorxiv_data = {
    "document_count": len(biorxiv_sentence_length),
    "sentence_count": sum(map(lambda x: len(x[1]), biorxiv_sentence_len_list)),
    "token_count": biorxiv_total_count_df['count'].sum(),
    "stop_word_count":(
        biorxiv_total_count_df
        .query(f"lemma in {stop_word_list}")
        ['count']
        .sum()
    ),
    
    "avg_document_length":np.mean(
        list(
            map(
                lambda x: len(x[1]), 
                biorxiv_sentence_len_list
            )
        )
    ),
    
    "avg_sentence_length":np.mean(
        list(
            itertools
            .chain(
                *list(
                    map(
                        lambda x: x[1], 
                        biorxiv_sentence_len_list
                    )
                )
            )
        )
    ),
    
    "negatives":(
        biorxiv_total_count_df
        .query(f"dep_tag =='neg'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions":(
        biorxiv_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions%":(
        biorxiv_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    )/biorxiv_total_count_df['count'].sum(),
    
    "pronouns":(
        biorxiv_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    ),
    
    "pronouns%":(
        biorxiv_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    )/biorxiv_total_count_df['count'].sum(),
    
    "passives":(
        biorxiv_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    ),
    
    "passive%":(
        biorxiv_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    )/biorxiv_total_count_df['count'].sum()
}

In [10]:
pmc_sentence_len_list = list(pmc_sentence_length.items())
pmc_data = {
    "document_count": len(pmc_sentence_length),
    "sentence_count": sum(map(lambda x: len(x[1]), pmc_sentence_len_list)),
    "token_count": pmc_total_count_df['count'].sum(),
    "stop_word_count":(
        pmc_total_count_df
        .query(f"lemma in {stop_word_list}")
        ['count']
        .sum()
    ),
    
    "avg_document_length":np.mean(
        list(
            map(
                lambda x: len(x[1]), 
                pmc_sentence_len_list
            )
        )
    ),
    
    "avg_sentence_length":np.mean(
        list(
            itertools
            .chain(
                *list(
                    map(
                        lambda x: x[1], 
                        pmc_sentence_len_list
                    )
                )
            )
        )
    ),
    
    "negatives":(
        pmc_total_count_df
        .query(f"dep_tag =='neg'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions":(
        pmc_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions%":(
        pmc_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    )/pmc_total_count_df['count'].sum(),
    
    "pronouns":(
        pmc_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    ),
    
    "pronouns%":(
        pmc_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    )/pmc_total_count_df['count'].sum(),
    
    "passives":(
        pmc_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    ),
    
    "passive%":(
        pmc_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    )/pmc_total_count_df['count'].sum()
}

In [11]:
nytac_sentence_len_list = list(nytac_sentence_length.items())
nytac_data = {
    "document_count": len(nytac_sentence_length),
    "sentence_count": sum(map(lambda x: len(x[1]), nytac_sentence_len_list)),
    "token_count": nytac_total_count_df['count'].sum(),
    "stop_word_count":(
        nytac_total_count_df
        .query(f"lemma in {stop_word_list}")
        ['count']
        .sum()
    ),
    
    "avg_document_length":np.mean(
        list(
            map(
                lambda x: len(x[1]), 
                nytac_sentence_len_list
            )
        )
    ),
    
    "avg_sentence_length":np.mean(
        list(
            itertools
            .chain(
                *list(
                    map(
                        lambda x: x[1], 
                        nytac_sentence_len_list
                    )
                )
            )
        )
    ),
    
    "negatives":(
        nytac_total_count_df
        .query(f"dep_tag =='neg'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions":(
        nytac_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    ),
    
    "coordinating_conjunctions%":(
        nytac_total_count_df
        .query(f"dep_tag =='cc'")
        ['count']
        .sum()
    )/nytac_total_count_df['count'].sum(),
    
    "pronouns":(
        nytac_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    ),
    
    "pronouns%":(
        nytac_total_count_df
        .query(f"pos_tag =='PRON'")
        ['count']
        .sum()
    )/nytac_total_count_df['count'].sum(),
    
    "passives":(
        nytac_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    ),
    
    "passive%":(
        nytac_total_count_df
        .query(f"dep_tag in ['auxpass', 'nsubjpass', 'csubjpass']")
        ['count']
        .sum()
    )/nytac_total_count_df['count'].sum()
}

In [12]:
# This dataframe contains document statistics for each Corpus
# document count - the number of documents within the corpus
# Sentence count - the number of sentences within the corpus
# Token count - the number of tokens within the corpus
# Stop word counts - the number of stop words within the corpus
# Average document length - the average number of sentences within a document for a given corpus
# Average sentence length - the average number of words within a sentence for a given corpus
# Negatives - the number of negations (e.g. placing not in within a sentence) within a given corpus 
# Coordinating Conjunctions - the number of coordinating conjunctions (and, but, for etc.) within a given corpus 
# Pronouns - the number of pronouns within a given corpus 
# Passive - the number of passive words within a given corpus 

token_stats_df = (
    pd.DataFrame
    .from_records(
        [
            biorxiv_data,
            pmc_data,
            nytac_data
        ], 
        index=["bioRxiv", "PMC", "NYTAC"]
    )
    .T
)
token_stats_df.to_csv(
    "output/figures/corpora_token_stats.tsv", 
    sep="\t"
)
token_stats_df

Unnamed: 0,bioRxiv,PMC,NYTAC
document_count,71118.0,1977647.0,1855658.0
sentence_count,22195740.0,480489800.0,72171040.0
token_count,420969900.0,8597101000.0,1218673000.0
stop_word_count,158429400.0,3153077000.0,559391100.0
avg_document_length,312.0973,242.9604,38.89242
avg_sentence_length,22.70775,21.46228,19.89098
negatives,1148382.0,24928800.0,7272401.0
coordinating_conjunctions,14295740.0,307082300.0,38730050.0
coordinating_conjunctions%,0.03395904,0.03571929,0.0317805
pronouns,4604432.0,74994120.0,46712550.0


## LogLikelihood + Odds Ratio + KL Divergence Calculations

The goal here is to compare word frequencies between bioRxiv and pubmed central. The problem when comparing word frequencies is that non-meaningful words (aka stopwords) such as the, of, and, be, etc., appear the most often. To account for this problem the first step here is to remove those words from analyses. 

### Remove Stop words

In [13]:
biorxiv_total_count_df = (
    biorxiv_total_count_df
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
)
biorxiv_total_count_df

Unnamed: 0,lemma,count
1192554,et,1762717
632487,al,1754311
885798,cells,1281939
1032848,data,1054600
1265968,fig,1031811
...,...,...
1541922,"i,3a",1
1541921,"i,3",1
1541919,"i,14,i",1
1541918,"i,1016",1


In [14]:
pmc_total_count_df = (
    pmc_total_count_df
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
    .iloc[2:]
)
pmc_total_count_df

Unnamed: 0,lemma,count
44829210,cells,23853661
6832340,1,20800303
90301906,study,20300433
78312111,patients,18263142
3,\t\t\t\t,18199324
...,...,...
36210476,a2.6±0.4,1
36210474,a2.6±0.1n,1
36210472,a2.6±0.08,1
36210471,a2.6±0,1


In [15]:
nytac_total_count_df = (
    nytac_total_count_df
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
)
nytac_total_count_df

Unnamed: 0,lemma,count
1780479,said,7731500
1555930,mr.,5817945
1585217,new,4031910
2,,3356883
3685,--,2835532
...,...,...
1224054,gaietes,1
226116,17:18.5,1
226115,17:18.3,1
1224058,gaieté,1


### Calculate LogLikelihoods and Odds ratios

In [16]:
biorxiv_vs_pmc = get_term_statistics(
    biorxiv_total_count_df, 
    pmc_total_count_df, 
    100
)

biorxiv_vs_pmc.to_csv(
    "output/comparison_stats/biorxiv_vs_pmc_comparison.tsv", 
    sep="\t", index=False
)

biorxiv_vs_pmc

HBox(children=(IntProgress(value=0, max=126), HTML(value='')))




Unnamed: 0,term,corpus_one_a,corpua_two_b,corpus_one_c,corpus_two_d,log_likelihood,odds_ratio
0,’,345652,4837734,262494660,5365015381,41488.085876,1.460319
1,–,211069,7614840,262494660,5365015381,79745.759155,0.566519
2,growth,240308,3933709,262494660,5365015381,10453.263824,1.248580
3,risk,94155,5338328,262494660,5365015381,136800.666016,0.360486
4,rna,280074,3073361,262494660,5365015381,83766.772949,1.862558
...,...,...,...,...,...,...,...
121,regions,276621,2613208,262494660,5365015381,121417.507721,2.163524
122,non,362862,6145600,262494660,5365015381,11440.261810,1.206779
123,changes,253264,4224365,262494660,5365015381,9290.785034,1.225357
124,shown,393296,8110411,262494660,5365015381,29.864899,0.991122


In [17]:
biorxiv_vs_nytac = get_term_statistics(
    biorxiv_total_count_df, 
    nytac_total_count_df, 
    100
)
biorxiv_vs_nytac.to_csv(
    "output/comparison_stats/biorxiv_nytac_comparison.tsv", 
    sep="\t", index=False
)
biorxiv_vs_nytac

HBox(children=(IntProgress(value=0, max=192), HTML(value='')))




Unnamed: 0,term,corpus_one_a,corpua_two_b,corpus_one_c,corpus_two_d,log_likelihood,odds_ratio
0,house,11025,802719,262494660,659277495,448757.653778,0.034496
1,yesterday,65,816741,262494660,659277495,545985.714767,0.000200
2,best,82201,501641,262494660,659277495,68168.151474,0.411559
3,’,345652,1070,262494660,659277495,854209.302498,811.340349
4,net,22817,504167,262494660,659277495,207272.925217,0.113666
...,...,...,...,...,...,...,...
187,director,271,548286,262494660,659277495,363403.238731,0.001241
188,like,148698,1809044,262494660,659277495,532894.593819,0.206445
189,end,120660,527968,262494660,659277495,33784.450325,0.573989
190,shown,393296,91152,262494660,659277495,580266.632233,10.836801


In [18]:
pmc_vs_nytac = get_term_statistics(
    pmc_total_count_df, 
    nytac_total_count_df, 
    100
)

pmc_vs_nytac.to_csv(
    "output/comparison_stats/pmc_nytac_comparison.tsv", 
    sep="\t", index=False
)

pmc_vs_nytac

HBox(children=(IntProgress(value=0, max=190), HTML(value='')))




Unnamed: 0,term,corpus_one_a,corpua_two_b,corpus_one_c,corpus_two_d,log_likelihood,odds_ratio
0,house,236735,802719,5365015381,659277495,2.490533e+06,0.036241
1,yesterday,3190,816741,5365015381,659277495,3.572001e+06,0.000480
2,best,1323985,501641,5365015381,659277495,3.796544e+05,0.324330
3,’,4837734,1070,5365015381,659277495,1.105503e+06,555.591365
4,–,7614840,1,5365015381,659277495,1.763922e+06,935746.178437
...,...,...,...,...,...,...,...
185,director,35553,548286,5365015381,659277495,2.166017e+06,0.007968
186,like,2545436,1809044,5365015381,659277495,2.680255e+06,0.172906
187,end,2065223,527968,5365015381,659277495,1.938481e+05,0.480681
188,shown,8110411,91152,5365015381,659277495,1.280659e+06,10.933871


## Calculate KL Divergence

In [19]:
term_grid = [100,200,300,400,500,1000,1500,2000,3000,5000]
kl_data = []
for num_terms in tqdm_notebook(term_grid):
    kl_data.append({
        "num_terms": num_terms,
        "KL_divergence":KL_divergence(
            biorxiv_total_count_df, 
            pmc_total_count_df, 
            num_terms = num_terms
        ),
        "comparison":"biorxiv_vs_pmc"
    })
    
    kl_data.append({
        "num_terms": num_terms,
        "KL_divergence":KL_divergence(
            biorxiv_total_count_df, 
            nytac_total_count_df, 
            num_terms = num_terms
        ),
        "comparison":"biorxiv_vs_nytac"
    })
        
    kl_data.append({
        "num_terms": num_terms,
        "KL_divergence":KL_divergence(
            pmc_total_count_df, 
            nytac_total_count_df, 
            num_terms = num_terms
        ),
        "comparison":"pmc_vs_nytac"
    }) 

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [20]:
kl_metrics = pd.DataFrame.from_records(kl_data)
kl_metrics.to_csv(
    "output/comparison_stats/corpora_kl_divergence.tsv", 
    sep="\t", index=False
)
kl_metrics

Unnamed: 0,num_terms,KL_divergence,comparison
0,100,0.02343,biorxiv_vs_pmc
1,100,0.473538,biorxiv_vs_nytac
2,100,0.29421,pmc_vs_nytac
3,200,0.037528,biorxiv_vs_pmc
4,200,0.639116,biorxiv_vs_nytac
5,200,0.459137,pmc_vs_nytac
6,300,0.047375,biorxiv_vs_pmc
7,300,0.77364,biorxiv_vs_nytac
8,300,0.541992,pmc_vs_nytac
9,400,0.178005,biorxiv_vs_pmc


# Preprint to Published View

In [21]:
mapped_doi_df = (
    pd.read_csv("../journal_tracker/output/mapped_published_doi.tsv", sep="\t")
    .query("published_doi.notnull()")
    .query("pmcid.notnull()")
    .groupby("doi")
    .agg({
        "author_type":"first",
        "heading":"first",
        "category":"first",
        "document":"first",
        "doi":"last",
        "published_doi":"last",
        "journal":"last",
        "pmcid":"last"
    })
    .reset_index(drop=True)
)
mapped_doi_df.tail()

Unnamed: 0,author_type,heading,category,document,doi,published_doi,journal,pmcid
23266,regular article,new results,neuroscience,866855_v1.xml,10.1101/866855,10.1162/netn_a_00122,Network Neuroscience,PMC7069064
23267,regular article,new results,bioinformatics,867903_v1.xml,10.1101/867903,10.1186/s13059-019-1915-9,Genome Biology,PMC6927177
23268,regular article,new results,genomics,869339_v1.xml,10.1101/869339,10.3390/genes11010027,Genes,PMC7017358
23269,regular article,new results,neuroscience,870345_v1.xml,10.1101/870345,10.1152/jn.00399.2019,Journal of Neurophysiology,PMC7099478
23270,regular article,new results,physiology,872127_v1.xml,10.1101/872127,10.1152/ajpcell.00568.2019,American Journal of Physiology-Cell Physiology,PMC7099523


In [22]:
print(f"Total # of Preprints Mapped: {mapped_doi_df.shape[0]}")
print(f"Total % of Mapped: {mapped_doi_df.shape[0]/71118}")

Total # of Preprints Mapped: 23271
Total % of Mapped: 0.3272167383784696


In [24]:
preprint_count = aggregate_word_counts([
    Path(f"output/biorxiv_word_counts/{Path(file)}.tsv")
    for file in mapped_doi_df.document.values.tolist()
    if Path(f"output/biorxiv_word_counts/{Path(file)}.tsv").exists()
])

preprint_count_df = (
    pd.DataFrame.from_records([
        {
            "lemma":token[0],
            "pos_tag":token[1],
            "dep_tag":token[2],
            "count":preprint_count[token]
        }
        for token in preprint_count
    ])
    .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
)

preprint_count_df.head()

HBox(children=(IntProgress(value=0, max=23271), HTML(value='')))




Unnamed: 0,lemma,pos_tag,dep_tag,count
0,abstract,PROPN,compound,9225
1,adaptation,PROPN,nsubj,24
2,in,ADP,prep,3368754
3,response,NOUN,pobj,39876
4,to,ADP,prep,1300958


In [25]:
published_count = aggregate_word_counts([
    Path(f"../../pmc/pmc_corpus/pmc_word_counts/{file}.tsv")
    for file in mapped_doi_df.pmcid.values.tolist()
    if Path(f"../../pmc/pmc_corpus/pmc_word_counts/{file}.tsv").exists()
])

published_count_df = (
    pd.DataFrame.from_records([
        {
            "lemma":token[0],
            "pos_tag":token[1],
            "dep_tag":token[2],
            "count":published_count[token]
        }
        for token in published_count
    ])
     .query(f"lemma not in {stop_word_list}")
    .groupby("lemma")
    .agg({
        "count":"sum"
    })
    .reset_index()
    .sort_values("count", ascending=False)
)

published_count_df.head()

HBox(children=(IntProgress(value=0, max=17275), HTML(value='')))




Unnamed: 0,lemma,pos_tag,dep_tag,count
0,introduction,PROPN,compound,2421
1,population,PROPN,nsubj,29
2,and,CCONJ,cc,3540569
3,quantitative,ADJ,amod,11904
4,genetics,NOUN,conj,339


In [26]:
preprint_vs_published = get_term_statistics(
    preprint_count_df, 
    published_count_df, 
    100
)

preprint_vs_published.to_csv(
    "output/comparison_stats/preprint_to_published_comparison.tsv", 
    sep="\t", index=False
)

preprint_vs_published

HBox(children=(IntProgress(value=0, max=126), HTML(value='')))




Unnamed: 0,term,corpus_one_a,corpua_two_b,corpus_one_c,corpus_two_d,log_likelihood,odds_ratio
0,’,345652,4837734,262494660,5365015381,41488.085876,1.460319
1,–,211069,7614840,262494660,5365015381,79745.759155,0.566519
2,growth,240308,3933709,262494660,5365015381,10453.263824,1.248580
3,risk,94155,5338328,262494660,5365015381,136800.666016,0.360486
4,rna,280074,3073361,262494660,5365015381,83766.772949,1.862558
...,...,...,...,...,...,...,...
121,regions,276621,2613208,262494660,5365015381,121417.507721,2.163524
122,non,362862,6145600,262494660,5365015381,11440.261810,1.206779
123,changes,253264,4224365,262494660,5365015381,9290.785034,1.225357
124,shown,393296,8110411,262494660,5365015381,29.864899,0.991122


Main takeaways from this analysis:
1. On a global scale bioRxiv contains more field specific articles as top words consist of: neuron, gene, genome, network
2. "Patients" appear more correlated with PMC as most preprints involving patients are shipped over to medRxiv.
3. Many words associated with PMC are health related which ties back to the medRxiv note.
4. Citation styles change as preprints transition to published versions. Et Al. has a greater association within bioRxiv compared to PMC.
5. On a local scale published articles contain more statistical concepts (e.g., t-test) as well as quantitative measures (e.g. degree signs). (High associated lemmas are t, -, degree sign etc.)
6. Publish articles have a focus shift on mentioning figures, adding supplementary data etc compared to preprints.
7. Preprints have a universal way of citing published works by using the et al. citation. Hard to pinpoint if leading factor is because of peer review or journal style, but it will be an interesting point to discuss in the paper.