# Download biorxiv preprint table from the PrePubMed repository

In [1]:
import os
import json
import logging

import pandas
import requests

import utilities

In [2]:
# Configure logging to write to file
logging.basicConfig(level=logging.INFO, filename=os.path.join('logs/donwload.log'), filemode='w')

## Get `OmnesRes/prepub` version

In [3]:
url = 'https://api.github.com/repos/OmnesRes/prepub/git/refs/heads/master'
response = requests.get(url)
response = response.json()
response['object']

{'sha': '1d4f9118202a9d8e8979bb97705b13d72e639362',
 'type': 'commit',
 'url': 'https://api.github.com/repos/OmnesRes/prepub/git/commits/1d4f9118202a9d8e8979bb97705b13d72e639362'}

## Load bioRxiv data

In [4]:
url = 'https://github.com/OmnesRes/prepub/raw/master/biorxiv/biorxiv_licenses.tsv'
biorxiv_df = pandas.read_table(url)

In [5]:
# Limit to preprints through November 2016
biorxiv_df = biorxiv_df.query("Date <= '2017-03-25'")

In [6]:
biorxiv_df.head(2)

Unnamed: 0,DOI,Date,Subjects,License,Title,Authors,Affiliations
0,http://dx.doi.org/10.1101/049031,2016-04-16,Microbiology,CC BY-NC,Alternative Growth Behavior of Mycobacterium A...,Peilin Zhang|Lawrence M Minardi|J. Todd Kuenst...,"PZM Diagnostics, LLC"
1,http://dx.doi.org/10.1101/049049,2016-04-16,Genomics,CC BY-NC-ND,Lateral genetic transfers between eukaryotes a...,Sarah R Bordenstein|Seth R Bordenstein,Vanderbilt University


## Processing

In [7]:
# Remove URL from DOIs
biorxiv_df.DOI = biorxiv_df.DOI.str.extract(r'(10\.[0-9]+/[0-9]+)', expand=False)
biorxiv_df.License = biorxiv_df.License.str.replace('CC-BY', 'CC BY')
biorxiv_df.License = biorxiv_df.License.fillna('None')

## Authors

In [8]:
author_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Authors')
    .rename(columns={'Authors': 'Author'})
    [['DOI', 'Author']]
    .sort_values(['DOI', 'Author'])
    .drop_duplicates()
    .reset_index(drop=True)
)

In [9]:
# Standardize author names
author_df['Standard_Author'] = author_df.Author.map(utilities.get_standard_author)

In [10]:
author_df.tail(2)

Unnamed: 0,DOI,Author,Standard_Author
56816,10.1101/120600,Nils Gehlenborg,Nils Gehlenborg
56817,10.1101/120634,Kenneth W Witwer,Kenneth Witwer


In [11]:
# Authors with the most preprints
author_df.Standard_Author.value_counts().head(2)

Mark Daly      36
Alkes Price    34
Name: Standard_Author, dtype: int64

## Subjects

In [12]:
# Create a TSV with a row per preprint-subject pair
subject_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Subjects')
    .rename(columns={'Subjects': 'Subject'})
    [['DOI', 'Subject']]
    .sort_values(['DOI', 'Subject'])
    .reset_index(drop=True)
)
subject_df.tail(2)

Unnamed: 0,DOI,Subject
8589,10.1101/120600,Bioinformatics
8590,10.1101/120634,Molecular Biology


In [13]:
# Number of subjects per preprint
subject_df.DOI.value_counts().value_counts()

1    8534
2      22
5       2
3       1
Name: DOI, dtype: int64

In [14]:
# Number of preprints by subject
subject_df.Subject.value_counts()

Bioinformatics                            1316
Evolutionary Biology                      1150
Neuroscience                              1094
Genomics                                   995
Genetics                                   712
Ecology                                    437
Microbiology                               424
Systems Biology                            329
Biophysics                                 283
Cell Biology                               238
Developmental Biology                      189
Cancer Biology                             189
Plant Biology                              186
Molecular Biology                          155
Biochemistry                               145
Animal Behavior and Cognition              128
Epidemiology                               121
Synthetic Biology                          111
Immunology                                  95
Bioengineering                              80
Physiology                                  44
Zoology      

## Preprints

In [15]:
preprint_df = (biorxiv_df
    [['DOI', 'Date', 'License']]
    .sort_values('DOI')
    .reset_index(drop=True)
)
preprint_df.tail(4)

Unnamed: 0,DOI,Date,License
9334,10.1101/120543,2017-03-25,CC BY
9335,10.1101/120584,2017-03-25,
9336,10.1101/120600,2017-03-25,CC BY-NC
9337,10.1101/120634,2017-03-25,CC BY


In [16]:
len(preprint_df)

9338

In [17]:
# Preprints by license
preprint_df.License.value_counts(normalize=True).reset_index()

Unnamed: 0,index,License
0,CC BY-NC-ND,0.359713
1,,0.295674
2,CC BY,0.18655
3,CC BY-NC,0.090169
4,CC BY-ND,0.067895


In [18]:
# Preprints by year
preprint_df.Date.map(lambda date: date.split('-')[0]).value_counts()

2016    4901
2017    1848
2015    1706
2014     806
2013      77
Name: Date, dtype: int64

## Save as a TSVs

In [19]:
path = os.path.join('data', 'preprints.tsv')
preprint_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'subjects.tsv')
subject_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'authors.tsv')
author_df.to_csv(path, sep='\t', index=False)