# Download biorxiv preprint table from the PrePubMed repository

In [1]:
import os
import json
import logging

import pandas
import requests

import utilities

In [2]:
# Configure logging to write to file
logging.basicConfig(level=logging.INFO, filename=os.path.join('logs/donwload.log'), filemode='w')

## Get `OmnesRes/prepub` version

In [3]:
url = 'https://api.github.com/repos/OmnesRes/prepub/git/refs/heads/master'
response = requests.get(url)
response = response.json()
response['object']

{'sha': 'e97cbaf6bf09b87f35f6e907ac625b3d57731a0d',
 'type': 'commit',
 'url': 'https://api.github.com/repos/OmnesRes/prepub/git/commits/e97cbaf6bf09b87f35f6e907ac625b3d57731a0d'}

## Load bioRxiv data

In [4]:
url = 'https://github.com/OmnesRes/prepub/raw/master/biorxiv/biorxiv_licenses.tsv'
biorxiv_df = pandas.read_table(url)

In [5]:
# Limit to preprints through November 2016
biorxiv_df = biorxiv_df.query("Date <= '2017-05-03'")

In [6]:
biorxiv_df.head(2)

Unnamed: 0,DOI,Date,Subjects,License,Title,Authors,Affiliations
0,http://dx.doi.org/10.1101/049031,2016-04-16,Microbiology,CC BY-NC,Alternative Growth Behavior of Mycobacterium A...,Peilin Zhang|Lawrence M Minardi|J. Todd Kuenst...,"PZM Diagnostics, LLC"
1,http://dx.doi.org/10.1101/049049,2016-04-16,Genomics,CC BY-NC-ND,Lateral genetic transfers between eukaryotes a...,Sarah R Bordenstein|Seth R Bordenstein,Vanderbilt University


## Processing

In [7]:
# Remove URL from DOIs
biorxiv_df.DOI = biorxiv_df.DOI.str.extract(r'(10\.[0-9]+/[0-9]+)', expand=False)
biorxiv_df.License = biorxiv_df.License.str.replace('CC-BY', 'CC BY')
biorxiv_df.License = biorxiv_df.License.fillna('None')

## Authors

In [8]:
author_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Authors')
    .rename(columns={'Authors': 'Author'})
    [['DOI', 'Author']]
    .sort_values(['DOI', 'Author'])
    .drop_duplicates()
    .reset_index(drop=True)
)

In [9]:
# Standardize author names
author_df['Standard_Author'] = author_df.Author.map(utilities.get_standard_author)

In [10]:
author_df.tail(2)

Unnamed: 0,DOI,Author,Standard_Author
63042,10.1101/133512,Hernan Lopez-Fernandez,Hernan Lopez-Fernandez
63043,10.1101/133512,Katriina L. Ilves,Katriina Ilves


In [11]:
# Authors with the most preprints
author_df.Standard_Author.value_counts().head(2)

Mark Daly      41
Alkes Price    34
Name: Standard_Author, dtype: int64

## Subjects

In [12]:
# Create a TSV with a row per preprint-subject pair
subject_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Subjects')
    .rename(columns={'Subjects': 'Subject'})
    [['DOI', 'Subject']]
    .sort_values(['DOI', 'Subject'])
    .reset_index(drop=True)
)
subject_df.tail(2)

Unnamed: 0,DOI,Subject
9350,10.1101/133371,Neuroscience
9351,10.1101/133405,Zoology


In [13]:
# Number of subjects per preprint
subject_df.DOI.value_counts().value_counts()

1    9295
2      22
5       2
3       1
Name: DOI, dtype: int64

In [14]:
# Number of preprints by subject
subject_df.Subject.value_counts()

Bioinformatics                            1415
Neuroscience                              1230
Evolutionary Biology                      1195
Genomics                                  1048
Genetics                                   759
Microbiology                               472
Ecology                                    460
Systems Biology                            362
Biophysics                                 316
Cell Biology                               283
Cancer Biology                             213
Developmental Biology                      211
Plant Biology                              208
Molecular Biology                          176
Biochemistry                               171
Epidemiology                               147
Animal Behavior and Cognition              135
Synthetic Biology                          116
Immunology                                 104
Bioengineering                              89
Physiology                                  53
Zoology      

## Preprints

In [15]:
preprint_df = (biorxiv_df
    [['DOI', 'Date', 'License']]
    .sort_values('DOI')
    .reset_index(drop=True)
)
preprint_df.tail(4)

Unnamed: 0,DOI,Date,License
10318,10.1101/133413,2017-05-02,
10319,10.1101/133462,2017-05-02,CC BY
10320,10.1101/133488,2017-05-02,CC BY-NC
10321,10.1101/133512,2017-05-02,CC BY-NC-ND


In [16]:
len(preprint_df)

10322

In [17]:
# Preprints by license
preprint_df.License.value_counts(normalize=True).reset_index()

Unnamed: 0,index,License
0,CC BY-NC-ND,0.352839
1,,0.294517
2,CC BY,0.193083
3,CC BY-NC,0.091746
4,CC BY-ND,0.067816


In [18]:
# Preprints by year
preprint_df.Date.map(lambda date: date.split('-')[0]).value_counts()

2016    4901
2017    2832
2015    1706
2014     806
2013      77
Name: Date, dtype: int64

## Save as a TSVs

In [19]:
path = os.path.join('data', 'preprints.tsv')
preprint_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'subjects.tsv')
subject_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'authors.tsv')
author_df.to_csv(path, sep='\t', index=False)