# Download biorxiv preprint table from the PrePubMed repository

In [1]:
import os
import json
import logging

import pandas
import requests
import altair

import utilities

In [2]:
# Configure logging to write to file
logging.basicConfig(level=logging.INFO, filename=os.path.join('logs/donwload.log'), filemode='w')

## Get `OmnesRes/prepub` version

In [3]:
url = 'https://api.github.com/repos/OmnesRes/prepub/git/refs/heads/master'
response = requests.get(url)
response = response.json()
response['object']

{'sha': 'c7137fe288a7cbf3885fc7507060440d2e80e801',
 'type': 'commit',
 'url': 'https://api.github.com/repos/OmnesRes/prepub/git/commits/c7137fe288a7cbf3885fc7507060440d2e80e801'}

## Load bioRxiv data

In [4]:
url = 'https://github.com/OmnesRes/prepub/raw/master/biorxiv/biorxiv_licenses.tsv'
biorxiv_df = pandas.read_table(url, error_bad_lines=False)

In [5]:
biorxiv_df.head(2)

Unnamed: 0,DOI,Date,Subjects,License,Title,Authors,Affiliations
0,http://dx.doi.org/10.1101/049031,2016-04-16,Microbiology,CC BY-NC,Alternative Growth Behavior of Mycobacterium A...,Peilin Zhang|Lawrence M Minardi|J. Todd Kuenst...,"PZM Diagnostics, LLC"
1,http://dx.doi.org/10.1101/049049,2016-04-16,Genomics,CC BY-NC-ND,Lateral genetic transfers between eukaryotes a...,Sarah R Bordenstein|Seth R Bordenstein,Vanderbilt University


## Processing

In [6]:
# Remove URL from DOIs
biorxiv_df.DOI = biorxiv_df.DOI.str.extract(r'(10\.[0-9]+/[0-9]+)', expand=False)
biorxiv_df.License = biorxiv_df.License.str.replace('CC-BY', 'CC BY')
biorxiv_df.License = biorxiv_df.License.fillna('None')

## Authors

In [7]:
author_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Authors')
    .rename(columns={'Authors': 'Author'})
    [['DOI', 'Author']]
    .sort_values(['DOI', 'Author'])
    .drop_duplicates()
    .reset_index(drop=True)
)

In [8]:
# Standardize author names
author_df['Standard_Author'] = author_df.Author.map(utilities.get_standard_author)

In [9]:
author_df.tail(2)

Unnamed: 0,DOI,Author,Standard_Author
41452,10.1101/091280,Mathieu Videlier,Mathieu Videlier
41453,10.1101/091280,Nicolas Pollet,Nicolas Pollet


In [10]:
# Authors with the most preprints
author_df.Standard_Author.value_counts().head(2)

Mark Daly      34
Alkes Price    31
Name: Standard_Author, dtype: int64

## Subjects

In [11]:
# Create a TSV with a row per preprint-subject pair
subject_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Subjects')
    .rename(columns={'Subjects': 'Subject'})
    [['DOI', 'Subject']]
    .sort_values(['DOI', 'Subject'])
    .reset_index(drop=True)
)
subject_df.tail(2)

Unnamed: 0,DOI,Subject
6980,10.1101/091272,Cell Biology
6981,10.1101/091280,Bioinformatics


In [12]:
# Number of subjects per preprint
subject_df.DOI.value_counts().value_counts()

1    6925
2      22
5       2
3       1
Name: DOI, dtype: int64

In [13]:
# Number of preprints by subject
subject_df.Subject.value_counts()

Bioinformatics                            1122
Evolutionary Biology                       998
Genomics                                   838
Neuroscience                               832
Genetics                                   598
Ecology                                    372
Microbiology                               304
Systems Biology                            281
Biophysics                                 225
Cell Biology                               180
Cancer Biology                             152
Plant Biology                              148
Developmental Biology                      133
Molecular Biology                          116
Biochemistry                               110
Animal Behavior and Cognition              101
Synthetic Biology                           91
Immunology                                  77
Epidemiology                                72
Bioengineering                              64
Physiology                                  37
Zoology      

## Preprints

In [14]:
preprint_df = (biorxiv_df
    [['DOI', 'Date', 'License']]
    .sort_values('DOI')
    .reset_index(drop=True)
)
preprint_df.tail(4)

Unnamed: 0,DOI,Date,License
6968,10.1101/091256,2016-12-02,
6969,10.1101/091264,2016-12-02,
6970,10.1101/091272,2016-12-02,
6971,10.1101/091280,2016-12-02,


In [15]:
len(preprint_df)

6972

In [16]:
# Preprints by license
preprint_df.License.value_counts(normalize=True).reset_index()

Unnamed: 0,index,License
0,CC BY-NC-ND,0.368474
1,,0.297762
2,CC BY,0.177711
3,CC BY-NC,0.084624
4,CC BY-ND,0.071429


In [17]:
# Preprints by year
preprint_df.Date.map(lambda date: date.split('-')[0]).value_counts()

2016    4383
2015    1706
2014     806
2013      77
Name: Date, dtype: int64

## Save as a TSVs

In [18]:
path = os.path.join('data', 'preprints.tsv')
preprint_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'subjects.tsv')
subject_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'authors.tsv')
author_df.to_csv(path, sep='\t', index=False)