# Download biorxiv preprint table from the PrePubMed repository

In [1]:
import os
import json
import logging

import pandas
import requests
import altair

import utilities

## Get `OmnesRes/prepub` version

In [2]:
url = 'https://api.github.com/repos/OmnesRes/prepub/git/refs/heads/master'
response = requests.get(url)
response = response.json()
response['object']

{'sha': '9f49f9c52e618f1cabc115ab49e86d6dbd0ff40c',
 'type': 'commit',
 'url': 'https://api.github.com/repos/OmnesRes/prepub/git/commits/9f49f9c52e618f1cabc115ab49e86d6dbd0ff40c'}

## Load bioRxiv data

In [3]:
url = 'https://github.com/OmnesRes/prepub/raw/master/biorxiv/biorxiv_licenses.tsv'
biorxiv_df = pandas.read_table(url, error_bad_lines=False)

In [4]:
biorxiv_df.head(2)

Unnamed: 0,DOI,Date,Subjects,License,Title,Authors,Affiliations
0,http://dx.doi.org/10.1101/049031,2016-04-16,Microbiology,CC BY-NC,Alternative Growth Behavior of Mycobacterium A...,Peilin Zhang|Lawrence M Minardi|J. Todd Kuenst...,"PZM Diagnostics, LLC"
1,http://dx.doi.org/10.1101/049049,2016-04-16,Genomics,CC BY-NC-ND,Lateral genetic transfers between eukaryotes a...,Sarah R Bordenstein|Seth R Bordenstein,Vanderbilt University


## Processing

In [5]:
# Remove URL from DOIs
biorxiv_df.DOI = biorxiv_df.DOI.str.extract(r'(10\.[0-9]+/[0-9]+)', expand=False)
biorxiv_df.License = biorxiv_df.License.str.replace('CC-BY', 'CC BY')
biorxiv_df.License = biorxiv_df.License.fillna('None')

## Authors

In [6]:
author_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Authors')
    .rename(columns={'Authors': 'Author'})
    [['DOI', 'Author']]
    .sort_values(['DOI', 'Author'])
    .drop_duplicates()
    .reset_index(drop=True)
)

In [7]:
# Standardize author names
author_df['Standard_Author'] = author_df.Author.map(utilities.get_standard_author)



In [8]:
author_df.tail(2)

Unnamed: 0,DOI,Author,Standard_Author
41025,10.1101/090209,Kathleen A Martin,Kathleen Martin
41026,10.1101/090225,David W Armitage,David Armitage


In [9]:
# Authors with the most preprints
author_df.Standard_Author.value_counts().head(2)

Mark Daly      34
Alkes Price    31
Name: Standard_Author, dtype: int64

## Subjects

In [10]:
# Create a TSV with a row per preprint-subject pair
subject_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Subjects')
    .rename(columns={'Subjects': 'Subject'})
    [['DOI', 'Subject']]
    .sort_values(['DOI', 'Subject'])
    .reset_index(drop=True)
)
subject_df.tail(2)

Unnamed: 0,DOI,Subject
6907,10.1101/090209,Neuroscience
6908,10.1101/090225,Ecology


In [11]:
# Number of subjects per preprint
subject_df.DOI.value_counts().value_counts()

1    6852
2      22
5       2
3       1
Name: DOI, dtype: int64

In [12]:
# Number of preprints by subject
subject_df.Subject.value_counts()

Bioinformatics                            1112
Evolutionary Biology                       983
Genomics                                   829
Neuroscience                               824
Genetics                                   594
Ecology                                    369
Microbiology                               300
Systems Biology                            279
Biophysics                                 224
Cell Biology                               177
Cancer Biology                             151
Plant Biology                              146
Developmental Biology                      129
Molecular Biology                          116
Biochemistry                               108
Animal Behavior and Cognition              100
Synthetic Biology                           91
Immunology                                  76
Epidemiology                                71
Bioengineering                              63
Physiology                                  37
Zoology      

## Preprints

In [13]:
preprint_df = (biorxiv_df
    [['DOI', 'Date', 'License']]
    .sort_values('DOI')
    .reset_index(drop=True)
)
preprint_df.tail(4)

Unnamed: 0,DOI,Date,License
6894,10.1101/090183,2016-11-28,CC BY-NC-ND
6895,10.1101/090191,2016-11-28,CC BY-NC-ND
6896,10.1101/090209,2016-11-28,CC BY-NC-ND
6897,10.1101/090225,2016-11-28,CC BY-NC-ND


In [14]:
len(preprint_df)

6898

In [15]:
preprint_df.License.value_counts(normalize=True)

CC BY-NC-ND    0.368078
None           0.297767
CC BY          0.178168
CC BY-NC       0.084517
CC BY-ND       0.071470
Name: License, dtype: float64

In [16]:
preprint_df.Date.map(lambda date: date.split('-')[0]).value_counts()

2016    4309
2015    1706
2014     806
2013      77
Name: Date, dtype: int64

## Save as a TSVs

In [17]:
path = os.path.join('data', 'preprints.tsv')
preprint_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'subjects.tsv')
subject_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'authors.tsv')
author_df.to_csv(path, sep='\t', index=False)