# Download biorxiv preprint table from the PrePubMed repository

In [1]:
import os
import json

import pandas
import requests
import altair

## Get `OmnesRes/prepub` version

In [2]:
url = 'https://api.github.com/repos/OmnesRes/prepub/git/refs/heads/master'
response = requests.get(url)
response = response.json()
response['object']

{'sha': '9f49f9c52e618f1cabc115ab49e86d6dbd0ff40c',
 'type': 'commit',
 'url': 'https://api.github.com/repos/OmnesRes/prepub/git/commits/9f49f9c52e618f1cabc115ab49e86d6dbd0ff40c'}

## Load bioRxiv data

In [3]:
url = 'https://github.com/OmnesRes/prepub/raw/master/biorxiv/biorxiv_licenses.tsv'
biorxiv_df = pandas.read_table(url, error_bad_lines=False)

In [4]:
biorxiv_df.head(2)

Unnamed: 0,DOI,Date,Subjects,License,Title,Authors,Affiliations
0,http://dx.doi.org/10.1101/049031,2016-04-16,Microbiology,CC BY-NC,Alternative Growth Behavior of Mycobacterium A...,Peilin Zhang|Lawrence M Minardi|J. Todd Kuenst...,"PZM Diagnostics, LLC"
1,http://dx.doi.org/10.1101/049049,2016-04-16,Genomics,CC BY-NC-ND,Lateral genetic transfers between eukaryotes a...,Sarah R Bordenstein|Seth R Bordenstein,Vanderbilt University


## Process

In [5]:
# Remove URL from DOIs
biorxiv_df.DOI = biorxiv_df.DOI.str.extract(r'(10\.[0-9]+/[0-9]+)', expand=False)
biorxiv_df.License = biorxiv_df.License.str.replace('CC-BY', 'CC BY')
biorxiv_df.License = biorxiv_df.License.fillna('None')

In [6]:
biorxiv_df = (biorxiv_df
    [['DOI', 'Date', 'Subjects', 'License']]
    .sort_values('DOI')
    .reset_index(drop=True)
)
biorxiv_df.tail(4)

Unnamed: 0,DOI,Date,Subjects,License
6894,10.1101/090183,2016-11-28,Genomics,CC BY-NC-ND
6895,10.1101/090191,2016-11-28,Evolutionary Biology,CC BY-NC-ND
6896,10.1101/090209,2016-11-28,Neuroscience,CC BY-NC-ND
6897,10.1101/090225,2016-11-28,Ecology,CC BY-NC-ND


In [7]:
len(biorxiv_df)

6898

In [8]:
# Save as a TSV
path = os.path.join('data', 'biorxiv-prepubmed.tsv')
biorxiv_df.to_csv(path, sep='\t', index=False)