# Analyse license choices for bioRxiv preprints

In [1]:
import json
import os

import pandas
import altair

import utilities

## Read data

In [2]:
# Ordered according to https://creativecommons.org/licenses/
licenses = ['CC BY', 'CC BY-ND', 'CC BY-NC', 'CC BY-NC-ND', 'None']
license_colors = ['#F68212', '#AC5B0C', '#934E0A', '#492705', '#000000']

In [3]:
biorxiv_df = pandas.read_table('data/biorxiv-prepubmed.tsv', parse_dates=['Date'])
biorxiv_df.License = pandas.Categorical(biorxiv_df.License, licenses)
biorxiv_df.head(2)

Unnamed: 0,DOI,Date,Subjects,License
0,10.1101/000026,2014-09-08,Genetics,CC BY
1,10.1101/000042,2013-12-01,Genomics,CC BY


## Frequency by license type

In [4]:
biorxiv_df.License.value_counts(normalize=True)

CC BY-NC-ND    0.368078
None           0.297767
CC BY          0.178168
CC BY-NC       0.084517
CC BY-ND       0.071470
Name: License, dtype: float64

## License distribution over time

In [5]:
path = os.path.join('figure', 'license-vs-time', 'vega-lite-data.json')
utilities.df_to_vega_lite(biorxiv_df, path)

## License distribution by subject

In [6]:
# Create a TSV with a row per preprint-subject pair
subject_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Subjects')
    .rename(columns={'Subjects': 'Subject'})
    .sort_values(['DOI', 'Subject'])
    .fillna('None')
)
subject_df.tail(2)

Unnamed: 0,DOI,Date,Subject,License
6896,10.1101/090209,2016-11-28,Neuroscience,CC BY-NC-ND
6897,10.1101/090225,2016-11-28,Ecology,CC BY-NC-ND


In [7]:
# Number of subjects per preprint
subject_df.DOI.value_counts().value_counts()

1    6852
2      22
5       2
3       1
Name: DOI, dtype: int64

In [8]:
# Number of preprints by subject
subject_counts = subject_df.Subject.value_counts()
subject_counts

Bioinformatics                            1112
Evolutionary Biology                       983
Genomics                                   829
Neuroscience                               824
Genetics                                   594
Ecology                                    369
Microbiology                               300
Systems Biology                            279
Biophysics                                 224
Cell Biology                               177
Cancer Biology                             151
Plant Biology                              146
Developmental Biology                      129
Molecular Biology                          116
Biochemistry                               108
Animal Behavior and Cognition              100
Synthetic Biology                           91
Immunology                                  76
Epidemiology                                71
Bioengineering                              63
Physiology                                  37
Zoology      

In [9]:
# Subset subject_df for subjects with 100+ preprints
popular_subjects = subject_counts[subject_counts >= 100].index.tolist()
popular_subject_df = subject_df.query("Subject in @popular_subjects")
len(popular_subjects)

16

In [10]:
# Export for vega-lite
path = os.path.join('figure', 'license-vs-subject', 'vega-lite-data.json')
utilities.df_to_vega_lite(popular_subject_df, path)