# Analyse license choices for bioRxiv preprints

In [1]:
import json
import os

import pandas
import altair

import utilities

## Read data

In [2]:
# Ordered according to https://creativecommons.org/licenses/
licenses = ['CC BY', 'CC BY-ND', 'CC BY-NC', 'CC BY-NC-ND', 'None']
license_colors = ['#F68212', '#AC5B0C', '#934E0A', '#492705', '#000000']

license_scores = {
    'CC BY': 5,
    'CC BY-ND': 3,
    'CC BY-NC': 3,
    'CC BY-NC-ND': 2,
    'None': 1,
}

In [3]:
path = os.path.join('data', 'preprints.tsv')
preprint_df = pandas.read_table(path, parse_dates=['Date'])
preprint_df.License = pandas.Categorical(preprint_df.License, licenses)
preprint_df.head(2)

Unnamed: 0,DOI,Date,License
0,10.1101/000026,2014-09-08,CC BY
1,10.1101/000042,2013-12-01,CC BY


In [4]:
# Table of licensing choices
counts = preprint_df.groupby('License').apply(len)
counts.name = 'Count'
count_df = counts.reset_index()
count_df['Percent'] = count_df['Count'] / sum(count_df['Count'])
count_df.License = pandas.Categorical(count_df.License, licenses)
count_df.assign(
    Count=count_df['Count'].map('{:,}'.format),
    Percent=count_df['Percent'].map('{:.1%}'.format),
    Score=count_df['License'].map(license_scores),
)

Unnamed: 0,License,Count,Percent,Score
0,CC BY,1239,17.8%,5
1,CC BY-ND,498,7.1%,3
2,CC BY-NC,590,8.5%,3
3,CC BY-NC-ND,2569,36.8%,2
4,,2076,29.8%,1


In [5]:
# Preprints that forbid derivatives
ND_licenses = {'CC BY-ND', 'CC BY-NC-ND', 'None'}
count_df.query("License in @ND_licenses").sum(numeric_only=True)

Count      5143.000000
Percent       0.737665
dtype: float64

In [6]:
# Preprints that forbid commercial use
ND_licenses = {'CC BY-NC', 'CC BY-NC-ND', 'None'}
count_df.query("License in @ND_licenses").sum(numeric_only=True)

Count      5235.000000
Percent       0.750861
dtype: float64

## License distribution over time

In [7]:
path = os.path.join('figure', 'license-vs-time', 'vega-lite-data.json')
utilities.df_to_vega_lite(preprint_df, path)

## License distribution by subject

In [8]:
path = os.path.join('data', 'subjects.tsv')
subject_df = preprint_df.merge(
    pandas.read_table(path)
)
subject_df.tail(2)

Unnamed: 0,DOI,Date,License,Subject
6980,10.1101/091272,2016-12-02,,Cell Biology
6981,10.1101/091280,2016-12-02,,Bioinformatics


In [9]:
# Subset subject_df for subjects with 100+ preprints
subject_counts = subject_df.Subject.value_counts()
popular_subjects = subject_counts[subject_counts >= 100].index.tolist()
popular_subject_df = subject_df.query("Subject in @popular_subjects")
len(popular_subjects)

16

In [10]:
# Export for vega-lite
path = os.path.join('figure', 'license-vs-subject', 'vega-lite-data.json')
utilities.df_to_vega_lite(popular_subject_df, path)

## Licensing by author

In [11]:
path = os.path.join('data', 'authors.tsv')
author_df = preprint_df.merge(
    pandas.read_table(path)
)

In [12]:
author_df['score'] = author_df.License.map(license_scores)

In [13]:
author_df.tail(2)

Unnamed: 0,DOI,Date,License,Author,Standard_Author,score
41452,10.1101/091280,2016-12-02,,Mathieu Videlier,Mathieu Videlier,1
41453,10.1101/091280,2016-12-02,,Nicolas Pollet,Nicolas Pollet,1


In [14]:
def summarize(df):
    row = pandas.Series()
    row['Preprints'] = len(df)
    row['Score'] = sum(df.score)
    return row

author_score_df = author_df.groupby('Standard_Author').apply(summarize).reset_index()

In [15]:
author_score_df.sort_values('Score', ascending=False).head()

Unnamed: 0,Standard_Author,Preprints,Score
17572,Mark Daly,34,77
18856,Michael Inouye,16,74
12348,Jeffrey Leek,15,69
9823,Graham Coop,15,67
3249,Benjamin Neale,28,62


In [16]:
path = os.path.join('data', 'author-scores.tsv')
author_score_df.to_csv(path, sep='\t', index=False)

In [17]:
path = os.path.join('data', 'author-scores.json')
utilities.df_to_datatables(author_score_df, path)