# Analyse license choices for bioRxiv preprints

In [1]:
import json
import os

import pandas
import altair

import utilities

## Read data

In [2]:
# Ordered according to https://creativecommons.org/licenses/
licenses = ['CC BY', 'CC BY-ND', 'CC BY-NC', 'CC BY-NC-ND', 'None']
license_colors = ['#F68212', '#AC5B0C', '#934E0A', '#492705', '#000000']

license_scores = {
    'CC BY': 5,
    'CC BY-ND': 3,
    'CC BY-NC': 3,
    'CC BY-NC-ND': 2,
    'None': 1,
}

In [3]:
path = os.path.join('data', 'preprints.tsv')
preprint_df = pandas.read_table(path, parse_dates=['Date'])
preprint_df.License = pandas.Categorical(preprint_df.License, licenses)
preprint_df.head(2)

Unnamed: 0,DOI,Date,License
0,10.1101/000026,2014-09-08,CC BY
1,10.1101/000042,2013-12-01,CC BY


In [4]:
# Table of licensing choices
counts = preprint_df.groupby('License').apply(len)
counts.name = 'Count'
counts = counts.reset_index()
counts['Percent'] = counts['Count'] / sum(counts['Count'])
counts['Count'] = counts['Count'].map('{:,}'.format)
counts['Percent'] = counts['Percent'].map('{:.1%}'.format)
counts['Score'] = counts['License'].map(license_scores)
counts.License = pandas.Categorical(counts.License, licenses)
counts

Unnamed: 0,License,Count,Percent,Score
0,CC BY,1229,17.8%,5
1,CC BY-ND,493,7.1%,3
2,CC BY-NC,583,8.5%,3
3,CC BY-NC-ND,2539,36.8%,2
4,,2054,29.8%,1


## License distribution over time

In [5]:
path = os.path.join('figure', 'license-vs-time', 'vega-lite-data.json')
utilities.df_to_vega_lite(preprint_df, path)

## License distribution by subject

In [6]:
path = os.path.join('data', 'subjects.tsv')
subject_df = preprint_df.merge(
    pandas.read_table(path)
)
subject_df.tail(2)

Unnamed: 0,DOI,Date,License,Subject
6907,10.1101/090209,2016-11-28,CC BY-NC-ND,Neuroscience
6908,10.1101/090225,2016-11-28,CC BY-NC-ND,Ecology


In [7]:
# Subset subject_df for subjects with 100+ preprints
subject_counts = subject_df.Subject.value_counts()
popular_subjects = subject_counts[subject_counts >= 100].index.tolist()
popular_subject_df = subject_df.query("Subject in @popular_subjects")
len(popular_subjects)

16

In [8]:
# Export for vega-lite
path = os.path.join('figure', 'license-vs-subject', 'vega-lite-data.json')
utilities.df_to_vega_lite(popular_subject_df, path)

## Licensing by author

In [9]:
path = os.path.join('data', 'authors.tsv')
author_df = preprint_df.merge(
    pandas.read_table(path)
)

In [10]:
author_df['score'] = author_df.License.map(license_scores)

In [11]:
author_df.tail(2)

Unnamed: 0,DOI,Date,License,Author,Standard_Author,score
41025,10.1101/090209,2016-11-28,CC BY-NC-ND,Kathleen A Martin,Kathleen Martin,2
41026,10.1101/090225,2016-11-28,CC BY-NC-ND,David W Armitage,David Armitage,2


In [12]:
def summarize(df):
    row = pandas.Series()
    row['Preprints'] = len(df)
    row['Score'] = sum(df.score)
    return row

author_score_df = author_df.groupby('Standard_Author').apply(summarize).reset_index()

In [13]:
author_score_df.sort_values('Score', ascending=False).head()

Unnamed: 0,Standard_Author,Preprints,Score
17411,Mark Daly,34,77
18682,Michael Inouye,16,74
12224,Jeffrey Leek,15,69
9723,Graham Coop,15,67
3220,Benjamin Neale,28,62


In [14]:
path = os.path.join('data', 'author-scores.tsv')
author_score_df.to_csv(path, sep='\t', index=False)

In [15]:
path = os.path.join('data', 'author-scores.json')
utilities.df_to_datatables(author_score_df, path)