# Analyse license choices for bioRxiv preprints

In [1]:
import json
import os

import pandas
import altair

import utilities

## Read data

In [2]:
# Ordered according to https://creativecommons.org/licenses/
licenses = ['CC BY', 'CC BY-ND', 'CC BY-NC', 'CC BY-NC-ND', 'None']
license_colors = ['#F68212', '#AC5B0C', '#934E0A', '#492705', '#000000']

In [3]:
path = os.path.join('data', 'preprints.tsv')
preprint_df = pandas.read_table(path, parse_dates=['Date'])
preprint_df.License = pandas.Categorical(preprint_df.License, licenses)
preprint_df.head(2)

Unnamed: 0,DOI,Date,License
0,10.1101/000026,2014-09-08,CC BY
1,10.1101/000042,2013-12-01,CC BY


## License distribution over time

In [4]:
path = os.path.join('figure', 'license-vs-time', 'vega-lite-data.json')
utilities.df_to_vega_lite(preprint_df, path)

## License distribution by subject

In [5]:
path = os.path.join('data', 'subjects.tsv')
subject_df = preprint_df.merge(
    pandas.read_table(path)
)
subject_df.tail(2)

Unnamed: 0,DOI,Date,License,Subject
6907,10.1101/090209,2016-11-28,CC BY-NC-ND,Neuroscience
6908,10.1101/090225,2016-11-28,CC BY-NC-ND,Ecology


In [6]:
# Subset subject_df for subjects with 100+ preprints
subject_counts = subject_df.Subject.value_counts()
popular_subjects = subject_counts[subject_counts >= 100].index.tolist()
popular_subject_df = subject_df.query("Subject in @popular_subjects")
len(popular_subjects)

16

In [7]:
# Export for vega-lite
path = os.path.join('figure', 'license-vs-subject', 'vega-lite-data.json')
utilities.df_to_vega_lite(popular_subject_df, path)

## Licensing by author

In [8]:
path = os.path.join('data', 'authors.tsv')
author_df = preprint_df.merge(
    pandas.read_table(path)
)
author_df.head()

Unnamed: 0,DOI,Date,License,Author
0,10.1101/000026,2014-09-08,CC BY,Graham Coop
1,10.1101/000026,2014-09-08,CC BY,Jeremy J Berg
2,10.1101/000042,2013-12-01,CC BY,Arvind Narayanan
3,10.1101/000042,2013-12-01,CC BY,Yaniv Erlich
4,10.1101/000067,2013-11-07,,Arthur H Shockley


In [9]:
license_scores = {
    'CC BY': 5,
    'CC BY-ND': 3,
    'CC BY-NC': 3,
    'CC BY-NC-ND': 2,
    'None': 1,
}
author_df['score'] = author_df.License.map(license_scores)

In [10]:
def summarize(df):
    row = pandas.Series()
    row['Preprints'] = len(df)
    row['Score'] = sum(df.score)
    return row

author_score_df = author_df.groupby('Author').apply(summarize).reset_index()

In [11]:
author_score_df.sort_values('Score', ascending=False).head()

Unnamed: 0,Author,Preprints,Score
20158,Michael Inouye,16,74
10491,Graham Coop,15,67
28637,Timothee Poisot,12,54
4554,Casey S Greene,11,48
24311,Richard Durbin,17,48


In [12]:
path = os.path.join('data', 'author-scores.tsv')
author_score_df.to_csv(path, sep='\t', index=False)

In [13]:
path = os.path.join('data', 'author-scores.json')
utilities.df_to_datatables(author_score_df, path)