# Process Unpaywall data for access status to articles

In [1]:
import collections
import csv
import gzip
import itertools
import json
import lzma

import pandas

## Read and process jsonl snapshot

In [2]:
def _process_unpaywall_record(row):
    row['doi'] = row['doi'].lower()
    row['journal_access'] = False
    row['journal_access_license'] = None
    row['journal_access_evidence'] = None
    for location in row['oa_locations']:
        if location['host_type'] == 'publisher':
            row['journal_access'] = True
            row['journal_access_license'] = location.get('license')
            row['journal_access_evidence'] = location.get('evidence')

def read_unpaywall_snapshot(path, doi_subset=None):
    """
    https://unpaywall.org/data-format
    https://unpaywall.org/products/snapshot
    """
    opener = gzip.open if str(path).endswith('.gz') else open
    with opener(path, 'rt') as read_file:
        for line in read_file:
            row = json.loads(line)
            if doi_subset is None or row['doi'].lower() in doi_subset:
                _process_unpaywall_record(row)
                yield row

record_renamer = {
    'doi': 'doi',
    'is_oa': 'unpaywall_access',
    'journal_access': 'journal_access',
    'journal_access_evidence': 'journal_access_evidence',
    'journal_access_license': 'journal_access_license',
    'journal_is_oa': 'journal_fully_oa',
}

def _reduce_unpaywall_record(row):
    reduced_row = collections.OrderedDict()
    for key, renamed_key in record_renamer.items():
        value = row[key]
        if isinstance(value, bool):
            value = int(value)
        reduced_row[renamed_key] = value
    return reduced_row

In [3]:
# Input path
path_jsonl = 'downloads/unpaywall/unpaywall_snapshot_2018-09-24T232615.jsonl.gz'
# Output path
path_tsv = 'data/02.unpaywall-access.tsv.xz'

articles = read_unpaywall_snapshot(path_jsonl)
articles = map(_reduce_unpaywall_record, articles)
# Uncomment following line for development
# articles = itertools.islice(articles, 100)
with lzma.open(path_tsv, 'wt') as write_file:
    writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=list(record_renamer.values()))
    writer.writeheader()
    writer.writerows(articles)

## Test reading `unpaywall-access.tsv`

In [4]:
article_df = pandas.read_csv(path_tsv, sep='\t')
article_df.tail()

Unnamed: 0,doi,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
99940224,10.1002/nadc.19970450625,0,0,,,0
99940225,10.1371/journal.pbio.1001712.g004,1,1,oa journal (via publisher name),,1
99940226,10.1364/opex.12.002220.m005,0,0,,,0
99940227,10.2105/ajph.10.6.536,1,1,open (via free pdf),,0
99940228,10.1002/asi.20570,1,0,,,0


In [5]:
# Green OA only articles
article_df.query("journal_access==0 and unpaywall_access==1").head(3)

Unnamed: 0,doi,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
0,10.1080/21645515.2017.1330236,1,0,,,0
49,10.1016/j.drugalcdep.2016.08.636,1,0,,,0
54,10.1109/icecs.2001.957596,1,0,,,0


In [6]:
# Hybrid/Bronze OA articles
article_df.query("journal_access==1 and journal_fully_oa==0").head(3)

Unnamed: 0,doi,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
3,10.1088/0004-6256/135/4/1201,1,1,open (via free pdf),,0
5,10.2478/v10172-012-0058-8,1,1,open (via free pdf),,0
13,10.1038/313176c0,1,1,open (via free pdf),,0
