# Process Unpaywall data for access status to articles

In [1]:
import json
import gzip

import pandas

In [2]:
def _process_unpaywall_record(row):
    row['journal_access'] = False
    row['journal_access_license'] = None
    row['journal_access_evidence'] = None
    for location in row['oa_locations']:
        if location['host_type'] == 'publisher':
            row['journal_access'] = True
            row['journal_access_license'] = location.get('license')
            row['journal_access_evidence'] = location.get('evidence')

def read_unpaywall_snapshot(path, doi_subset=None):
    """
    https://unpaywall.org/data-format
    https://unpaywall.org/products/snapshot
    """
    opener = gzip.open if str(path).endswith('.gz') else open
    with opener(path, 'rt') as read_file:
        for line in read_file:
            row = json.loads(line)
            if doi_subset is None or row['doi'].lower() in doi_subset:
                _process_unpaywall_record(row)
                yield row

In [3]:
path = 'downloads/unpaywall/unpaywall_snapshot_2018-09-24T232615-subset.jsonl'
articles = list(read_unpaywall_snapshot(path))

In [4]:
article_df = (
    pandas.DataFrame(articles)
    .rename(columns={
        'is_oa': 'unpaywall_access',
        'journal_is_oa': 'journal_fully_oa',
    })
    [['doi', 'unpaywall_access', 'journal_access', 'journal_access_evidence', 'journal_access_license', 'journal_fully_oa']]
)
article_df.doi = article_df.doi.str.lower()
article_df = article_df.sort_values('doi')
assert not any(article_df.doi.duplicated())
article_df.head(10)

Unnamed: 0,doi,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
87,10.1001/archneur.1965.00470040029004,False,False,,,False
85,10.1002/1615-1003(200105)30:3<234::aid-pauz234...,False,False,,,False
86,10.1002/adsc.201300332,False,False,,,False
50,10.1002/au.3650020210,False,False,,,False
72,10.1007/978-3-0348-6310-0_43,False,False,,,False
96,10.1007/978-3-319-68675-2_2,False,False,,,False
94,10.1007/978-3-322-89635-3_1,False,False,,,False
95,10.1007/978-3-476-05018-2,False,False,,,False
93,10.1007/978-3-540-29925-7_7075,False,False,,,False
97,10.1007/978-3-540-31751-7_7,False,False,,,False


In [5]:
# Green OA only articles
article_df.query("journal_access==False and unpaywall_access==True").head(2)

Unnamed: 0,doi,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
49,10.1016/j.drugalcdep.2016.08.636,True,False,,,False
0,10.1080/21645515.2017.1330236,True,False,,,False


In [6]:
# Hybrid/Bronze OA articles
article_df.query("journal_access==True and journal_fully_oa==False").head(3)

Unnamed: 0,doi,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
58,10.1016/s0006-3495(97)78333-4,True,True,open (via crossref license),elsevier-specific: oa user license,False
13,10.1038/313176c0,True,True,open (via free pdf),,False
64,10.1080/07391102.2011.10524954,True,True,open (via page says Open Access),implied-oa,False


In [7]:
article_df.to_csv('data/02.unpaywall-access.tsv.xz', sep='\t', index=False)