# Process Unpaywall data for access status to articles

In [1]:
import json
import gzip

import pandas

In [2]:
def read_unpaywall_snapshot(path, doi_subset=None):
    opener = gzip.open if str(path).endswith('.gz') else open
    with opener(path, 'rt') as read_file:
        for line in read_file:
            row = json.loads(line)
            if doi_subset is None or row['doi'].lower() in doi_subset:
                row['journal_access'] = any(location['host_type'] == 'publisher' for location in row['oa_locations'])
                yield row

In [3]:
path = 'downloads/unpaywall/unpaywall_snapshot_2018-09-24T232615-subset.jsonl'
articles = list(read_unpaywall_snapshot(path))

In [4]:
article_df = (
    pandas.DataFrame(articles)
    .rename(columns={
        'is_oa': 'unpaywall_access',
    })
    [['doi', 'unpaywall_access', 'journal_access', 'journal_is_oa']]
)
article_df.doi = article_df.doi.str.lower()
article_df.head()

Unnamed: 0,doi,unpaywall_access,journal_access,journal_is_oa
0,10.1080/21645515.2017.1330236,True,False,False
1,10.1371/journal.pone.0061390.g002,True,True,True
2,10.1371/journal.pone.0082270.t001,True,True,True
3,10.1088/0004-6256/135/4/1201,True,True,False
4,10.1088/0022-3727/48/43/435001,False,False,False


In [5]:
article_df.query("unpaywall_access==True and journal_access==False")

Unnamed: 0,doi,unpaywall_access,journal_access,journal_is_oa
0,10.1080/21645515.2017.1330236,True,False,False
49,10.1016/j.drugalcdep.2016.08.636,True,False,False
54,10.1109/icecs.2001.957596,True,False,False
