# Compile journal homepages links from Scopus

+ https://github.com/dhimmel/journalmetrics/issues/2
+ https://twitter.com/dhimmel/status/871007923010490369
+ https://api.elsevier.com/documentation/SerialTitleAPI.wadl#N102B5

In [1]:
import os
import xml.etree.ElementTree
import shelve
import logging
import json
import lzma

import pandas
import requests

## Read Scopus ISSNs

In [2]:
path = os.path.join('data', 'issn.tsv')
issn_df = pandas.read_table(path)
issn_df.head(3)

Unnamed: 0,scopus_id,issn_type,issn
0,12000,print,1527-6228
1,12001,print,0022-5002
2,12002,electronic,1520-6696


In [3]:
issns = sorted(set(issn_df.issn))
len(issns)

47154

## Query Scopus API for title metadata

Use a shelve to avoid repeating queries across executions.

In [4]:
def get_response_text(issn, cache={}):
    if issn in cache:
        return cache[issn]
    url = f'https://api.elsevier.com/content/serial/title/issn/{issn}'
    params = {
        'httpAccept': 'text/xml',
    }
    response = requests.get(url, params)
    if not response.ok:
        logging.info(f'{response.url} returned {response.status_code}:\n{response.text}')
    text = response.text
    cache[issn] = text
    return text

def get_homepage(text):
    tree = xml.etree.ElementTree.fromstring(text)
    elem = tree.find('entry/link[@ref="homepage"]')
    href = None if elem is None else elem.get('href')
    return href

In [5]:
path = os.path.join('data', 'homepages', 'issn-scopus-api.shelve')
cache = shelve.open(path, protocol=4)

In [6]:
issn_to_url = dict()
for issn in issns:
    text = get_response_text(issn, cache)
    if not text:
        continue
    url = get_homepage(text)
    if not url:
        continue
    issn_to_url[issn] = url
len(issn_to_url)

27392

In [7]:
len(cache)

47154

In [8]:
cache_to_export = dict(cache)

In [9]:
cache.close()

## Create homepage mapping TSVs

In [10]:
issn_homepage_df = pandas.DataFrame.from_records(list(issn_to_url.items()), columns=['issn', 'homepage'])
issn_homepage_df.head(2)

Unnamed: 0,issn,homepage
0,0001-0782,http://www.acm.org/pubs/contents/journals/cacm/
1,0001-1452,http://www.aiaa.org/content.cfm?pageid=322&lup...


In [11]:
scopus_homepage_df = issn_df.merge(issn_homepage_df)
scopus_homepage_df = scopus_homepage_df[['scopus_id', 'homepage']].drop_duplicates()
scopus_homepage_df.head(3)

Unnamed: 0,scopus_id,homepage
0,12000,http://jtc.colstate.edu
1,12001,http://seab.envmed.rochester.edu/jeab/index.html
2,12002,http://www.interscience.wiley.com/jpages/0022-...


In [12]:
len(scopus_homepage_df)

20992

In [13]:
# Journals with multiple homepage URLs
scopus_homepage_df[scopus_homepage_df.duplicated(keep=False)]

Unnamed: 0,scopus_id,homepage


In [14]:
path = os.path.join('data', 'homepages', 'issn-homepages.tsv')
issn_homepage_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'homepages', 'scopus-homepages.tsv')
scopus_homepage_df.to_csv(path, sep='\t', index=False)

## Export cache to compressed JSON

In [15]:
path = os.path.join('data', 'homepages', 'issn-scopus-api.json.xz')
with lzma.open(path, 'wt') as write_file:
    json.dump(cache_to_export, write_file, indent=2, sort_keys=True)