In [1]:
import requests, json
from datetime import datetime
import xml.etree.ElementTree as ET
from lxml import etree
import re
import pandas as pd
from pandas import DataFrame
from glob import glob
import asyncio
import aiohttp
from collections import defaultdict, deque
import time
from pathlib import Path

This code uses asynchronous Python libraries to query the Scopus API's effeciently and retrieve the following:

1. Affiliation codes associated with the string "George Washington University"

2. Article search results where any of the authors are tagged with any of those affiliations

3. Author profile information for each author in the above associated with those affiliations

Results are retrieved in XML format and parsed using the lxml library. 

In [2]:
# This code is necessary to make the aiohttp requests in rate-limited fashion.
# The Scopus API's have rate limits associated with them; exceeding those limits will raise errors.
# From https://github.com/hallazzang/asyncio-throttle
class Throttler:
    def __init__(self, rate_limit, period=1.0, retry_interval=0.01):
        self.rate_limit = rate_limit
        self.period = period
        self.retry_interval = retry_interval

        self._task_logs = deque()

    def flush(self):
        now = time.time()
        while self._task_logs:
            if now - self._task_logs[0] > self.period:
                self._task_logs.popleft()
            else:
                break

    async def acquire(self):
        while True:
            self.flush()
            if len(self._task_logs) < self.rate_limit:
                break
            await asyncio.sleep(self.retry_interval)

        self._task_logs.append(time.time())

    async def __aenter__(self):
        await self.acquire()

    async def __aexit__(self, exc_type, exc, tb):
        pass

In [3]:
# Testing the above 
#The code should run significantly faster by replacing the argument throttler with a factor of 10, e.g., 100
async def test_throttle(throttler, i):
    async with throttler:
        return i
async def run_test(loop):
    throttler = Throttler(rate_limit=10)
    tasks = [loop.create_task(test_throttle(throttler, i)) for i in range(100)]
    results = await asyncio.gather(*tasks)
    return results
loop = asyncio.get_event_loop()
results = loop.run_until_complete(run_test(loop))

### SCOPUS data ####

Retrieving data from the Elsevier developer API

The **headers** contain the API key (from https://dev.elsevier.com/)

In [3]:
SCOP_HEADERS = {'Accept': 'application/xml',
                 'X-ELS-APIKey' : ''}

#### Step 1: Affiliation search

**fields to retrieve:**
- affiliation-name
- dc:identifier (=afid in Scopus search)
- document-count

In [4]:
# These are passed to the requests.get function
params = {'query': 'affil({george washington university})',
         'field': 'affiliation-name,dc:identifier,document-count'}

In [5]:
base_url = 'http://api.elsevier.com/content/search/affiliation'

In [6]:
# We join the keys and values from the dict with the equals sign and separate each entry with the ampersand
# This code is NOT asynchronous -- we're just making one request here
r = requests.get(base_url, headers=SCOP_HEADERS, params='&'.join([k+'='+v for k,v in params.items()]))

In [None]:
# Response = 200: good
# TO DO --> error handling on requests

In [7]:
def parse_xml(xmldoc):
    '''Returns an ElementTree root Element and a namespace map'''
    # load the XML response body
    parser = etree.XMLParser(encoding='utf-8')
    # Get rid of the default namespace
    xmldoc = re.sub(r'\sxmlns="[^"]+"', '', xmldoc, count=1)
    # Need to use the explicit encoding in order to avoid errors
    tree = etree.fromstring(xmldoc.encode('utf8'), parser=parser)
    return tree, tree.nsmap

In [9]:
# Parse the affiliation search result
tree, _ = parse_xml(r.text)

In [10]:
# Output CSV for customization of affiliations
# We will use the affiliation codes to find GW authors
aff_tbl = pd.DataFrame.from_dict(
    dict(
        zip(
            [node.text for node in tree.xpath('//affiliation-name')],
[node.text for node in tree.xpath('//dc:identifier', namespaces=tree.nsmap)]
        )
    ), orient='index',
).reset_index()

In [14]:
aff_tbl

Unnamed: 0,affiliation_name,affiliation_id
0,George Washington University,60003088
1,George Washington University Medical Center,100931046
2,George Washington University School of Medicin...,113202420
3,George Washington University Hospital,100851611
4,Elliott School of International Affairs,60077343
5,District of Columbia General Hospital,60001971
6,George Washington University School of Public ...,60032302
7,D.C. General Hospital,60009393
8,National Capital Poison Center,60031795
9,George Washington University Biostatistics Center,101758582


In [12]:
aff_tbl.columns=['affiliation_name', 'affiliation_id']

In [13]:
# Now get rid of the extraneous text on the ID column
aff_tbl.affiliation_id = aff_tbl.affiliation_id.apply(lambda x: x.split(':')[1])

In [None]:
# Save the file to disk
aff_tbl.to_csv('../data/scholcomm/faculty_pubs/affiliation_tbl_092018.csv', index=False)

#### Step 2: Document retrieval

**Fields to include:**

https://dev.elsevier.com/guides/ScopusSearchViews.htm

In [15]:
# Read a table of affiliations we want to use
# This table may be modified in a spreadsheet or text editor to exclude unwanted affiliation codes
aff_tbl = pd.read_csv('../data/scholcomm/faculty_pubs/affiliation_tbl_092018.csv',
                     dtype={'affiliation_id': 'str'})

In [16]:
BASE_URL = 'http://api.elsevier.com/content/search/scopus'

In [17]:
# Where we want to save the data
DATA_DIR = '../data/scholcomm/faculty_pubs/'

In [38]:
def dump_xml(doc_list, path):
    '''Accepts a list of XML documents and a path (pathlib object) to a folder to write them to.'''
    # Store the XML output to disk
    for i, d in enumerate(doc_list):
        with open(path / 'results_{}.xml'.format(i), 'w', errors='ignore') as f:
            f.write(d)

In [18]:
def load_xml(path):
    '''Accepts a path (pathlib object) to a folder of XML files to load. Returns the XML docs as a list.'''
    doc_list = []
    for p in path.glob('*.xml'):
        with open(p, 'r') as f:
            doc_list.append(f.read())
    return doc_list

In [None]:
# To DO:
# 1. Use cursor to retrieve more than 5000 results

In [19]:
async def do_request(url, params, throttler, start=None):
    '''Using the aiohttp library to make asynchronous requests. Accepts an instance of the Throttler class (defined above) 
    with a rate limit set. This will keep the requests bounded by the rate limit.'''
    async with throttler:
        async with aiohttp.ClientSession() as client:
            # Need to update the params here...updating them outside of the with statement for some reason doesn't work
            if start:
                # On each pass, the "start" query parameter will be updated when working through a batch of requests
                params['start'] = str(start)
            async with client.get(url, 
                             headers=SCOP_HEADERS, 
                             params='&'.join([k+'='+v for k,v in params.items()])) as session:
                response = await session.text()
    return response

In [20]:
async def batched_search(num_results, params, loop, start=25, rate_limit=6):
    '''Helper for the Scopus Search query. Can be used to restart a search interrupted, given an offset number'''
    # Create the iterable of request tasks, where each task is an async request (an awaitable)    
    # The first time through, assume we have retrieved the first 25 results
    # Using the asyncio_throttler library to limit our requests to X number per second, as per the Scopus API specs
    throttler = Throttler(rate_limit=rate_limit)
    # Run the requests concurrently
    awaitables = [loop.create_task(do_request(BASE_URL, params, throttler, i)) for i in range(start, int(num_results), 25)]
    results = await asyncio.gather(*awaitables)
    return results

In [21]:
def run_full_works_search(params):
    '''Given a set of affiliations and a date range, returns all the results (works) for that query
       on the Scopus Search API'''
    # Using the complete view, we can get only 25 results at a time.
    # So we iterate in batches
    # Store the results in a running list
    doc_list = []
    # Perform an initial query to get the number of results
    try:
        r = requests.get(BASE_URL, 
                         headers=SCOP_HEADERS, 
                         params='&'.join([k+'='+v for k,v in params.items()]))
        if r.status_code != 200:
            # Throw error on connection failure --> TO DO: implement logging
            raise Exception(r.text)
        tree, nsmap = parse_xml(r.text)
        # Get the total number of results
        num_results = tree.find('opensearch:totalResults', 
                                    namespaces=nsmap).text
        doc_list.append(r.text)
    except Exception as e:
        print(e)
    # Initialize the event loop for the asynchronous queries
    loop = asyncio.get_event_loop()
    print("Retrieving {} results".format(num_results))
    #loop.run_untiL_complete will pass along the return value of the called function
    doc_list.extend(loop.run_until_complete(batched_search(num_results, params, loop)))
    return doc_list

In [22]:
def parse_works(tree, nsmap, aff_xpath_strs):
    '''Parses a page of search results, extracting metadata and affiliated authors'''
    # Dictionaries to look up a) a list of works by Author ID and b) a list of authors by Work ID
    work_authors = defaultdict(list)
    author_works = defaultdict(list)
    # The list of article-level metadata for each work
    works = []    
    # Iterate over the entries in the XML list of results
    for entry in tree.xpath('//entry', namespaces=nsmap):
        # Unique Scopus work ID
        work_id = entry.find('dc:identifier', namespaces=nsmap).text
        # Authors whose affiliations match those in our list
        # B/c the Scopus search only returns at most 100 authors per work, affiliated authors may be missed if 
        #    a) the list of authors has > 100 elements and 
        #    b) the affiliated author's place in the sequence is > 100
        aff_authors = entry.xpath('author[{}]'
                                    .format(aff_xpath_strs['afid']), 
                                            namespaces=nsmap)
        # For each author, create a mapping between the article ID and the Scopus author ID
        # This will be useful for connecting up the more specific affiliation info we can get from the author profiles
        for auth_elem in aff_authors:
            auth_url = auth_elem.find('author-url', namespaces=nsmap).text
            # The author's ID is at the end of the URL to their profile
            auth_id = auth_url.split('/')[-1]
            work_authors[work_id].append(auth_url)
            author_works[auth_id].append(work_id)
        # Extract the rest of the citation metadata, exclusive of the authors
        # QName.localname drops the namespace prefix
        work = {etree.QName(element).localname: element.text for element 
                       in entry.xpath('dc:*|prism:*', namespaces=nsmap)}
            # Add the complete list of authors
        work['authors'] = [element.text for element 
                            in entry.xpath('author/authname', namespaces=tree.nsmap)]
        # Add the cited-by count
        work['citedby_count'] = entry.find('citedby-count', namespaces=tree.nsmap).text
        # Store this work in the list
        works.append(work)
    return (work_authors, author_works, works)

In [23]:
# This string allows us to limit our search to those works associated with the affiliations in our list
aff_str = '+or+'.join(['af-id({})'.format(afid) for afid in aff_tbl.affiliation_id])
# Start year should be one less than the desired year, e.g., enter 2017 to find everything published in 2018 or more recently
start_year = 2018
params = {'query': '({})+PUBYEAR+>+{}'.format(aff_str, 
                            str(start_year)),
                         'view': 'complete',
                          'start': '0'}

In [24]:
doc_list = run_full_works_search(params)

Retrieving 465 results


In [26]:
# String for efficient XPATH queries on the affiliation ids we're looking for
# Allows us to ignore co-authors not associated with our affiliations
aff_xpath_strs = {'afid': ' or '.join(['afid="{}"'.format(a) for a in aff_tbl.affiliation_id]),
                               'parent_id': ' or '.join(['@parent="{}"'.format(a) for a in aff_tbl.affiliation_id])}
works = []
author_works = {}
work_authors = {}
# Process each page of results
for d in doc_list:
    tree, nsmap = parse_xml(d)
    work_authors_batch, author_works_batch, works_batch = parse_works(tree, nsmap, aff_xpath_strs)
    works.extend(works_batch)
    work_authors.update(work_authors_batch)
    author_works.update(author_works_batch)

In [30]:
# Test: the number of parsed works should equal the number reported by our initial search
assert len(works) == 465

In [35]:
# Test: are all the documents captured in our author list? If not, why not?
id_list = []
for i, d in enumerate(doc_list):
    tree, nsmap = parse_xml(d)
    ids = tree.xpath('//entry/dc:identifier/text()', namespaces=nsmap)
    id_list.extend([{'doc_num': i, 'id': t} for t in ids])
[i for i in id_list if i['id'] not in work_authors] # Sould return an empty list

[{'doc_num': 2, 'id': 'SCOPUS_ID:85061593910'},
 {'doc_num': 2, 'id': 'SCOPUS_ID:85060016891'},
 {'doc_num': 4, 'id': 'SCOPUS_ID:85060236337'},
 {'doc_num': 7, 'id': 'SCOPUS_ID:85052723351'},
 {'doc_num': 11, 'id': 'SCOPUS_ID:85059443714'},
 {'doc_num': 12, 'id': 'SCOPUS_ID:85057551299'},
 {'doc_num': 14, 'id': 'SCOPUS_ID:85061055309'},
 {'doc_num': 14, 'id': 'SCOPUS_ID:85059232340'},
 {'doc_num': 17, 'id': 'SCOPUS_ID:85058847142'}]

In [36]:
# Use the pathlib library to create platform-specific paths
# Helps simplify navigating the filesystem
path_to_tests = Path(DATA_DIR) / 'tests/scopus_search'

In [39]:
dump_xml(doc_list, path_to_tests)

**Step 3:** Retrieve author profiles for each author

In [40]:
# Get the URL's for the author profiles in this set of works
# Using a set since the same author may be attached to more than one work
author_urls = list({url for urls in work_authors.values() for url in urls})

In [42]:
# For the Authors API, the only necessary parameter is the type of view
# The URL contains the author ID, so we need to make one request per author
throttler = Throttler(rate_limit=2)
params={'view': 'STANDARD'}
# Testing a small batch
loop = asyncio.get_event_loop()
awaitables = [loop.create_task(do_request(url, params, throttler)) for url in author_urls]
results = await asyncio.gather(*awaitables)

In [43]:
path_to_tests = Path(DATA_DIR) / 'tests/author_search'
dump_xml(results, path_to_tests)

In [44]:
results = [parse_xml(r) for r in results]

In [49]:
profiles = [parse_author_profile(r[0], r[1], aff_tbl, aff_xpath_strs) for r in results]

In [56]:
profiles[275]

{'auth_id': '6603418598',
 'index_name': 'Samango-Sprouse C.',
 'surname': 'Samango-Sprouse',
 'given-name': 'Carole A.',
 'departments': [{'Department': 'Department of Pediatrics',
   'Type': 'Current',
   'Parent': 'George Washington University'}]}

In [47]:
def parse_author_profile(tree, nsmap, aff_tbl, aff_xpath_strs):
    '''Parses an author profile, extracting name and relevant affiliations'''
    author = {'auth_id': tree.xpath('//dc:identifier', 
                                      namespaces=nsmap)[0].text.split(':')[1]}
    #Get the indexed name as well as surname and given name
    author['index_name'] = tree.find('author-profile/preferred-name/indexed-name').text
    author['surname'] = tree.find('author-profile/preferred-name/surname').text
    author['given-name'] = tree.find('author-profile/preferred-name/given-name').text
    # List to store the author's affiliations
    author['departments'] = []
    # Get the value of the author's current affiliation
    # Need to loop through the current affiliations, which may contain more than one
    affs_current = tree.xpath('//affiliation-current/affiliation')
    for aff in affs_current:
        aff_current_id = aff.get('affiliation-id')
        aff_current_parent = aff.get('parent')
        # Is this author affiliation in our target list?
        if (aff_current_id in aff_tbl.affiliation_id.values) \
            or (aff_current_parent in aff_tbl.affiliation_id.values):
            # Capture the preferred name listed for the current department
            current_dept = {'Department': aff.find('ip-doc/preferred-name').text,
                                   'Type': 'Current'}
            # If there's a parent entity listed, capture that, too
            parent_dept = aff.find('ip-doc/parent-preferred-name')
            if parent_dept is not None:
                current_dept['Parent'] = parent_dept.text
            author['departments'].append(current_dept)
        # If it's an author who is possibly no longer affiliated
        #author['other_depts'] = [e.text for e in tree.xpath('//affiliation[{}]/ip-doc/preferred-name'.format(aff_xpath_strs['parent_id']))]
    return author