## OpenAlex API - Fetching Data

In [1]:
# basic
import sys
import os
import json
import requests
from tqdm import tqdm

# debug
import pdb
from loguru import logger

In [2]:
# 0. OpenAlex API Docs: https://docs.openalex.org/api
# 0. OpenAlex API Tutorials: https://github.com/ourresearch/openalex-api-tutorials
# 1. additional request packages: https://stackoverflow.com/a/18579484
# 2. custom tqdm: https://stackoverflow.com/questions/45808140/using-tqdm-progress-bar-in-a-while-loop 
# 3. loguru tutorial: https://medium.com/analytics-vidhya/a-quick-guide-to-using-loguru-4042dc5437a5
# 3.1 multiple loggers: https://github.com/Delgan/loguru/issues/333

In [3]:
# Example work: https://api.openalex.org/works?filter=publication_year:2021&per-page=2

## Preprocessing Utils
def get_id(url):
    return url.split('https://openalex.org/')[1]

def get_url(id_):
    return f'https://openalex.org/{id_}'

def get_cited_by_url(work_id_):
    return f'https://api.openalex.org/works?filter=cites:{work_id_}'


def int_author_position(pos):
    if pos.lower() == 'first':
        return 1
    elif pos.lower() == 'middle':
        return 2
    elif pos.lower() == 'last':
        return 3
    
def parse_authorships(authorships):
    return [
        [
            get_id(auth['author']['id']), 
            [get_id(inst['id']) if inst['id'] else None for inst in auth['institutions']]
        ]
        for auth in sorted(authorships, key=lambda x: int_author_position(x['author_position']))
    ]

def parse_concepts(concepts):
    top_k = 5
    return [
        [
            get_id(con['id']), 
            con['score']
        ]
        for con in sorted(concepts, key=lambda x: x['score'], reverse=True)
    ][:top_k]
    
def parse_abstract(abstract_inverted_index):
    max_len = max([j for _, i in abstract_inverted_index.items() for j in i]) + 1
    abstract = ['' for _ in range(max_len)]
    for key, value in abstract_inverted_index.items():
        for idx in value:
            abstract[idx] = key
    return ' '.join(abstract)

def parse_counts_by_year(counts_by_year):
    return [
        [
            count['year'],
            count['cited_by_count']
        ]
        for count in counts_by_year
    ]

def preprocess(works):
    processed_works = []
    for work in works:
        if work['id'] is None:
            continue
        if work['host_venue']['id'] is None:
            continue
        if None in [
            auth['author']['id'] 
            for auth in work['authorships']
        ]:
            continue
        if None in [
            inst['id'] 
            for auth in work['authorships'] 
                for inst in auth['institutions']
        ]:
            continue
        if None in [
            con['id'] 
            for con in work['concepts']
        ]:
            continue

        processed_works.append({
            'id': get_id(work['id']),
            'doi': work['doi'],
            'title': work['title'],
            'type': work['type'],
            'publication_date': work['publication_date'],
            'host_venue': get_id(work['host_venue']['id']),
            'open_access_is_oa': work['open_access']['is_oa'],
            'open_access_oa_status': work['open_access']['oa_status'],
            'authorships': parse_authorships(work['authorships']),
            'cited_by_count': work['cited_by_count'],
            'concepts': parse_concepts(work['concepts']),
            'referenced_works': [get_id(ref) for ref in work['referenced_works']],
            'abstract': parse_abstract(work['abstract_inverted_index']),
            'counts_by_year': parse_counts_by_year(work['counts_by_year']),
        })
    return processed_works

In [4]:
## Init Params

LOG_PATH = './log.v3.txt'
DATA_PATH = './data.v3.txt'
MAX_RESULTS = 100000
PER_PAGE = 200

INIT_PAGE, INIT_CURSOR = 0, '*'
# INIT_PAGE, INIT_CURSOR = 298, 'Ils5MCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1cyNjI0MTg2MjY4J10i'

DATA_URL = 'https://api.openalex.org/works?'\
                'filter=from_publication_date:2016-01-01,'\
                'to_publication_date:2020-12-31,'\
                'has_abstract:true,'\
                'has_references:true,'\
                'is_paratext:false,'\
                'is_retracted:false,'\
                'concepts.id:C41008148'

email = 'armaan.1997@cs.iitr.ac.in'

expected_results = requests.get(DATA_URL).json()['meta']
print ('dry run:', expected_results)
results_remain = PER_PAGE * INIT_PAGE <= min(MAX_RESULTS, expected_results['count'])

dry run: {'count': 4185273, 'db_response_time_ms': 120, 'page': 1, 'per_page': 25}


In [5]:
## Scraping Code

logger.remove()
logger.add(LOG_PATH, backtrace=True, diagnose=True, catch=True, 
           filter=lambda record: record["level"].name == "INFO")
logger.add(DATA_PATH, rotation='500 MB', 
           filter=lambda record: record["level"].name == "SUCCESS",
           format="{message}")

headers = {
    "Accept": "application/json",
    "User-Agent": f"mailto:{email}"
}


with tqdm(total=expected_results['count']) as pbar:
    pbar.update(PER_PAGE*INIT_PAGE)
    page = INIT_PAGE
    cursor = INIT_CURSOR
    while results_remain:
        hit_url = DATA_URL + f'&per_page={PER_PAGE}&cursor={cursor}'
        response = requests.get(hit_url, headers=headers) # scrape data, API call
        
        try:
            assert (response.status_code == 200), 'oops'
            works = response.json()['results'] # parse data
            page += 1
            
            processed_works = preprocess(works)
            logger.success (processed_works) # data logging
            logger.info ('page: {}, works:{}, processed works: {}, hit url: {}'
                         .format(page, len(works), len(processed_works), hit_url)) # scrape logging
            pbar.update(len(works))
            
            cursor = response.json()['meta']['next_cursor'] # parse next pointer to data
            results_remain = PER_PAGE * page <= min(MAX_RESULTS, expected_results['count'])
        except Exception as err:
            print (f"Got {response.json()} from OpenAlex")
            print (f"Unexpected {err}, {type(err)}")
            break
            

print (f'Last log: \n  - page: {page}, \n  - cursor: {cursor}, \n  - hit_url: {hit_url}')
print (f'Logs saved to file: {LOG_PATH}')
print (f'Data saved to file: {DATA_PATH}')

  2%|▏         | 100200/4185273 [09:06<6:11:29, 183.27it/s]

Last log: 
  - page: 501, 
  - cursor: Ils2NiwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1cyOTQ0Mjk3NTUyJ10i, 
  - hit_url: https://api.openalex.org/works?filter=from_publication_date:2016-01-01,to_publication_date:2020-12-31,has_abstract:true,has_references:true,is_paratext:false,is_retracted:false,concepts.id:C41008148&per_page=200&cursor=Ils2NiwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1cyOTAxODQ5NTcxJ10i
Logs saved to file: ./log.v3.txt
Data saved to file: ./data.v3.txt





In [None]:
## to resume, check line of log file for {cursor, page, hit_url}