In [21]:
import json
import warnings
import numpy as np
import requests
import time
import concurrent.futures
import urllib.request
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
import os
import logging as log

### Data retrieval from Semantic Scholar

In [22]:
#Load your api key
with open('api_keys.json') as f:
    api_keys = json.load(f)
key = api_keys['semantic_scholar']

In [23]:
global req_counter
req_counter = 0
# CLASS calls per second
CALLS = 950
RATE_LIMIT = 1

@sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT)
def check_limit():
    """ Empty function just to check for calls to API """
    return

In [24]:
def get_paper_information(paperId, fields='authors,venue,year,isOpenAccess,references,references.year,citations,citations.year'):
    """adds entry paperInfo to data for a given paper in data"""
    global key
    global req_counter
    req_counter += 1
    check_limit()
    time.sleep(0.01)
    req = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paperId}?fields={fields}&limit=1000',headers={'x-api-key':key}, timeout=30).json()
    if 'message' in req.keys():
        log.error(req['message'])
        raise Exception('Probs too many requests :(')
    return paperId, req
def get_author_citations(author_id='47237027', fields='year,citations.year'):
    """returns a paper+year and its citations+year for a given authorId"""
    global key
    global req_counter
    req_counter += 1
    check_limit()
    req = requests.get(f'https://api.semanticscholar.org/graph/v1/author/{author_id}/papers?fields={fields}&limit=1000',headers={'x-api-key':key})
    #print(req)
    req = req.json()
    if 'message' in req.keys():
        log.error(req['message'])
        raise Exception('Probs too many requests :(')
    return author_id, req
#get_author_citations(author_id='47237027', fields='year,citations.year')

In [25]:
def original_paper_side(paper):
    try:
        idx, paperInfo = get_paper_information(paper["paperId"])
        paper.update({'paperInfo': paperInfo})
        return  paper, True
    except Exception as exec:
        log.error(exec)
        return paper, False


In [26]:
def requests_w_workers(paper, function=get_paper_information, side='references', field='citations.year'):
    org_number = len(paper['paperInfo'][side])
    c=0
    if side == 'authors':
        interestId = 'authorId'
        insertion_key = 'authorsPapers'
        insertion_value ='data'
    else:
        interestId = 'paperId'
        insertion_key = 'citations'
        insertion_value ='citations'
    log.info(f'Starting requests for {side}')
    with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
        # Start the load operations and mark each future with its URL
        future_to_url = {executor.submit(function, paper['paperInfo'][side][ref_idx][interestId], fields=field):ref_idx for ref_idx, reference in enumerate(paper['paperInfo'][side])}
        for future in concurrent.futures.as_completed(future_to_url):
            #res = future_to_url[future]
            c += 1
            try:
                 idx, request = future.result()
                 if 'error' in request.keys():
                     continue
                 for ref_idx, reference in enumerate(paper['paperInfo'][side]):
                     if paper['paperInfo'][side][ref_idx][interestId] == idx:
                         #if side == 'authors':
                            #print('matched paper')
                         #checking for matching reference paperId to insert at correct position
                         paper['paperInfo'][side][ref_idx].update({insertion_key: request[insertion_value]})
            except Exception as exc:
                log.error(f'{paper["id"]} generated an exception: {exc}', request)
    if c != org_number:
        #raise Exception('Somehow lost a request on the way', c-org_number)
        log.error('Somehow lost a request on the way')
        log.error(c-org_number)
    return paper

In [27]:
def get_all_data_for_paper(req_paper):
    """"Retrieves all relevant data and adds it to the json"""
    t1 = time.time()
    req_paper, has_paper_info = original_paper_side(req_paper)
    missing_info = []
    if has_paper_info:
        if 'references' in req_paper['paperInfo'].keys():
            req_paper = requests_w_workers(req_paper, function=get_paper_information, side='references', field='citations.year')
        else:
            missing_info.append((req_paper['id'], 'references'))
        if 'citations' in req_paper['paperInfo'].keys():
           req_paper = requests_w_workers(req_paper, function=get_paper_information, side='citations', field='citations.year')
        else:
            missing_info.append((req_paper['id'], 'citations'))
        if 'citations' in req_paper['paperInfo'].keys():
           req_paper = requests_w_workers(req_paper, function=get_author_citations, side='authors', field='year,citations.year')
        else:
            missing_info.append((req_paper['id'], 'citations'))
    else:
        missing_info.append((req_paper['id'], 'paperInfo'))
    t2 = time.time()
    log.info(f"Took {t2 - t1} seconds for {req_paper['id']}")
    return req_paper

In [28]:
verbose = True
if verbose:
    log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
    log.info("Verbose output.")
else:
    log.basicConfig(format="%(levelname)s: %(message)s")

In [29]:
file_name = 'data/data_sorted_w_altmetrics.json'
with open(file_name, "r") as ds:
    data = json.load(ds)
res = get_all_data_for_paper(data[8])
res['paperInfo']

{'paperId': '22fe619996b59c09cb73be40103a123d2e328111',
 'venue': 'The 2011 International Joint Conference on Neural Networks',
 'year': 2011,
 'isOpenAccess': False,
 'authors': [{'authorId': '69539592',
   'name': 'J. Stallkamp',
   'authorsPapers': [{'paperId': '5746d310948a247255846b044a2bbdda903425ed',
     'year': 2013,
     'citations': [{'paperId': '0e8f260fb38b1bf08a40d9e5cbdcb80c563900d0',
       'year': 2022},
      {'paperId': 'e9d9771a6fb46565e60135403ad30b40deee4f9c', 'year': 2021},
      {'paperId': '0fc79b52b6195fc2a41832c494ed36c37e0a9482', 'year': 2021},
      {'paperId': 'a33dd82dfa785c86aaae3dd0dc67b6e7f0b8aa2f', 'year': 2021},
      {'paperId': '0d7450b74a27ac4b0f93851db29eeb70ba6d40e5', 'year': 2021},
      {'paperId': 'bad7a903d68cb0111d33d220533c542ae659ae77', 'year': 2021},
      {'paperId': '405cce0af28a9a9f4e267271896f114765f2228d', 'year': 2020},
      {'paperId': 'f290009faa0c42a58e0d65ba662d55b301ea1827', 'year': 2020},
      {'paperId': 'e9750f9858a32ecbd

In [30]:
with open(file_name, "r") as ds:
    data = json.load(ds)
for o_paper_idx in tqdm(range(len(data))):
    log.info(f'starting for paper {data[o_paper_idx]["id"]}')
    try:
        res = get_all_data_for_paper(data[o_paper_idx])
        name = ''.join(e for e in data[o_paper_idx]["id"] if e.isalnum())
        with open(f'data/requests/req{name}.json', "w") as f:
            json.dump(res, f, indent = 4, sort_keys=True)
    except Exception as exec:
        log.error('Failed with req', data[o_paper_idx]['id'], exec)

100%|██████████| 151/151 [47:27<00:00, 18.85s/it] 
