In [10]:
import json
import requests
import time
import concurrent.futures
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
import logging as log
import os

### Data retrieval from Semantic Scholar

In [11]:
#Load your api key
with open('api_keys.json') as f:
    api_keys = json.load(f)
key = api_keys['semantic_scholar']

In [12]:
suffix = '' #'_04_01_2023'

In [13]:
path = os.path.join(os.getcwd(), 'data', 'requests', suffix)
os.mkdir(path)

In [14]:
global req_counter
req_counter = 0
# CLASS calls per second
CALLS = 500 # 950
RATE_LIMIT = 1

@sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT)
def check_limit():
    """ Empty function just to check for calls to API """
    return

In [15]:
def request_paper_information(paperId, fields):
    """requests GENERAL INFO for a given paper in data"""
    global key
    global req_counter
    req_counter += 1
    check_limit()
    time.sleep(0.01)
    req_str = f'https://api.semanticscholar.org/graph/v1/paper/{paperId}?fields={fields}'
    print(req_str)
    req = requests.get(req_str, headers={'x-api-key':key}, timeout=600).json()
    if 'message' in req.keys():
        log.error(req['message'])
        raise Exception('Probs too many requests :(')
    return paperId, req

def request_citation_information(paperId, offset, fields, limit):
    """requests CITATION INFO for a given paper in data"""
    global key
    global req_counter
    req_counter += 1
    check_limit()
    time.sleep(0.01)
    req = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paperId}/citations?fields={fields}&offset={offset}&limit={limit}', headers={'x-api-key':key}, timeout=600).json()
    if 'message' in req.keys():
        log.error(req['message'])
        raise Exception('Probs too many requests :(')
    if 'error' in req.keys():
        req = 'error'
        return paperId, req
    if 'data' in req.keys():
        for i in range(len(req['data'])):
            req['data'][i] = req['data'][i]['citingPaper']
        req = req['data']
    else:
        log.info(f'data not in keys: Response keys: {req["error"]}')
    return paperId, req

def request_author_citations(authorId, offset, fields):
    """returns a paper+year and its citations+year for a given authorId"""
    global key
    global req_counter
    req_counter += 1
    check_limit()
    req = requests.get(f'https://api.semanticscholar.org/graph/v1/author/{authorId}/papers?fields={fields}&offset={offset}&limit=1000',headers={'x-api-key':key}, timeout=600).json()
    if 'message' in req.keys():
        log.error(req['message'])
        raise Exception('Probs too many requests :(')
    if 'error' in req.keys():
        req = 'error'
        return authorId, req
    if 'data' in req.keys():
        log.info(req['data'])
        #for id in range(len(req['data'])):
        #    req['data'][id] = req['data'][id]['citingPaper']
        req = req['data']
    else:
        log.info(f'data not in keys: Response keys: {req["error"]}')
    return authorId, req
#request_author_citations(author_id='47237027', fields='year,citations.year')

In [16]:
def get_author_citations(authorId, fields):
    """handels the request and ensures that for papers with more than 1k citations all requests are made"""
    offset = 0
    limit = 1000
    authorId, req = request_author_citations(authorId, offset, fields)
    while len(req)%1000 == 0:
        offset += 1000
        if offset+limit >= 10000:
            break
        authorId, next_request = request_author_citations(authorId, offset, fields)
        req = req + next_request

    return authorId, req

def get_citation_information(paperId, fields):
    """handels the request and ensures that for papers with more than 1k citations all requests are made, returns list with the requests"""
    offset = 0
    limit = 1000
    paperId, req = request_citation_information(paperId, offset, fields, limit)
    while len(req)%1000 == 0:
        offset += 1000
        if offset+limit >= 10000:
            break
        paperId, next_request = request_citation_information(paperId, offset, fields, limit)
        req = req + next_request

    return paperId, req

In [17]:
def original_paper_side(paper):
    """gets general information, e.g. authors,venue... , on the paper that first presented a dataset"""
    try:
        idx, paperInfo = request_paper_information(paper["paperId"], fields='authors,venue,year,isOpenAccess,citationCount,references,references.year')
        if 'paperInfo' in paper.keys():
            paperInfo.append(paper['paperInfo']) #entire new function that iterates over all potential parts cits, refs, atuhs and adds TODO: ask on slack?
        paper.update({'paperInfo': paperInfo})
        return  paper, True
    except Exception as exec:
        log.error(exec)
        return paper, False


In [18]:
def requests_w_workers(paper, function=request_paper_information, side='references', field='year'):
    """coordinates multithreading for many parallel requests for a paper and a function (e.g. author side)"""
    org_number = len(paper['paperInfo'][side])
    c=0
    if side == 'authors':
        interestId = 'authorId'
        insertion_key = 'authorsPapers'
        insertion_value ='data'
    else:
        interestId = 'paperId'
        insertion_key = 'citations'
        insertion_value ='citations'
    log.info(f'Starting requests for {side}')
    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        # Start the load operations and mark each future with its URL
        future_to_url = {executor.submit(function, paper['paperInfo'][side][ref_idx][interestId], fields=field):ref_idx for ref_idx, reference in enumerate(paper['paperInfo'][side])}
        for future in concurrent.futures.as_completed(future_to_url):
            #res = future_to_url[future]
            c += 1
            try:
                 paperId, ls_response = future.result()
                 #log.warning(ls_response)
                 if ls_response == 'error':
                     continue
                 for ref_idx, reference in enumerate(paper['paperInfo'][side]):
                     if paper['paperInfo'][side][ref_idx][interestId] == paperId:
                         #checking for matching reference paperId to insert at correct position because ref_idx is the position in the list of my pata structure and not the sescho id.
                         paper['paperInfo'][side][ref_idx][insertion_key] = ls_response # paper['paperInfo']['citations'][0]['citations']
            except Exception as exc:
                log.error(f'{paper["id"]} generated an exception: {exc}', ls_response)
    if c != org_number:
        #raise Exception('Somehow lost a request on the way', c-org_number)
        log.error('Somehow lost a request on the way')
        log.error(c-org_number)
    return paper

In [19]:
def get_all_data_for_paper(req_paper):
    """"Retrieves all relevant data and adds it to the json"""
    t1 = time.time()
    #array to track if and what is missing. Usually empty
    missing_info = []
    # get general Information
    req_paper, has_paper_info = original_paper_side(req_paper)
    # get all citations for paper (also if there is more than 1k)
    paperId, req = get_citation_information(req_paper['paperId'], 'year')
    req_paper['paperInfo']['citations'] = req
    #check if we got all citations that are counted in the citationCount by SemanticScholar
    if req_paper['paperInfo']['citationCount'] != len(req_paper['paperInfo']['citations']):
        log.error('Missing a/some citation! Semantic scholar implied', req_paper['paperInfo']['citationCount'] , 'but only got', len(req_paper['paperInfo']['citations']))
        log.error(req_paper['paperInfo']['citationCount'] != len(req_paper['paperInfo']['citations']))
    if has_paper_info:
        if 'citations' in req_paper['paperInfo'].keys():
           req_paper = requests_w_workers(req_paper, function=get_citation_information, side='citations', field='year')
        else:
            missing_info.append((req_paper['id'], 'citations'))
        if 'references' in req_paper['paperInfo'].keys():
            req_paper = requests_w_workers(req_paper, function=get_citation_information, side='references', field='year')
        else:
            missing_info.append((req_paper['id'], 'references'))
        if 'citations' in req_paper['paperInfo'].keys():
           req_paper = requests_w_workers(req_paper, function=get_author_citations, side='authors', field='year,citations.year')
        else:
            missing_info.append((req_paper['id'], 'citations'))
    else:
        missing_info.append((req_paper['id'], 'paperInfo'))
    t2 = time.time()
    log.info(f"Took {t2 - t1} seconds for {req_paper['id']}")
    return req_paper, missing_info

In [20]:
# Set verbosity level
verbose = False
if verbose:
    log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
    log.info("Verbose output.")
else:
    log.basicConfig(format="%(levelname)s: %(message)s")
    log.getLogger("requests").setLevel(log.WARNING)
log.getLogger("requests").setLevel(log.WARNING)
log.getLogger("urllib3").setLevel(log.WARNING)

In [None]:
# Open json with datasets

file_name = f'data/data_sorted_only_w_ids{suffix}_altmetrics.json'
with open(file_name, "r") as ds:
    data = json.load(ds)
res, missing_info = get_all_data_for_paper(data[9])
#res['paperInfo']

https://api.semanticscholar.org/graph/v1/paper/4f0b8f730273e9f11b2bfad2415485414b96299f?fields=authors,venue,year,isOpenAccess,citationCount,references,references.year


In [None]:
with open(file_name, "r") as ds:
    data = json.load(ds)
for o_paper_idx in tqdm(range(len(data))):
    log.info(f'starting for paper {data[o_paper_idx]["id"]}')
    try:
        res, missing_info = get_all_data_for_paper(data[o_paper_idx])
        name = ''.join(e for e in data[o_paper_idx]["id"] if e.isalnum())
        with open(f'data/requests{suffix}/req{name}.json', "w") as f:
            json.dump(res, f, indent=4, sort_keys=True)
    except Exception as exec:
        log.error('Failed with req', data[o_paper_idx]['id'], exec)