In [1]:
import json
import requests
from tqdm import tqdm
from os import getcwd

### Gets Semantic Scholar ID for further processing
...based on the ARXIV ID or DOI

In [2]:
url = "https://raw.githubusercontent.com/daniel-bogdoll/ad-datasets/main/my-app/src/data_sorted.json"
filename = '..\data\data_sorted.json'
r = requests.get(url).json()
f = open(filename,'w')
with open(filename, 'w', encoding='utf-8') as f:
        json.dump(r, f)

In [3]:
suffix = '' #set to empty string to work with new data #'_04_01_2023'

In [4]:
#Load your api key
with open('../api_keys.json') as f:
    api_keys = json.load(f)
key = api_keys['semantic_scholar']

In [5]:
def add_ids(data_w_ids, paper_idx, r):
    """add DOI or ARXIV for later API retrieval"""
    no_arxiv = False
    no_doi = False
    data_w_ids[paper_idx].update({'paperId': r['paperId']})

    if 'arxivId' in data_w_ids[paper_idx].keys():
        print('existing arxiv_id')
    else:
        try:
            data_w_ids[paper_idx].update({'arxivId': r['externalIds']['ArXiv']})
        except:
            no_arxiv = True
    try:
        data_w_ids[paper_idx].update({'DOI': r['externalIds']['DOI']})
    except:
        no_doi = True
    if no_doi and no_arxiv:
        print('No DOI AND No Arxiv found')
    return False

In [6]:
def new_get_values(data, payload=None):
    """get number of citations for all datasets by accessing the semantic scholar api
    number of citations are obtained by link of paper if it's an arxiv paper or alternatively by
    the papers DOI"""
    data_w_ids = data.copy()
    if payload is None:
        payload = {'fields': 'citationCount'}
    missing = []
    for paper_idx, paper in enumerate(tqdm(data)):
        searching = True
        if 'DOI' in paper.keys() and paper['DOI'] != '-' and searching:
            with requests.Session() as s:
                r = s.get(f'https://api.semanticscholar.org/graph/v1/paper/DOI:{paper["DOI"]}',headers={'x-api-key':key}, timeout=30,
                             params=payload).json()
                if 'error' not in r.keys():
                    searching = add_ids(data_w_ids, paper_idx, r)
                #else:
                    #print('failed doi', r)
        if searching and 'relatedPaper' in paper.keys() :
            if 'arxiv' or 'semanticscholar' in paper['relatedPaper']:
                url = (paper['relatedPaper'].replace('.pdf', ''))
                r = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/URL:{url}',headers={'x-api-key':key}, timeout=30, params=payload).json()
                if 'error' not in r.keys():
                    #print('success!!!!', url)
                    searching = add_ids(data_w_ids, paper_idx, r)
        if searching:
            missing.append(paper['id'])
    else:
        print('Number of Missing Papers:', len(missing), '/', len(data))
    return data_w_ids, missing

In [7]:
def save_only_papers_w_ids(suffix):
    """Searches for the ids of papers and saves those which have ids"""
    file_name = f'../data/data_sorted{suffix}.json'
    with open(file_name, "r", encoding='utf-8') as ds:
        data = json.load(ds)
        payload = {'fields': 'paperId,externalIds'}
    ids, missing_data = new_get_values(data, payload)
    c = 0
    only_papers_w_ids = []
    for id in ids:
        if 'paperId' in id.keys():
            only_papers_w_ids.append(id)
            c+=1
    with open(f'../data/data_sorted_only_w_ids{suffix}.json', 'w', encoding='utf-8') as f:
        json.dump(only_papers_w_ids, f)

In [8]:
save_only_papers_w_ids(suffix)

100%|██████████| 235/235 [01:57<00:00,  2.00it/s]

Number of Missing Papers: 29 / 235



