In [1]:
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.colors import Normalize
import json
import numpy as np
import pandas as pd
import requests
import warnings
import pickle
from sklearn.cluster import KMeans, DBSCAN
from scipy.optimize import curve_fit
import time
from tqdm import tqdm

In [2]:
#Load your api key
with open('api_keys.json') as f:
    api_keys = json.load(f)
semantic_scholar_key = api_keys['semantic_scholar']
alt_metric_key = api_keys['alt_metric']

In [3]:
def add_ids(data_w_ids, paper_idx, r):
    """Add semantic scholar ids, DOIs and ArxivId if available"""
    no_arxiv = False
    no_doi = False
    data_w_ids[paper_idx].update({'paperId': r['paperId']})
    try:
        data_w_ids[paper_idx].update({'arxivId': r['externalIds']['ArXiv']})
    except:
        no_arxiv = True
    try:
        data_w_ids[paper_idx].update({'DOI': r['externalIds']['DOI']})
    except:
        no_doi = True
    if no_doi and no_arxiv:
        print('No DOI AND No Arxiv found')
    return False

In [4]:
def new_get_values(data, payload=None):
    """get number of citations for all datasets by accessing the semantic scholar api
    number of citations are obtained by link of paper if it's an arxiv paper or alternatively by
    the papers DOI"""
    data_w_ids = data.copy()
    if payload is None:
        payload = {'fields': 'citationCount'}
    missing = []
    for paper_idx, paper in enumerate(tqdm(data)):
        searching = True
        if 'DOI' in paper.keys() and paper['DOI'] != '-' and searching:
            with requests.Session() as s:
                r = s.get(f'https://api.semanticscholar.org/graph/v1/paper/DOI:{paper["DOI"]}',headers={'x-api-key':semantic_scholar_key}, timeout=30,
                             params=payload).json()
                if 'error' not in r.keys():
                    searching = add_ids(data_w_ids, paper_idx, r)
                #else:
                    #print('failed doi', r)
        if searching and 'relatedPaper' in paper.keys() :
            if 'arxiv' or 'semanticscholar' in paper['relatedPaper']:
                url = (paper['relatedPaper'].replace('.pdf', ''))
                r = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/URL:{url}',headers={'x-api-key':semantic_scholar_key}, timeout=30, params=payload).json()
                if 'error' not in r.keys():
                    #print('success!!!!', url)
                    searching = add_ids(data_w_ids, paper_idx, r)
        if searching:
            missing.append(paper['id'])
    else:
        print('Number of Missing Papers:', len(missing), '/', len(data))
    return data_w_ids, missing

In [6]:
def save_only_papers_w_ids():
    """Searches for the ids of papers and saves those which have ids"""
    file_name = 'data/data_sorted_new.json'
    with open(file_name, "r") as ds:
        data = json.load(ds)
        payload = {'fields': 'paperId,externalIds'}
    ids, missing_data = new_get_values(data, payload)
    c = 0
    only_papers_w_ids = []
    for id in ids:
        if 'paperId' in id.keys():
            only_papers_w_ids.append(id)
            c+=1
    with open('data/data_sorted_only_w_ids.json', 'w') as f:
        json.dump(only_papers_w_ids, f)
    return only_papers_w_ids

In [7]:
only_papers_w_ids = save_only_papers_w_ids()

100%|██████████| 211/211 [01:27<00:00,  2.41it/s]

Number of Missing Papers: 60 / 211





In [9]:
only_papers_w_ids[0].keys()

dict_keys(['id', 'href', 'size_hours', 'size_storage', 'frames', 'numberOfScenes', 'samplingRate', 'lengthOfScenes', 'sensors', 'sensorDetail', 'benchmark', 'annotations', 'licensing', 'relatedDatasets', 'publishDate', 'lastUpdate', 'paperTitle', 'relatedPaper', 'location', 'rawData', 'DOI', 'citationCount', 'completionStatus', 'paperId'])

In [18]:
# not needed to reproduce results.
def number_vars(variable):
    """counts how many datasets have a value for a variable, e.g. 151 data sets have an ID but only 73 a value for number of frames."""
    number = 0
    for i in range(len(only_papers_w_ids)):
        try:
            if len(only_papers_w_ids[i][variable]) > 1:
                number += 1
        except:
            pass
    print(variable, number)
def number_frames_and_sensors():
    """Counts how many datasets have both senors and frames"""
    number = 0
    for i in range(len(only_papers_w_ids)):
        try:
            if len(only_papers_w_ids[i]['sensors']) > 1 and len(only_papers_w_ids[i]['frames']) > 1:
                number += 1
        except:
            pass
    print(number)
number_frames_and_sensors()
for key in only_papers_w_ids[0].keys():
    number_vars(key)

73
id 151
href 149
size_hours 38
size_storage 42
frames 73
numberOfScenes 57
samplingRate 45
lengthOfScenes 14
sensors 120
sensorDetail 95
benchmark 48
annotations 94
licensing 102
relatedDatasets 47
publishDate 119
lastUpdate 33
paperTitle 92
relatedPaper 150
location 95
rawData 50
DOI 132
citationCount 0
completionStatus 151
paperId 151


In [8]:
def altmetric_doi_requests(doi):
    global alt_metric_key
    r = requests.get(f'https://api.altmetric.com/v1/fetch/doi/{doi}?key={alt_metric_key}')
    return r

def altmetric_arxiv_requests(arxiv):
    global alt_metric_key
    r = requests.get(f'https://api.altmetric.com/v1/fetch/arxiv_id/{arxiv}?key={alt_metric_key}')
    return r

def get_alt_values(r):
    """Extracts altmetirc score, percentile, percentile after 3 months and readers"""
    total_readers = 0
    for key in r['counts']['readers'].keys():
        total_readers += int(r['counts']['readers'][key])
    try:
        score = r['score']
    except:
        score = 0
    try:
        percentile = r['altmetric_score']['context_for_score']['all']['percentile']
    except:
        percentile = 0
    try:
        similar_age_3m_percentile = r['altmetric_score']['context_for_score']['similar_age_3m']['percentile']
    except:
        similar_age_3m_percentile = 0
    alt_values = {'score': score}, {'percentile': percentile}, {
        'similar_age_3m_percentile': similar_age_3m_percentile}, {'total_readers': total_readers}
    return alt_values

def get_altmetrics(data):
    """iterates over papers, calls api requests, and updates the relevant data"""
    c = 0
    c_doi = 0
    r = None
    for idx_paper, paper in enumerate(data):
        c += 1
        if 'DOI' in paper.keys():
            try:
                r = altmetric_doi_requests(paper['DOI'])
            except Exception as ex:
                warnings.warn(f'{paper["id"]} caused: {ex}')
        if 'arxivId' in paper.keys():
            try:
                r = altmetric_arxiv_requests(paper['arxivId'])
            except Exception as ex:
                warnings.warn(f'{paper["id"]} caused: {ex}')
        if r is not None and r.ok:
            r = r.json()
            #print(paper['id'])
            c_doi += 1
            alt_values = get_alt_values(r)
            data[idx_paper].update({'altmetrics': alt_values})
        elif not r.ok:
            print(r)
    with open('data/data_sorted_w_altmetrics.json', 'w') as f:
        json.dump(data, f)
    print('found:', c_doi, '/', c)
    return data

In [94]:
import urllib.parse
doi =  '10.1177/0278364913507326'
doi = urllib.parse.unquote(doi)
url = f'https://api.altmetric.com/v1/fetch/doi/{doi}?key={alt_metric_key}'
#url = urllib.parse.quote(url)
print(url)
requests.get(url)

https://api.altmetric.com/v1/fetch/doi/10.1177/0278364913507326?key=f98d8c1ae2ee25733dfe36811c83faa9


<Response [404]>

In [95]:
data = get_altmetrics(only_papers_w_ids)

<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
<Response [404]>
found: 131 / 151
