In [1]:
import requests
import pandas as pd
import numpy as np
import plotly.express as px
from tqdm import tqdm
import backoff

In [2]:
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=30)
@backoff.on_predicate(backoff.expo, lambda x: x.status_code >= 429, max_time=30)
def make_request(url, params=None, debug=False):
    if debug:
        print(url, params)
    if params is None:
        return requests.get(url)
    else:
        return requests.get(url, params=params)


def paginate_openalex(url, params=None, per_page=200, debug=False):
    if params is None:
        params = {}
    if "per-page" not in params and per_page:
        params["per-page"] = per_page
    cursor = "*"
    while cursor:
        params["cursor"] = cursor
        r = make_request(url, params, debug=debug)
        yield r

        page_with_results = r.json()
        # update cursor to meta.next_cursor
        cursor = page_with_results["meta"]["next_cursor"]



In [5]:
r = requests.get("https://api.openalex.org/types")
work_types = [x['display_name'] for x in r.json()['results']]

In [7]:
url = "https://api.openalex.org/works"
data = []
for work_type in tqdm(work_types):
    params = {
        "mailto": "jportenoy@ourresearch.org",
        "filter": f"type:{work_type}",
        "group_by": "primary_location.source.type",
    }
    r = requests.get(url, params=params)
    for item in r.json()["group_by"]:
        data.append({
            "work_type": work_type,
            "source_type": item["key"],
            "source_type_display_name": item["key_display_name"],
            "works_count": item["count"],
        })
df = pd.DataFrame(data)

100%|██████████| 18/18 [00:04<00:00,  4.41it/s]


In [9]:
df.to_clipboard(index=False)