In [1]:
from copy import deepcopy
import csv
import json
import requests

In [2]:
bso_name = "A2F"
per_page = 200 # Any number between 1 and 200
query = "https://openalex.org/works?page=1&filter=authorships.institutions.lineage:i4210104468|i4210102596|i4210133191|i4210124901|i4387154115|i4387155501,publication_year:2017-2025"

In [3]:
# Transform user query into params to query OpenAlex API
params = {}
for param in query.split("?")[1].split("&"):
  [key, value] = param.split("=")
  params[key] = value
# Add default config
params["mailto"] = "bso@recherche.gouv.fr"
params["per-page"] = per_page
params["select"] = "doi,id,locations"

In [4]:
def get_count_from_openalex():
  params_count = deepcopy(params)
  del params_count["per-page"]
  response = requests.request("GET", params=params_count, url="https://api.openalex.org/works")
  return response.json().get("meta", {}).get("count", 0)

In [5]:
def get_works_from_openalex(cursor="*"):
  params_works = deepcopy(params)
  params_works["cursor"] = cursor
  response = requests.request("GET", params=params_works, url="https://api.openalex.org/works")
  next_cursor = response.json().get("meta", {}).get("next_cursor")
  dois = []
  hal_ids = []
  nnt_ids = []
  no_ids = []
  for result in response.json().get("results", []):
    # Collect DOI
    doi = result.get("doi")
    if doi is not None:
      dois.append(doi.replace("https://doi.org/", ""))
    hal_id = None
    nnt_id = None
    for location in result.get("locations", []):
        # Collect HAL ids
        if "pmh:oai:HAL:" in location.get("id"):
            hal_id = location.get("id").replace("pmh:oai:HAL:", "")
            # Delete HAL version if any
            if hal_id[-2] == 'v':
               hal_id = hal_id[0:-2]
            hal_ids.append(hal_id)
        if location.get("landing_page_url") and "http://www.theses.fr/" in location.get("landing_page_url"):
            nnt_id = location.get("landing_page_url").replace("http://www.theses.fr/", "").replace("/document", "")
            nnt_ids.append(nnt_id)
    # No collected id
    if doi is None and hal_id is None and nnt_id is None:
       no_ids.append(result)
  return { "next_cursor": next_cursor, "dois": dois, "hal_ids": hal_ids, "nnt_ids": nnt_ids,"no_ids": no_ids }

In [6]:
count = get_count_from_openalex()
cursor = "*"
works_dois = []
works_hal_ids = []
works_nnt_ids = []
works_no_ids = []
print(f"Publications count : {count}")
page = 0
while len(works_dois) < count:
  page += 1
  d = get_works_from_openalex(cursor)
  cursor = d.get("next_cursor")
  works_dois += d.get("dois")
  works_hal_ids += d.get("hal_ids")
  works_nnt_ids += d.get("nnt_ids")
  works_no_ids += d.get("no_ids")
  print("Page : ", page)
# Flat HAL ids array
# works_hal_ids = [element for sublist in works_hal_ids for element in sublist]
# Remove duplicates
works_hal_ids = list(set(works_hal_ids))
print(f"DOI count : {len(works_dois)}")
print(f"HAL Ids count : {len(works_hal_ids)}")
print(f"NNT count : {len(works_nnt_ids)}")
print(f"No DOI count : {len(works_no_ids)}")

Publications count : 3309
Page :  1
Page :  2
Page :  3
Page :  4
Page :  5
Page :  6
Page :  7
Page :  8
Page :  9
Page :  10
Page :  11
Page :  12
Page :  13
Page :  14
Page :  15
Page :  16
Page :  17
Page :  18
Page :  19
Page :  20
Page :  21
Page :  22
Page :  23
Page :  24
DOI count : 3484
HAL Ids count : 3093
NNT count : 37
No DOI count : 41


In [7]:
with open(f"{bso_name}_errors.json", "w") as f:
    json.dump({ "errrors": [error.get("id") for error in works_no_ids] }, f)

In [8]:
max_count = max(len(works_dois), len(works_hal_ids), len(works_nnt_ids))
print(max_count)
data = []
for i in range(max_count):
    tmp = {}
    if len(works_dois) > i:
        tmp["doi"] = works_dois[i]
    if len(works_hal_ids) > i:
        tmp["hal_id"] = works_hal_ids[i]
    if len(works_nnt_ids) > i:
        tmp["nnt_id"] = works_nnt_ids[i]
    data.append(tmp)
print(len(data))
with open(f"{bso_name}.csv", "w", newline="") as f:
  writer = csv.DictWriter(f, delimiter=";", fieldnames=["doi", "hal_id", "nnt_id"])
  writer.writeheader()
  writer.writerows(data)

3484
3484
