In [None]:
from dotenv import load_dotenv
import json
import numpy as np
import os
import pandas as pd
from pathlib import Path
import requests
import shutil

In [None]:
load_dotenv()

root_folder_name = "/upw_data/openalex_dump"
# root_folder_name = "."
data_folder_name = f"{root_folder_name}/data"
zip_file_name = "data.zip"
issns_file_name = f"{root_folder_name}/issns.xlsx"
done_file_name = f"{root_folder_name}/done.csv"

In [None]:
def getOpenAlexWorks(issn):
  # See https://github.com/ourresearch/openalex-api-tutorials/blob/main/notebooks/getting-started/paging.ipynb
  # loop through pages
  cursor = "*"
  per_page = 100
  while cursor:
      # set cursor value and request page from OpenAlex
      url = f"http://api.openalex.org/works?filter=locations.source.issn:{issn}&per_page={per_page}&cursor={cursor}"
      if OPENALEX_API_KEY:
        url += f"&api_key={OPENALEX_API_KEY}"
      response = requests.get(url).json()
      # loop through partial list of results
      results = response.get("results", [])
      if cursor == "*":
        count = response.get("meta", {}).get("count")
      for result in results:
          file_name = f"{result.get('id').lower().replace('https://openalex.org/', '')}.json"
          if not Path(f"{data_folder_name}/{file_name}").is_file():
            with open(f"{data_folder_name}/{file_name}", "w") as file:
              json.dump(result, file, indent=4)
      # update cursor to meta.next_cursor
      cursor = response.get("meta", {}).get("next_cursor")
  return count

def reset():
  # Init / reset flders and files needed
  if Path(data_folder_name).is_dir():
    shutil.rmtree(data_folder_name)
  os.mkdir(data_folder_name)
  if Path(f"{root_folder_name}/{zip_file_name}").is_file():
    os.remove(f"{root_folder_name}/{zip_file_name}")
  if Path(done_file_name).is_file():
    os.remove(done_file_name)
  with open(done_file_name, "w") as fp:
      pass

def main ():
  df = pd.read_excel(issns_file_name, sheet_name="liste dedoublonnÃ©e et avis")
  df = df.replace("nan", "").replace(np.nan, "")
  total = df.shape[0]
  for index, row in df.iterrows():
    try:
      df_done = pd.read_csv(done_file_name, header=None)
    except pd.errors.EmptyDataError:
      df_done = pd.DataFrame()
    issn_e = row.get("ISSN-E", "")
    issn_p = row.get("ISSN-P", "")
    issn = issn_e if issn_e and len(issn_e) > 0 else issn_p
    print(f"{index} / {total} - ISSN: {issn}")
    if len(issn) > 0:
      if df_done.shape[0] == 0 or not df_done[0].isin([issn]).any():
        count = getOpenAlexWorks(issn)
        if count:
          df_done.loc[len(df_done), [0, 1]] = [issn, count]
          df_done.to_csv(done_file_name, header=False, index=False)
    else:
      print(f"Error to retrieve issn for: {issn_e} or {issn_p}")
  shutil.make_archive(data_folder_name, "zip", data_folder_name)

In [None]:
# reset()

In [None]:
main()