In [None]:
import csv
from dotenv import load_dotenv
import json
import os
import requests

In [None]:
load_dotenv()

ES_HOST="https://cluster.elasticsearch.dataesr.ovh"
ES_INDEX="bso-publications"

# Access the environment variables from the .env file
ES_TOKEN=os.getenv("ES_TOKEN")

In [None]:
with open("doi_error_reason.csv", newline='') as csvfile:
  rows = csv.reader(csvfile)
  # Skip headers
  next(rows, None)
  errors = []
  for row in rows:
    doi = row[0].lower()
    error_code = row[1]
    error_reason = row[2]
    openalex_authorships = requests.get(f"https://api.openalex.org/works/https://doi.org/{doi}").json().get("authorships", [])
    openalex_french_institutions = []
    for openalex_author in openalex_authorships:
      openalex_french_institutions += [openalex_institution.get("display_name") for openalex_institution in openalex_author.get("institutions") if openalex_institution.get("country_code", "").lower() == "fr"]
    openalex_french_institutions = list(set(openalex_french_institutions))
    openalex_french_raw_affiliations = [author.get("raw_affiliation_strings") for author in openalex_authorships if "FR" in author.get("countries", [])]
    openalex_french_raw_affiliations = [j for openalex_affiliation in openalex_french_raw_affiliations for j in openalex_affiliation]
    openalex_french_raw_affiliations = list(set(openalex_french_raw_affiliations))
    fosm_data = requests.get(f"{ES_HOST}/{ES_INDEX}/_search?q=all_ids:\"doi{doi}\"", headers={"Authorization": f"Basic {ES_TOKEN}"}).json()
    fosm_hits = fosm_data.get("hits", {}).get("hits", [])
    if len(fosm_hits) > 0:
      fosm_affiliations = fosm_hits[0].get("_source", {}).get("affiliations", [])
      fosm_affiliations = [aff.get("name", "").replace(";", ",") for aff in fosm_affiliations if "fr" in aff.get("detected_countries", [])]
    else:
      fosm_affiliations = []
    errors.append([doi, error_code, error_reason, "; ".join(fosm_affiliations), openalex_french_institutions, openalex_french_raw_affiliations])
with open("doi_error_reason.csv", "w", encoding="UTF8") as f:
  writer = csv.writer(f)
  writer.writerow(["doi", "error_code", "error_reason", "fosm_institutions", "openalex_french_institutions", "openalex_french_raw_affiliations"])
  for error in errors:
    writer.writerow(error)

In [None]:
with open("doi_error_reason.csv", newline='') as csvfile:
  rows = csv.reader(csvfile)
  # Skip headers
  next(rows, None)
  for row in rows:
    openalex_french_raw_affiliations = row[-1]
    countries = []
    for openalex_french_raw_affiliation in eval(openalex_french_raw_affiliations):
      r = requests.post("https://affiliation-matcher.staging.dataesr.ovh/match", json={ "type": "country", "year": "2022", "verbose": True, "query": openalex_french_raw_affiliation })
      countries.append(r.json().get("results", []))
    countries = [j for sub in countries for j in sub]
    countries = list(set(countries))
    print(countries)

In [None]:
data = []
for error in errors:
  affiliation_matcher_countries_affiliations = []
  for openalex_affiliation in error.get("openalex_affiliations", []):
    r = requests.post("https://affiliation-matcher.staging.dataesr.ovh/match", json={ "type": "country", "year": "2022", "verbose": True, "query": openalex_affiliation })
    affiliation_matcher_countries_affiliations += r.json().get("results", [])
  affiliation_matcher_countries_affiliations = list(set(affiliation_matcher_countries_affiliations))
  affiliation_matcher_countries_institutions = []
  for openalex_institution in error.get("openalex_institutions", []):
    r = requests.post("https://affiliation-matcher.staging.dataesr.ovh/match", json={ "type": "country", "query": openalex_institution })
    affiliation_matcher_countries_institutions += r.json().get("results", [])
  affiliation_matcher_countries_institutions = list(set(affiliation_matcher_countries_institutions))
  is_false_positive = "fr" not in affiliation_matcher_countries_affiliations
  data.append({
    "doi": error.get("doi"),
    "openalex_institutions": error.get("openalex_institutions"),
    "openalex_affiliations": error.get("openalex_affiliations"),
    "affiliation_matcher_countries_affiliations": affiliation_matcher_countries_affiliations,
    "affiliation_matcher_countries_institutions": affiliation_matcher_countries_institutions,
    "is_false_positive": is_false_positive
  })
with open("data_file.json", "w") as jsonfile:
    json.dump(data, jsonfile, indent=4)
print(len(data))
print(len([d for d in data if d.get("is_false_positive")]))