## 1. Collect

In [1]:
import csv
from elasticsearch import Elasticsearch
import pandas as pd
import requests

In [2]:
ES_HOST = "https://cluster.elasticsearch.dataesr.ovh/"
ES_INDEX = "bso-publications"
ES_PASSWORD = "vn84q9Xef9U7pmU%"
ES_USER = "BSO"

OPENALEX_API = "https://api.openalex.org/works"
OPENALEX_COUNTRIES = "fr"
OPENALEX_FIELD = "open_access.oa_status"
OPENALEX_MAILTO = "bso@recherche.gouv.fr"
OPENALEX_MAX_RESULTS = 10000
OPENALEX_PER_PAGE = 200
OPENALEX_YEAR = "2020"

In [3]:
es = Elasticsearch(ES_HOST, http_auth=(ES_USER, ES_PASSWORD))

In [4]:
def get_works_by_status(filter, label):
  works = []
  results = {}
  url = f"{OPENALEX_API}?filter=institutions.country_code:{OPENALEX_COUNTRIES},publication_year:{OPENALEX_YEAR},has_doi:true,is_paratext:false,{filter}&mailto={OPENALEX_MAILTO}&per-page={OPENALEX_PER_PAGE}&page="
  number_of_pages = int(OPENALEX_MAX_RESULTS / OPENALEX_PER_PAGE)
  for page in range(number_of_pages):
    try:
      url_with_page = f"{url}{page + 1}"
      work = requests.get(url_with_page).json().get("results", [])
      works.append(work)
    except Exception as error:
      print(f"An exception occurred | {url_with_page} | {error}")
  works = [j for sub in works for j in sub]
  for work in works:
      results[work.get("doi").replace("https://doi.org/", "")] = { "openalex_type": label, "bso_type": "not_found" }
  return works, results

In [5]:
works = []
results = {}
works_repo, results_repo = get_works_by_status(filter="open_access.oa_status:green", label="repository")
works += works_repo
results = { **results, **results_repo }
works_repopubli, results_repopubli = get_works_by_status(filter="open_access.is_oa:true,open_access.oa_status:!green,open_access.any_repository_has_fulltext:true", label="publisher;repository")
works += works_repopubli
results = { **results, **results_repopubli }
works_repopubli, results_repopubli = get_works_by_status(filter="open_access.is_oa:true,open_access.oa_status:!green,open_access.any_repository_has_fulltext:false", label="publisher")
works += works_repopubli
results = { **results, **results_repopubli }

In [6]:
print(f"Number of collected works: {len(works)}")
print(f"Number of results: {len(results)}")

Number of collected works: 30000
Number of results: 29999


In [7]:
dois = list(results.keys())
n = 20
dois_chunks = [dois[i:i + n] for i in range(0, len(dois), n)]
for chunk in dois_chunks:
	query = {
		"bool": {
			"should": [
				{
					"terms": {
						"doi.keyword": chunk
					}
				}
			]
		}
	}
	hits = es.search(index=ES_INDEX, query=query, size=n).get("hits", {}).get("hits", [])
	for hit in hits:
		doi = hit.get("_source", {}).get("doi")
		bso_status = hit.get("_source", {}).get("oa_details", {}).get("2023Q1", {}).get("oa_host_type")
		if doi and doi in results and bso_status:
			results[doi]["bso_type"] = bso_status
len(results)

29999

In [8]:
errors = {k:v for k, v in results.items() if v.get("openalex_type") != v.get("bso_type")}
print(f"Number of detected errors: {len(errors)}")
outputfile = f"errors_{OPENALEX_COUNTRIES}_{OPENALEX_YEAR}.csv"
with open(outputfile, "w", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["doi", "openalex_type", "bso_type"])
    for row in [doi for doi in errors]:
      csvwriter.writerow([row, errors.get(row).get("openalex_type"), errors.get(row).get("bso_type")])

Number of detected errors: 2722


## 2. Analyze

In [9]:
filepath = f"errors_{OPENALEX_COUNTRIES}_{OPENALEX_YEAR}.csv"
df = pd.read_csv(filepath)

In [10]:
df.shape[0]

2722

In [11]:
df.groupby(by=["openalex_type"]).size()

openalex_type
publisher               1892
publisher;repository     379
repository               451
dtype: int64

In [12]:
df.groupby(by=["openalex_type", "bso_type"]).size()

openalex_type         bso_type            
publisher             closed                  100
                      not_found               804
                      publisher;repository    983
                      repository                5
publisher;repository  not_found               129
                      publisher               213
                      repository               37
repository            closed                  145
                      not_found               248
                      publisher                 3
                      publisher;repository     55
dtype: int64