In [None]:
from dotenv import load_dotenv
import os
import pymongo
import requests

In [None]:
FOSM_BASE_URL = "https://cluster.elasticsearch.dataesr.ovh"
FOSM_INDEX = "bso-publications-20230728"
FOSM_LIMIT = 0  # Set to 0 for no limit
FOSM_PAGE_SIZE = 10000  # Maximum is 10000
MONGO_DB = "bsocoverage"
MONGO_COLLECTION = "publications"
OA_LIMIT = 0  # Set to 0 for no limit
OA_MAX_RETRY = 5
OA_PAGE_SIZE = 200  # Maximum is 200

# Access the environment variables from the .env file
FOSM_AUTHORIZATION=os.environ.get("FOSM_AUTHORIZATION")
OA_API_KEY=os.environ.get("OA_API_KEY")

In [None]:
mongo_database = pymongo.MongoClient()[MONGO_DB]
mongo_collection = mongo_database[MONGO_COLLECTION]
# mongo_collection.drop()
# mongo_collection.create_index([("doi", pymongo.ASCENDING)], unique=True)

In [None]:
# Number of publications in French OSM
r = requests.get("/".join([FOSM_BASE_URL, FOSM_INDEX, "_count"]),
                 headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"})
fosm_total_count = r.json().get("count")
fosm_total_count

In [None]:
# Number of publications with DOI in French OSM
r = requests.get("/".join([FOSM_BASE_URL, FOSM_INDEX, "_count?q=doi:*"]),
                 headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"})
fosm_doi_count = r.json().get("count")
fosm_doi_count

In [None]:
def get_fosm_publications(pit, total_results_count, search_after=None):
    json = {"size": FOSM_PAGE_SIZE, "query": {"exists": {"field": "doi"}},
            "pit": {"id":  pit, "keep_alive": "1m"}, "sort": ["_doc"]}
    if search_after:
        json["search_after"] = search_after
        json["track_total_hits"] = False
    r = requests.get("/".join([FOSM_BASE_URL, "_search"]),
                     headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"}, json=json)
    response = r.json()
    results = response.get("hits").get("hits")
    actions = []
    for publication in results:
        doi = publication.get("_source").get("doi")
        publication = {
            "all_ids": publication.get("_source").get("external_ids"),
            "doi": doi,
            "is_in_fosm": True,
        }
        actions.append(pymongo.UpdateOne(
            {"doi": doi}, {"$set": publication}, upsert=True))
    mongo_collection.bulk_write(actions, ordered=False)
    total_results_count += len(results)
    search_after = results[len(results) - 1].get("sort")
    next_pit = response.get("pit_id")
    del json
    del r
    del response
    del results
    del actions
    print('{:.0f} %'.format((total_results_count / fosm_doi_count) * 100))
    if FOSM_LIMIT == 0 or total_results_count < FOSM_LIMIT:
        return get_fosm_publications(next_pit, total_results_count, search_after)
    else:
        return total_results_count

In [None]:
# Get Point In Time
r = requests.post("/".join([FOSM_BASE_URL, FOSM_INDEX, "_pit?keep_alive=1m"]),
                  headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"})
pit = r.json().get("id")
# Collect all publications with DOI in French OSM
fosm_publications = get_fosm_publications(pit, 0)
print(fosm_publications)
# Delete Point In Time
r = requests.delete("/".join([FOSM_BASE_URL, "_pit"]), headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"}, json={"id": pit})

In [None]:
# Number of French publications in OpenAlex
r = requests.get(
    f"https://api.openalex.org/works?filter=institutions.country_code:FR&api_key={OA_API_KEY}")
openalex_total_count = r.json().get("meta").get("count")
openalex_total_count

In [None]:
def get_openalex_publications(cursor, total_results_count, retry_count):
    try:
        print(f"{total_results_count} {cursor}")
        r = requests.get(
            f"https://api.openalex.org/works?filter=institutions.country_code:FR&per-page={OA_PAGE_SIZE}&api_key={OA_API_KEY}&cursor={cursor}")
        response = r.json()
        results = response.get("results")
        actions = []
        for publication in results:
            doi = publication.get("doi")
            if doi:
                doi = doi.replace("https://doi.org/", "")
                publication = {
                    "all_ids": [{"id_type": k, "id_value": v} for k, v in publication.get("ids").items()],
                    "doi": doi,
                    "is_in_openalex": True,
                }
                actions.append(pymongo.UpdateOne(
                    {"doi": doi}, {"$set": publication}, upsert=True))
        mongo_collection.bulk_write(actions, ordered=False)
        results_count = len(results)
        total_results_count += results_count
        next_cursor = response.get("meta").get("next_cursor")
        del r
        del response
        del results
        del actions
        # print('{:.0f} %'.format((total_results_count / openalex_total_count) * 100))
        if next_cursor is not None and results_count > 0 and (OA_LIMIT == 0 or len(total_results_count) < OA_LIMIT):
            return get_openalex_publications(next_cursor, total_results_count, retry_count)
        else:
            return total_results_count
    except Exception as e:
        print('error')
        if hasattr(e, 'message'):
            print(e.message)
        else:
            print(e)
        if retry_count < OA_MAX_RETRY:
            retry_count = retry_count + 1
            return get_openalex_publications(cursor, total_results_count, retry_count)
        else:
            print("Too many retries")

In [None]:
# Collect all French publications with DOI in OpenAlex
openalex_publications = get_openalex_publications("*", 0, 0)
openalex_publications