In [None]:
# OSFM examples
# With DOI
# Without DOI
# OpenAlex examples
# With DOI : https://api.openalex.org/works/https://doi.org/10.35562/arabesques.3084
# Without DOI : https://api.openalex.org/works/W3204678669

In [1]:
from dotenv import load_dotenv
import os
import pymongo
import re
import requests
from retry import retry

In [2]:
load_dotenv()

FOSM_BASE_URL = "https://cluster.elasticsearch.dataesr.ovh"
FOSM_INDEX = "bso-publications-20230728"
FOSM_LIMIT = 0  # Set to 0 for no limit
FOSM_PAGE_SIZE = 10000  # Maximum is 10000
MONGO_DB = "bsocoverage"
MONGO_COLLECTION = "publications"
OA_LIMIT = 0  # Set to 0 for no limit
OA_PAGE_SIZE = 200  # Maximum is 200

# Access the environment variables from the .env file
FOSM_AUTHORIZATION=os.getenv("FOSM_AUTHORIZATION")
OA_API_KEY=os.getenv("OA_API_KEY")

In [3]:
mongo_database = pymongo.MongoClient()[MONGO_DB]
mongo_collection = mongo_database[MONGO_COLLECTION]
# mongo_collection.drop()
# mongo_collection.create_index([("id", pymongo.ASCENDING)], unique=True)

In [None]:
# Number of publications in French OSM
json = {
  "query": { "bool": { "must": [
    { "range": { "year": { "gte": 2013, "lte": 2021 } } },
    { "term": { "bso_country_corrected": "fr" } },
    { "terms": { "genre.keyword": [ "journal-article", "proceedings", "book-chapter", "book", "preprint" ] } },
  ] } },
}
r = requests.get("/".join([FOSM_BASE_URL, FOSM_INDEX, "_count"]),
                 headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"}, json=json)
fosm_total_count = r.json().get("count")
fosm_total_count

In [None]:
@retry(Exception, tries=5, delay=10)
def get_fosm_publications(pit, total_results_count, search_after=None):
    print(f"{pit}, {total_results_count}, {search_after}")
    json = {
        "pit": {"id":  pit, "keep_alive": "1m"},
        "query": { "bool": { "must": [
            { "range": { "year": { "gte": 2013, "lte": 2021 } } },
            { "term": { "bso_country_corrected": "fr" } },
            { "terms": { "genre.keyword": [ "journal-article", "proceedings", "book-chapter", "book", "preprint" ] } },
        ] } },
        "size": FOSM_PAGE_SIZE,
        "sort": ["_doc"],
    }
    if search_after:
        json["search_after"] = search_after
        json["track_total_hits"] = False
    r = requests.get("/".join([FOSM_BASE_URL, "_search"]),
                     headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"}, json=json)
    response = r.json()
    results = response.get("hits").get("hits")
    actions = []
    for publication in results:
        doi = publication.get("_source", {}).get("doi")
        hal_id = publication.get("_source", {}).get("hal_id")
        id = doi if doi else hal_id
        id = id.lower()
        publication = {
            "all_ids": publication.get("_source").get("external_ids"),
            "id": id,
            "is_in_fosm": True,
            "year_fosm": publication.get("_source").get("year")
        }
        if doi:
            publication["doi"] = doi
        if hal_id:
            publication["hal_id"] = hal_id
        actions.append(pymongo.UpdateOne(
            {"id": id}, {"$set": publication}, upsert=True))
    if len(actions) > 0:
        mongo_collection.bulk_write(actions, ordered=False)
    total_results_count += len(results)
    search_after = results[len(results) - 1].get("sort")
    next_pit = response.get("pit_id")
    del json
    del r
    del response
    del results
    del actions
    print('{:.0f} %'.format((total_results_count / fosm_total_count) * 100))
    if FOSM_LIMIT == 0 or total_results_count < FOSM_LIMIT:
        return get_fosm_publications(next_pit, total_results_count, search_after)
    else:
        return total_results_count

In [None]:
# Delete all documents from FOSM only, in Mongo
# delete = mongo_collection.delete_many({ "is_in_fosm": "true", "is_in_openalex": { "$exists": False } })
# print(delete.deleted_count, " documents deleted")
# Delete "is_in_fosm" field for all documents in Mongo
# updated = mongo_collection.update_many({}, { "$unset": { "is_in_fosm": 1 } } , True)
# print(updated.modified_count, " documents modified")
# Get Point In Time
r = requests.post("/".join([FOSM_BASE_URL, FOSM_INDEX, "_pit?keep_alive=1m"]),
                  headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"})
pit = r.json().get("id")
# Collect all publications with DOI in French OSM
fosm_publications = get_fosm_publications(pit, 0)
print(fosm_publications)
# Delete Point In Time
r = requests.delete("/".join([FOSM_BASE_URL, "_pit"]), headers={"Authorization": f"Basic {FOSM_AUTHORIZATION}"}, json={"id": pit})

In [None]:
# Number of French publications in OpenAlex
r = requests.get(
    f"https://api.openalex.org/works?filter=institutions.country_code:FR,is_paratext:false,publication_year:2013-2021&api_key={OA_API_KEY}")
openalex_total_count = r.json().get("meta").get("count")
print(openalex_total_count)

In [None]:
@retry(Exception, tries=5, delay=30)
def get_openalex_publications(cursor, total_results_count):
    print(f"\"{cursor}\", {total_results_count}")
    r = requests.get(
        f"https://api.openalex.org/works?filter=institutions.country_code:FR,is_paratext:false,publication_year:2013-2021&per-page={OA_PAGE_SIZE}&api_key={OA_API_KEY}&cursor={cursor}")
    response = r.json()
    results = response.get("results")
    actions = []
    for publication in results:
        open_alex_id = publication.get("id")
        doi = False
        if publication.get("doi"):
            doi = publication.get("doi", "").replace("https://doi.org/", "")
        hal_landing_page_urls = [location.get("landing_page_url") for location in response.get("locations", []) if re.match("^https:\/\/hal\.(science|archives-ouvertes.fr)/(hal-\d*)", location.get("landing_page_url", ""))]
        hal_ids_uniq = list(set([hal_landing_page_url.split('/')[3] for hal_landing_page_url in hal_landing_page_urls]))
        if len(hal_ids_uniq) > 1:
            print(f"More than one hal_id in OpenAlex work : {open_alex_id}")
            hal_id = False
        else:
            hal_id = hal_ids_uniq[0] if len(hal_ids_uniq) == 1 else False
        id = doi if doi else hal_id if hal_id else open_alex_id
        id = id.lower()
        if id:
            all_ids = [{"id_type": k, "id_value": v} for k, v in publication.get("ids").items()]
            if open_alex_id and len([id for id in all_ids if id.get("id_type") == "openalex"]) == 0:
                all_ids.append({"id_type": "openalex", "id_value": open_alex_id})
            if doi and len([id for id in all_ids if id.get("id_type") == "doi"]) == 0:
                all_ids.append({"id_type": "doi", "id_value": doi})
            if hal_id and len([id for id in all_ids if id.get("id_type") == "hal_id"]) == 0:
                all_ids.append({"id_type": "hal_id", "id_value": hal_id})
            publication = {
                "all_ids": all_ids,
                "id": id,
                "is_in_openalex": True,
                "year_openalex": publication.get("publication_year")
            }
            actions.append(pymongo.UpdateOne(
                {"id": id}, {"$set": publication}, upsert=True))
    if len(actions) > 0:
        mongo_collection.bulk_write(actions, ordered=False)
    results_count = len(results)
    total_results_count += results_count
    next_cursor = response.get("meta").get("next_cursor")
    del actions
    del r
    del response
    del results
    print('{:.0f} %'.format((total_results_count / openalex_total_count) * 100))
    if next_cursor is not None and results_count > 0 and (OA_LIMIT == 0 or len(total_results_count) < OA_LIMIT):
        return get_openalex_publications(next_cursor, total_results_count)
    else:
        return total_results_count

In [None]:
# Delete all documents from OpenAlex only, in Mongo
# delete = mongo_collection.delete_many({ "is_in_openalex": "true", "is_in_fosm": { "$exists": False } })
# print(delete.deleted_count, " documents deleted")
# Delete "is_in_openalex" field for all documents in Mongo
# updated = mongo_collection.update_many({}, { "$unset": { "is_in_openalex": 1 } } , { "multi": True })
# print(updated.updated_count, " documents updated")
# Collect all French publications in OpenAlex
# openalex_publications = get_openalex_publications("*", 0)
openalex_publications = get_openalex_publications("IlswLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzQzMTgzMTg0MTEnXSI=", 1813200)
print(openalex_publications)

In [None]:
# Dump the whole database
# mongodump --uri="mongodb://localhost:27017" --archive=bsocoverage.20230917.gz --gzip --db=bsocoverage
# Restore a gzipped database
# mongorestore --uri="mongodb://localhost:27017" --archive=bsocoverage.20230917.gz --gzip