In [None]:
import csv
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import os
import pymongo

In [None]:
load_dotenv()

MONGO_DB="bsocoverage"
MONGO_COLLECTION="publications"

# Access the environment variables from the .env file
MONGO_URI=os.getenv("MONGO_URI", "mongodb://localhost:27017/")

In [None]:
mongo_database = pymongo.MongoClient(MONGO_URI)[MONGO_DB]
mongo_collection = mongo_database[MONGO_COLLECTION]

In [21]:
# Total in database
total_count = mongo_collection.count_documents({})
print(f"Total count : {total_count}")
# FOSM
total_fosm = mongo_collection.count_documents({"is_in_fosm": True})
total_fosm_percent = (total_fosm / total_count) * 100
print(
    f"Total FOSM : {total_fosm} (i.e. {total_fosm_percent:.0f} % of the total)")
# OpenAlex
total_openalex = mongo_collection.count_documents({"is_in_openalex": True})
total_openalex_percent = (total_openalex / total_count) * 100
print(
    f"Total OpenAlex : {total_openalex} (i.e. {total_openalex_percent:.0f} % of the total)")
# In FOSM and in OpenAlex
total_fosm_openalex = mongo_collection.count_documents(
    {"is_in_fosm": True, "is_in_openalex": True})
total_fosm_openalex_percent = (total_fosm_openalex / total_count) * 100
print(
    f"Total FOSM and OpenAlex : {total_fosm_openalex} (i.e. {total_fosm_openalex_percent:.0f} % of the total)")

In [23]:
# Repartition by year of publications
agg_results = mongo_collection.aggregate([
    {"$match": {"is_in_openalex": True, "is_in_fosm": {"$exists": False}}},
    {"$group": {"_id": "$openalex.publication_year", "count": {"$sum": 1}}},
    {"$sort": {"_id": 1}}
])
agg_results = [d for d in agg_results]
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
labels = [f"{d.get('_id')} ({d.get('count')})" for d in agg_results]
count = [d.get("count") for d in agg_results]
ax.bar(labels, count)
plt.xticks(rotation=90)
plt.savefig("output/repartition_by_year_of_publications_from_openalex_missing_in_fosm.png", bbox_inches='tight')
plt.show()

In [24]:
# List errors about publication year consistency accross FOSM and OpenAlex
with open("output/errors_publication_year.csv", "w", encoding="UTF8") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "year_fosm", "year_openalex"])
    for publication in mongo_collection.find():
        if publication.get("year_fosm") and publication.get("year_openalex") and publication.get("year_fosm") != publication.get("year_openalex"):
            writer.writerow([publication.get('id'), publication.get('year_fosm'), publication.get('year_openalex')])

In [None]:
# List 200 DOI from 2021 existing in OpenAlex but missing in FOSM
results = mongo_collection.find({ "is_in_openalex": True, "is_in_fosm": { "$exists": False }, "year": 2021 }).limit(200)
with open("output/200_doi_of_publications_from_openalex_missing_in_fosm_from_2021.csv", "w", encoding="UTF8") as f:
  writer = csv.writer(f)
  writer.writerow(["doi"])
  for result in results:
    writer.writerow([result.get("id")])