In [3]:
from dotenv import load_dotenv
import os
import json
import re
import requests
from retry import retry
import duckdb
import pandas as pd
import bso_coverage_tools as bct
import mercury

In [2]:
# Collect all French publications in OpenAlex
def reduce_french_openalex(openalex_files, yearmin = 2018, yearmax = 2023):
    publications_fr = []
    for file in openalex_files:
        with open(f"data/openalex/{file}.jsonl") as f:
            # Iterate over json lines
            for line in f:

                # Get publication
                publication = json.loads(line)

                # Check dates
                publication_year = publication.get("publication_year")
                if publication_year is None or publication_year < yearmin or publication_year > yearmax:
                    continue
                
                # Check paratext
                is_paratext = publication.get("is_paratext")
                if is_paratext:
                    continue
            
                
                # Check if french publication
                if "authorships" in publication:
                    countries = []
                    institutions = []
                    for authorship in publication.get("authorships"):
                        countries += authorship.get("countries")
                        if len(authorship.get("institutions")):
                            institutions += [authorship.get("institutions")[0].get("display_name")]

                    if "FR" in countries:
                        # Primary location
                        primary_location = None
                        if "primary_location" in publication:
                            if publication.get("primary_location") is not None:
                                if "source" in publication.get("primary_location"):
                                    if publication.get("primary_location").get("source") is not None:
                                        primary_location = publication.get("primary_location").get("source").get("display_name")
                        
                        # Check coverage
                        coverage_last_state = None
                        coverage_last_error = None
                        coverage_last_error_data = None
                        if publication.get("doi") is not None:
                             coverage = bct.analyse_from_openalex_work(publication, cli=False, as_pandas=False)
                             if coverage is not None:
                                coverage_last_state = coverage.get("last_state")
                                coverage_last_error = coverage.get("last_error")
                                coverage_last_error_data = coverage.get("last_error_data")
                    
                        # Publication data
                        data = {
                            "id": publication.get("id"),
                            "doi": publication.get("doi"),
                            "year": publication.get("publication_year"),
                            "type": publication.get("type"),
                            "type_crossref": publication.get("type_crossref"),
                            "institutions": list(set(institutions)),
                            "countries": list(set(countries)),
                            "primary_location": primary_location,
                            "is_oa": publication.get("open_access").get("is_oa") if "open_access" in publication else None,
                            "coverage": {"last_state": coverage_last_state, "last_error": coverage_last_error, "last_error_data": coverage_last_error_data}
                        }
                        # Add to list of publications
                        publications_fr.append(data)
                
    return publications_fr

In [30]:
# Get french publications
publications_fr = reduce_french_openalex(["part_019"])

In [31]:
# Export to parquet
print("Number of publications", len(publications_fr))
print(publications_fr[0])
publications_df = pd.json_normalize(publications_fr)
publications_df["coverage.last_error_data"] = publications_df["coverage.last_error_data"].astype(str)
display(publications_df.head(2))
publications_df.to_parquet("data/openalex/part_019.parquet")

Number of publications 10080
{'id': 'https://openalex.org/W4367178356', 'doi': 'https://doi.org/10.1080/15298868.2023.2202413', 'year': 2023, 'type': 'article', 'type_crossref': 'journal-article', 'institutions': ['Istanbul Bilgi University', 'Kyoto University', 'Nord University', 'University of Cagliari', 'University of Sussex', 'University of Georgia', 'University of Ghana', 'Baze University', 'Renmin University of China', 'Czech Academy of Sciences, Institute of Psychology', 'Carleton University', "King's College School", 'San Sebastián University', 'King Saud University', 'University of Essex', 'Universidad de Salamanca', 'ZHAW Zurich University of Applied Sciences', 'Tilburg University', 'Icesi University', 'University of Iceland', 'University of Limerick', 'Iscte – Instituto Universitário de Lisboa', 'Hong Kong Polytechnic University', 'University of Brawijaya', 'Universidad Nacional de La Matanza', 'Johannes Kepler University of Linz', 'Palacký University, Olomouc', 'Nagoya Univ

Unnamed: 0,id,doi,year,type,type_crossref,institutions,countries,primary_location,is_oa,coverage.last_state,coverage.last_error,coverage.last_error_data
0,https://openalex.org/W4367178356,https://doi.org/10.1080/15298868.2023.2202413,2023,article,journal-article,"[Istanbul Bilgi University, Kyoto University, ...","[MX, AR, NL, ES, SA, CO, DE, CH, TW, PL, GT, U...",Self and Identity,False,DOI_FOUND,DOI_LATE_PUBLICATION_YEAR,"('publication_year', 2023)"
1,https://openalex.org/W4367178472,https://doi.org/10.21203/rs.3.rs-2836023/v1,2023,article,posted-content,"[Institut Polytechnique de Paris, Institut Pas...",[FR],Research Square (Research Square),True,PARSED,NOT_PARSED_FR,


: 

In [None]:
with duckdb.connect("bso_coverage.db") as con:
    con.sql("DROP TABLE openalex_fr")

In [52]:
with duckdb.connect("bso_coverage.db") as con:
    con.table("openalex_fr").show()

┌──────────────────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬──────────────────────┐
│          id          │         doi          │     coverage.doi     │ … │ coverage.last_error  │ coverage.last_erro…  │
│       varchar        │       varchar        │       varchar        │   │       varchar        │      varchar[]       │
├──────────────────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼──────────────────────┤
│ https://openalex.o…  │ https://doi.org/10…  │ 10.1038/nmeth.2019   │ … │ DOI_EARLY_PUBLICAT…  │ [publication_year,…  │
│ https://openalex.o…  │ https://doi.org/10…  │ 10.1038/s41592-019…  │ … │ MISMATCH_TYPE        │ [journal-article, …  │
│ https://openalex.o…  │ https://doi.org/10…  │ 10.1136/bmj.l4898    │ … │ MISMATCH_TYPE        │ [journal-article, …  │
│ https://openalex.o…  │ https://doi.org/10…  │ 10.1051/0004-6361/…  │ … │ MISMATCH_TYPE        │ [journal-article, …  │
│ https://openalex.o…  │ https:/