In [3]:
from dotenv import load_dotenv
import os
import json
import re
import requests
from retry import retry
import duckdb
import pandas as pd
import bso_coverage_tools as bct
import mercury

In [2]:
# Collect all French publications in OpenAlex
def reduce_french_openalex(openalex_files, yearmin = 2018, yearmax = 2023):
    publications_fr = []
    for file in openalex_files:
        with open(f"data/openalex/{file}.jsonl") as f:
            # Iterate over json lines
            for line in f:

                # Get publication
                publication = json.loads(line)

                # Check dates
                publication_year = publication.get("publication_year")
                if publication_year is None or publication_year < yearmin or publication_year > yearmax:
                    continue
                
                # Check paratext
                is_paratext = publication.get("is_paratext")
                if is_paratext:
                    continue
            
                
                # Check if french publication
                if "authorships" in publication:
                    countries = []
                    institutions = []
                    for authorship in publication.get("authorships"):
                        countries += authorship.get("countries")
                        if len(authorship.get("institutions")):
                            institutions += [authorship.get("institutions")[0].get("display_name")]

                    if "FR" in countries:
                        # Primary location
                        primary_location = None
                        if "primary_location" in publication:
                            if publication.get("primary_location") is not None:
                                if "source" in publication.get("primary_location"):
                                    if publication.get("primary_location").get("source") is not None:
                                        primary_location = publication.get("primary_location").get("source").get("display_name")
                        
                        # Check coverage
                        coverage_last_state = None
                        coverage_last_error = None
                        coverage_last_error_data = None
                        if publication.get("doi") is not None:
                             coverage = bct.analyse_from_openalex_work(publication, cli=False, as_pandas=False)
                             if coverage is not None:
                                coverage_last_state = coverage.get("last_state")
                                coverage_last_error = coverage.get("last_error")
                                coverage_last_error_data = coverage.get("last_error_data")
                    
                        # Publication data
                        data = {
                            "id": publication.get("id"),
                            "doi": publication.get("doi"),
                            "year": publication.get("publication_year"),
                            "type": publication.get("type"),
                            "type_crossref": publication.get("type_crossref"),
                            "institutions": list(set(institutions)),
                            "countries": list(set(countries)),
                            "primary_location": primary_location,
                            "is_oa": publication.get("open_access").get("is_oa") if "open_access" in publication else None,
                            "coverage": {"last_state": coverage_last_state, "last_error": coverage_last_error, "last_error_data": coverage_last_error_data}
                        }
                        # Add to list of publications
                        publications_fr.append(data)
                
    return publications_fr

In [61]:
# Collect all French publications in OpenAlex
def collect_french_openalex(openalex_files, con, startindex = 0, yearmin = 2018, yearmax = 2023, forcebreak=False):
    publications_fr = []
    for file in openalex_files:
        with open(f"data/openalex/{file}.jsonl") as f:
            index = -1
            # Iterate over json lines
            for line in f:
                index +=1
                if index < startindex:
                    continue
                
                # Get publication
                publication = json.loads(line)

                # Check dates
                publication_year = publication.get("publication_year")
                if publication_year is None or publication_year < yearmin or publication_year > yearmax:
                    continue
                
                # Check if french publication
                if "authorships" in publication:
                    countries = []
                    for authorship in publication.get("authorships"):
                        countries += authorship.get("countries")

                    if "FR" in countries:
                        # Get relevant data 
                        print(f"{publication.get('id')}: french publication detected")
                        data = {
                            "id": publication.get("id"),
                            "doi": publication.get("doi"),
                            "year": publication_year,
                            "type": publication.get("type"),
                            "coverage": bct.analyse_from_openalex_work(publication, cli=False, as_pandas=False) if publication.get("doi") else {"doi": publication.get("doi"), "last_state": None, "last_error": None, "last_error_data": None}
                        }
                        df = pd.json_normalize(data)
                        # print("data", data)
                        
                        # Add publication to duckdb
                        if not con.query(f"SELECT * FROM openalex_fr WHERE id = '{data.get('id')}'"):
                            print(f"added index {index}")
                            con.sql("INSERT INTO openalex_fr SELECT * FROM df")
                
                if forcebreak:
                    break

In [8]:
# Get french publications
publications_fr = reduce_french_openalex(["part_012"])

In [7]:
# Export to parquet
print("Number of publications", len(publications_fr))
print(publications_fr[0])
publications_df = pd.json_normalize(publications_fr)
publications_df["coverage.last_error_data"] = publications_df["coverage.last_error_data"].astype(str)
display(publications_df.head(2))
publications_df.to_parquet("data/openalex/part_012.parquet")

Number of publications 5180
{'id': 'https://openalex.org/W2968185027', 'doi': 'https://doi.org/10.1103/physrevb.100.060401', 'year': 2019, 'type': 'article', 'type_crossref': 'journal-article', 'institutions': ['Université de Lorraine', 'Institute for High Pressure Physics', 'Institut Laue-Langevin', 'Laboratoire Léon Brillouin'], 'countries': ['FR', 'RU'], 'primary_location': 'Physical review', 'is_oa': True, 'coverage': {'last_state': 'IN_FOSM_FR', 'last_error': 'MISMATCH_TYPE', 'last_error_data': ('journal-article', 'article')}}


Unnamed: 0,id,doi,year,type,type_crossref,institutions,countries,primary_location,is_oa,coverage.last_state,coverage.last_error,coverage.last_error_data
0,https://openalex.org/W2968185027,https://doi.org/10.1103/physrevb.100.060401,2019,article,journal-article,"[Université de Lorraine, Institute for High Pr...","[FR, RU]",Physical review,True,IN_FOSM_FR,MISMATCH_TYPE,"('journal-article', 'article')"
1,https://openalex.org/W2968592665,https://doi.org/10.2514/1.c035343,2019,article,journal-article,"[University of Paris-Saclay, Safran (France)]",[FR],Journal of Aircraft,True,IN_FOSM_FR,MISMATCH_TYPE,"('journal-article', 'article')"


In [72]:
with duckdb.connect("bso_coverage.db") as con:
    con.sql("DROP TABLE openalex_fr")

CatalogException: Catalog Error: Table with name openalex_fr does not exist!
Did you mean "openalex"?

In [59]:
files = ["part_000"]
with duckdb.connect("bso_coverage.db") as con:
    collect_french_openalex(files, con, startindex=67026, forcebreak=False)

https://openalex.org/W2799933121: french publication detected
https://openalex.org/W2883916703: french publication detected
added index 67030
https://openalex.org/W2905736690: french publication detected
added index 67039
https://openalex.org/W2919006054: french publication detected
added index 67042
https://openalex.org/W2941313896: french publication detected
added index 67044
https://openalex.org/W2947520615: french publication detected
added index 67461
https://openalex.org/W2961319233: french publication detected
added index 67464
https://openalex.org/W2997720498: french publication detected
added index 67465
https://openalex.org/W3103215654: french publication detected
added index 67472
https://openalex.org/W2945715301: french publication detected
added index 67855
https://openalex.org/W2789389013: french publication detected
added index 68237
https://openalex.org/W2884149298: french publication detected
added index 68243
https://openalex.org/W3008170328: french publication detec

ClientException: Authorization Failure. Authorization failed: HTTP Client Error (HTTP 429)

In [52]:
with duckdb.connect("bso_coverage.db") as con:
    con.table("openalex_fr").show()

┌──────────────────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬──────────────────────┐
│          id          │         doi          │     coverage.doi     │ … │ coverage.last_error  │ coverage.last_erro…  │
│       varchar        │       varchar        │       varchar        │   │       varchar        │      varchar[]       │
├──────────────────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼──────────────────────┤
│ https://openalex.o…  │ https://doi.org/10…  │ 10.1038/nmeth.2019   │ … │ DOI_EARLY_PUBLICAT…  │ [publication_year,…  │
│ https://openalex.o…  │ https://doi.org/10…  │ 10.1038/s41592-019…  │ … │ MISMATCH_TYPE        │ [journal-article, …  │
│ https://openalex.o…  │ https://doi.org/10…  │ 10.1136/bmj.l4898    │ … │ MISMATCH_TYPE        │ [journal-article, …  │
│ https://openalex.o…  │ https://doi.org/10…  │ 10.1051/0004-6361/…  │ … │ MISMATCH_TYPE        │ [journal-article, …  │
│ https://openalex.o…  │ https:/

In [None]:

    with duckdb.connect("bso_coverage.db") as con:
        con.execute("CREATE TABLE openalex_coverage AS SELECT * FROM 'data/part_000/part_000.jsonl'")
        con.table("openalex_coverage").show()