In [1]:
import duckdb
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import requests
import mercury
sns.set_theme(palette="tab10")

In [2]:
# Merge parquet files
duckdb.execute("""
COPY (SELECT * FROM 'data/openalex/*.parquet') TO 'merge.parquet' (FORMAT 'parquet');
""")

<duckdb.duckdb.DuckDBPyConnection at 0x24d9266a630>

In [3]:
# Build dataframe
df = pd.read_parquet("merge.parquet")
display(df.head(5))
print(df.shape)

Unnamed: 0,id,doi,year,type,type_crossref,institutions,countries,primary_location,is_oa,coverage.last_state,coverage.last_error,coverage.last_error_data
0,https://openalex.org/W3103145119,https://doi.org/10.1038/s41592-019-0686-2,2020,article,journal-article,"[Universidade Federal de Minas Gerais, Univers...","[FR, NL, AU, JP, RU, CA, FI, BR, GB, CZ, EE, US]",Nature Methods,True,IN_FOSM_FR,MISMATCH_TYPE,"('journal-article', 'article')"
1,https://openalex.org/W2970684805,https://doi.org/10.1136/bmj.l4898,2019,article,journal-article,"[Cochrane, Monash University, Inserm, Harvard–...","[FR, AU, CA, GB, DK, US]",BMJ,True,IN_FOSM_FR,MISMATCH_TYPE,"('journal-article', 'article')"
2,https://openalex.org/W2798336535,https://doi.org/10.1051/0004-6361/201833051,2018,article,journal-article,"[Laboratoire de Physique de l'ENS, Laboratoire...","[CN, RU, CU, AT, SE, FR, RS, IL, BR, ZA, MA, S...",Astronomy and Astrophysics,True,IN_FOSM_FR,MISMATCH_TYPE,"('journal-article', 'article')"
3,https://openalex.org/W2900756811,https://doi.org/10.1080/20013078.2018.1535750,2018,article,journal-article,"[Women & Infants Hospital of Rhode Island, Cas...","[TW, CN, GR, RU, MT, AT, NO, GM, IR, SI, SE, F...",Journal of extracellular vesicles,True,IN_FOSM_FR,MISMATCH_TYPE,"('journal-article', 'article')"
4,https://openalex.org/W2777074421,https://doi.org/10.1016/s1474-4422(17)30470-2,2018,article,journal-article,"[University of California, San Francisco, Medi...","[DE, AR, FR, ES, NL, AU, CA, JP, AT, CH, GB, D...",Lancet Neurology,True,IN_FOSM_FR,MISMATCH_TYPE,"('journal-article', 'article')"


(74174, 12)


# Sankey graph

In [4]:
# Build Sankey data
labels = ["OpenAlex", "UNDEFINED", "DOI_MISSING", "DOI_FOUND", "CRAWLED", "PARSED", "PARSED_FR", "IN_FOSM", "IN_FOSM_FR", 
          "BAD_DOI", "DOI_NO_ACCESS", "DOI_NO_CROSSREF", "DOI_NO_UNPAYWALL", "DOI_NO_PUBLICATION_YEAR",
          "DOI_EARLY_PUBLICATION_YEAR", "DOI_LATE_PUBLICATION_YEAR", "NOT_PARSED_FR", "NOT_PARSED", 
          "NOT_CRAWLED", "ALEX_DOI_NOT_FOUND", "ALEX_AUTHORSHIPS_NOT_FOUND", "ALEX_YEAR_NOT_FOUND", 
          "ALEX_TYPE_NOT_FOUND", "FOSM_YEAR_NOT_FOUND", "FOSM_TYPE_NOT_FOUND", "MISMATCH_YEAR", 
          "MISMATCH_TYPE", "MISMATCH_FRENCH_AFFILIATION", "OK"]

sources = []
targets = []
values = []

In [5]:
# Valid journal-article / article error
df.loc[df["coverage.last_error_data"] == "('journal-article', 'article')", "coverage.last_error"] = "OK"
df.loc[df["coverage.last_error"] == "OK", "coverage.last_error_data"] = None

In [6]:
# Add missing dois
sources.append(labels.index("OpenAlex"))
targets.append(labels.index("DOI_MISSING"))
values.append(df["doi"].isnull().sum())

In [7]:
# Add publications states
states_dict = df["coverage.last_state"].value_counts().to_dict()
print(states_dict)
for key in states_dict:
    sources.append(labels.index("OpenAlex"))
    targets.append(labels.index(key))
    values.append(states_dict.get(key))

print(sources, targets, values)

{'IN_FOSM_FR': 67665, 'PARSED': 1669, 'PARSED_FR': 1559, 'DOI_FOUND': 812, 'UNDEFINED': 71, 'IN_FOSM': 42, 'CRAWLED': 35}
[0, 0, 0, 0, 0, 0, 0, 0] [2, 8, 5, 6, 3, 1, 7, 4] [2321, 67665, 1669, 1559, 812, 71, 42, 35]


In [8]:
# Add publications errors
grouped_df = df.groupby(by=["coverage.last_state", "coverage.last_error"], as_index=False).size()
display(grouped_df)
for index, row in grouped_df.iterrows():
    sources.append(labels.index(row["coverage.last_state"]))
    targets.append(labels.index(row["coverage.last_error"]))
    values.append(row["size"])

print(sources, targets, values)

Unnamed: 0,coverage.last_state,coverage.last_error,size
0,CRAWLED,NOT_PARSED,35
1,DOI_FOUND,DOI_LATE_PUBLICATION_YEAR,427
2,DOI_FOUND,DOI_NO_CROSSREF,58
3,DOI_FOUND,DOI_NO_PUBLICATION_YEAR,2
4,DOI_FOUND,NOT_CRAWLED,325
5,IN_FOSM,MISMATCH_FRENCH_AFFILIATION,29
6,IN_FOSM,MISMATCH_TYPE,8
7,IN_FOSM,OK,5
8,IN_FOSM_FR,MISMATCH_TYPE,3528
9,IN_FOSM_FR,MISMATCH_YEAR,92


[0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 3, 3, 3, 7, 7, 7, 8, 8, 8, 5, 6, 1] [2, 8, 5, 6, 3, 1, 7, 4, 17, 15, 11, 13, 18, 27, 26, 28, 26, 25, 28, 16, 28, 10] [2321, 67665, 1669, 1559, 812, 71, 42, 35, 35, 427, 58, 2, 325, 29, 8, 5, 3528, 92, 64045, 1669, 1559, 71]


In [9]:
grouped_df = df.groupby(by=["coverage.last_error", "coverage.last_error_data"], as_index=False).size()
display(grouped_df)

Unnamed: 0,coverage.last_error,coverage.last_error_data,size
0,DOI_LATE_PUBLICATION_YEAR,"('publication_year', 2022)",38
1,DOI_LATE_PUBLICATION_YEAR,"('publication_year', 2023)",389
2,DOI_NO_ACCESS,"('doi_status_code', 404)",67
3,DOI_NO_ACCESS,"('doi_status_code', 500)",2
4,DOI_NO_ACCESS,"('doi_status_code', 502)",2
...,...,...,...
61,MISMATCH_YEAR,"(2022, 2021)",4
62,MISMATCH_YEAR,"(2022, 2023)",11
63,NOT_CRAWLED,,325
64,NOT_PARSED,,35


In [10]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      color = "blue"
    ),
    link = dict(
      source = sources,
      target = targets,
      value = values
  ))])

fig.update_layout(title_text="OpenAlex french publications coverage", font_size=10)
fig.show()

# French affiliations analysis

In [11]:
bad_affiliations_df = df[df["coverage.last_error"] == "MISMATCH_FRENCH_AFFILIATION"]
display(bad_affiliations_df[["id", "doi", "coverage.last_error_data"]])
bad_affiliations = bad_affiliations_df["coverage.last_error_data"].to_list()

Unnamed: 0,id,doi,coverage.last_error_data
2049,https://openalex.org/W2805870922,https://doi.org/10.1681/asn.2017121260,"['Division of Nephrology and Hypertension, Cen..."
10381,https://openalex.org/W2889373178,https://doi.org/10.1039/c8nr05787g,['1015 Lausanne; EPFL; Laboratoire des Matéria...
11188,https://openalex.org/W2887463283,https://doi.org/10.12688/f1000research.14417.1,"['Clinique du Vertige, Centre Hospitalier Emil..."
11933,https://openalex.org/W4225140177,https://doi.org/10.1038/s41467-022-29959-1,"['Department of Sciences and Engineering, Sorb..."
15831,https://openalex.org/W3080555168,https://doi.org/10.1145/3407023.3409219,['Montimage']
17668,https://openalex.org/W4223896490,https://doi.org/10.1007/s00500-022-07068-x,"['Sorbonne Center of Artificial Intelligence, ..."
23124,https://openalex.org/W4224233561,https://doi.org/10.1002/anie.202203938,['Department of Sciences and Engineering Sorbo...
25604,https://openalex.org/W3101925414,https://doi.org/10.1039/c9na00323a,['Foundation of Research and Technology-Hellas...
26874,https://openalex.org/W2992815577,https://doi.org/10.1039/c9nr08453c,['EPFL; Faculty of Engineering; Institute of M...
27790,https://openalex.org/W4226275118,https://doi.org/10.1021/acs.cgd.2c00225,"['Department of Sciences and Engineering, Sorb..."


In [12]:
def clean_affiliations(affiliation):
    return affiliation.removeprefix("['").removesuffix("']").split("', '")

In [13]:
cleaned_affiliations = list(set(sum([clean_affiliations(affiliation) for affiliation in bad_affiliations], [])))
cleaned_affiliations

['Medicine and Endocrinology, Pondicherry Institute of Medical Sciences, Pondicherry, IND.',
 'Science and Engineering Department, Sorbonne University Abu Dhabi, Abu Dhabi, UAE; Smart Materials Lab, New York University Abu Dhabi, Abu Dhabi, UAE',
 '1015 Lausanne; EPFL; Laboratoire des Matériaux Semiconducteurs; Switzerland; École Polytechnique Fédérale de Lausanne',
 'EPFL; Faculty of Engineering; Institute of Materials; Laboratoire des Matériaux Semiconducteurs; École Polytechnique Fédérale de Lausanne',
 'Laboratoire de Biochimie, CHU Habib Bourguiba, Sfax, Tunisie; UR 12ES17 « Bases moléculaires de la pathologie humaine », Faculté de Médecine de Sfax, Université de Sfax, Sfax, Tunisie',
 'Présidente de la SFP.',
 'Department of Sciences and Engineering, Sorbonne University Abu Dhabi, Abu Dhabi, UAE; Smart Materials Lab, New York University Abu Dhabi, Abu Dhabi, UAE',
 'Montimage',
 'Department of Sciences and Engineering Sorbonne University Abu Dhabi  38044 Abu Dhabi United Arab Emi

In [None]:
id = "W3080555168"
response = requests.get(f"https://api.openalex.org/works/{id}").json()
mercury.JSON(response)