In [1]:
import duckdb
import pandas as pd

print("DuckDB:", duckdb.__version__)
print("Pandas:", pd.__version__)


DuckDB: 1.4.3
Pandas: 2.3.3


In [2]:
con = duckdb.connect()

# BASE = r"data\all_backends"
# 2) Build paths robustly from the notebook folder

from pathlib import Path
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "all_backends"
con.execute(f"""
CREATE VIEW crossref AS
SELECT * FROM read_parquet(
    '{BASE}\\all_crossref\\**\\*.parquet',
    hive_partitioning=true
);
""")

con.execute(f"""
CREATE VIEW datacite AS
SELECT * FROM read_parquet(
    '{BASE}\\all_datacite\\**\\*.parquet',
    hive_partitioning=true
);
""")

con.execute(f"""
CREATE VIEW openalex AS
SELECT * FROM read_parquet(
    '{BASE}\\all_openalex\\**\\*.parquet',
    hive_partitioning=true
);
""")


<_duckdb.DuckDBPyConnection at 0x7f4617246e70>

In [3]:
con.execute("""
SELECT backend, COUNT(*) AS total, COUNT(record_id) AS with_record_id
FROM (
    SELECT backend, record_id FROM crossref
    UNION ALL
    SELECT backend, record_id FROM datacite
    UNION ALL
    SELECT backend, record_id FROM openalex
)
GROUP BY backend
""").df()


Unnamed: 0,backend,total,with_record_id
0,crossref,3612833,3612833
1,datacite,3523368,3523368
2,openalex,2366645,2366645


In [4]:
# How many unique records do we have per backend?

In [5]:
con.execute("""
SELECT
    backend,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, record_id FROM crossref
    UNION ALL
    SELECT backend, record_id FROM datacite
    UNION ALL
    SELECT backend, record_id FROM openalex
)
GROUP BY backend
ORDER BY dedup_records DESC
""").df()


Unnamed: 0,backend,dedup_records
0,datacite,3520391
1,crossref,3187171
2,openalex,1718485


In [6]:
# Deduplicated count across all backends

In [7]:
con.execute("""
SELECT COUNT(DISTINCT record_id) AS total_dedup_records
FROM (
    SELECT record_id FROM crossref
    UNION ALL
    SELECT record_id FROM datacite
    UNION ALL
    SELECT record_id FROM openalex
)
""").df()


Unnamed: 0,total_dedup_records
0,8426047


In [8]:
# Record count per server (raw)

In [9]:
con.execute("""
SELECT
    backend,
    server_name,
    COUNT(*) AS n_records
FROM (
    SELECT backend, server_name FROM crossref
    UNION ALL
    SELECT backend, server_name FROM datacite
    UNION ALL
    SELECT backend, server_name FROM openalex
)
GROUP BY backend, server_name
ORDER BY backend, n_records DESC
""").df()


Unnamed: 0,backend,server_name,n_records
0,crossref,SSRN,1258958
1,crossref,Research Square,870976
2,crossref,bioRxiv,306948
3,crossref,eLife,247558
4,crossref,Qeios,172316
...,...,...,...
109,openalex,National Bureau of Economic Research,1856
110,openalex,CogPrints,1537
111,openalex,Electron Colloquium Comput Complex,227
112,openalex,LSE Research Online Documents on Economics,119


In [10]:
# Deduplicated records per server (record_id)

In [11]:
con.execute("""
SELECT
    backend,
    server_name,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, server_name, record_id FROM crossref
    UNION ALL
    SELECT backend, server_name, record_id FROM datacite
    UNION ALL
    SELECT backend, server_name, record_id FROM openalex
)
GROUP BY backend, server_name
ORDER BY dedup_records DESC
""").df()


Unnamed: 0,backend,server_name,dedup_records
0,datacite,arXiv,2920797
1,crossref,SSRN,1258958
2,openalex,HAL,1056424
3,crossref,Research Square,450818
4,openalex,RePEc: Research Papers in Economics,389398
...,...,...,...
109,crossref,EmeRI,8
110,crossref,Therapoid,7
111,crossref,NewAddictionsX,7
112,openalex,Bepress Legal Repository,6


In [13]:
df = con.execute("""
SELECT
    backend,
    server_name,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, server_name, record_id FROM crossref
    UNION ALL
    SELECT backend, server_name, record_id FROM datacite
    UNION ALL
    SELECT backend, server_name, record_id FROM openalex
)
GROUP BY backend, server_name
ORDER BY dedup_records DESC
""").df()

df.to_csv("outputs/dedup_counts_by_server_record_id.csv", index=False)


In [14]:
df.head(60)

Unnamed: 0,backend,server_name,dedup_records
0,datacite,arXiv,2920797
1,crossref,SSRN,1258958
2,openalex,HAL,1056424
3,crossref,Research Square,450818
4,openalex,RePEc: Research Papers in Economics,389398
5,crossref,bioRxiv,306948
6,crossref,eLife,247558
7,datacite,AgEcon Search,188173
8,datacite,ResearchGate,181231
9,crossref,Qeios,172316


In [15]:
df.tail(55)

Unnamed: 0,backend,server_name,dedup_records
59,crossref,PREPRINTS.RU,1415
60,crossref,Keldysh Institute Preprints,1258
61,crossref,HRB Open Research,1012
62,datacite,CERN document server,973
63,crossref,ARPHA Preprints,890
64,crossref,MetaArXiv,880
65,crossref,SportRxiv,878
66,crossref,Gates Open Research,863
67,crossref,AgriRxiv,818
68,crossref,Beilstein Archives,697
