In [1]:
import duckdb
import pandas as pd

print("DuckDB:", duckdb.__version__)
print("Pandas:", pd.__version__)


DuckDB: 1.4.3
Pandas: 2.3.3


In [2]:
import duckdb
from pathlib import Path

# 1) Always create ONE connection and keep using it
con = duckdb.connect()  # in-memory is fine for analysis

# 2) Build paths robustly from the notebook folder
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "all_backends"

# Your structure is: all_crossref/backend=crossref/data_0.parquet
crossref_glob = (BASE / "all_crossref" / "*" / "*.parquet").as_posix()
datacite_glob = (BASE / "all_datacite" / "*" / "*.parquet").as_posix()
openalex_glob = (BASE / "all_openalex" / "*" / "*.parquet").as_posix()

print("BASE:", BASE)
print("crossref_glob:", crossref_glob)
print("datacite_glob:", datacite_glob)
print("openalex_glob:", openalex_glob)


BASE: /mnt/c/SCHOLCOMMLAB/APPs/preprint-harvester/data/all_backends
crossref_glob: /mnt/c/SCHOLCOMMLAB/APPs/preprint-harvester/data/all_backends/all_crossref/*/*.parquet
datacite_glob: /mnt/c/SCHOLCOMMLAB/APPs/preprint-harvester/data/all_backends/all_datacite/*/*.parquet
openalex_glob: /mnt/c/SCHOLCOMMLAB/APPs/preprint-harvester/data/all_backends/all_openalex/*/*.parquet


In [3]:
con.execute(f"""
CREATE OR REPLACE VIEW crossref AS
SELECT * FROM read_parquet('{crossref_glob}', hive_partitioning=true, union_by_name=true);
""")

con.execute(f"""
CREATE OR REPLACE VIEW datacite AS
SELECT * FROM read_parquet('{datacite_glob}', hive_partitioning=true, union_by_name=true);
""")

con.execute(f"""
CREATE OR REPLACE VIEW openalex AS
SELECT * FROM read_parquet('{openalex_glob}', hive_partitioning=true, union_by_name=true);
""")


<_duckdb.DuckDBPyConnection at 0x7c0d71b01470>

In [4]:
con.execute("SHOW TABLES").fetchall()


[('crossref',), ('datacite',), ('openalex',)]

In [5]:
con.execute("SELECT COUNT(*) FROM crossref").fetchone()


(3612833,)

In [6]:
con.execute("""
CREATE OR REPLACE VIEW all_rows AS
SELECT * FROM crossref
UNION ALL
SELECT * FROM datacite
UNION ALL
SELECT * FROM openalex;
""")

con.execute("""
CREATE OR REPLACE VIEW all_dedup AS
SELECT * EXCLUDE(rn)
FROM (
  SELECT
    *,
    row_number() OVER (
      PARTITION BY record_id
      ORDER BY COALESCE(date_updated, date_indexed, date_deposited, date_created) DESC NULLS LAST
    ) AS rn
  FROM all_rows
)
WHERE rn = 1;
""")


<_duckdb.DuckDBPyConnection at 0x7c0d71b01470>

In [7]:
total_raw   = con.execute("SELECT COUNT(*) FROM all_rows").fetchone()[0]
total_dedup = con.execute("SELECT COUNT(*) FROM all_dedup").fetchone()[0]
(total_raw, total_dedup, total_raw - total_dedup)


(9502846, 8426047, 1076799)

In [8]:
con.execute("""
SELECT server_name, COUNT(*) AS n
FROM all_dedup
GROUP BY 1
ORDER BY n DESC;
""").df()


Unnamed: 0,server_name,n
0,arXiv,2920797
1,SSRN,1258958
2,HAL,1056424
3,Research Square,450818
4,RePEc: Research Papers in Economics,389398
...,...,...
107,Prepublicaciones OpenCiencia,8
108,Therapoid,7
109,NewAddictionsX,7
110,Bepress Legal Repository,6


In [9]:
con.execute("SHOW TABLES").fetchall()


[('all_dedup',), ('all_rows',), ('crossref',), ('datacite',), ('openalex',)]

In [10]:
con.execute("""
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'all_dedup'
  AND (
    lower(column_name) LIKE '%url%'
    OR lower(column_name) LIKE '%landing%'
    OR lower(column_name) LIKE '%primary%'
  )
ORDER BY 1;
""").df()


Unnamed: 0,column_name
0,doi_url
1,landing_page_url


In [11]:
con.execute(r"""
CREATE OR REPLACE TABLE version_detection AS
WITH base AS (
  SELECT
    backend,
    server_name,
    record_id,

    /* ---- FORCE ALL STRINGS ---- */
    lower(coalesce(CAST(doi AS VARCHAR), ''))               AS doi_l,
    lower(coalesce(CAST(landing_page_url AS VARCHAR), '')) AS landing_l,
    lower(coalesce(CAST(version_label AS VARCHAR), ''))    AS vlabel_l,

    /* ---- FORCE BOOLEAN SAFELY ---- */
    CASE
      WHEN CAST(is_version_of AS VARCHAR) IN ('true','1','t','yes') THEN TRUE
      ELSE FALSE
    END AS is_version_of

  FROM all_dedup
),

flags AS (
  SELECT
    backend,
    server_name,
    record_id,

    /* DOI version patterns */
    regexp_matches(doi_l, '[-._]v[0-9]{1,3}([^0-9]|$)') AS doi_sep_vN,
    regexp_matches(doi_l, '[-._][0-9]{1,3}([^0-9]|$)')  AS doi_sep_N,

    /* URL version patterns */
    regexp_matches(landing_l, '([?&](version|ver|v)=([0-9]{1,3}))') AS landing_param_version,
    regexp_matches(landing_l, '(/v[0-9]{1,3})(/|$)')                AS landing_path_vN,

    /* Metadata signal */
    (vlabel_l <> '' AND vlabel_l <> 'null') AS version_label_present,

    /* Relationship signal */
    is_version_of AS metadata_is_version_of
  FROM base
),

scored AS (
  SELECT
    *,
    (CASE WHEN doi_sep_vN THEN 1 ELSE 0 END
     + CASE WHEN doi_sep_N THEN 1 ELSE 0 END
     + CASE WHEN landing_param_version THEN 1 ELSE 0 END
     + CASE WHEN landing_path_vN THEN 1 ELSE 0 END
     + CASE WHEN version_label_present THEN 1 ELSE 0 END
     + CASE WHEN metadata_is_version_of THEN 1 ELSE 0 END
    ) AS n_signals
  FROM flags
)

SELECT
  backend,
  server_name,
  record_id,

  CASE
    WHEN n_signals = 0 THEN 'no_signal'
    WHEN n_signals > 1 THEN 'multi_signal'
    WHEN doi_sep_vN THEN 'doi_sep_vN'
    WHEN doi_sep_N  THEN 'doi_sep_N'
    WHEN landing_param_version THEN 'landing_param_version'
    WHEN landing_path_vN THEN 'landing_path_vN'
    WHEN version_label_present THEN 'metadata_version_label'
    WHEN metadata_is_version_of THEN 'metadata_is_version_of'
    ELSE 'unknown'
  END AS version_signal_type,

  /* audit columns */
  n_signals,
  doi_sep_vN,
  doi_sep_N,
  landing_param_version,
  landing_path_vN,
  version_label_present,
  metadata_is_version_of

FROM scored;
""")


<_duckdb.DuckDBPyConnection at 0x7c0d71b01470>

In [12]:
con.execute("""
SELECT version_signal_type, COUNT(*) AS n
FROM version_detection
GROUP BY 1
ORDER BY n DESC;
""").df()


Unnamed: 0,version_signal_type,n
0,no_signal,3991578
1,metadata_version_label,2959535
2,doi_sep_N,792447
3,multi_signal,607660
4,doi_sep_vN,74445
5,landing_path_vN,382


In [13]:
# Create a clean version-detection table
con.execute(r"""
CREATE OR REPLACE TABLE version_detection AS
WITH base AS (
  SELECT
    backend,
    server_name,
    record_id,

    -- normalize strings safely across drifted schemas
    lower(coalesce(CAST(doi AS VARCHAR), '')) AS doi_l,
    lower(coalesce(CAST(landing_page_url AS VARCHAR), '')) AS landing_l,
    lower(coalesce(CAST(version_label AS VARCHAR), '')) AS vlabel_l,

    -- normalize boolean safely
    CASE
      WHEN CAST(is_version_of AS VARCHAR) IN ('true','1','t','yes') THEN TRUE
      ELSE FALSE
    END AS is_version_of
  FROM all_dedup
),
flags AS (
  SELECT
    backend,
    server_name,
    record_id,

    -- DOI patterns (common: -v1, .v2, _v3, -1, .2, etc.)
    regexp_matches(doi_l, '[-._]v[0-9]{1,3}([^0-9]|$)') AS doi_sep_vN,
    regexp_matches(doi_l, '[-._][0-9]{1,3}([^0-9]|$)')  AS doi_sep_N,

    -- Landing URL patterns (query params or path versions)
    regexp_matches(landing_l, '([?&](version|ver|v)=([0-9]{1,3}))') AS landing_param_version,
    regexp_matches(landing_l, '(/v[0-9]{1,3})(/|$)')                AS landing_path_vN,

    -- Metadata signal (string present)
    (vlabel_l <> '' AND vlabel_l <> 'null') AS version_label_present,

    -- Relationship signal
    is_version_of AS metadata_is_version_of
  FROM base
),
scored AS (
  SELECT
    *,
    -- count how many signals are true
    (CASE WHEN doi_sep_vN THEN 1 ELSE 0 END
     + CASE WHEN doi_sep_N THEN 1 ELSE 0 END
     + CASE WHEN landing_param_version THEN 1 ELSE 0 END
     + CASE WHEN landing_path_vN THEN 1 ELSE 0 END
     + CASE WHEN version_label_present THEN 1 ELSE 0 END
     + CASE WHEN metadata_is_version_of THEN 1 ELSE 0 END
    ) AS n_signals
  FROM flags
)
SELECT
  backend,
  server_name,
  record_id,

  -- Primary label (single best category) + multi/no signal handling
  CASE
    WHEN n_signals = 0 THEN 'no_signal'
    WHEN n_signals > 1 THEN 'multi_signal'

    -- single-signal cases (priority order)
    WHEN doi_sep_vN THEN 'doi_sep_vN'
    WHEN doi_sep_N  THEN 'doi_sep_N'
    WHEN landing_param_version THEN 'landing_param_version'
    WHEN landing_path_vN       THEN 'landing_path_vN'
    WHEN version_label_present THEN 'metadata_version_label'
    WHEN metadata_is_version_of THEN 'metadata_is_version_of'
    ELSE 'unknown'
  END AS version_signal_type,

  -- Helpful audit columns (keep these!)
  n_signals,
  doi_sep_vN,
  doi_sep_N,
  landing_param_version,
  landing_path_vN,
  version_label_present,
  metadata_is_version_of

FROM scored;
""")

# quick peek
con.execute("SELECT version_signal_type, COUNT(*) AS n FROM version_detection GROUP BY 1 ORDER BY n DESC;").df()


Unnamed: 0,version_signal_type,n
0,no_signal,3991578
1,metadata_version_label,2959535
2,doi_sep_N,792447
3,multi_signal,607660
4,doi_sep_vN,74445
5,landing_path_vN,382


In [14]:
con.execute(r"""
CREATE OR REPLACE TABLE version_detection AS
WITH base AS (
  SELECT
    backend, server_name, record_id,
    lower(coalesce(CAST(doi AS VARCHAR), '')) AS doi_l,
    lower(coalesce(CAST(landing_page_url AS VARCHAR), '')) AS landing_l,
    lower(coalesce(CAST(version_label AS VARCHAR), '')) AS vlabel_l,
    
    CASE
      WHEN CAST(is_version_of AS VARCHAR) IN ('true','1','t','yes') THEN TRUE
      ELSE FALSE
    END AS is_version_of
  FROM all_dedup
),
flags AS (
  SELECT
    backend, server_name, record_id,
    regexp_matches(doi_l, '[-._]v[0-9]{1,3}([^0-9]|$)') AS doi_sep_vN,
    regexp_matches(doi_l, '[-._][0-9]{1,3}([^0-9]|$)')  AS doi_sep_N,
    regexp_matches(landing_l, '([?&](version|ver|v)=([0-9]{1,3}))') AS landing_param_version,
    regexp_matches(landing_l, '(/v[0-9]{1,3})(/|$)')                AS landing_path_vN,
    (vlabel_l <> '' AND vlabel_l <> 'null') AS version_label_present,
    is_version_of AS metadata_is_version_of
  FROM base
),
scored AS (
  SELECT
    *,
    (CASE WHEN doi_sep_vN THEN 1 ELSE 0 END
     + CASE WHEN doi_sep_N THEN 1 ELSE 0 END
     + CASE WHEN landing_param_version THEN 1 ELSE 0 END
     + CASE WHEN landing_path_vN THEN 1 ELSE 0 END
     + CASE WHEN version_label_present THEN 1 ELSE 0 END
     + CASE WHEN metadata_is_version_of THEN 1 ELSE 0 END
    ) AS n_signals
  FROM flags
),
reasons AS (
  SELECT
    *,
    trim(both '|' from
      (CASE WHEN doi_sep_vN THEN 'doi_sep_vN|' ELSE '' END) ||
      (CASE WHEN doi_sep_N  THEN 'doi_sep_N|'  ELSE '' END) ||
      (CASE WHEN landing_param_version THEN 'landing_param_version|' ELSE '' END) ||
      (CASE WHEN landing_path_vN       THEN 'landing_path_vN|'       ELSE '' END) ||
      (CASE WHEN version_label_present THEN 'metadata_version_label|' ELSE '' END) ||
      (CASE WHEN metadata_is_version_of THEN 'metadata_is_version_of|' ELSE '' END)
    ) AS version_signal_reasons
  FROM scored
)
SELECT
  backend,
  server_name,
  record_id,
  CASE
    WHEN n_signals = 0 THEN 'no_signal'
    WHEN n_signals > 1 THEN 'multi_signal'
    WHEN doi_sep_vN THEN 'doi_sep_vN'
    WHEN doi_sep_N  THEN 'doi_sep_N'
    WHEN landing_param_version THEN 'landing_param_version'
    WHEN landing_path_vN       THEN 'landing_path_vN'
    WHEN version_label_present THEN 'metadata_version_label'
    WHEN metadata_is_version_of THEN 'metadata_is_version_of'
    ELSE 'unknown'
  END AS version_signal_type,
  version_signal_reasons,
  n_signals,
  doi_sep_vN, doi_sep_N, landing_param_version, landing_path_vN, version_label_present, metadata_is_version_of
FROM reasons;
""")


<_duckdb.DuckDBPyConnection at 0x7c0d71b01470>

In [15]:
con.execute("""
SELECT server_name, version_signal_type, COUNT(*) AS n
FROM version_detection
GROUP BY 1,2
ORDER BY server_name, n DESC;
""").df()


Unnamed: 0,server_name,version_signal_type,n
0,AIJR Preprints,doi_sep_N,143
1,AMRC Open Research,multi_signal,38
2,AMRC Open Research,no_signal,1
3,APSA Preprints,no_signal,772
4,APSA Preprints,doi_sep_N,391
...,...,...,...
244,searchRxiv,no_signal,2
245,viXra,no_signal,24275
246,viXra,doi_sep_N,1128
247,viXra,doi_sep_vN,165


In [16]:
con.execute("""
SELECT server_name,
       SUM(CASE WHEN version_signal_type <> 'no_signal' THEN 1 ELSE 0 END) AS n_with_signal,
       COUNT(*) AS n_total
FROM version_detection
GROUP BY 1
ORDER BY n_with_signal DESC
LIMIT 30;
""").df()


Unnamed: 0,server_name,n_with_signal,n_total
0,arXiv,2920797.0,2920797
1,Research Square,450818.0,450818
2,bioRxiv,239579.0,306948
3,eLife,209257.0,247558
4,ResearchGate,181231.0,181231
5,Preprints.org,115815.0,115815
6,medRxiv,74951.0,75743
7,Zenodo,38122.0,166786
8,ChemRxiv,27959.0,46475
9,EGUsphere,18002.0,60396


In [17]:
con.execute(r"""
CREATE OR REPLACE TABLE version_detection AS
WITH base AS (
  SELECT
    backend,
    server_name,
    record_id,

    /* ---- normalize strings safely ---- */
    lower(coalesce(CAST(doi AS VARCHAR), ''))               AS doi_l,
    lower(coalesce(CAST(landing_page_url AS VARCHAR), '')) AS landing_l,
    lower(coalesce(CAST(version_label AS VARCHAR), ''))    AS vlabel_l,

    /* ---- normalize boolean safely ---- */
    CASE
      WHEN CAST(is_version_of AS VARCHAR) IN ('true','1','t','yes') THEN TRUE
      ELSE FALSE
    END AS is_version_of

  FROM all_dedup
),

flags AS (
  SELECT
    backend,
    server_name,
    record_id,

    /* DOI patterns */
    regexp_matches(doi_l, '[-._]v[0-9]{1,3}([^0-9]|$)') AS doi_sep_vN,
    regexp_matches(doi_l, '[-._][0-9]{1,3}([^0-9]|$)')  AS doi_sep_N,

    /* Landing URL patterns */
    regexp_matches(landing_l, '([?&](version|ver|v)=([0-9]{1,3}))') AS landing_param_version,
    regexp_matches(landing_l, '(/v[0-9]{1,3})(/|$)')                AS landing_path_vN,

    /* Metadata signal */
    (vlabel_l <> '' AND vlabel_l <> 'null') AS metadata_version_label,

    /* Relationship signal */
    is_version_of AS metadata_is_version_of
  FROM base
),

reasons AS (
  SELECT
    backend,
    server_name,
    record_id,

    /* Build explicit reason list */
    trim(both '|' FROM
      concat(
        CASE WHEN doi_sep_vN THEN 'doi_sep_vN|' ELSE '' END,
        CASE WHEN doi_sep_N THEN 'doi_sep_N|' ELSE '' END,
        CASE WHEN landing_param_version THEN 'landing_param_version|' ELSE '' END,
        CASE WHEN landing_path_vN THEN 'landing_path_vN|' ELSE '' END,
        CASE WHEN metadata_version_label THEN 'metadata_version_label|' ELSE '' END,
        CASE WHEN metadata_is_version_of THEN 'metadata_is_version_of|' ELSE '' END
      )
    ) AS version_signal_reasons,

    /* signal count (useful downstream) */
    (CASE WHEN doi_sep_vN THEN 1 ELSE 0 END
     + CASE WHEN doi_sep_N THEN 1 ELSE 0 END
     + CASE WHEN landing_param_version THEN 1 ELSE 0 END
     + CASE WHEN landing_path_vN THEN 1 ELSE 0 END
     + CASE WHEN metadata_version_label THEN 1 ELSE 0 END
     + CASE WHEN metadata_is_version_of THEN 1 ELSE 0 END
    ) AS n_signals

  FROM flags
)

SELECT
  backend,
  server_name,
  record_id,

  /* Final normalized output */
  CASE
    WHEN n_signals = 0 THEN 'no_signal'
    ELSE version_signal_reasons
  END AS version_signal_reasons,

  n_signals

FROM reasons;
""")


<_duckdb.DuckDBPyConnection at 0x7c0d71b01470>

In [18]:
con.execute("""
SELECT version_signal_reasons, COUNT(*) AS n
FROM version_detection
GROUP BY 1
ORDER BY n DESC;
""").df()


Unnamed: 0,version_signal_reasons,n
0,no_signal,3991578
1,metadata_version_label,2959535
2,doi_sep_N,792447
3,doi_sep_N|landing_path_vN,467897
4,doi_sep_vN|landing_path_vN,115982
5,doi_sep_vN,74445
6,doi_sep_vN|doi_sep_N,8304
7,doi_sep_N|landing_path_vN|metadata_version_label,8265
8,doi_sep_N|metadata_version_label,6416
9,doi_sep_vN|doi_sep_N|landing_path_vN,565


In [19]:
con.execute("""
SELECT *
FROM version_detection
WHERE n_signals > 1
LIMIT 20;
""").df()


Unnamed: 0,backend,server_name,record_id,version_signal_reasons,n_signals
0,openalex,RePEc: Research Papers in Economics,openalex::W3124501944,doi_sep_vN|doi_sep_N,2
1,datacite,ResearchGate,datacite::10.13140/rg.2.2.15213.56805/1,doi_sep_N|metadata_version_label,2
2,datacite,ResearchGate,datacite::10.13140/rg.2.2.15288.57604/1,doi_sep_N|metadata_version_label,2
3,datacite,ResearchGate,datacite::10.13140/rg.2.2.16595.40484/1,doi_sep_N|metadata_version_label,2
4,datacite,ResearchGate,datacite::10.13140/rg.2.2.16647.65443/1,doi_sep_N|metadata_version_label,2
5,datacite,ResearchGate,datacite::10.13140/rg.2.2.16770.91843/1,doi_sep_N|metadata_version_label,2
6,datacite,ResearchGate,datacite::10.13140/rg.2.2.17465.93281/1,doi_sep_N|metadata_version_label,2
7,datacite,ResearchGate,datacite::10.13140/rg.2.2.18040.11524/2,doi_sep_N|metadata_version_label,2
8,datacite,ResearchGate,datacite::10.13140/rg.2.2.19963.69922/1,doi_sep_N|metadata_version_label,2
9,datacite,ResearchGate,datacite::10.13140/rg.2.2.20013.44002/2,doi_sep_N|metadata_version_label,2


In [20]:
con.execute("DESCRIBE all_dedup").df()


Unnamed: 0,column_name,column_type,null,key,default,extra
0,record_id,VARCHAR,YES,,,
1,server_name,VARCHAR,YES,,,
2,doi,VARCHAR,YES,,,
3,doi_url,VARCHAR,YES,,,
4,landing_page_url,VARCHAR,YES,,,
5,title,VARCHAR,YES,,,
6,type_backend_raw,VARCHAR,YES,,,
7,subtype_backend_raw,VARCHAR,YES,,,
8,type_canonical,VARCHAR,YES,,,
9,is_preprint_candidate,VARCHAR,YES,,,


In [21]:
con.execute("""
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_name = 'all_dedup'
ORDER BY ordinal_position
""").df()


Unnamed: 0,column_name,data_type
0,record_id,VARCHAR
1,server_name,VARCHAR
2,doi,VARCHAR
3,doi_url,VARCHAR
4,landing_page_url,VARCHAR
5,title,VARCHAR
6,type_backend_raw,VARCHAR
7,subtype_backend_raw,VARCHAR
8,type_canonical,VARCHAR
9,is_preprint_candidate,VARCHAR


In [22]:
# con.execute("""
# SELECT *
# FROM all_dedup
# LIMIT 100
# """).df()


In [23]:
# con.execute("""
# SELECT *
# FROM all_dedup
# USING SAMPLE 100 ROWS
# """).df()


In [24]:
# con.execute("""
# SELECT *
# FROM all_dedup
# USING SAMPLE 0.01 PERCENT
# """).df()


In [25]:
# PAGE_SIZE = 1000

# for i in range(0, 5000, PAGE_SIZE):
#     df = con.execute(f"""
#         SELECT *
#         FROM all_dedup
#         LIMIT {PAGE_SIZE} OFFSET {i}
#     """).df()
#     display(df)


In [26]:
# df = con.execute("""
# SELECT *
# FROM all_dedup
# LIMIT 50_000
# """).df()


In [27]:
# con.execute("""
# COPY all_dedup
# TO 'all_dedup_full.parquet'
# (FORMAT PARQUET)
# """)


In [28]:
con.execute("""
SELECT backend, COUNT(*) 
FROM all_dedup 
GROUP BY backend
""").df()



Unnamed: 0,backend,count_star()
0,crossref,3187171
1,datacite,3520391
2,openalex,1718485


In [29]:
con.execute("""
SELECT server_name, COUNT(*) 
FROM all_dedup 
GROUP BY server_name
ORDER BY COUNT(*) DESC
""").df()


Unnamed: 0,server_name,count_star()
0,arXiv,2920797
1,SSRN,1258958
2,HAL,1056424
3,Research Square,450818
4,RePEc: Research Papers in Economics,389398
...,...,...
107,EmeRI,8
108,Therapoid,7
109,NewAddictionsX,7
110,Bepress Legal Repository,6


In [30]:
import duckdb
from pathlib import Path

# con = duckdb.connect()  # or your existing connection

# Make DuckDB less memory-hungry during export
con.execute("PRAGMA threads=1;")                    # single thread uses less memory
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")

# If you can: cap memory so it spills to disk earlier (tune if needed)
con.execute("PRAGMA memory_limit='2GB';")           # try 2GB; if still OOM, try 1GB
con.execute("PRAGMA temp_directory='data/tmp_duckdb';")


<_duckdb.DuckDBPyConnection at 0x7c0d71b01470>

In [31]:
import math
from pathlib import Path

OUTDIR = Path("data/all_backends/all_dedup_parts")
OUTDIR.mkdir(parents=True, exist_ok=True)

ROWS_PER_PART = 200_000  # start small; increase later if stable

total = con.execute("SELECT COUNT(*) FROM all_dedup").fetchone()[0]
n_parts = math.ceil(total / ROWS_PER_PART)

print("total rows:", total, "parts:", n_parts)

for i in range(n_parts):
    offset = i * ROWS_PER_PART
    out_path = OUTDIR / f"part_{i:06d}.parquet"
    print("writing", out_path.name, "offset", offset)

    con.execute(f"""
        COPY (
            SELECT *
            FROM all_dedup
            LIMIT {ROWS_PER_PART} OFFSET {offset}
        )
        TO '{out_path.as_posix()}'
        (FORMAT PARQUET);
    """)


total rows: 8426047 parts: 43
writing part_000000.parquet offset 0


OutOfMemoryException: Out of Memory Error: could not allocate block of size 256.0 KiB (1.8 GiB/1.8 GiB used)

Possible solutions:
* Reducing the number of threads (SET threads=X)
* Disabling insertion-order preservation (SET preserve_insertion_order=false)
* Increasing the memory limit (SET memory_limit='...GB')

See also https://duckdb.org/docs/stable/guides/performance/how_to_tune_workloads