In [1]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


In [2]:
import duckdb
from pathlib import Path

con = duckdb.connect()

# Low-memory settings
con.execute("PRAGMA threads=1;")
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")
con.execute("PRAGMA memory_limit='2GB';")           # try 1GB if still unstable
con.execute("PRAGMA temp_directory='data/tmp_duckdb';")

# 2) Build paths robustly from the notebook folder
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "by_server"

# IMPORTANT: your files are hive-partitioned like:
all_backends = (BASE / "*" / "*.parquet").as_posix()

con.execute(f"""
CREATE OR REPLACE VIEW all_backends AS
SELECT * FROM read_parquet('{all_backends}', hive_partitioning=true, union_by_name=true);
""")

# A unified "all_rows" view
con.execute("""
CREATE OR REPLACE VIEW all_rows AS
SELECT * FROM all_backends
""")

print(con.execute("SHOW TABLES").fetchall())


[('all_backends',), ('all_rows',)]


In [3]:
con.execute("""
SELECT backend, COUNT(*) AS total, COUNT(record_id) AS with_record_id
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
""").df()


Unnamed: 0,backend,total,with_record_id
0,crossref,3612833,3612833
1,datacite,3523368,3523368
2,openalex,2350692,2350692


In [4]:
con.execute(f"""
CREATE OR REPLACE VIEW server_thin AS
SELECT
  CAST(record_id AS VARCHAR)           AS record_id,
  CAST(server_name AS VARCHAR)         AS server_name,
  CAST(backend AS VARCHAR)             AS backend,

  CAST(doi AS VARCHAR)                 AS doi,
  CAST(doi_url AS VARCHAR)             AS doi_url,
  CAST(landing_page_url AS VARCHAR)    AS landing_page_url,
  
  -- Dates (helpful for temporal patterns)
  CAST(publication_year AS VARCHAR)    AS publication_year,
  CAST(date_created AS VARCHAR)        AS date_created,
  CAST(date_posted AS VARCHAR)         AS date_posted,
  CAST(date_deposited AS VARCHAR)      AS date_deposited,
  CAST(date_published AS VARCHAR)      AS date_published,
  CAST(date_published_online AS VARCHAR)      AS date_published_online,
  CAST(date_issued AS VARCHAR)         AS date_issued,
  CAST(date_indexed AS VARCHAR)        AS date_indexed,
  CAST(date_updated AS VARCHAR)        AS date_updated,
  CAST(date_registered AS VARCHAR)     AS date_registered,

FROM all_backends
""")

con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


Unnamed: 0,n
0,9486893


In [5]:
data = con.execute("SELECT * FROM server_thin").df()
# data.drop_duplicates(subset=['record_id'], keep='first', inplace=False)

data = data.drop_duplicates()
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-05-03,2020-09-15,2020-05-03,,2020-05-03,2025-05-14,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-25,2020-09-15,2020-04-25,,2020-04-25,2025-05-14,,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-16,2020-09-15,2020-04-16,,2020-04-16,2025-05-14,,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2022-12-13,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2024-08-11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-09-01,,,,2025-10-10T17:16:08.811792,
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,2011.0,2025-10-10T00:00:00,,,2011-12-01,,,,2025-10-10T17:16:08.811792,
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-06-01,,,,2025-10-10T17:16:08.811792,
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,2013.0,2025-10-10T00:00:00,,,2013-06-01,,,,2025-10-10T17:16:08.811792,


In [6]:
import pandas as pd

def compute_earliest_date_and_year(
    df: pd.DataFrame,
    date_cols,
    date_col_out: str = "date_first_seen",
    year_col_out: str = "publication_year_first_seen",
) -> pd.DataFrame:
    """
    Compute the earliest known date for each record and extract its year.

    This function:
    ----------------
    1) Takes multiple candidate date columns (e.g. date_created, date_posted, etc.)
    2) Parses them safely into datetime objects
    3) Finds the earliest valid date PER ROW
    4) Stores it in a single canonical column (date_first_seen)
    5) Extracts the publication year from that date (nullable Int64)

    Why this matters:
    -----------------
    - Different sources expose different date semantics
    - The "true" first appearance of a record is often NOT date_created
    - A single, reliable date greatly improves:
        * duplicate resolution
        * parent selection (oldest record)
        * longitudinal analyses
    """

    # Work on a copy to avoid mutating the original DataFrame
    df = df.copy()

    # ---------------------------------------------------------
    # Step 1: Keep only date columns that actually exist
    # ---------------------------------------------------------
    # This avoids KeyErrors and allows flexible schemas
    existing_cols = [c for c in date_cols if c in df.columns]

    # If none of the requested date columns exist,
    # create empty outputs and exit early
    if not existing_cols:
        df[date_col_out] = pd.NaT
        df[year_col_out] = pd.NA
        df[year_col_out] = df[year_col_out].astype("Int64")
        return df

    # ---------------------------------------------------------
    # Step 2: Parse each date column safely
    # ---------------------------------------------------------
    # - pd.to_datetime handles:
    #     * YYYY
    #     * YYYY-MM
    #     * YYYY-MM-DD
    #     * timestamps with hours
    # - errors="coerce" converts invalid values to NaT
    #
    # We build a temporary DataFrame where:
    #   rows = records
    #   columns = parsed date fields
    parsed_dates = pd.DataFrame(
        {
            col: pd.to_datetime(df[col], errors="coerce", utc=False)
            for col in existing_cols
        },
        index=df.index,
    )

    # ---------------------------------------------------------
    # Step 3: Compute the earliest date per record
    # ---------------------------------------------------------
    # .min(axis=1) ignores NaT values automatically
    # Result:
    #   - datetime if at least one valid date exists
    #   - NaT if all date fields are missing/invalid
    df[date_col_out] = parsed_dates.min(axis=1)

    # ---------------------------------------------------------
    # Step 4: Extract publication year from earliest date
    # ---------------------------------------------------------
    # - .dt.year extracts the year component
    # - Using pandas nullable Int64 preserves <NA>
    df[year_col_out] = (
        df[date_col_out]
        .dt.year
        .astype("Int64")
    )

    # ---------------------------------------------------------
    # Step 5: Return enriched DataFrame
    # ---------------------------------------------------------
    return df


In [7]:
DATE_COLUMNS = [
    "date_created",
    "date_posted",
    "date_deposited",
    "date_published",
    "date_published_online",
    "date_issued",
    "date_indexed",
    "date_updated",
    "date_registered",
]

data_date_first_seen = compute_earliest_date_and_year(
    data,
    DATE_COLUMNS,
    date_col_out="date_first_seen",
    year_col_out="publication_year_first_seen",
)
data_date_first_seen

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-05-03,2020-09-15,2020-05-03,,2020-05-03,2025-05-14,,,2020-05-03,2020
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-25,2020-09-15,2020-04-25,,2020-04-25,2025-05-14,,,2020-04-25,2020
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-16,2020-09-15,2020-04-16,,2020-04-16,2025-05-14,,,2020-04-16,2020
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2022-12-13,,,2020-04-15,2020
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2024-08-11,,,2020-04-15,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-09-01,,,,2025-10-10T17:16:08.811792,,2014-09-01,2014
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,2011.0,2025-10-10T00:00:00,,,2011-12-01,,,,2025-10-10T17:16:08.811792,,2011-12-01,2011
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-06-01,,,,2025-10-10T17:16:08.811792,,2014-06-01,2014
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,2013.0,2025-10-10T00:00:00,,,2013-06-01,,,,2025-10-10T17:16:08.811792,,2013-06-01,2013


In [8]:
data_date_first_seen[data_date_first_seen['server_name']=='SSRN']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen
4376023,crossref::10.2139/ssrn.127308,SSRN,crossref,10.2139/ssrn.127308,https://doi.org/10.2139/ssrn.127308,http://www.ssrn.com/abstract=127308,1998.0,2005-11-03,,2007-02-13,1998-01-01,,1998-01-01,2022-04-03,,,1998-01-01,1998
4376024,crossref::10.2139/ssrn.48285,SSRN,crossref,10.2139/ssrn.48285,https://doi.org/10.2139/ssrn.48285,http://www.ssrn.com/abstract=48285,1997.0,2005-08-19,,2007-02-13,1997-01-01,,1997-01-01,2022-04-04,,,1997-01-01,1997
4376025,crossref::10.2139/ssrn.681174,SSRN,crossref,10.2139/ssrn.681174,https://doi.org/10.2139/ssrn.681174,http://www.ssrn.com/abstract=681174,2005.0,2005-07-07,,2007-02-13,2005-01-01,,2005-01-01,2022-04-02,,,2005-01-01,2005
4376026,crossref::10.2139/ssrn.480691,SSRN,crossref,10.2139/ssrn.480691,https://doi.org/10.2139/ssrn.480691,http://www.ssrn.com/abstract=480691,2003.0,2005-07-07,,2007-02-13,2003-01-01,,2003-01-01,2022-04-06,,,2003-01-01,2003
4376027,crossref::10.2139/ssrn.161192,SSRN,crossref,10.2139/ssrn.161192,https://doi.org/10.2139/ssrn.161192,http://www.ssrn.com/abstract=161192,1999.0,2005-07-07,,2007-02-13,1999-01-01,,1999-01-01,2022-04-06,,,1999-01-01,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634976,crossref::10.2139/ssrn.5996464,SSRN,crossref,10.2139/ssrn.5996464,https://doi.org/10.2139/ssrn.5996464,https://www.ssrn.com/abstract=5996464,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025
5634977,crossref::10.2139/ssrn.5996465,SSRN,crossref,10.2139/ssrn.5996465,https://doi.org/10.2139/ssrn.5996465,https://www.ssrn.com/abstract=5996465,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025
5634978,crossref::10.2139/ssrn.5996467,SSRN,crossref,10.2139/ssrn.5996467,https://doi.org/10.2139/ssrn.5996467,https://www.ssrn.com/abstract=5996467,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025
5634979,crossref::10.2139/ssrn.5996468,SSRN,crossref,10.2139/ssrn.5996468,https://doi.org/10.2139/ssrn.5996468,https://www.ssrn.com/abstract=5996468,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025


In [9]:
data_date_first_seen[['record_id','server_name','date_first_seen','publication_year_first_seen']].to_csv("outputs/date_first_seen.csv", index=False)
data_date_first_seen[['record_id','server_name','date_first_seen','publication_year_first_seen']].to_pickle("outputs/date_first_seen.pkl")

In [10]:
data_date_first_seen[['record_id','date_first_seen','publication_year_first_seen']]

Unnamed: 0,record_id,date_first_seen,publication_year_first_seen
0,crossref::10.21467/preprints.48,2020-05-03,2020
1,crossref::10.21467/preprints.43,2020-04-25,2020
2,crossref::10.21467/preprints.39,2020-04-16,2020
3,crossref::10.21467/preprints.38,2020-04-15,2020
4,crossref::10.21467/preprints.36,2020-04-15,2020
...,...,...,...
9486888,openalex::W999325625,2014-09-01,2014
9486889,openalex::W999460032,2011-12-01,2011
9486890,openalex::W99967155,2014-06-01,2014
9486891,openalex::W999790414,2013-06-01,2013
