In [1]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


# import data

In [2]:
import duckdb
from pathlib import Path

con = duckdb.connect()

# Low-memory settings
con.execute("PRAGMA threads=1;")
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")
con.execute("PRAGMA memory_limit='2GB';")           # try 1GB if still unstable
con.execute("PRAGMA temp_directory='data/tmp_duckdb';")

# 2) Build paths robustly from the notebook folder
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "by_server"

# IMPORTANT: your files are hive-partitioned like:
all_backends = (BASE / "*" / "*.parquet").as_posix()

con.execute(f"""
CREATE OR REPLACE VIEW all_backends AS
SELECT * FROM read_parquet('{all_backends}', hive_partitioning=true, union_by_name=true);
""")

# A unified "all_rows" view
con.execute("""
CREATE OR REPLACE VIEW all_rows AS
SELECT * FROM all_backends
""")

print(con.execute("SHOW TABLES").fetchall())


[('all_backends',), ('all_rows',)]


In [3]:
con.execute("""
SELECT backend, COUNT(*) AS total, COUNT(record_id) AS with_record_id
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
""").df()


Unnamed: 0,backend,total,with_record_id
0,crossref,3612833,3612833
1,datacite,3523368,3523368
2,openalex,2350692,2350692


In [4]:
con.execute(f"""
CREATE OR REPLACE VIEW server_thin AS
SELECT
  CAST(record_id AS VARCHAR)           AS record_id,
  CAST(server_name AS VARCHAR)         AS server_name,
  CAST(backend AS VARCHAR)             AS backend,

  CAST(doi AS VARCHAR)                 AS doi,
  CAST(doi_url AS VARCHAR)             AS doi_url,
  CAST(landing_page_url AS VARCHAR)    AS landing_page_url,
  
  -- Dates (helpful for temporal patterns)
  CAST(publication_year AS VARCHAR)    AS publication_year,
  CAST(date_created AS VARCHAR)        AS date_created,
  CAST(date_posted AS VARCHAR)         AS date_posted,
  CAST(date_deposited AS VARCHAR)      AS date_deposited,
  CAST(date_published AS VARCHAR)      AS date_published,
  CAST(date_published_online AS VARCHAR)      AS date_published_online,
  CAST(date_issued AS VARCHAR)         AS date_issued,
  CAST(date_indexed AS VARCHAR)        AS date_indexed,
  CAST(date_updated AS VARCHAR)        AS date_updated,
  CAST(date_registered AS VARCHAR)     AS date_registered,

FROM all_backends
""")

con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


Unnamed: 0,n
0,9486893


In [5]:
data = con.execute("SELECT * FROM server_thin").df()
# data.drop_duplicates(subset=['record_id'], keep='first', inplace=False)

data = data.drop_duplicates()
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-05-03,2020-09-15,2020-05-03,,2020-05-03,2025-05-14,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-25,2020-09-15,2020-04-25,,2020-04-25,2025-05-14,,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-16,2020-09-15,2020-04-16,,2020-04-16,2025-05-14,,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2022-12-13,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2024-08-11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-09-01,,,,2025-10-10T17:16:08.811792,
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,2011.0,2025-10-10T00:00:00,,,2011-12-01,,,,2025-10-10T17:16:08.811792,
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-06-01,,,,2025-10-10T17:16:08.811792,
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,2013.0,2025-10-10T00:00:00,,,2013-06-01,,,,2025-10-10T17:16:08.811792,


# Compute the earliest known date for each record and extract its year

In [6]:
import pandas as pd

def compute_earliest_date_and_year(
    df: pd.DataFrame,
    date_cols,
    date_col_out: str = "date_first_seen",
    year_col_out: str = "publication_year_first_seen",
) -> pd.DataFrame:
    """
    Compute the earliest known date for each record and extract its year.

    This function:
    ----------------
    1) Takes multiple candidate date columns (e.g. date_created, date_posted, etc.)
    2) Parses them safely into datetime objects
    3) Finds the earliest valid date PER ROW
    4) Stores it in a single canonical column (date_first_seen)
    5) Extracts the publication year from that date (nullable Int64)

    Why this matters:
    -----------------
    - Different sources expose different date semantics
    - The "true" first appearance of a record is often NOT date_created
    - A single, reliable date greatly improves:
        * duplicate resolution
        * parent selection (oldest record)
        * longitudinal analyses
    """

    # Work on a copy to avoid mutating the original DataFrame
    df = df.copy()

    # ---------------------------------------------------------
    # Step 1: Keep only date columns that actually exist
    # ---------------------------------------------------------
    # This avoids KeyErrors and allows flexible schemas
    existing_cols = [c for c in date_cols if c in df.columns]

    # If none of the requested date columns exist,
    # create empty outputs and exit early
    if not existing_cols:
        df[date_col_out] = pd.NaT
        df[year_col_out] = pd.NA
        df[year_col_out] = df[year_col_out].astype("Int64")
        return df

    # ---------------------------------------------------------
    # Step 2: Parse each date column safely
    # ---------------------------------------------------------
    # - pd.to_datetime handles:
    #     * YYYY
    #     * YYYY-MM
    #     * YYYY-MM-DD
    #     * timestamps with hours
    # - errors="coerce" converts invalid values to NaT
    #
    # We build a temporary DataFrame where:
    #   rows = records
    #   columns = parsed date fields
    parsed_dates = pd.DataFrame(
        {
            col: pd.to_datetime(df[col], errors="coerce", utc=False)
            for col in existing_cols
        },
        index=df.index,
    )

    # ---------------------------------------------------------
    # Step 3: Compute the earliest date per record
    # ---------------------------------------------------------
    # .min(axis=1) ignores NaT values automatically
    # Result:
    #   - datetime if at least one valid date exists
    #   - NaT if all date fields are missing/invalid
    df[date_col_out] = parsed_dates.min(axis=1)

    # ---------------------------------------------------------
    # Step 4: Extract publication year from earliest date
    # ---------------------------------------------------------
    # - .dt.year extracts the year component
    # - Using pandas nullable Int64 preserves <NA>
    df[year_col_out] = (
        df[date_col_out]
        .dt.year
        .astype("Int64")
    )

    # ---------------------------------------------------------
    # Step 5: Return enriched DataFrame
    # ---------------------------------------------------------
    return df


In [7]:
DATE_COLUMNS = [
    "date_created",
    "date_posted",
    "date_deposited",
    "date_published",
    "date_published_online",
    "date_issued",
    "date_indexed",
    "date_updated",
    "date_registered",
]

data_date_first_seen = compute_earliest_date_and_year(
    data,
    DATE_COLUMNS,
    date_col_out="date_first_seen",
    year_col_out="publication_year_first_seen",
)
data_date_first_seen

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-05-03,2020-09-15,2020-05-03,,2020-05-03,2025-05-14,,,2020-05-03,2020
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-25,2020-09-15,2020-04-25,,2020-04-25,2025-05-14,,,2020-04-25,2020
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-16,2020-09-15,2020-04-16,,2020-04-16,2025-05-14,,,2020-04-16,2020
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2022-12-13,,,2020-04-15,2020
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2024-08-11,,,2020-04-15,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-09-01,,,,2025-10-10T17:16:08.811792,,2014-09-01,2014
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,2011.0,2025-10-10T00:00:00,,,2011-12-01,,,,2025-10-10T17:16:08.811792,,2011-12-01,2011
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-06-01,,,,2025-10-10T17:16:08.811792,,2014-06-01,2014
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,2013.0,2025-10-10T00:00:00,,,2013-06-01,,,,2025-10-10T17:16:08.811792,,2013-06-01,2013


# matching ssrn dates

In [8]:
data_date_first_seen_ssrn = data_date_first_seen[data_date_first_seen['server_name']=='SSRN']
data_date_first_seen_ssrn

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen
4376023,crossref::10.2139/ssrn.127308,SSRN,crossref,10.2139/ssrn.127308,https://doi.org/10.2139/ssrn.127308,http://www.ssrn.com/abstract=127308,1998.0,2005-11-03,,2007-02-13,1998-01-01,,1998-01-01,2022-04-03,,,1998-01-01,1998
4376024,crossref::10.2139/ssrn.48285,SSRN,crossref,10.2139/ssrn.48285,https://doi.org/10.2139/ssrn.48285,http://www.ssrn.com/abstract=48285,1997.0,2005-08-19,,2007-02-13,1997-01-01,,1997-01-01,2022-04-04,,,1997-01-01,1997
4376025,crossref::10.2139/ssrn.681174,SSRN,crossref,10.2139/ssrn.681174,https://doi.org/10.2139/ssrn.681174,http://www.ssrn.com/abstract=681174,2005.0,2005-07-07,,2007-02-13,2005-01-01,,2005-01-01,2022-04-02,,,2005-01-01,2005
4376026,crossref::10.2139/ssrn.480691,SSRN,crossref,10.2139/ssrn.480691,https://doi.org/10.2139/ssrn.480691,http://www.ssrn.com/abstract=480691,2003.0,2005-07-07,,2007-02-13,2003-01-01,,2003-01-01,2022-04-06,,,2003-01-01,2003
4376027,crossref::10.2139/ssrn.161192,SSRN,crossref,10.2139/ssrn.161192,https://doi.org/10.2139/ssrn.161192,http://www.ssrn.com/abstract=161192,1999.0,2005-07-07,,2007-02-13,1999-01-01,,1999-01-01,2022-04-06,,,1999-01-01,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634976,crossref::10.2139/ssrn.5996464,SSRN,crossref,10.2139/ssrn.5996464,https://doi.org/10.2139/ssrn.5996464,https://www.ssrn.com/abstract=5996464,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025
5634977,crossref::10.2139/ssrn.5996465,SSRN,crossref,10.2139/ssrn.5996465,https://doi.org/10.2139/ssrn.5996465,https://www.ssrn.com/abstract=5996465,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025
5634978,crossref::10.2139/ssrn.5996467,SSRN,crossref,10.2139/ssrn.5996467,https://doi.org/10.2139/ssrn.5996467,https://www.ssrn.com/abstract=5996467,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025
5634979,crossref::10.2139/ssrn.5996468,SSRN,crossref,10.2139/ssrn.5996468,https://doi.org/10.2139/ssrn.5996468,https://www.ssrn.com/abstract=5996468,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025


In [9]:
data_date_first_seen_ssrn.count()

record_id                      1258958
server_name                    1258958
backend                        1258958
doi                            1258958
doi_url                        1258958
landing_page_url               1258958
publication_year               1258958
date_created                   1258958
date_posted                     515391
date_deposited                 1258958
date_published                 1258958
date_published_online           117276
date_issued                    1258958
date_indexed                   1258958
date_updated                         0
date_registered                      0
date_first_seen                1258958
publication_year_first_seen    1258958
dtype: int64

In [10]:
df_before1990 = data_date_first_seen[data_date_first_seen['publication_year_first_seen'] < 1990]
df_before1990

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen
2010343,crossref::10.20948/prepr-1981-43,Keldysh Institute Preprints,crossref,10.20948/prepr-1981-43,https://doi.org/10.20948/prepr-1981-43,http://keldysh.ru/papers/1981/prep1981_43.pdf,1981.0,2021-07-22,,2021-07-22,1981-01-01,1981-01-01,1981-01-01,2025-02-21,,,1981-01-01,1981
2010344,crossref::10.20948/prepr-1984-102,Keldysh Institute Preprints,crossref,10.20948/prepr-1984-102,https://doi.org/10.20948/prepr-1984-102,http://keldysh.ru/papers/1984/prep1984_102.pdf,1984.0,2021-07-22,,2021-07-22,1984-01-01,1984-01-01,1984-01-01,2025-02-21,,,1984-01-01,1984
4377730,crossref::10.2139/ssrn.820064,SSRN,crossref,10.2139/ssrn.820064,https://doi.org/10.2139/ssrn.820064,http://www.ssrn.com/abstract=820064,1975.0,2011-12-28,,2012-07-05,1975-01-01,,1975-01-01,2022-03-29,,,1975-01-01,1975
4377972,crossref::10.2139/ssrn.869196,SSRN,crossref,10.2139/ssrn.869196,https://doi.org/10.2139/ssrn.869196,http://www.ssrn.com/abstract=869196,1980.0,2011-12-28,,2012-07-05,1980-01-01,,1980-01-01,2022-03-30,,,1980-01-01,1980
4377988,crossref::10.2139/ssrn.903219,SSRN,crossref,10.2139/ssrn.903219,https://doi.org/10.2139/ssrn.903219,http://www.ssrn.com/abstract=903219,1986.0,2011-12-28,,2012-07-05,1986-01-01,,1986-01-01,2022-04-03,,,1986-01-01,1986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5214460,crossref::10.2139/ssrn.2726577,SSRN,crossref,10.2139/ssrn.2726577,https://doi.org/10.2139/ssrn.2726577,http://www.ssrn.com/abstract=2726577,1983.0,2016-02-03,,2024-06-14,1983-01-01,,1983-01-01,2025-07-08,,,1983-01-01,1983
5214919,crossref::10.2139/ssrn.2740530,SSRN,crossref,10.2139/ssrn.2740530,https://doi.org/10.2139/ssrn.2740530,http://www.ssrn.com/abstract=2740530,1989.0,2016-03-09,,2024-06-14,1989-01-01,,1989-01-01,2024-06-14,,,1989-01-01,1989
5214921,crossref::10.2139/ssrn.2740544,SSRN,crossref,10.2139/ssrn.2740544,https://doi.org/10.2139/ssrn.2740544,http://www.ssrn.com/abstract=2740544,1989.0,2016-03-09,,2024-06-14,1989-01-01,,1989-01-01,2024-06-14,,,1989-01-01,1989
5401624,crossref::10.2139/ssrn.689301,SSRN,crossref,10.2139/ssrn.689301,https://doi.org/10.2139/ssrn.689301,http://www.ssrn.com/abstract=689301,1981.0,2011-12-28,,2025-03-16,1981-01-01,,1981-01-01,2025-03-16,,,1981-01-01,1981


In [11]:
df_before1990[df_before1990['server_name']=='SSRN']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen
4377730,crossref::10.2139/ssrn.820064,SSRN,crossref,10.2139/ssrn.820064,https://doi.org/10.2139/ssrn.820064,http://www.ssrn.com/abstract=820064,1975.0,2011-12-28,,2012-07-05,1975-01-01,,1975-01-01,2022-03-29,,,1975-01-01,1975
4377972,crossref::10.2139/ssrn.869196,SSRN,crossref,10.2139/ssrn.869196,https://doi.org/10.2139/ssrn.869196,http://www.ssrn.com/abstract=869196,1980.0,2011-12-28,,2012-07-05,1980-01-01,,1980-01-01,2022-03-30,,,1980-01-01,1980
4377988,crossref::10.2139/ssrn.903219,SSRN,crossref,10.2139/ssrn.903219,https://doi.org/10.2139/ssrn.903219,http://www.ssrn.com/abstract=903219,1986.0,2011-12-28,,2012-07-05,1986-01-01,,1986-01-01,2022-04-03,,,1986-01-01,1986
4378006,crossref::10.2139/ssrn.868891,SSRN,crossref,10.2139/ssrn.868891,https://doi.org/10.2139/ssrn.868891,http://www.ssrn.com/abstract=868891,1980.0,2011-12-28,,2012-07-05,1980-01-01,,1980-01-01,2022-04-04,,,1980-01-01,1980
4378266,crossref::10.2139/ssrn.742714,SSRN,crossref,10.2139/ssrn.742714,https://doi.org/10.2139/ssrn.742714,http://www.ssrn.com/abstract=742714,1987.0,2011-12-28,,2012-07-05,1987-01-01,,1987-01-01,2023-10-12,,,1987-01-01,1987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5214460,crossref::10.2139/ssrn.2726577,SSRN,crossref,10.2139/ssrn.2726577,https://doi.org/10.2139/ssrn.2726577,http://www.ssrn.com/abstract=2726577,1983.0,2016-02-03,,2024-06-14,1983-01-01,,1983-01-01,2025-07-08,,,1983-01-01,1983
5214919,crossref::10.2139/ssrn.2740530,SSRN,crossref,10.2139/ssrn.2740530,https://doi.org/10.2139/ssrn.2740530,http://www.ssrn.com/abstract=2740530,1989.0,2016-03-09,,2024-06-14,1989-01-01,,1989-01-01,2024-06-14,,,1989-01-01,1989
5214921,crossref::10.2139/ssrn.2740544,SSRN,crossref,10.2139/ssrn.2740544,https://doi.org/10.2139/ssrn.2740544,http://www.ssrn.com/abstract=2740544,1989.0,2016-03-09,,2024-06-14,1989-01-01,,1989-01-01,2024-06-14,,,1989-01-01,1989
5401624,crossref::10.2139/ssrn.689301,SSRN,crossref,10.2139/ssrn.689301,https://doi.org/10.2139/ssrn.689301,http://www.ssrn.com/abstract=689301,1981.0,2011-12-28,,2025-03-16,1981-01-01,,1981-01-01,2025-03-16,,,1981-01-01,1981


In [12]:
df_before1990['server_name'].value_counts()

server_name
SSRN                           670
Keldysh Institute Preprints      2
Name: count, dtype: int64

In [13]:
df_before1990.columns

Index(['record_id', 'server_name', 'backend', 'doi', 'doi_url',
       'landing_page_url', 'publication_year', 'date_created', 'date_posted',
       'date_deposited', 'date_published', 'date_published_online',
       'date_issued', 'date_indexed', 'date_updated', 'date_registered',
       'date_first_seen', 'publication_year_first_seen'],
      dtype='object')

## import SSRNData

In [14]:
import pandas as pd

# The 'r' before the string tells Python to treat backslashes as literal characters
file_path = r"/mnt/c/SCHOLCOMMLAB/APPS/preprint-harvester/data/SSRNData/SSRNData.txt"

# If the data is separated by tabs, use sep='\t'. If it's commas, you can remove the sep argument.
SSRNData = pd.read_csv(file_path, sep='\t') 

# Display the first 5 rows to make sure it loaded correctly
SSRNData.head()

Unnamed: 0,rank,url,title,authors,datePosted
0,1,https://papers.ssrn.com/sol3/papers.cfm?abstra...,Monetary Tightening and U.S. Bank Fragility in...,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",24 Mar 2023
1,2,https://papers.ssrn.com/sol3/papers.cfm?abstra...,Why Do People Migrate? A Review of the Theoret...,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",14 Mar 2008
2,3,https://papers.ssrn.com/sol3/papers.cfm?abstra...,The Sweep and Force of Section Three,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",14 Aug 2023
3,4,https://papers.ssrn.com/sol3/papers.cfm?abstra...,Navigating the Jagged Technological Frontier: ...,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",18 Sep 2023
4,5,https://papers.ssrn.com/sol3/papers.cfm?abstra...,Can ChatGPT Forecast Stock Price Movements? Re...,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",10 Apr 2023


In [15]:
SSRNData.count()

rank          1137036
url           1137036
title         1137032
authors       1137036
datePosted    1137015
dtype: int64

In [16]:
SSRNData.url.unique()

array(['https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4387676',
       'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1105657',
       'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4532751', ...,
       'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3655838',
       'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3655866',
       'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3647635'],
      shape=(1137036,), dtype=object)

In [17]:
SSRNData.authors[2]

'<a href="https://papers.ssrn.com/sol3/cf_dev/AbsByAuth.cfm?per_id=398074" target="_blank">William Baude</a> and <a href="https://papers.ssrn.com/sol3/cf_dev/AbsByAuth.cfm?per_id=967471" target="_blank">Michael Stokes Paulsen</a><br>University of Chicago - Law School and University of St. Thomas School of Law<br>'

In [18]:
import pandas as pd
import re

SSRNData_clean = SSRNData.copy()

# extract abstract_id
SSRNData_clean["ssrn_id"] = SSRNData_clean["url"].str.extract(r"abstract_id=(\d+)")

data_date_first_seen["ssrn_id"] = data_date_first_seen["doi"].str.extract(r"10.2139/ssrn.(\d+)")

# build DOI
SSRNData_clean["doi"] = "10.2139/ssrn." + SSRNData_clean["ssrn_id"]

# build DOI
SSRNData_clean["landing_page_url"] = "https://www.ssrn.com/abstract=" + SSRNData_clean["ssrn_id"]

# parse date
SSRNData_clean["date_posted_ssrn"] = pd.to_datetime(SSRNData_clean["datePosted"], errors="coerce")

SSRNData_clean["publication_year_ssrn"] = SSRNData_clean["date_posted_ssrn"].dt.year

# SSRNData_clean["landing_page_url"] = df["url"]

# SSRNData_clean["server_name"] = "SSRN"
# SSRNData_clean["backend"] = "SSRN"

SSRNData_clean.head()

Unnamed: 0,rank,url,title,authors,datePosted,ssrn_id,doi,landing_page_url,date_posted_ssrn,publication_year_ssrn
0,1,https://papers.ssrn.com/sol3/papers.cfm?abstra...,Monetary Tightening and U.S. Bank Fragility in...,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",24 Mar 2023,4387676,10.2139/ssrn.4387676,https://www.ssrn.com/abstract=4387676,2023-03-24,2023.0
1,2,https://papers.ssrn.com/sol3/papers.cfm?abstra...,Why Do People Migrate? A Review of the Theoret...,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",14 Mar 2008,1105657,10.2139/ssrn.1105657,https://www.ssrn.com/abstract=1105657,2008-03-14,2008.0
2,3,https://papers.ssrn.com/sol3/papers.cfm?abstra...,The Sweep and Force of Section Three,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",14 Aug 2023,4532751,10.2139/ssrn.4532751,https://www.ssrn.com/abstract=4532751,2023-08-14,2023.0
3,4,https://papers.ssrn.com/sol3/papers.cfm?abstra...,Navigating the Jagged Technological Frontier: ...,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",18 Sep 2023,4573321,10.2139/ssrn.4573321,https://www.ssrn.com/abstract=4573321,2023-09-18,2023.0
4,5,https://papers.ssrn.com/sol3/papers.cfm?abstra...,Can ChatGPT Forecast Stock Price Movements? Re...,"<a href=""https://papers.ssrn.com/sol3/cf_dev/A...",10 Apr 2023,4412788,10.2139/ssrn.4412788,https://www.ssrn.com/abstract=4412788,2023-04-10,2023.0


In [19]:
pattern = "10.2139/ssrn.4387676"

mask = data_date_first_seen['doi'].str.contains(pattern, regex=False, na=False)
result = data_date_first_seen[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id
5234824,crossref::10.2139/ssrn.4387676,SSRN,crossref,10.2139/ssrn.4387676,https://doi.org/10.2139/ssrn.4387676,https://www.ssrn.com/abstract=4387676,2023.0,2023-03-24,,2024-07-13,2023-01-01,2023-01-01,2023-01-01,2025-12-13,,,2023-01-01,2023,4387676


In [20]:
SSRNData_clean[['doi','date_posted_ssrn','publication_year_ssrn']]

Unnamed: 0,doi,date_posted_ssrn,publication_year_ssrn
0,10.2139/ssrn.4387676,2023-03-24,2023.0
1,10.2139/ssrn.1105657,2008-03-14,2008.0
2,10.2139/ssrn.4532751,2023-08-14,2023.0
3,10.2139/ssrn.4573321,2023-09-18,2023.0
4,10.2139/ssrn.4412788,2023-04-10,2023.0
...,...,...,...
1137031,10.2139/ssrn.3655672,2020-08-24,2020.0
1137032,10.2139/ssrn.3655759,2020-07-30,2020.0
1137033,10.2139/ssrn.3655838,2020-08-25,2020.0
1137034,10.2139/ssrn.3655866,2020-07-31,2020.0


## merge ssrn data to others data

In [21]:
ssrn_map_doi = SSRNData_clean.set_index("doi")["date_posted_ssrn"]
ssrn_map_url = SSRNData_clean.set_index("landing_page_url")["date_posted_ssrn"]
ssrn_map_id = SSRNData_clean.set_index("ssrn_id")["date_posted_ssrn"]

ssrn_map_year = SSRNData_clean.set_index("doi")["publication_year_ssrn"]

data_date_first_seen["date_posted_ssrn_doi"] = data_date_first_seen["doi"].map(ssrn_map_doi)
data_date_first_seen["date_posted_ssrn_url"] = data_date_first_seen["landing_page_url"].map(ssrn_map_url)
data_date_first_seen["date_posted_ssrn_id"] = data_date_first_seen["ssrn_id"].map(ssrn_map_id)
data_date_first_seen["publication_year_ssrn"] = data_date_first_seen["doi"].map(ssrn_map_year)

In [22]:
data_date_first_seen

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id,date_posted_ssrn_doi,date_posted_ssrn_url,date_posted_ssrn_id,publication_year_ssrn
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-05-03,2020-09-15,2020-05-03,,2020-05-03,2025-05-14,,,2020-05-03,2020,,NaT,NaT,NaT,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-25,2020-09-15,2020-04-25,,2020-04-25,2025-05-14,,,2020-04-25,2020,,NaT,NaT,NaT,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-16,2020-09-15,2020-04-16,,2020-04-16,2025-05-14,,,2020-04-16,2020,,NaT,NaT,NaT,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2022-12-13,,,2020-04-15,2020,,NaT,NaT,NaT,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2024-08-11,,,2020-04-15,2020,,NaT,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-09-01,,,,2025-10-10T17:16:08.811792,,2014-09-01,2014,,NaT,NaT,NaT,
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,2011.0,2025-10-10T00:00:00,,,2011-12-01,,,,2025-10-10T17:16:08.811792,,2011-12-01,2011,,NaT,NaT,NaT,
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-06-01,,,,2025-10-10T17:16:08.811792,,2014-06-01,2014,,NaT,NaT,NaT,
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,2013.0,2025-10-10T00:00:00,,,2013-06-01,,,,2025-10-10T17:16:08.811792,,2013-06-01,2013,,NaT,NaT,NaT,


In [23]:
date_posted_ssrn_doi = data_date_first_seen[data_date_first_seen["date_posted_ssrn_doi"].notna()]
date_posted_ssrn_doi

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id,date_posted_ssrn_doi,date_posted_ssrn_url,date_posted_ssrn_id,publication_year_ssrn
345081,openalex::W3124716296,Digital Access to Scholarship at Harvard (DASH...,openalex,10.2139/ssrn.2102794,https://doi.org/10.2139/ssrn.2102794,http://nrs.harvard.edu/urn-3:HUL.InstRepos:940...,2012.0,2025-10-10T00:00:00,,,2012-01-01,,,,2025-12-10T02:49:46.989445,,2012-01-01,2012,2102794,2012-07-10,NaT,2012-07-10,2012.0
457000,openalex::W3123789860,EconStor Preprints,openalex,10.2139/ssrn.2573596,https://doi.org/10.2139/ssrn.2573596,http://hdl.handle.net/10419/107595,2015.0,2025-10-10T00:00:00,,,2015-01-01,,,,2025-10-10T17:16:08.811792,,2015-01-01,2015,2573596,2015-03-05,NaT,2015-03-05,2015.0
479249,openalex::W3125554759,EconStor Preprints,openalex,10.2139/ssrn.3192474,https://doi.org/10.2139/ssrn.3192474,http://hdl.handle.net/10419/271218,2018.0,2021-02-01T00:00:00,,,2018-01-01,,,,2025-11-06T03:46:38.306776,,2018-01-01,2018,3192474,2018-06-27,NaT,2018-06-27,2018.0
480294,openalex::W2894453084,EconStor Preprints,openalex,10.2139/ssrn.2797203,https://doi.org/10.2139/ssrn.2797203,http://hdl.handle.net/10419/179123,2018.0,2018-10-05T00:00:00,,,2018-01-01,,,,2025-11-06T03:46:38.306776,,2018-01-01,2018,2797203,2016-06-19,NaT,2016-06-19,2016.0
480465,openalex::W3124251451,EconStor Preprints,openalex,10.2139/ssrn.3347766,https://doi.org/10.2139/ssrn.3347766,http://hdl.handle.net/10419/193667,2019.0,2021-02-01T00:00:00,,,2019-01-01,,,,2025-11-06T03:46:38.306776,,2019-01-01,2019,3347766,2019-04-01,NaT,2019-04-01,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634768,crossref::10.2139/ssrn.3474720,SSRN,crossref,10.2139/ssrn.3474720,https://doi.org/10.2139/ssrn.3474720,https://www.ssrn.com/abstract=3474720,2019.0,2019-11-06,,2025-12-31,2019-01-01,,2019-01-01,2025-12-31,,,2019-01-01,2019,3474720,2019-11-04,2019-11-04,2019-11-04,2019.0
5634775,crossref::10.2139/ssrn.3221252,SSRN,crossref,10.2139/ssrn.3221252,https://doi.org/10.2139/ssrn.3221252,https://www.ssrn.com/abstract=3221252,2018.0,2018-08-14,,2025-12-31,2018-01-01,,2018-01-01,2025-12-31,,,2018-01-01,2018,3221252,2018-08-14,2018-08-14,2018-08-14,2018.0
5634914,crossref::10.2139/ssrn.3362066,SSRN,crossref,10.2139/ssrn.3362066,https://doi.org/10.2139/ssrn.3362066,https://www.ssrn.com/abstract=3362066,2019.0,2019-05-01,,2025-12-31,2019-01-01,,2019-01-01,2025-12-31,,,2019-01-01,2019,3362066,2019-04-30,2019-04-30,2019-04-30,2019.0
5634967,crossref::10.2139/ssrn.4668027,SSRN,crossref,10.2139/ssrn.4668027,https://doi.org/10.2139/ssrn.4668027,https://www.ssrn.com/abstract=4668027,2023.0,2023-12-19,,2025-12-31,2023-01-01,2023-01-01,2023-01-01,2025-12-31,,,2023-01-01,2023,4668027,2023-12-19,2023-12-19,2023-12-19,2023.0


In [24]:
date_posted_ssrn_doi['server_name'].value_counts()

server_name
SSRN                                                                    730023
RePEc: Research Papers in Economics                                         14
EconStor Preprints                                                          11
Digital Access to Scholarship at Harvard (DASH) (Harvard University)         1
HAL                                                                          1
Name: count, dtype: int64

In [25]:
date_posted_ssrn_doi.count()

record_id                      730050
server_name                    730050
backend                        730050
doi                            730050
doi_url                        730050
landing_page_url               730050
publication_year               730050
date_created                   730050
date_posted                    125366
date_deposited                 730023
date_published                 730050
date_published_online           45303
date_issued                    730023
date_indexed                   730023
date_updated                       27
date_registered                     0
date_first_seen                730050
publication_year_first_seen    730050
ssrn_id                        730050
date_posted_ssrn_doi           730050
date_posted_ssrn_url           493004
date_posted_ssrn_id            730050
publication_year_ssrn          730050
dtype: int64

In [26]:
date_posted_ssrn_doi.head(60)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id,date_posted_ssrn_doi,date_posted_ssrn_url,date_posted_ssrn_id,publication_year_ssrn
345081,openalex::W3124716296,Digital Access to Scholarship at Harvard (DASH...,openalex,10.2139/ssrn.2102794,https://doi.org/10.2139/ssrn.2102794,http://nrs.harvard.edu/urn-3:HUL.InstRepos:940...,2012.0,2025-10-10T00:00:00,,,2012-01-01,,,,2025-12-10T02:49:46.989445,,2012-01-01,2012,2102794,2012-07-10,NaT,2012-07-10,2012.0
457000,openalex::W3123789860,EconStor Preprints,openalex,10.2139/ssrn.2573596,https://doi.org/10.2139/ssrn.2573596,http://hdl.handle.net/10419/107595,2015.0,2025-10-10T00:00:00,,,2015-01-01,,,,2025-10-10T17:16:08.811792,,2015-01-01,2015,2573596,2015-03-05,NaT,2015-03-05,2015.0
479249,openalex::W3125554759,EconStor Preprints,openalex,10.2139/ssrn.3192474,https://doi.org/10.2139/ssrn.3192474,http://hdl.handle.net/10419/271218,2018.0,2021-02-01T00:00:00,,,2018-01-01,,,,2025-11-06T03:46:38.306776,,2018-01-01,2018,3192474,2018-06-27,NaT,2018-06-27,2018.0
480294,openalex::W2894453084,EconStor Preprints,openalex,10.2139/ssrn.2797203,https://doi.org/10.2139/ssrn.2797203,http://hdl.handle.net/10419/179123,2018.0,2018-10-05T00:00:00,,,2018-01-01,,,,2025-11-06T03:46:38.306776,,2018-01-01,2018,2797203,2016-06-19,NaT,2016-06-19,2016.0
480465,openalex::W3124251451,EconStor Preprints,openalex,10.2139/ssrn.3347766,https://doi.org/10.2139/ssrn.3347766,http://hdl.handle.net/10419/193667,2019.0,2021-02-01T00:00:00,,,2019-01-01,,,,2025-11-06T03:46:38.306776,,2019-01-01,2019,3347766,2019-04-01,NaT,2019-04-01,2019.0
483809,openalex::W2265447265,EconStor Preprints,openalex,10.2139/ssrn.2458855,https://doi.org/10.2139/ssrn.2458855,http://hdl.handle.net/10419/203278,2015.0,2025-10-10T00:00:00,,,2015-01-01,,,,2025-11-06T03:46:38.306776,,2015-01-01,2015,2458855,2014-06-27,NaT,2014-06-27,2014.0
487274,openalex::W2296153767,EconStor Preprints,openalex,10.2139/ssrn.2715366,https://doi.org/10.2139/ssrn.2715366,http://hdl.handle.net/10419/146789,2016.0,2025-10-10T00:00:00,,,2016-01-01,,,,2025-11-06T03:46:38.306776,,2016-01-01,2016,2715366,2016-01-15,NaT,2016-01-15,2016.0
494028,openalex::W3034197722,EconStor Preprints,openalex,10.2139/ssrn.3354400,https://doi.org/10.2139/ssrn.3354400,http://hdl.handle.net/10419/225342,2020.0,2020-06-19T00:00:00,,,2020-01-01,,,,2025-11-06T03:46:38.306776,,2020-01-01,2020,3354400,2019-04-18,NaT,2019-04-18,2019.0
513871,openalex::W2792595594,EconStor Preprints,openalex,10.2139/ssrn.3187331,https://doi.org/10.2139/ssrn.3187331,http://hdl.handle.net/10419/179465,2017.0,2025-10-10T00:00:00,,,2017-12-01,,,,2025-10-10T17:16:08.811792,,2017-12-01,2017,3187331,2018-05-30,NaT,2018-05-30,2018.0
514193,openalex::W2803691386,EconStor Preprints,openalex,10.2139/ssrn.3175871,https://doi.org/10.2139/ssrn.3175871,http://hdl.handle.net/10419/178645,2018.0,2025-10-10T00:00:00,,,2018-05-30,,,,2025-11-06T04:12:42.849631,,2018-05-30,2018,3175871,2018-05-09,NaT,2018-05-09,2018.0


In [27]:
date_posted_ssrn_url = data_date_first_seen[data_date_first_seen["date_posted_ssrn_url"].notna()]
date_posted_ssrn_url

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id,date_posted_ssrn_doi,date_posted_ssrn_url,date_posted_ssrn_id,publication_year_ssrn
4413418,crossref::10.2139/ssrn.2742086,SSRN,crossref,10.2139/ssrn.2742086,https://doi.org/10.2139/ssrn.2742086,https://www.ssrn.com/abstract=2742086,2016.0,2017-06-28,,2017-06-28,2016-01-01,,2016-01-01,2022-04-05,,,2016-01-01,2016,2742086,2016-03-07,2016-03-07,2016-03-07,2016.0
4413419,crossref::10.2139/ssrn.2742024,SSRN,crossref,10.2139/ssrn.2742024,https://doi.org/10.2139/ssrn.2742024,https://www.ssrn.com/abstract=2742024,2016.0,2017-06-28,,2017-06-28,2016-01-01,,2016-01-01,2024-06-10,,,2016-01-01,2016,2742024,2016-10-20,2016-10-20,2016-10-20,2016.0
4413423,crossref::10.2139/ssrn.2746644,SSRN,crossref,10.2139/ssrn.2746644,https://doi.org/10.2139/ssrn.2746644,https://www.ssrn.com/abstract=2746644,2016.0,2017-06-28,,2017-06-28,2016-01-01,,2016-01-01,2025-10-27,,,2016-01-01,2016,2746644,2016-03-14,2016-03-14,2016-03-14,2016.0
4413426,crossref::10.2139/ssrn.2748258,SSRN,crossref,10.2139/ssrn.2748258,https://doi.org/10.2139/ssrn.2748258,https://www.ssrn.com/abstract=2748258,2016.0,2017-06-28,,2017-06-28,2016-01-01,,2016-01-01,2025-10-08,,,2016-01-01,2016,2748258,2016-03-17,2016-03-17,2016-03-17,2016.0
4413431,crossref::10.2139/ssrn.2752895,SSRN,crossref,10.2139/ssrn.2752895,https://doi.org/10.2139/ssrn.2752895,https://www.ssrn.com/abstract=2752895,2016.0,2017-06-28,,2017-06-28,2016-01-01,,2016-01-01,2022-04-05,,,2016-01-01,2016,2752895,2016-03-22,2016-03-22,2016-03-22,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634768,crossref::10.2139/ssrn.3474720,SSRN,crossref,10.2139/ssrn.3474720,https://doi.org/10.2139/ssrn.3474720,https://www.ssrn.com/abstract=3474720,2019.0,2019-11-06,,2025-12-31,2019-01-01,,2019-01-01,2025-12-31,,,2019-01-01,2019,3474720,2019-11-04,2019-11-04,2019-11-04,2019.0
5634775,crossref::10.2139/ssrn.3221252,SSRN,crossref,10.2139/ssrn.3221252,https://doi.org/10.2139/ssrn.3221252,https://www.ssrn.com/abstract=3221252,2018.0,2018-08-14,,2025-12-31,2018-01-01,,2018-01-01,2025-12-31,,,2018-01-01,2018,3221252,2018-08-14,2018-08-14,2018-08-14,2018.0
5634914,crossref::10.2139/ssrn.3362066,SSRN,crossref,10.2139/ssrn.3362066,https://doi.org/10.2139/ssrn.3362066,https://www.ssrn.com/abstract=3362066,2019.0,2019-05-01,,2025-12-31,2019-01-01,,2019-01-01,2025-12-31,,,2019-01-01,2019,3362066,2019-04-30,2019-04-30,2019-04-30,2019.0
5634967,crossref::10.2139/ssrn.4668027,SSRN,crossref,10.2139/ssrn.4668027,https://doi.org/10.2139/ssrn.4668027,https://www.ssrn.com/abstract=4668027,2023.0,2023-12-19,,2025-12-31,2023-01-01,2023-01-01,2023-01-01,2025-12-31,,,2023-01-01,2023,4668027,2023-12-19,2023-12-19,2023-12-19,2023.0


In [28]:
date_posted_ssrn_url['server_name'].value_counts()

server_name
SSRN    493004
Name: count, dtype: int64

In [29]:
date_posted_ssrn_url.count()

record_id                      493004
server_name                    493004
backend                        493004
doi                            493004
doi_url                        493004
landing_page_url               493004
publication_year               493004
date_created                   493004
date_posted                    125366
date_deposited                 493004
date_published                 493004
date_published_online           45299
date_issued                    493004
date_indexed                   493004
date_updated                        0
date_registered                     0
date_first_seen                493004
publication_year_first_seen    493004
ssrn_id                        493004
date_posted_ssrn_doi           493004
date_posted_ssrn_url           493004
date_posted_ssrn_id            493004
publication_year_ssrn          493004
dtype: int64

In [30]:
date_posted_ssrn_id = data_date_first_seen[data_date_first_seen["date_posted_ssrn_id"].notna()]
date_posted_ssrn_id

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id,date_posted_ssrn_doi,date_posted_ssrn_url,date_posted_ssrn_id,publication_year_ssrn
345081,openalex::W3124716296,Digital Access to Scholarship at Harvard (DASH...,openalex,10.2139/ssrn.2102794,https://doi.org/10.2139/ssrn.2102794,http://nrs.harvard.edu/urn-3:HUL.InstRepos:940...,2012.0,2025-10-10T00:00:00,,,2012-01-01,,,,2025-12-10T02:49:46.989445,,2012-01-01,2012,2102794,2012-07-10,NaT,2012-07-10,2012.0
457000,openalex::W3123789860,EconStor Preprints,openalex,10.2139/ssrn.2573596,https://doi.org/10.2139/ssrn.2573596,http://hdl.handle.net/10419/107595,2015.0,2025-10-10T00:00:00,,,2015-01-01,,,,2025-10-10T17:16:08.811792,,2015-01-01,2015,2573596,2015-03-05,NaT,2015-03-05,2015.0
479249,openalex::W3125554759,EconStor Preprints,openalex,10.2139/ssrn.3192474,https://doi.org/10.2139/ssrn.3192474,http://hdl.handle.net/10419/271218,2018.0,2021-02-01T00:00:00,,,2018-01-01,,,,2025-11-06T03:46:38.306776,,2018-01-01,2018,3192474,2018-06-27,NaT,2018-06-27,2018.0
480294,openalex::W2894453084,EconStor Preprints,openalex,10.2139/ssrn.2797203,https://doi.org/10.2139/ssrn.2797203,http://hdl.handle.net/10419/179123,2018.0,2018-10-05T00:00:00,,,2018-01-01,,,,2025-11-06T03:46:38.306776,,2018-01-01,2018,2797203,2016-06-19,NaT,2016-06-19,2016.0
480465,openalex::W3124251451,EconStor Preprints,openalex,10.2139/ssrn.3347766,https://doi.org/10.2139/ssrn.3347766,http://hdl.handle.net/10419/193667,2019.0,2021-02-01T00:00:00,,,2019-01-01,,,,2025-11-06T03:46:38.306776,,2019-01-01,2019,3347766,2019-04-01,NaT,2019-04-01,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634768,crossref::10.2139/ssrn.3474720,SSRN,crossref,10.2139/ssrn.3474720,https://doi.org/10.2139/ssrn.3474720,https://www.ssrn.com/abstract=3474720,2019.0,2019-11-06,,2025-12-31,2019-01-01,,2019-01-01,2025-12-31,,,2019-01-01,2019,3474720,2019-11-04,2019-11-04,2019-11-04,2019.0
5634775,crossref::10.2139/ssrn.3221252,SSRN,crossref,10.2139/ssrn.3221252,https://doi.org/10.2139/ssrn.3221252,https://www.ssrn.com/abstract=3221252,2018.0,2018-08-14,,2025-12-31,2018-01-01,,2018-01-01,2025-12-31,,,2018-01-01,2018,3221252,2018-08-14,2018-08-14,2018-08-14,2018.0
5634914,crossref::10.2139/ssrn.3362066,SSRN,crossref,10.2139/ssrn.3362066,https://doi.org/10.2139/ssrn.3362066,https://www.ssrn.com/abstract=3362066,2019.0,2019-05-01,,2025-12-31,2019-01-01,,2019-01-01,2025-12-31,,,2019-01-01,2019,3362066,2019-04-30,2019-04-30,2019-04-30,2019.0
5634967,crossref::10.2139/ssrn.4668027,SSRN,crossref,10.2139/ssrn.4668027,https://doi.org/10.2139/ssrn.4668027,https://www.ssrn.com/abstract=4668027,2023.0,2023-12-19,,2025-12-31,2023-01-01,2023-01-01,2023-01-01,2025-12-31,,,2023-01-01,2023,4668027,2023-12-19,2023-12-19,2023-12-19,2023.0


In [31]:
date_posted_ssrn_id['server_name'].value_counts()

server_name
SSRN                                                                    730023
RePEc: Research Papers in Economics                                         14
EconStor Preprints                                                          11
Digital Access to Scholarship at Harvard (DASH) (Harvard University)         1
HAL                                                                          1
Name: count, dtype: int64

In [32]:
date_posted_ssrn_id.count()

record_id                      730050
server_name                    730050
backend                        730050
doi                            730050
doi_url                        730050
landing_page_url               730050
publication_year               730050
date_created                   730050
date_posted                    125366
date_deposited                 730023
date_published                 730050
date_published_online           45303
date_issued                    730023
date_indexed                   730023
date_updated                       27
date_registered                     0
date_first_seen                730050
publication_year_first_seen    730050
ssrn_id                        730050
date_posted_ssrn_doi           730050
date_posted_ssrn_url           493004
date_posted_ssrn_id            730050
publication_year_ssrn          730050
dtype: int64

In [33]:
date_posted_ssrn_yearna = data_date_first_seen[data_date_first_seen["publication_year_ssrn"].isna()]
date_posted_ssrn_yearna

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id,date_posted_ssrn_doi,date_posted_ssrn_url,date_posted_ssrn_id,publication_year_ssrn
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-05-03,2020-09-15,2020-05-03,,2020-05-03,2025-05-14,,,2020-05-03,2020,,NaT,NaT,NaT,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-25,2020-09-15,2020-04-25,,2020-04-25,2025-05-14,,,2020-04-25,2020,,NaT,NaT,NaT,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-15,2020-04-16,2020-09-15,2020-04-16,,2020-04-16,2025-05-14,,,2020-04-16,2020,,NaT,NaT,NaT,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2022-12-13,,,2020-04-15,2020,,NaT,NaT,NaT,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,2020.0,2020-09-17,2020-04-15,2020-09-17,2020-04-15,,2020-04-15,2024-08-11,,,2020-04-15,2020,,NaT,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-09-01,,,,2025-10-10T17:16:08.811792,,2014-09-01,2014,,NaT,NaT,NaT,
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,2011.0,2025-10-10T00:00:00,,,2011-12-01,,,,2025-10-10T17:16:08.811792,,2011-12-01,2011,,NaT,NaT,NaT,
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,2014.0,2025-10-10T00:00:00,,,2014-06-01,,,,2025-10-10T17:16:08.811792,,2014-06-01,2014,,NaT,NaT,NaT,
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,2013.0,2025-10-10T00:00:00,,,2013-06-01,,,,2025-10-10T17:16:08.811792,,2013-06-01,2013,,NaT,NaT,NaT,


In [34]:
date_posted_ssrn_yearna['publication_year_first_seen'].value_counts()

publication_year_first_seen
2022    2502348
2025    1025264
2024     794115
2023     571006
2020     407863
2019     372728
2021     323097
2018     198472
2017     173482
2016     139162
2015     119492
2014     108089
2013      96149
2012      88233
2011      81623
2010      76335
2009      70222
2008      64745
2007      60657
2006      57184
2005      51157
2004      40543
2003      34959
2002      28878
2001      24386
2000      23296
1999      20632
1998      19476
1997      17780
1996      16839
1995      15773
1994      14592
1993      12574
1992      12305
1991      10462
1990       9534
1987          4
1984          3
1981          3
1971          2
1983          2
1976          2
1978          2
1967          1
1982          1
1972          1
1961          1
1969          1
1988          1
Name: count, dtype: Int64

### not match in the main file

In [35]:
date_posted_ssrn_yearna_ssrn = date_posted_ssrn_yearna[date_posted_ssrn_yearna['server_name']=='SSRN']
date_posted_ssrn_yearna_ssrn

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id,date_posted_ssrn_doi,date_posted_ssrn_url,date_posted_ssrn_id,publication_year_ssrn
4376025,crossref::10.2139/ssrn.681174,SSRN,crossref,10.2139/ssrn.681174,https://doi.org/10.2139/ssrn.681174,http://www.ssrn.com/abstract=681174,2005.0,2005-07-07,,2007-02-13,2005-01-01,,2005-01-01,2022-04-02,,,2005-01-01,2005,681174,NaT,NaT,NaT,
4376026,crossref::10.2139/ssrn.480691,SSRN,crossref,10.2139/ssrn.480691,https://doi.org/10.2139/ssrn.480691,http://www.ssrn.com/abstract=480691,2003.0,2005-07-07,,2007-02-13,2003-01-01,,2003-01-01,2022-04-06,,,2003-01-01,2003,480691,NaT,NaT,NaT,
4376027,crossref::10.2139/ssrn.161192,SSRN,crossref,10.2139/ssrn.161192,https://doi.org/10.2139/ssrn.161192,http://www.ssrn.com/abstract=161192,1999.0,2005-07-07,,2007-02-13,1999-01-01,,1999-01-01,2022-04-06,,,1999-01-01,1999,161192,NaT,NaT,NaT,
4376029,crossref::10.2139/ssrn.686132,SSRN,crossref,10.2139/ssrn.686132,https://doi.org/10.2139/ssrn.686132,http://www.ssrn.com/abstract=686132,2005.0,2005-07-07,,2007-02-13,2005-01-01,,2005-01-01,2022-04-06,,,2005-01-01,2005,686132,NaT,NaT,NaT,
4376031,crossref::10.2139/ssrn.141356,SSRN,crossref,10.2139/ssrn.141356,https://doi.org/10.2139/ssrn.141356,http://www.ssrn.com/abstract=141356,1998.0,2005-08-19,,2007-02-13,1998-01-01,,1998-01-01,2022-04-05,,,1998-01-01,1998,141356,NaT,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634975,crossref::10.2139/ssrn.5996462,SSRN,crossref,10.2139/ssrn.5996462,https://doi.org/10.2139/ssrn.5996462,https://www.ssrn.com/abstract=5996462,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025,5996462,NaT,NaT,NaT,
5634976,crossref::10.2139/ssrn.5996464,SSRN,crossref,10.2139/ssrn.5996464,https://doi.org/10.2139/ssrn.5996464,https://www.ssrn.com/abstract=5996464,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025,5996464,NaT,NaT,NaT,
5634977,crossref::10.2139/ssrn.5996465,SSRN,crossref,10.2139/ssrn.5996465,https://doi.org/10.2139/ssrn.5996465,https://www.ssrn.com/abstract=5996465,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025,5996465,NaT,NaT,NaT,
5634978,crossref::10.2139/ssrn.5996467,SSRN,crossref,10.2139/ssrn.5996467,https://doi.org/10.2139/ssrn.5996467,https://www.ssrn.com/abstract=5996467,2025.0,2025-12-31,2025-01-01,2025-12-31,2025-01-01,,2025-01-01,2025-12-31,,,2025-01-01,2025,5996467,NaT,NaT,NaT,


In [36]:
date_posted_ssrn_yearna_ssrn.count()

record_id                      528935
server_name                    528935
backend                        528935
doi                            528935
doi_url                        528935
landing_page_url               528935
publication_year               528935
date_created                   528935
date_posted                    390025
date_deposited                 528935
date_published                 528935
date_published_online           71973
date_issued                    528935
date_indexed                   528935
date_updated                        0
date_registered                     0
date_first_seen                528935
publication_year_first_seen    528935
ssrn_id                        528935
date_posted_ssrn_doi                0
date_posted_ssrn_url                0
date_posted_ssrn_id                 0
publication_year_ssrn               0
dtype: int64

In [37]:
date_posted_ssrn_yearna_ssrn['publication_year_first_seen'].value_counts()

publication_year_first_seen
2025    260498
2024    185286
2023     15434
2022     15030
2012      5847
2013      4786
2021      4659
2011      4496
2010      3472
2020      3226
2014      2963
2009      2791
2015      2704
2019      2589
2017      2462
2018      2391
2016      2360
2008      2189
2007      1606
2006      1155
2005      1035
2004       522
2003       493
2002       286
2001       199
2000       188
1998        75
1999        70
1997        29
1996        24
1991        11
1995        10
1992         9
1993         9
1994         5
1987         4
1990         4
1983         2
1981         2
1984         2
1971         2
1978         2
1976         2
1982         1
1967         1
1961         1
1972         1
1969         1
1988         1
Name: count, dtype: Int64

In [38]:
date_posted_ssrn_yearna_ssrn['date_created'].value_counts().head(60)

date_created
2011-12-28    8357
2012-01-05    5608
2025-05-06    3244
2025-08-12    2666
2025-06-13    2629
2025-05-07    2609
2024-08-19    2401
2025-06-20    2218
2025-02-24    2100
2025-06-19    1928
2025-07-01    1909
2025-12-12    1807
2025-07-24    1780
2025-09-17    1745
2025-10-29    1677
2025-11-17    1632
2025-10-08    1626
2024-12-13    1549
2025-09-08    1518
2024-09-18    1470
2025-05-28    1452
2025-05-14    1439
2025-01-08    1427
2025-02-25    1425
2025-05-01    1400
2025-05-19    1391
2025-06-23    1372
2025-06-03    1348
2025-06-16    1340
2024-08-22    1329
2025-09-11    1320
2024-12-17    1287
2025-06-24    1287
2025-06-12    1279
2025-12-17    1277
2025-12-23    1276
2025-08-28    1273
2025-03-03    1266
2025-12-18    1260
2025-12-03    1260
2025-07-30    1257
2025-10-16    1254
2024-05-07    1254
2025-11-18    1232
2025-12-22    1220
2025-02-04    1219
2025-01-14    1217
2024-10-07    1208
2024-05-30    1207
2025-12-10    1201
2024-06-04    1189
2025-10-22    1185

## Replace first seen date/year for SSRN records using SSRN posted date

In [39]:
import pandas as pd

def replace_ssrn_first_seen_dates(
    df,
    server_col="server_name",
    server_value="SSRN",
    date_first_seen_col="date_first_seen",
    year_first_seen_col="publication_year_first_seen",
    ssrn_date_col="date_posted_ssrn_doi",
    ssrn_year_col="publication_year_ssrn",
):
    """
    Replace first seen date/year for SSRN records using SSRN posted date
    with safe fallback to existing values.
    """

    df = df.copy()

    # Ensure datetime
    df[date_first_seen_col] = pd.to_datetime(df[date_first_seen_col], errors="coerce")
    df[ssrn_date_col] = pd.to_datetime(df[ssrn_date_col], errors="coerce")

    # Ensure numeric year
    df[year_first_seen_col] = pd.to_numeric(df[year_first_seen_col], errors="coerce")
    df[ssrn_year_col] = pd.to_numeric(df[ssrn_year_col], errors="coerce")

    # Mask for SSRN rows
    mask_ssrn = df[server_col].astype(str).str.upper() == server_value.upper()

    # Mask where SSRN values are valid
    mask_valid_date = mask_ssrn & df[ssrn_date_col].notna()
    mask_valid_year = mask_ssrn & df[ssrn_year_col].notna()

    # Replace date
    df.loc[mask_valid_date, date_first_seen_col] = df.loc[mask_valid_date, ssrn_date_col]

    # Replace year
    df.loc[mask_valid_year, year_first_seen_col] = df.loc[mask_valid_year, ssrn_year_col]

    return df

In [40]:
data_date_first_seen = replace_ssrn_first_seen_dates(data_date_first_seen)
data_date_first_seen["publication_year_first_seen"] = (
    pd.to_datetime(data_date_first_seen["date_first_seen"], errors="coerce")
    .dt.year
)

In [41]:
data_date_first_seen.loc[
    data_date_first_seen["server_name"] == "SSRN",
    ["date_first_seen", "date_posted_ssrn_doi"]
].head(20)

Unnamed: 0,date_first_seen,date_posted_ssrn_doi
4376023,1998-09-25,1998-09-25
4376024,1997-12-22,1997-12-22
4376025,2005-01-01,NaT
4376026,2003-01-01,NaT
4376027,1999-01-01,NaT
4376028,1998-09-04,1998-09-04
4376029,2005-01-01,NaT
4376030,1999-11-10,1999-11-10
4376031,1998-01-01,NaT
4376032,1999-07-23,1999-07-23


In [42]:
(
    data_date_first_seen["server_name"].eq("SSRN")
    & data_date_first_seen["date_posted_ssrn_doi"].notna()
).sum()

np.int64(730023)

# save

In [43]:
data_date_first_seen[['record_id','server_name','date_first_seen','publication_year_first_seen']].to_csv("outputs/date_first_seen.csv", index=False)
data_date_first_seen[['record_id','server_name','date_first_seen','publication_year_first_seen']].to_pickle("outputs/date_first_seen.pkl")

In [44]:
data_date_first_seen[['record_id','date_first_seen','publication_year_first_seen']]

Unnamed: 0,record_id,date_first_seen,publication_year_first_seen
0,crossref::10.21467/preprints.48,2020-05-03,2020
1,crossref::10.21467/preprints.43,2020-04-25,2020
2,crossref::10.21467/preprints.39,2020-04-16,2020
3,crossref::10.21467/preprints.38,2020-04-15,2020
4,crossref::10.21467/preprints.36,2020-04-15,2020
...,...,...,...
9486888,openalex::W999325625,2014-09-01,2014
9486889,openalex::W999460032,2011-12-01,2011
9486890,openalex::W99967155,2014-06-01,2014
9486891,openalex::W999790414,2013-06-01,2013


In [45]:
data_date_first_seen.publication_year_first_seen.value_counts().reset_index().sort_values('publication_year_first_seen')

Unnamed: 0,publication_year_first_seen,count
46,1961,1
43,1967,1
47,1969,1
39,1971,2
45,1972,1
41,1976,2
42,1978,2
38,1981,3
44,1982,1
40,1983,2


In [46]:
data_date_first_seen.sort_values('publication_year_first_seen')

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,publication_year,date_created,date_posted,date_deposited,date_published,date_published_online,date_issued,date_indexed,date_updated,date_registered,date_first_seen,publication_year_first_seen,ssrn_id,date_posted_ssrn_doi,date_posted_ssrn_url,date_posted_ssrn_id,publication_year_ssrn
4401398,crossref::10.2139/ssrn.1681218,SSRN,crossref,10.2139/ssrn.1681218,https://doi.org/10.2139/ssrn.1681218,http://www.ssrn.com/abstract=1681218,1961.0,2013-09-04,,2013-09-04,1961-01-01,,1961-01-01,2022-03-31,,,1961-01-01,1961,1681218,NaT,NaT,NaT,
4389769,crossref::10.2139/ssrn.1678783,SSRN,crossref,10.2139/ssrn.1678783,https://doi.org/10.2139/ssrn.1678783,http://www.ssrn.com/abstract=1678783,1967.0,2012-01-05,,2012-07-06,1967-01-01,,1967-01-01,2022-04-05,,,1967-01-01,1967,1678783,NaT,NaT,NaT,
4401918,crossref::10.2139/ssrn.2338938,SSRN,crossref,10.2139/ssrn.2338938,https://doi.org/10.2139/ssrn.2338938,http://www.ssrn.com/abstract=2338938,1969.0,2013-10-16,,2013-10-16,1969-01-01,,1969-01-01,2022-04-03,,,1969-01-01,1969,2338938,NaT,NaT,NaT,
4401414,crossref::10.2139/ssrn.1774822,SSRN,crossref,10.2139/ssrn.1774822,https://doi.org/10.2139/ssrn.1774822,http://www.ssrn.com/abstract=1774822,1971.0,2013-09-04,,2013-09-04,1971-01-01,,1971-01-01,2022-03-31,,,1971-01-01,1971,1774822,NaT,NaT,NaT,
4390827,crossref::10.2139/ssrn.1773179,SSRN,crossref,10.2139/ssrn.1773179,https://doi.org/10.2139/ssrn.1773179,http://www.ssrn.com/abstract=1773179,1971.0,2012-01-05,,2012-07-06,1971-01-01,,1971-01-01,2022-04-04,,,1971-01-01,1971,1773179,NaT,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5895279,datacite::10.5281/zenodo.18112001,Zenodo,datacite,10.5281/zenodo.18112001,https://doi.org/10.5281/zenodo.18112001,https://zenodo.org/doi/10.5281/zenodo.18112001,2025.0,2025-12-31,,,,,,,2025-12-31,2025-12-31,2025-12-31,2025,,NaT,NaT,NaT,
5895304,datacite::10.5281/zenodo.18112126,Zenodo,datacite,10.5281/zenodo.18112126,https://doi.org/10.5281/zenodo.18112126,https://zenodo.org/doi/10.5281/zenodo.18112126,2025.0,2025-12-31,,,,,,,2025-12-31,2025-12-31,2025-12-31,2025,,NaT,NaT,NaT,
5895305,datacite::10.5281/zenodo.18112114,Zenodo,datacite,10.5281/zenodo.18112114,https://doi.org/10.5281/zenodo.18112114,https://zenodo.org/doi/10.5281/zenodo.18112114,2026.0,2025-12-31,,,,,,,2026-01-01,2025-12-31,2025-12-31,2025,,NaT,NaT,NaT,
5895306,datacite::10.5281/zenodo.18112115,Zenodo,datacite,10.5281/zenodo.18112115,https://doi.org/10.5281/zenodo.18112115,https://zenodo.org/doi/10.5281/zenodo.18112115,2026.0,2025-12-31,,,,,,,2026-01-01,2025-12-31,2025-12-31,2025,,NaT,NaT,NaT,
