In [1]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


# function to update google sheet file

In [2]:
from __future__ import annotations

# =========================
# 0) Imports
# =========================
import numpy as np
import pandas as pd
import duckdb
from typing import Iterable, Dict, Any, Optional, List, Tuple

import gspread
from google.oauth2.service_account import Credentials


# =========================
# 1) DuckDB metrics builder
# =========================
def build_sheet_metrics_duckdb(
    df: pd.DataFrame,
    *,
    server_col: str = "server_name",
    backend_col: str = "backend",
    group_id_col: str = "dup_group_id",
    date_col: str = "date_first_seen",
    year_col: str = "publication_year_first_seen",
    has_published_col: str = "has_published_version",
    hierarchy_col: str = "records_hierarchy",
    record_id_col: str = "record_id",
    choose_parent: str = "oldest",   # "oldest" | "newest"
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns:
      overall_df:  one row per server_name (ALL backends combined)
      backend_df:  one row per (server_name, backend)

    Output columns (overall_df & backend_df):
      server_name, [backend], n_records, n_unique, n_published, pct_published,
      n_is_version_of, pct_has_versions, count_2024, count_2025
    """
    if choose_parent not in ("oldest", "newest"):
        raise ValueError("choose_parent must be 'oldest' or 'newest'")

    order_dir = "ASC" if choose_parent == "oldest" else "DESC"

    con = duckdb.connect(database=":memory:")
    con.register("df", df)

    overall_sql = f"""
    WITH base AS (
      SELECT
        CAST({server_col} AS VARCHAR) AS server_name,
        CAST({backend_col} AS VARCHAR) AS backend,
        CAST({group_id_col} AS VARCHAR) AS dup_group_id,
        CAST({record_id_col} AS VARCHAR) AS record_id,
        TRY_CAST({date_col} AS TIMESTAMP) AS date_first_seen,
        TRY_CAST({year_col} AS INTEGER) AS year_first_seen,
        CASE
          WHEN {has_published_col} IS NULL THEN FALSE
          WHEN CAST({has_published_col} AS VARCHAR) IN ('true','True','1') THEN TRUE
          WHEN CAST({has_published_col} AS VARCHAR) IN ('false','False','0') THEN FALSE
          ELSE COALESCE(TRY_CAST({has_published_col} AS BOOLEAN), FALSE)
        END AS has_published_version,
        LOWER(TRIM(COALESCE(CAST({hierarchy_col} AS VARCHAR), ''))) AS records_hierarchy
      FROM df
      WHERE {server_col} IS NOT NULL
        AND {group_id_col} IS NOT NULL
        AND TRIM(CAST({server_col} AS VARCHAR)) <> ''
        AND TRIM(CAST({group_id_col} AS VARCHAR)) <> ''
    ),
    clusters AS (
      SELECT
        dup_group_id,
        COUNT(*) AS cluster_size,
        BOOL_OR(has_published_version) AS cluster_has_published
      FROM base
      GROUP BY dup_group_id
    ),
    parents AS (
      SELECT
        b.dup_group_id,
        b.server_name AS parent_server_name,
        b.backend AS parent_backend,
        b.record_id AS parent_record_id,
        b.date_first_seen AS parent_date_first_seen,
        b.year_first_seen AS parent_year_first_seen
      FROM base b
      QUALIFY ROW_NUMBER() OVER (
        PARTITION BY b.dup_group_id
        ORDER BY
          CASE WHEN b.records_hierarchy = 'parent' THEN 0 ELSE 1 END,
          b.date_first_seen {order_dir} NULLS LAST,
          b.record_id {order_dir}
      ) = 1
    ),
    joined AS (
      SELECT
        b.server_name,
        b.backend,
        b.dup_group_id,
        c.cluster_size,
        c.cluster_has_published,
        p.parent_year_first_seen
      FROM base b
      JOIN clusters c USING (dup_group_id)
      JOIN parents  p USING (dup_group_id)
    ),
    overall AS (
      SELECT
        server_name,
        COUNT(*) AS n_records,
        COUNT(DISTINCT dup_group_id) AS n_unique,
        COUNT(DISTINCT CASE WHEN cluster_has_published THEN dup_group_id END) AS n_published,
        COUNT(DISTINCT CASE WHEN cluster_size > 1 THEN dup_group_id END) AS n_is_version_of
      FROM joined
      GROUP BY server_name
    ),
    yearly AS (
      SELECT
        server_name,
        SUM(CASE WHEN parent_year_first_seen = 2024 THEN 1 ELSE 0 END) AS count_2024,
        SUM(CASE WHEN parent_year_first_seen = 2025 THEN 1 ELSE 0 END) AS count_2025
      FROM (
        SELECT DISTINCT server_name, dup_group_id, parent_year_first_seen
        FROM joined
      )
      GROUP BY server_name
    )
    SELECT
      o.server_name,
      o.n_records,
      o.n_unique,
      o.n_published,
      CASE WHEN o.n_unique > 0 THEN (o.n_published::DOUBLE / o.n_unique) * 100 ELSE NULL END AS pct_published,
      o.n_is_version_of,
      CASE WHEN o.n_unique > 0 THEN (o.n_is_version_of::DOUBLE / o.n_unique) * 100 ELSE NULL END AS pct_has_versions,
      COALESCE(y.count_2024, 0) AS count_2024,
      COALESCE(y.count_2025, 0) AS count_2025
    FROM overall o
    LEFT JOIN yearly y USING (server_name)
    ORDER BY o.server_name;
    """
    overall_df = con.sql(overall_sql).df()

    backend_sql = f"""
    WITH base AS (
      SELECT
        CAST({server_col} AS VARCHAR) AS server_name,
        CAST({backend_col} AS VARCHAR) AS backend,
        CAST({group_id_col} AS VARCHAR) AS dup_group_id,
        CAST({record_id_col} AS VARCHAR) AS record_id,
        TRY_CAST({date_col} AS TIMESTAMP) AS date_first_seen,
        TRY_CAST({year_col} AS INTEGER) AS year_first_seen,
        CASE
          WHEN {has_published_col} IS NULL THEN FALSE
          WHEN CAST({has_published_col} AS VARCHAR) IN ('true','True','1') THEN TRUE
          WHEN CAST({has_published_col} AS VARCHAR) IN ('false','False','0') THEN FALSE
          ELSE COALESCE(TRY_CAST({has_published_col} AS BOOLEAN), FALSE)
        END AS has_published_version,
        LOWER(TRIM(COALESCE(CAST({hierarchy_col} AS VARCHAR), ''))) AS records_hierarchy
      FROM df
      WHERE {server_col} IS NOT NULL
        AND {group_id_col} IS NOT NULL
        AND TRIM(CAST({server_col} AS VARCHAR)) <> ''
        AND TRIM(CAST({group_id_col} AS VARCHAR)) <> ''
        AND {backend_col} IS NOT NULL
        AND TRIM(CAST({backend_col} AS VARCHAR)) <> ''
    ),
    clusters AS (
      SELECT
        dup_group_id,
        COUNT(*) AS cluster_size,
        BOOL_OR(has_published_version) AS cluster_has_published
      FROM base
      GROUP BY dup_group_id
    ),
    parents AS (
      SELECT
        b.dup_group_id,
        b.server_name AS parent_server_name,
        b.backend AS parent_backend,
        b.record_id AS parent_record_id,
        b.date_first_seen AS parent_date_first_seen,
        b.year_first_seen AS parent_year_first_seen
      FROM base b
      QUALIFY ROW_NUMBER() OVER (
        PARTITION BY b.dup_group_id
        ORDER BY
          CASE WHEN b.records_hierarchy = 'parent' THEN 0 ELSE 1 END,
          b.date_first_seen {order_dir} NULLS LAST,
          b.record_id {order_dir}
      ) = 1
    ),
    joined AS (
      SELECT
        b.server_name,
        b.backend,
        b.dup_group_id,
        c.cluster_size,
        c.cluster_has_published,
        p.parent_year_first_seen
      FROM base b
      JOIN clusters c USING (dup_group_id)
      JOIN parents  p USING (dup_group_id)
    ),
    agg AS (
      SELECT
        server_name,
        backend,
        COUNT(*) AS n_records,
        COUNT(DISTINCT dup_group_id) AS n_unique,
        COUNT(DISTINCT CASE WHEN cluster_has_published THEN dup_group_id END) AS n_published,
        COUNT(DISTINCT CASE WHEN cluster_size > 1 THEN dup_group_id END) AS n_is_version_of
      FROM joined
      GROUP BY server_name, backend
    ),
    yearly AS (
      SELECT
        server_name,
        backend,
        SUM(CASE WHEN parent_year_first_seen = 2024 THEN 1 ELSE 0 END) AS count_2024,
        SUM(CASE WHEN parent_year_first_seen = 2025 THEN 1 ELSE 0 END) AS count_2025
      FROM (
        SELECT DISTINCT server_name, backend, dup_group_id, parent_year_first_seen
        FROM joined
      )
      GROUP BY server_name, backend
    )
    SELECT
      a.server_name,
      a.backend,
      a.n_records,
      a.n_unique,
      a.n_published,
      CASE WHEN a.n_unique > 0 THEN (a.n_published::DOUBLE / a.n_unique) * 100 ELSE NULL END AS pct_published,
      a.n_is_version_of,
      CASE WHEN a.n_unique > 0 THEN (a.n_is_version_of::DOUBLE / a.n_unique) * 100 ELSE NULL END AS pct_has_versions,
      COALESCE(y.count_2024, 0) AS count_2024,
      COALESCE(y.count_2025, 0) AS count_2025
    FROM agg a
    LEFT JOIN yearly y
      ON a.server_name = y.server_name AND a.backend = y.backend
    ORDER BY a.server_name, a.backend;
    """
    backend_df = con.sql(backend_sql).df()

    con.close()
    return overall_df, backend_df


# ============================================
# 2) Read sheet -> DataFrame (gspread)
# ============================================
def read_sheet_as_df(
    *,
    sheet_id: str,
    worksheet_gid: int,
    service_account_json_path: str,
) -> pd.DataFrame:
    scopes = ["https://www.googleapis.com/auth/spreadsheets"]
    creds = Credentials.from_service_account_file(service_account_json_path, scopes=scopes)
    gc = gspread.authorize(creds)
    sh = gc.open_by_key(sheet_id)

    ws = None
    for w in sh.worksheets():
        if w.id == worksheet_gid:
            ws = w
            break
    if ws is None:
        raise ValueError(f"Worksheet gid={worksheet_gid} not found.")

    values = ws.get_all_values()
    if not values:
        return pd.DataFrame()

    header = values[0]
    rows = values[1:]
    df = pd.DataFrame(rows, columns=header)
    return df


# ============================================
# 3) Build payload_df (only columns to update)
# ============================================
def build_update_payload(
    sheet_df: pd.DataFrame,
    overall_df: pd.DataFrame,
    backend_df: pd.DataFrame,
    target_cols: List[str],
    *,
    key_col: str = "source_id",
    default_collection_date: str = "2025/12/31",
    per_server_date: Optional[Dict[str, str]] = None,
) -> pd.DataFrame:
    """
    Returns a dataframe containing:
      - key_col
      - all target_cols (exact names from the sheet, including trailing spaces)
    Values come from overall_df & backend_df.
    Missing columns are created as blanks so the writer won't crash.
    """
    if key_col not in sheet_df.columns:
        raise ValueError(f"Sheet dataframe must contain '{key_col}'.")

    out = pd.DataFrame()
    out[key_col] = sheet_df[key_col].astype(str)

    # Ensure all target cols exist in out (so writer won't KeyError)
    for c in target_cols:
        out[c] = ""

    # ---- Global map (overall)
    g = overall_df.set_index("server_name")
    sid = out[key_col].astype(str)

    def set_if_present(colname: str, values):
        if colname in out.columns:
            out[colname] = values

    # Global (red columns)
    set_if_present("Collection Date used for Red Columns", default_collection_date)
    set_if_present("number of unique preprints", sid.map(g["n_unique"]))
    set_if_present("number of unique preprints that are published", sid.map(g["n_published"]))

    # NOTE: your sheet has trailing spaces in some headers:
    set_if_present("% of unique that are published ", sid.map(g["pct_published"]))
    set_if_present("Number of unique preprints with more than 1 version", sid.map(g["n_is_version_of"]))
    set_if_present("% of unique that have more than one version ", sid.map(g["pct_has_versions"]))

    # ---- Backend blocks
    b = backend_df.copy()
    b["backend"] = b["backend"].astype(str).str.lower().str.strip()
    b["server_name"] = b["server_name"].astype(str)

    def backend_map(backend_key: str) -> pd.DataFrame:
        return b[b["backend"] == backend_key].set_index("server_name")

    # Crossref
    cr = backend_map("crossref")
    set_if_present("Is the Server in Crossref", sid.isin(cr.index))
    set_if_present("Crossref collection_date", default_collection_date)
    set_if_present("Number of works in Crossref", sid.map(cr["n_records"]))
    set_if_present("Number of unique preprints in Crossref", sid.map(cr["n_unique"]))
    set_if_present("Number of unique preprints that are published in Crossref", sid.map(cr["n_published"]))
    set_if_present("%_of unique that are published in Crossref", sid.map(cr["pct_published"]))
    set_if_present("Number of unique preprints with more than 1 version in Crossref", sid.map(cr["n_is_version_of"]))
    set_if_present("% of unique that have more than one version  in Crossref", sid.map(cr["pct_has_versions"]))
    set_if_present("Works_in_2024 in Crossref", sid.map(cr["count_2024"]))
    set_if_present("Works_in_2025 in Crossref", sid.map(cr["count_2025"]))

    # DataCite
    dc = backend_map("datacite")
    set_if_present("Is the server in DataCite", sid.isin(dc.index))
    set_if_present("DataCite Collection Date", default_collection_date)
    set_if_present("Number of works in DataCite", sid.map(dc["n_records"]))
    # sheet uses this exact column name:
    set_if_present("number_of_preprint_unique in DataCite", sid.map(dc["n_unique"]))
    set_if_present("number of unique preprints that are published in Datacite", sid.map(dc["n_published"]))
    set_if_present("%_of unique that are published in DataCite", sid.map(dc["pct_published"]))
    set_if_present("Number of unique preprints with more than 1 version in DataCite", sid.map(dc["n_is_version_of"]))
    set_if_present("% of unique that have more than one version  in DataCite", sid.map(dc["pct_has_versions"]))
    set_if_present("Works_in_2024 in DataCite", sid.map(dc["count_2024"]))
    set_if_present("Works_in_2025 in DataCite", sid.map(dc["count_2025"]))

    # OpenAlex
    oa = backend_map("openalex")
    set_if_present("Is the server in OpenAlex", sid.isin(oa.index))
    set_if_present("OpenAlex Data Collection Date", default_collection_date)
    set_if_present("Number of works in OpenAlex", sid.map(oa["n_records"]))
    set_if_present("number_of_preprint_unique in OpenAlex", sid.map(oa["n_unique"]))
    set_if_present("number of unique preprints that are published in OpenAlex", sid.map(oa["n_published"]))
    set_if_present("%_of unique that are published in OpenAlex", sid.map(oa["pct_published"]))
    set_if_present("Number of unique preprints with more than 1 version in OpenAlex", sid.map(oa["n_is_version_of"]))
    set_if_present("% of unique preprints with more than 1 version in OpenAlex", sid.map(oa["pct_has_versions"]))
    set_if_present("Works_in_2024 in OpenAlex", sid.map(oa["count_2024"]))
    set_if_present("Works_in_2025 in OpenAlex", sid.map(oa["count_2025"]))

    # ---- Optional per-server collection date overrides
    if per_server_date:
        # Only touches the *collection date* columns if present in target_cols
        for date_col in [
            "Collection Date used for Red Columns",
            "Crossref collection_date",
            "DataCite Collection Date",
            "OpenAlex Data Collection Date",
        ]:
            if date_col in out.columns:
                out[date_col] = [
                    per_server_date.get(s, default_collection_date) for s in sid.tolist()
                ]

    # Clean types for writing
    for c in target_cols:
        if c in out.columns:
            out[c] = out[c].replace({np.nan: ""})
    return out[[key_col] + list(target_cols)]


# ============================================
# 4) Update ONLY selected columns in the sheet
# ============================================
def update_google_sheet_only_columns(
    *,
    sheet_id: str,
    worksheet_gid: int,
    service_account_json_path: str,
    payload_df: pd.DataFrame,
    key_col: str,
    columns_to_update: Iterable[str],
    chunk_size: int = 500,
    overwrite: bool = True,
    verbose: bool = True,
) -> Dict[str, Any]:
    cols = list(columns_to_update)
    if key_col not in payload_df.columns:
        raise ValueError(f"payload_df must contain '{key_col}'.")

    scopes = ["https://www.googleapis.com/auth/spreadsheets"]
    creds = Credentials.from_service_account_file(service_account_json_path, scopes=scopes)
    gc = gspread.authorize(creds)
    sh = gc.open_by_key(sheet_id)

    ws = None
    for w in sh.worksheets():
        if w.id == worksheet_gid:
            ws = w
            break
    if ws is None:
        raise ValueError(f"Worksheet gid={worksheet_gid} not found.")

    header = ws.row_values(1)
    header_map = {h: (i + 1) for i, h in enumerate(header)}  # 1-based

    missing_cols = [c for c in cols if c not in header_map]
    if missing_cols:
        raise ValueError(f"These columns are not in the sheet header: {missing_cols}")

    if key_col not in header_map:
        raise ValueError(f"Sheet must contain '{key_col}' in header row.")

    key_col_idx = header_map[key_col]
    all_keys = ws.col_values(key_col_idx)
    key_to_row = {str(k): r for r, k in enumerate(all_keys[1:], start=2) if str(k).strip() != ""}

    p = payload_df.copy()
    p[key_col] = p[key_col].astype(str)

    keys_in_sheet = set(key_to_row.keys())
    p_in = p[p[key_col].isin(keys_in_sheet)].copy()
    missing_keys = p.loc[~p[key_col].isin(keys_in_sheet), key_col].unique().tolist()

    # Read existing values if overwrite=False
    existing = None
    if not overwrite:
        existing = {}
        for c in cols:
            cidx = header_map[c]
            vals = ws.col_values(cidx)
            for r, val in enumerate(vals[1:], start=2):
                existing[(r, cidx)] = val

    def norm(v):
        if v is None:
            return ""
        if isinstance(v, float) and np.isnan(v):
            return ""
        # keep numbers readable
        if isinstance(v, (np.integer,)):
            return str(int(v))
        if isinstance(v, (np.floating,)):
            return str(float(v))
        return str(v)

    cells_written = 0
    chunks = 0

    for i in range(0, len(p_in), chunk_size):
        chunk = p_in.iloc[i:i+chunk_size]
        cell_list = []

        for _, row in chunk.iterrows():
            rid = str(row[key_col])
            sheet_row = key_to_row.get(rid)
            if sheet_row is None:
                continue

            for c in cols:
                cidx = header_map[c]
                new_val = norm(row.get(c, ""))

                if not overwrite:
                    old_val = existing.get((sheet_row, cidx), "")
                    if str(old_val).strip() != "":
                        continue

                cell_list.append(gspread.Cell(sheet_row, cidx, new_val))

        if cell_list:
            ws.update_cells(cell_list, value_input_option="USER_ENTERED")
            cells_written += len(cell_list)

        chunks += 1
        if verbose:
            print(f"Chunk {chunks}: scanned {len(chunk)} rows, wrote {len(cell_list)} cells")

    return {
        "sheet_id": sheet_id,
        "worksheet_gid": worksheet_gid,
        "rows_payload": int(len(payload_df)),
        "rows_matched": int(len(p_in)),
        "missing_keys": missing_keys[:20],
        "n_missing_keys": int(len(missing_keys)),
        "columns_updated": cols,
        "overwrite": overwrite,
        "cells_written": int(cells_written),
        "chunks": int(chunks),
    }


# ============================================
# 5) RUN PIPELINE (edit these 4 things)
# ============================================

# SHEET_ID = "10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4"
# WORKSHEET_GID = 881157301

# # 1) put your service account json path here
# SERVICE_ACCOUNT_JSON = "/path/to/service_account.json"

# # 2) your target columns exactly as in the sheet (including trailing spaces)
# TARGET_COLS = [
#     'Collection Date used for Red Columns',
#     'number of unique preprints',
#     'number of unique preprints that are published',
#     '% of unique that are published ',
#     'Number of unique preprints with more than 1 version',
#     '% of unique that have more than one version ',
#     'Is the Server in Crossref',
#     'Crossref collection_date',
#     'Number of works in Crossref',
#     'Number of unique preprints in Crossref',
#     'Number of unique preprints that are published in Crossref',
#     '%_of unique that are published in Crossref',
#     'Number of unique preprints with more than 1 version in Crossref',
#     '% of unique that have more than one version  in Crossref',
#     'Works_in_2024 in Crossref',
#     'Works_in_2025 in Crossref',
#     'Is the server in DataCite',
#     'DataCite Collection Date',
#     'Number of works in DataCite',
#     'number_of_preprint_unique in DataCite',
#     'number of unique preprints that are published in Datacite',
#     '%_of unique that are published in DataCite',
#     'Number of unique preprints with more than 1 version in DataCite',
#     '% of unique that have more than one version  in DataCite',
#     'Works_in_2024 in DataCite',
#     'Works_in_2025 in DataCite',
#     'Is the server in OpenAlex',
#     'OpenAlex Data Collection Date',
#     'Number of works in OpenAlex',
#     'number_of_preprint_unique in OpenAlex',
#     'number of unique preprints that are published in OpenAlex',
#     '%_of unique that are published in OpenAlex',
#     'Number of unique preprints with more than 1 version in OpenAlex',
#     '% of unique preprints with more than 1 version in OpenAlex',
#     'Works_in_2024 in OpenAlex',
#     'Works_in_2025 in OpenAlex',
# ]

# # 3) choose the collection date you want for all servers
# DEFAULT_COLLECTION_DATE = "2025/12/31"

# # 4) optional later: per server overrides (keep None for now)
# PER_SERVER_DATE = None
# # Example:
# # PER_SERVER_DATE = {"bioRxiv": "2026/01/15", "medRxiv": "2026/01/15"}


# # =========================
# # Step A: compute metrics
# # =========================
# # IMPORTANT: 'data' must be your dedupe long dataframe in memory
# overall_df, backend_df = build_sheet_metrics_duckdb(data, choose_parent="oldest")

# # =========================
# # Step B: read the sheet
# # =========================
# sheet_df = read_sheet_as_df(
#     sheet_id=SHEET_ID,
#     worksheet_gid=WORKSHEET_GID,
#     service_account_json_path=SERVICE_ACCOUNT_JSON,
# )

# # =========================
# # Step C: build payload (only key + target cols)
# # =========================
# payload_df = build_update_payload(
#     sheet_df=sheet_df,
#     overall_df=overall_df,
#     backend_df=backend_df,
#     target_cols=TARGET_COLS,
#     key_col="source_id",
#     default_collection_date=DEFAULT_COLLECTION_DATE,
#     per_server_date=PER_SERVER_DATE,
# )

# print(payload_df.head())

# # =========================
# # Step D: update only those columns
# # =========================
# report = update_google_sheet_only_columns(
#     sheet_id=SHEET_ID,
#     worksheet_gid=WORKSHEET_GID,
#     service_account_json_path=SERVICE_ACCOUNT_JSON,
#     payload_df=payload_df,
#     key_col="source_id",
#     columns_to_update=TARGET_COLS,
#     chunk_size=500,
#     overwrite=True,   # set False to fill blanks only
#     verbose=True,
# )

# report


# import data

In [3]:
# lon_path = "outputs_new/parent/dedupe_clusters_long_full.parquet"
# data = pd.read_parquet(lon_path)

data = pd.read_pickle("outputs_new/dedupe_data_group_merge.pkl")
data["dup_group_id"] = (
    data["dup_group_id"]
    .astype("string")
    .str.strip()
    .replace("", pd.NA)
    .fillna(data["record_id"])
)
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last,records_hierarchy_dedupe_backup,parent_record_id_dedupe_backup,dup_group_id_dedupe_backup,records_hierarchy_unify_version_backup,parent_record_id_unify_version_backup,dup_group_id_unify_version_backup
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",,,,,,,,,false,,,,,parent,2020-05-03,2020,parent,,crossref::10.21467/preprints.48,,,,,parent,,,parent,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",,,,,,,,,false,,,,,parent,2020-04-25,2020,parent,,crossref::10.21467/preprints.43,mohamed_mostafa,doxycycline and minocycline drugs as a treatme...,mostafa|m,,parent,,,parent,,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",,,,,,,,,false,,,,,parent,2020-04-16,2020,parent,,crossref::10.21467/preprints.39,dasgupta_rimjhim,a genetic perspective of 2019 ncov in relation...,dasgupta|r,,parent,,,parent,,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",,,,,,,,,false,,,,,parent,2020-04-15,2020,parent,,crossref::10.21467/preprints.38,,,,,parent,,,parent,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",,,,,,,,,false,,,,,parent,2020-04-15,2020,parent,,crossref::10.21467/preprints.36,,,,,parent,,,parent,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8084855,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,Three Objections to Modern Physics,Lubomir Vlcek,,,,,,,,,,,,,,parent,2014-09-01,2014,parent,,openalex::W999325625,lubomir_vlcek,three objections to modern physics,vlcek|l,,parent,,,parent,,
8084856,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,Particle Mass Ratios,DT Froedge,,,,,,,,,,,,,,parent,2011-12-01,2011,parent,,openalex::W999460032,dt_froedge,particle mass ratios,froedge|d,,parent,,,parent,,
8084857,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,Quantum FFF Theory Proposals for Some Unsolved...,Leo Vuyk,,,,,,,,,,,,,,parent,2014-06-01,2014,parent,,openalex::W99967155,leo_vuyk,quantum fff theory proposals for some unsolved...,vuyk|l,,parent,,,parent,,
8084858,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,Investigation of the Formalism of Particle Dyn...,Chi-Yi Chen,,,,,,,,,,,,,,parent,2013-06-01,2013,parent,,openalex::W999790414,chen_chi_yi,investigation of the formalism of particle dyn...,chen|c,,parent,,,parent,,


In [4]:
data[data['dup_group_id'].isna()]

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last,records_hierarchy_dedupe_backup,parent_record_id_dedupe_backup,dup_group_id_dedupe_backup,records_hierarchy_unify_version_backup,parent_record_id_unify_version_backup,dup_group_id_unify_version_backup


# execution

In [5]:
SHEET_ID = "10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4"
GID = "582500270"   # your tab gid

sheet_url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv&gid={GID}"

sheet_df = pd.read_csv(sheet_url)

sheet_df.head()


Unnamed: 0,source_id,open_alex_source_id,InternalOrder,SourceOrig,Server Name,Server Name Rule File,Notes,Registration Agencies,Where to get metadata ?,Example URLs/DOI,synonym,Electronic\nISSN / ISBN,Source to obtain Red Columns,Column 134,Column 135,download rule,Collection Date used for Red Columns,number of unique preprints,number of unique preprints that are published,% of unique that are published,Number of unique preprints with more than 1 version,% of unique that have more than one version,Is the Server in Crossref,Example DOI in Crossref,API Query in Crossref,Crossref collection_date,Number of works in Crossref,Number of unique preprints in Crossref,Remarks in Crossref,Number of unique preprints that are published in Crossref,%_of unique that are published in Crossref,Number of unique preprints with more than 1 version in Crossref,% of unique that have more than one version in Crossref,Works_in_2024 in Crossref,Works_in_2025 in Crossref,Notes for Crossref,Is the server in DataCite,ExampleDOI in DataCite,API Query for DataCite,DataCite Collection Date,Number of works in DataCite,number_of_preprint_unique in DataCite,Remarks DataCite,number of unique preprints that are published in Datacite,%_of unique that are published in DataCite,Number of unique preprints with more than 1 version in DataCite,% of unique that have more than one version in DataCite,Works_in_2024 in DataCite,Works_in_2025 in DataCite,Notes for DataCite,Is the server in OpenAlex,Example DOI in OpenAlex,API Querry in OpenAlex,OpenAlex Data Collection Date,Number of works in OpenAlex,number_of_preprint_unique in OpenAlex,Remarks OpenAlex,number of unique preprints that are published in OpenAlex,%_of unique that are published in OpenAlex,Number of unique preprints with more than 1 version in OpenAlex,% of unique preprints with more than 1 version in OpenAlex,Works_in_2024 in OpenAlex,Works_in_2025 in OpenAlex,OpenAlex notes,ExampleDOI on server page,API Querry on Server Page,ServerPage Data Collection Date,Number of Works on Server Page,number_of_preprint_unique on Server Page,Remarks Server Page,number of unique preprints that are published on Server Page,%_of unique that are published on Server Page,Number of unique preprints with more than 1 version on Server Page,%of unique preprints with more than 1 version on Server Page,Works_in_2024 on Server Page,Works_in_2025 on Server Page,Active (in 2025),API Query,API LINK,total count from sever API link,Dimensions,Dimensions Query,Dimensions Query note,DOI registration Agency,OAI-PMH URL,Server Main Page Link,Active (in 2026),MarioDateCheck,Who can submit text,Researcher can submit,Accepts Preprints ONLY,Type of documents accepted,Publish - Review - Curate models,Date of Check,Definition of server/platform on the main page (Y/N),Text of the Definition of the server/platform on the main page,SelfDefinitiononMainPage,Definition on non-main page,Non-main source and notes,Definition of preprint on main page (Y/N),Text of Preprint definition,"Classified as 1- preprint, 2-eprint, 3-working, 4--",Any Warning message on main page,Text of message,Link of preprint checked,Any Warning on a single preprint page,Text of single page warning,Any warning on the text of COVID 19 page,Text of COVID page warning,Do they have a general warning label,Notes on warning label,Is there a separate warning label for medical preprints,Types,can identify peer reviewed version,Review Type,Publications can be submitted elsewhere,File types,Who has access,contact,"Does a server allow posting of protocols, or other resaeerch parts?",Is there a separete metadata for protocols?,Preprint Tag for Search or metadata,Versioning,Publication Status,Charges for Services,is statistics available ?,metrics to track their preprints,Mario List,Wikipedia List,Dimensions List,ASAP BIO List,Directory of Open Access Preprint Repositories List,State University of Campinas,Disciplinary scope,Ownership type,Screening processes,Additional notes on screening process,External content indexing,Permanence of content,Commenting
0,Advance,,1,1Original list,Advance: a SAGE Preprints Community,Advance,,Crossref,Crossref,['https://doi.org/10.31124/advance.172059085.5...,Advance,,Crossref,https://api.crossref.org/works?filter=type:pos...,ok,ok,31/12/2025,2440,162.0,6.639344,1788.0,73.278689,True,['https://doi.org/10.31124/advance.172059085.5...,https://api.crossref.org/works/10.31124/advanc...,31/12/2025,4400.0,2440.0,ok,162.0,6.639344,1788.0,73.278689,414.0,346.0,,False,,,2025/12/31,,,,,,,,,,,False,https://doi.org/10.31124/advance.13491510.v1,https://api.openalex.org/w4212857168,31/12/2025,,,,,,,,,,,https://doi.org/10.31124/advance.176303470.009...,,15/12/2025,2384,no types filter,,,,,,,,yes,,-,/,Yes,https://app.dimensions.ai/discover/publication...,,Crossref,,https://advance.sagepub.com/,No,01/15/26,Not Active,Not Active,Yes,Preprints only,No,08/30/25,Y,Advance: a Sage preprints community allows res...,preprints community,,,Y,A preprint is a version of a scholarly piece o...,1.0,Y,Papers posted to Advance or transportRxiv shou...,https://advance.sagepub.com/doi/full/10.31124/...,Y,Preprints are early versions of research artic...,,,Preprints are early versions of research artic...,Papers posted to Advance or transportRxiv shou...,,Primary submission preprint platforms,,,,,,,,,yes,,,,,,yes,,yes,yes,yes,,Humanities and Social Sciences,Publisher,"Content within scope, scholarly nature, manusc...",Preprints must include a research question or ...,"Google Scholar, CrossRef",Permanent with some removal options in excepti...,Commenting
1,AfricArXiv,,2,1Original list,AfricArxiv,AfricArXiv,,Crossref/DataCite,Crossref,"['https://doi.org/10.31730/osf.io/7n258', 'htt...",,,Crossref/Datacite,https://api.datacite.org/dois?client-id=eqhh.h...,ok,ok,31/12/2025,483,56.0,11.594203,68.0,14.078675,True,"['https://doi.org/10.31730/osf.io/7n258', 'htt...",https://api.crossref.org/works/10.31730/osf.io...,31/12/2025,501.0,483.0,ok,56.0,11.594203,68.0,14.078675,16.0,0.0,,False,10.60763/africarxiv/10352,https://api.datacite.org/dois?client-id=eqhh.h...,31/12/2025,,,,,,,,,,,False,10.60763/africarxiv/357,https://api.openalex.org/works?page=1&filter=p...,31/12/2025,,,,,,,,,,,https://doi.org/10.60763/africarxiv/10286,https://api.osf.io/v2/providers/preprints/afri...,15/12/2025,7944,no types filter,,,,,,,,yes,https://api.osf.io/v2/providers/preprints/afri...,https://api.osf.io/v2/providers/preprints/afri...,480,Yes,https://app.dimensions.ai/discover/publication...,,DataCite,,https://africarxiv.ubuntunet.net/home,Yes,01/15/26,Researcher (specific),Yes,No,"working papers, preprints, accepted manuscript...",No,08/30/25,Y,AfricArxiv is a community-led digital archive ...,community-led digital archive,,Very inconsistent termonology across their var...,N,A preprint is a version of a scholarly or scie...,,N,,https://africarxiv.ubuntunet.net/items/e85a94c...,N,,,,No,,No,,,,,"Text documents (preprints, postprints, VoR):\n...",,,,,yes,"We also provide options to link data and code,...",,,,,yes,yes,yes,yes,yes,yes,All scientific fields,Academic community group,,No information on screening on the new website...,"Google Scholar, SHARE, Microsoft Academic, Unp...",Permanent with some removal options in excepti...,no (on UbuntuNet)
2,AgEcon Search,,3,1Original list,AgEcon Search,"AgEcon Search (University of Minnesota, USA)",no preprint group on main website or on dataci...,DataCite,DataCite,https://openalex.org/works?page=1&filter=prima...,,2332-3140,DataCite,https://api.datacite.org/dois?client-id=tind.a...,ok,ok,31/12/2025,172831,53.0,0.030666,27568.0,15.950842,False,,,2025/12/31,,,,,,,,,,,True,10.22004/ag.econ.151297,https://api.datacite.org/dois?client-id=tind.a...,31/12/2025,188173.0,172831.0,,53.0,0.030666,27568.0,15.950842,8072.0,9054.0,,False,https://doi.org/10.22004/ag.econ.273723,https://api.openalex.org/works?page=1&filter=p...,31/12/2025,,,,,,,,,,,https://doi.org/10.22004/ag.econ.382416,https://ageconsearch.umn.edu/oai2d,15/12/2025,211648,no types filter,,,,,,,,no,https://ageconsearch.umn.edu/oai2d,https://ageconsearch.umn.edu/oai2d?verb=ListRe...,209243,no,,,DataCite,,https://ageconsearch.umn.edu/?ln=en,No,01/15/26,"Organizations, not individual researchers",No,No,"articles, conference papers, working or discus...",No,08/30/25,N,,,web repository,"From their FAQ: "".. is free-to-user Web site ...",N,,,N,,https://ageconsearch.umn.edu/record/358968?v=pdf,N,,,,No,,,repository,,,,Conference papers and posters\n Working...,,,,,yes,,,There are no costs for posting papers for grou...,,,,,,,yes,,Relating to agricultural & applied economics,Academic institution,,,,Permanent with some removal options in special...,no
3,AgriRxiv,,4,1Original list,AgriRxiv (formerly AgriXiv),AgriRxiv,Preprints for Agriculture and Allied Sciences ...,Crossref,Crossref,"['https://doi.org/10.31220/osf.io/87k3q', 'htt...",AgriRxiv,2791-1969,Crossref,https://api.crossref.org/works?filter=type:pos...,ok,ok,31/12/2025,804,24.0,2.985075,132.0,16.41791,True,"['https://doi.org/10.31220/osf.io/87k3q', 'htt...",https://api.crossref.org/works/10.31220/osf.io...,31/12/2025,818.0,804.0,ok,24.0,2.985075,132.0,16.41791,69.0,93.0,,False,,,2025/12/31,,,,,,,,,,,False,https://doi.org/10.31220/agrirxiv.2024.00228,https://openalex.org/works?page=1&filter=prima...,31/12/2025,,,,,,,,,,,https://doi.org/10.31220/agriRxiv.2020.00005,https://api.osf.io/v2/providers/preprints/arab...,15/12/2025,464,462,,,,,,,,yes,https://api.osf.io/v2/providers/preprints/arab...,-,397,Yes,https://app.dimensions.ai/discover/publication...,https://app.dimensions.ai/discover/publication...,Crossref,,https://www.cabidigitallibrary.org/journal/agr...,Yes,01/15/26,Researcher,Yes,Yes,Preprints only,No,08/30/25,Y,agriRxiv (pronounced ‘agri-archive’) is a free...,source of unpublished preprints,,,N,,,N,,https://www.cabidigitallibrary.org/doi/10.3122...,N,,,,No,,,Primary submission preprint platforms,,,,The following types of article may be posted:\...,,,,,yes,,,There is no fee to submit articles to agriRxiv,yes,,yes,yes,yes,yes,yes,yes,"Relating to agriculture and allied sciences, i...",Academic community group,"Content within scope, no inappropriate materia...",,"Google Scholar, Crossref, Researcher-app; Prep...",Permanent with withdrawal marking; articles ca...,no
4,,,5,1Original list,ai.vixra.org,ai.vixra.org,,no doi,API website,https://ai.vixra.org/hep/,,,Website,,ok,/,10/12/2025,966,,,,,No,,,,,,,,,,,,,,No,,,,,,,,,,,,,,no,no doi,/,12/12/2025,/,/,,,,,,,,,no doi,,15/12/2025,988,no types filter,,,,,,,,yes,,ai.vixra.org,567,no,,,no doi,,https://ai.vixra.org/,Yes,01/15/26,Researcher,Yes,Yes,Preprints only,No,08/30/25,Y,ai.viXra.org is an e-print archive of AI assis...,e-print archive,,,N,,,Y,ai.viXra.org has no association with arXiv.org...,https://ai.vixra.org/abs/2508.0021,Y,ai.Vixra.org is a AI assisted e-print reposito...,,,yes,ai.Vixra.org is a AI assisted e-print reposito...,no,Primary submission preprint platforms,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
# ============================================
# 5) RUN PIPELINE (edit these 4 things)
# ============================================

SHEET_ID = "10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4"
WORKSHEET_GID = 881157301 # test copy database sheet 582500270

# 1) put your service account json path here
SERVICE_ACCOUNT_JSON = "sheet_writer_key/preprint-tracker-credential.json"
# SERVICE_ACCOUNT_JSON =  '/mnt/c/SCHOLCOMMLAB/APPs/preprint-harvester/notebooks/notebooks/sheet_writer_key/preprint-tracker-credential.json'
# 2) your target columns exactly as in the sheet (including trailing spaces)
TARGET_COLS = [
    'Collection Date used for Red Columns',
    'number of unique preprints',
    'number of unique preprints that are published',
    '% of unique that are published ',
    'Number of unique preprints with more than 1 version',
    '% of unique that have more than one version ',
    'Is the Server in Crossref',
    'Crossref collection_date',
    'Number of works in Crossref',
    'Number of unique preprints in Crossref',
    'Number of unique preprints that are published in Crossref',
    '%_of unique that are published in Crossref',
    'Number of unique preprints with more than 1 version in Crossref',
    '% of unique that have more than one version  in Crossref',
    'Works_in_2024 in Crossref',
    'Works_in_2025 in Crossref',
    'Is the server in DataCite',
    'DataCite Collection Date',
    'Number of works in DataCite',
    'number_of_preprint_unique in DataCite',
    'number of unique preprints that are published in Datacite',
    '%_of unique that are published in DataCite',
    'Number of unique preprints with more than 1 version in DataCite',
    '% of unique that have more than one version  in DataCite',
    'Works_in_2024 in DataCite',
    'Works_in_2025 in DataCite',
    'Is the server in OpenAlex',
    'OpenAlex Data Collection Date',
    'Number of works in OpenAlex',
    'number_of_preprint_unique in OpenAlex',
    'number of unique preprints that are published in OpenAlex',
    '%_of unique that are published in OpenAlex',
    'Number of unique preprints with more than 1 version in OpenAlex',
    '% of unique preprints with more than 1 version in OpenAlex',
    'Works_in_2024 in OpenAlex',
    'Works_in_2025 in OpenAlex',
]

# 3) choose the collection date you want for all servers
DEFAULT_COLLECTION_DATE = "2025/12/31"

# 4) optional later: per server overrides (keep None for now)
PER_SERVER_DATE = None
# Example:
# PER_SERVER_DATE = {"bioRxiv": "2026/01/15", "medRxiv": "2026/01/15"}


# =========================
# Step A: compute metrics
# =========================
# IMPORTANT: 'data' must be your dedupe long dataframe in memory
overall_df, backend_df = build_sheet_metrics_duckdb(data, choose_parent="oldest")

# =========================
# Step B: read the sheet
# =========================
sheet_df = read_sheet_as_df(
    sheet_id=SHEET_ID,
    worksheet_gid=WORKSHEET_GID,
    service_account_json_path=SERVICE_ACCOUNT_JSON,
)

# =========================
# Step C: build payload (only key + target cols)
# =========================
payload_df = build_update_payload(
    sheet_df=sheet_df,
    overall_df=overall_df,
    backend_df=backend_df,
    target_cols=TARGET_COLS,
    key_col="source_id",
    default_collection_date=DEFAULT_COLLECTION_DATE,
    per_server_date=PER_SERVER_DATE,
)

payload_df.head()

# =========================
# Step D: update only those columns
# =========================
report = update_google_sheet_only_columns(
    sheet_id=SHEET_ID,
    worksheet_gid=WORKSHEET_GID,
    service_account_json_path=SERVICE_ACCOUNT_JSON,
    payload_df=payload_df,
    key_col="source_id",
    columns_to_update=TARGET_COLS,
    chunk_size=500,
    overwrite=True,   # set False to fill blanks only
    verbose=True,
)

report


Chunk 1: scanned 388 rows, wrote 13968 cells


{'sheet_id': '10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4',
 'worksheet_gid': 881157301,
 'rows_payload': 751,
 'rows_matched': 388,
 'missing_keys': [''],
 'n_missing_keys': 1,
 'columns_updated': ['Collection Date used for Red Columns',
  'number of unique preprints',
  'number of unique preprints that are published',
  '% of unique that are published ',
  'Number of unique preprints with more than 1 version',
  '% of unique that have more than one version ',
  'Is the Server in Crossref',
  'Crossref collection_date',
  'Number of works in Crossref',
  'Number of unique preprints in Crossref',
  'Number of unique preprints that are published in Crossref',
  '%_of unique that are published in Crossref',
  'Number of unique preprints with more than 1 version in Crossref',
  '% of unique that have more than one version  in Crossref',
  'Works_in_2024 in Crossref',
  'Works_in_2025 in Crossref',
  'Is the server in DataCite',
  'DataCite Collection Date',
  'Number of works in DataCite

# exploration

In [7]:
payload_df

Unnamed: 0,source_id,Collection Date used for Red Columns,number of unique preprints,number of unique preprints that are published,% of unique that are published,Number of unique preprints with more than 1 version,% of unique that have more than one version,Is the Server in Crossref,Crossref collection_date,Number of works in Crossref,Number of unique preprints in Crossref,Number of unique preprints that are published in Crossref,%_of unique that are published in Crossref,Number of unique preprints with more than 1 version in Crossref,% of unique that have more than one version in Crossref,Works_in_2024 in Crossref,Works_in_2025 in Crossref,Is the server in DataCite,DataCite Collection Date,Number of works in DataCite,number_of_preprint_unique in DataCite,number of unique preprints that are published in Datacite,%_of unique that are published in DataCite,Number of unique preprints with more than 1 version in DataCite,% of unique that have more than one version in DataCite,Works_in_2024 in DataCite,Works_in_2025 in DataCite,Is the server in OpenAlex,OpenAlex Data Collection Date,Number of works in OpenAlex,number_of_preprint_unique in OpenAlex,number of unique preprints that are published in OpenAlex,%_of unique that are published in OpenAlex,Number of unique preprints with more than 1 version in OpenAlex,% of unique preprints with more than 1 version in OpenAlex,Works_in_2024 in OpenAlex,Works_in_2025 in OpenAlex
0,Advance,2025/12/31,2440.0,162.0,6.639344,1788.0,73.278689,True,2025/12/31,4400.0,2440.0,162.0,6.639344,1788.0,73.278689,414.0,346.0,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,
1,AfricArXiv,2025/12/31,483.0,56.0,11.594203,68.0,14.078675,True,2025/12/31,501.0,483.0,56.0,11.594203,68.0,14.078675,16.0,0.0,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,
2,AgEcon Search,2025/12/31,172831.0,53.0,0.030666,27568.0,15.950842,False,2025/12/31,,,,,,,,,True,2025/12/31,188173.0,172831.0,53.0,0.030666,27568.0,15.950842,8072.0,9054.0,False,2025/12/31,,,,,,,,
3,AgriRxiv,2025/12/31,804.0,24.0,2.985075,132.0,16.41791,True,2025/12/31,818.0,804.0,24.0,2.985075,132.0,16.41791,69.0,93.0,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,
4,,2025/12/31,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,cngb.cga,2025/12/31,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,
747,cos.osf,2025/12/31,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,
748,,2025/12/31,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,
749,,2025/12/31,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,,False,2025/12/31,,,,,,,,


In [8]:
overall_df.head(60)

Unnamed: 0,server_name,n_records,n_unique,n_published,pct_published,n_is_version_of,pct_has_versions,count_2024,count_2025
0,AIJR Preprints,143,143,1,0.699301,9,6.293706,7.0,4.0
1,AMRC Open Research,38,27,0,0.0,11,40.740741,0.0,0.0
2,APSA Preprints,1470,1171,164,14.005124,263,22.459436,140.0,214.0
3,ARPHA Preprints,890,869,315,36.248562,39,4.487917,184.0,211.0
4,ART-Dok,9653,9532,3,0.031473,106,1.112044,562.0,485.0
5,Advance,4400,2440,162,6.639344,1788,73.278689,414.0,346.0
6,AfricArXiv,501,483,56,11.594203,68,14.078675,16.0,0.0
7,AgEcon Search,188173,172831,53,0.030666,27568,15.950842,8072.0,9054.0
8,AgriRxiv,818,804,24,2.985075,132,16.41791,69.0,93.0
9,Arabixiv,502,491,217,44.195519,113,23.014257,0.0,0.0


In [9]:
overall_df.tail(60)

Unnamed: 0,server_name,n_records,n_unique,n_published,pct_published,n_is_version_of,pct_has_versions,count_2024,count_2025
52,LIS Scholarship Archive,397,353,69,19.546742,152,43.05949,7.0,13.0
53,LSE Research Online Documents on Economics,119,119,1,0.840336,33,27.731092,0.0,0.0
54,LatArXiv,125,123,7,5.691057,11,8.943089,44.0,79.0
55,Law Archive,1808,1600,109,6.8125,965,60.3125,35.0,128.0
56,MNI Open Research,19,13,0,0.0,6,46.153846,0.0,0.0
57,MarXiv,508,489,253,51.738241,159,32.515337,2.0,2.0
58,MediArXiv,309,298,90,30.201342,38,12.751678,20.0,34.0
59,MetaArXiv,880,770,153,19.87013,224,29.090909,78.0,111.0
60,MindRxiv,335,292,108,36.986301,94,32.191781,1.0,0.0
61,Munich Personal RePEc Archive,68692,68585,99,0.144346,3783,5.515783,42.0,3.0


In [10]:
backend_df.head(60)

Unnamed: 0,server_name,backend,n_records,n_unique,n_published,pct_published,n_is_version_of,pct_has_versions,count_2024,count_2025
0,AIJR Preprints,crossref,143,143,1,0.699301,9,6.293706,7.0,4.0
1,AMRC Open Research,crossref,38,27,0,0.0,11,40.740741,0.0,0.0
2,APSA Preprints,crossref,1470,1171,164,14.005124,263,22.459436,140.0,214.0
3,ARPHA Preprints,crossref,890,869,315,36.248562,39,4.487917,184.0,211.0
4,ART-Dok,datacite,9653,9532,3,0.031473,106,1.112044,562.0,485.0
5,Advance,crossref,4400,2440,162,6.639344,1788,73.278689,414.0,346.0
6,AfricArXiv,crossref,501,483,56,11.594203,68,14.078675,16.0,0.0
7,AgEcon Search,datacite,188173,172831,53,0.030666,27568,15.950842,8072.0,9054.0
8,AgriRxiv,crossref,818,804,24,2.985075,132,16.41791,69.0,93.0
9,Arabixiv,crossref,502,491,217,44.195519,113,23.014257,0.0,0.0


In [11]:
backend_df.tail(60)

Unnamed: 0,server_name,backend,n_records,n_unique,n_published,pct_published,n_is_version_of,pct_has_versions,count_2024,count_2025
53,LSE Research Online Documents on Economics,openalex,119,119,1,0.840336,33,27.731092,0.0,0.0
54,LatArXiv,crossref,125,123,7,5.691057,11,8.943089,44.0,79.0
55,Law Archive,crossref,1808,1600,109,6.8125,965,60.3125,35.0,128.0
56,MNI Open Research,crossref,19,13,0,0.0,6,46.153846,0.0,0.0
57,MarXiv,crossref,508,489,253,51.738241,159,32.515337,2.0,2.0
58,MediArXiv,crossref,309,298,90,30.201342,38,12.751678,20.0,34.0
59,MetaArXiv,crossref,880,770,153,19.87013,224,29.090909,78.0,111.0
60,MindRxiv,crossref,335,292,108,36.986301,94,32.191781,1.0,0.0
61,Munich Personal RePEc Archive,openalex,68692,68585,99,0.144346,3783,5.515783,42.0,3.0
62,National Bureau of Economic Research,openalex,1856,1831,0,0.0,679,37.083561,0.0,0.0
