In [2]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


# import data

In [3]:
data = pd.read_pickle("outputs/dedupe_data_group_merge.pkl")
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,parent_record_id,records_hierarchy_copy,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",,,,,,,,,false,,,,,parent,2020-05-03,2020,,parent,crossref::10.21467/preprints.48,,,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",,,,,,,,,false,,,,,parent,2020-04-25,2020,,parent,crossref::10.21467/preprints.43,mohamed_mostafa,doxycycline and minocycline drugs as a treatme...,mostafa|m,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",,,,,,,,,false,,,,,parent,2020-04-16,2020,,parent,crossref::10.21467/preprints.39,dasgupta_rimjhim,a genetic perspective of 2019 ncov in relation...,dasgupta|r,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",,,,,,,,,false,,,,,parent,2020-04-15,2020,,parent,crossref::10.21467/preprints.38,,,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",,,,,,,,,false,,,,,parent,2020-04-15,2020,,parent,crossref::10.21467/preprints.36,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8410089,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,Three Objections to Modern Physics,Lubomir Vlcek,,,,,,,,,,,,,,parent,2014-09-01,2014,,parent,openalex::W999325625,lubomir_vlcek,three objections to modern physics,vlcek|l,
8410090,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,Particle Mass Ratios,DT Froedge,,,,,,,,,,,,,,parent,2011-12-01,2011,,parent,openalex::W999460032,dt_froedge,particle mass ratios,froedge|d,
8410091,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,Quantum FFF Theory Proposals for Some Unsolved...,Leo Vuyk,,,,,,,,,,,,,,parent,2014-06-01,2014,,parent,openalex::W99967155,leo_vuyk,quantum fff theory proposals for some unsolved...,vuyk|l,
8410092,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,Investigation of the Formalism of Particle Dyn...,Chi-Yi Chen,,,,,,,,,,,,,,parent,2013-06-01,2013,,parent,openalex::W999790414,chen_chi_yi,investigation of the formalism of particle dyn...,chen|c,


In [5]:
from __future__ import annotations

from pathlib import Path
from typing import Optional, Tuple, Dict
import pandas as pd
import numpy as np


def generate_tracker_files_from_dedupe(
    df: pd.DataFrame,
    out_dir: str | Path = "outputs/tracker_data",
    *,
    # core cols
    server_col: str = "server_name",
    record_id_col: str = "record_id",
    group_id_col: str = "dup_group_id",
    hierarchy_col: str = "records_hierarchy",
    date_col: str = "date_first_seen",
    year_col: str = "publication_year_first_seen",
    has_published_col: str = "has_published_version",
    source_col: Optional[str] = None,  # if you have it; else will be omitted

    # outputs
    write_csv: bool = True,
    write_xlsx: bool = False,
    return_frames: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, Dict]:
    """
    Build the 3 core files your Streamlit tracker can consume, from a deduped long table:
      1) summary.csv   (one row per server_name)
      2) yearly.csv    (wide or long? -> we export wide by default, plus long)
      3) clusters_summary.csv (one row per dup_group_id)

    Important modeling choices:
      - n_records: record-level count (raw harvested rows)
      - n_unique:  cluster-level count (nunique dup_group_id)
      - versions:  cluster-level count where cluster_size > 1
      - published: cluster-level count where any record in cluster has has_published_version=True
      - yearly:    computed using *parent* rows only (avoid double counting versions)

    Returns:
      summary_df, yearly_wide_df, clusters_summary_df, metrics
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    required = [server_col, record_id_col, group_id_col]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    work = df.copy()

    # ---- clean basics
    work[server_col] = work[server_col].astype("string").fillna("").str.strip()
    work[group_id_col] = work[group_id_col].astype("string").fillna("").str.strip()
    work[record_id_col] = work[record_id_col].astype("string").fillna("").str.strip()

    # drop rows with empty server or group id (cannot aggregate reliably)
    work = work[(work[server_col] != "") & (work[group_id_col] != "")].copy()

    # ---- normalize has_published to bool (if exists)
    if has_published_col in work.columns:
        hp = work[has_published_col]
        # accept True/False, 1/0, "true"/"false"
        work[has_published_col] = (
            hp.astype("string")
              .str.lower()
              .map({"true": True, "false": False, "1": True, "0": False})
              .where(~hp.isna(), other=np.nan)
        )
        # if original was already boolean/numeric, above may produce NaN; fallback:
        work[has_published_col] = pd.to_numeric(hp, errors="ignore") if work[has_published_col].isna().all() else work[has_published_col]
        # final coercion
        work[has_published_col] = work[has_published_col].fillna(False).astype(bool)
    else:
        work[has_published_col] = False

    # ---- parse date/year
    if date_col in work.columns:
        work[date_col] = pd.to_datetime(work[date_col], errors="coerce")
    else:
        work[date_col] = pd.NaT

    if year_col in work.columns:
        y = pd.to_numeric(work[year_col], errors="coerce")
        work[year_col] = y.where((y >= 1000) & (y <= 3000)).round().astype("Int64")
    else:
        work[year_col] = pd.NA

    # =========================================================
    # 1) clusters_summary (one row per dup_group_id)
    # =========================================================
    # cluster_size
    cluster_size = work.groupby(group_id_col, as_index=True).size().rename("cluster_size")

    # number of servers per cluster
    n_servers = work.groupby(group_id_col)[server_col].nunique().rename("n_servers")

    # cluster has published (any record in cluster)
    cluster_has_published = work.groupby(group_id_col)[has_published_col].any().rename("cluster_has_published")

    # Determine a "cluster parent" row:
    # Prefer rows labeled 'parent' in records_hierarchy; if missing, fallback to earliest date then record_id.
    if hierarchy_col in work.columns:
        h = work[hierarchy_col].astype("string").fillna("").str.lower().str.strip()
        is_parent = h.eq("parent")
    else:
        is_parent = pd.Series(False, index=work.index)

    parents = work[is_parent].copy()
    if parents.empty:
        # fallback: earliest row per cluster
        tmp = work.sort_values([group_id_col, date_col, record_id_col], ascending=[True, True, True], na_position="last")
        parents = tmp.groupby(group_id_col, sort=False).head(1).copy()
    else:
        # if a cluster has multiple 'parent' rows (shouldn't, but can happen), pick earliest
        parents = parents.sort_values([group_id_col, date_col, record_id_col], ascending=[True, True, True], na_position="last")
        parents = parents.groupby(group_id_col, sort=False).head(1).copy()

    # Build clusters summary frame
    clusters_summary = (
        parents[[group_id_col, record_id_col, server_col, date_col, year_col]]
        .rename(columns={
            record_id_col: "parent_record_id",
            server_col: "parent_server_name",
            date_col: "parent_date_first_seen",
            year_col: "parent_year_first_seen",
        })
        .set_index(group_id_col)
        .join(cluster_size)
        .join(n_servers)
        .join(cluster_has_published)
        .reset_index()
    )

    clusters_summary["has_versions"] = clusters_summary["cluster_size"] > 1
    clusters_summary["is_cross_server"] = clusters_summary["n_servers"] > 1

    # =========================================================
    # 2) summary (one row per server)
    # =========================================================
    # record-level counts
    n_records = work.groupby(server_col)[record_id_col].count().rename("n_records")

    # cluster-level unique count by server (a cluster may touch many servers; this is "server participation")
    n_unique = work.groupby(server_col)[group_id_col].nunique().rename("n_unique")

    # cluster-level versions by server (clusters with cluster_size>1 that have at least one record in server)
    version_clusters = set(clusters_summary.loc[clusters_summary["has_versions"], group_id_col].astype(str))
    published_clusters = set(clusters_summary.loc[clusters_summary["cluster_has_published"], group_id_col].astype(str))

    n_is_version_of = (
        work[work[group_id_col].isin(version_clusters)]
        .groupby(server_col)[group_id_col].nunique()
        .rename("n_is_version_of")
    )

    n_published = (
        work[work[group_id_col].isin(published_clusters)]
        .groupby(server_col)[group_id_col].nunique()
        .rename("n_published")
    )

    summary = (
        pd.concat([n_records, n_unique, n_is_version_of, n_published], axis=1)
          .fillna(0)
          .reset_index()
          .rename(columns={server_col: "server_name"})
    )

    summary["pct_published"] = np.where(
        summary["n_unique"] > 0,
        (summary["n_published"] / summary["n_unique"]) * 100.0,
        np.nan,
    )

    # optional: add source column if provided
    if source_col and source_col in df.columns:
        # take the most frequent source per server
        tmp = df[[server_col, source_col]].copy()
        tmp[server_col] = tmp[server_col].astype("string").fillna("").str.strip()
        tmp[source_col] = tmp[source_col].astype("string").fillna("").str.strip()
        mode_source = (
            tmp.groupby(server_col)[source_col]
               .agg(lambda x: x.value_counts().index[0] if len(x.dropna()) else "")
               .rename("source")
               .reset_index()
               .rename(columns={server_col: "server_name"})
        )
        summary = summary.merge(mode_source, on="server_name", how="left")

    summary["collection_date"] = pd.Timestamp.utcnow().date().isoformat()

    # =========================================================
    # 3) yearly (use *parents* only to avoid double counting)
    # =========================================================
    # Use parent server as the server for yearly counts (recommended).
    parents_for_yearly = clusters_summary.copy()
    parents_for_yearly = parents_for_yearly.dropna(subset=["parent_year_first_seen"])
    parents_for_yearly["parent_year_first_seen"] = parents_for_yearly["parent_year_first_seen"].astype("Int64")

    yearly_long = (
        parents_for_yearly.groupby(["parent_server_name", "parent_year_first_seen"], as_index=False)[group_id_col]
        .nunique()
        .rename(columns={
            "parent_server_name": "server_name",
            "parent_year_first_seen": "year",
            group_id_col: "count",
        })
        .sort_values(["server_name", "year"])
    )

    # Wide format (one row per server, year columns) — matches your app’s cleaner
    yearly_wide = (
        yearly_long.pivot_table(index="server_name", columns="year", values="count", aggfunc="sum", fill_value=0)
        .reset_index()
    )
    yearly_wide.columns = [str(c) for c in yearly_wide.columns]

    # Also add count_2024 / count_2025 to summary if present
    if "2024" in yearly_wide.columns:
        summary = summary.merge(yearly_wide[["server_name", "2024"]].rename(columns={"2024": "count_2024"}), on="server_name", how="left")
    else:
        summary["count_2024"] = 0
    if "2025" in yearly_wide.columns:
        summary = summary.merge(yearly_wide[["server_name", "2025"]].rename(columns={"2025": "count_2025"}), on="server_name", how="left")
    else:
        summary["count_2025"] = 0

    # =========================================================
    # Metrics (quick QA)
    # =========================================================
    metrics = {
        "rows_in_input": int(len(df)),
        "rows_used": int(len(work)),
        "n_servers": int(work[server_col].nunique()),
        "n_clusters": int(clusters_summary[group_id_col].nunique()),
        "clusters_with_versions": int(clusters_summary["has_versions"].sum()),
        "clusters_with_published": int(clusters_summary["cluster_has_published"].sum()),
        "cross_server_clusters": int(clusters_summary["is_cross_server"].sum()),
        "year_min": int(yearly_long["year"].min()) if len(yearly_long) else None,
        "year_max": int(yearly_long["year"].max()) if len(yearly_long) else None,
        "out_dir": str(out_dir),
    }

    # =========================================================
    # Write files
    # =========================================================
    if write_csv:
        summary.to_csv(out_dir / "summary.csv", index=False)
        yearly_wide.to_csv(out_dir / "yearly.csv", index=False)       # app expects wide-ish
        yearly_long.to_csv(out_dir / "yearly_long.csv", index=False) # useful for QA
        clusters_summary.to_csv(out_dir / "clusters_summary.csv", index=False)

        pd.DataFrame([metrics]).to_json(out_dir / "build_metrics.json", orient="records", indent=2)

    if write_xlsx:
        with pd.ExcelWriter(out_dir / "tracker_outputs.xlsx", engine="openpyxl") as xw:
            summary.to_excel(xw, sheet_name="summary", index=False)
            yearly_wide.to_excel(xw, sheet_name="yearly_wide", index=False)
            yearly_long.to_excel(xw, sheet_name="yearly_long", index=False)
            clusters_summary.to_excel(xw, sheet_name="clusters_summary", index=False)
            pd.DataFrame([metrics]).to_excel(xw, sheet_name="metrics", index=False)

    if return_frames:
        return summary, yearly_wide, clusters_summary, metrics

    return summary, yearly_wide, clusters_summary, metrics  # keep signature stable


# -------------------------
# Example usage
# -------------------------
# summary_df, yearly_df, clusters_df, build_metrics = generate_tracker_files_from_dedupe(
#     dedupe_clusters_long,
#     out_dir="outputs/tracker_data",
#     write_csv=True,
#     write_xlsx=False,
# )
# print(build_metrics)


In [6]:
summary_df, yearly_df, clusters_df, build_metrics = generate_tracker_files_from_dedupe(
    data,
    out_dir="outputs/tracker_data",
    write_csv=True,
    write_xlsx=False,
)
print(build_metrics)

  work[has_published_col] = work[has_published_col].fillna(False).astype(bool)


{'rows_in_input': 8084860, 'rows_used': 8084860, 'n_servers': 112, 'n_clusters': 7436474, 'clusters_with_versions': 486863, 'clusters_with_published': 1752699, 'cross_server_clusters': 193282, 'year_min': 1954, 'year_max': 2025, 'out_dir': 'outputs/tracker_data'}
