In [1]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


# import data

In [2]:
import duckdb
from pathlib import Path

con = duckdb.connect()

# Low-memory settings
con.execute("PRAGMA threads=1;")
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")
con.execute("PRAGMA memory_limit='2GB';")           # try 1GB if still unstable
con.execute("PRAGMA temp_directory='data/tmp_duckdb';")

# 2) Build paths robustly from the notebook folder
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "by_server"

# IMPORTANT: your files are hive-partitioned like:
all_backends = (BASE / "*" / "*.parquet").as_posix()

con.execute(f"""
CREATE OR REPLACE VIEW all_backends AS
SELECT * FROM read_parquet('{all_backends}', hive_partitioning=true, union_by_name=true);
""")

# A unified "all_rows" view
con.execute("""
CREATE OR REPLACE VIEW all_rows AS
SELECT * FROM all_backends
""")

print(con.execute("SHOW TABLES").fetchall())


[('all_backends',), ('all_rows',)]


In [3]:
con.execute(f"""
CREATE OR REPLACE VIEW server_thin AS
SELECT
  CAST(record_id AS VARCHAR)           AS record_id,
  CAST(server_name AS VARCHAR)         AS server_name,
  CAST(backend AS VARCHAR)             AS backend,

  CAST(doi AS VARCHAR)                 AS doi,
  CAST(doi_url AS VARCHAR)             AS doi_url,
  CAST(landing_page_url AS VARCHAR)    AS landing_page_url,

  CAST(title AS VARCHAR) AS title,
  -- CAST(abstract_text AS VARCHAR)      AS abstract_text,
  CAST(authors_flat AS VARCHAR)      AS authors_flat,
  CAST(institutions_flat AS VARCHAR)      AS institutions_flat,
  CAST(countries_flat AS VARCHAR)      AS countries_flat,
  
  -- Dates (helpful for temporal patterns)
  -- CAST(publication_year AS VARCHAR)    AS publication_year,
  -- CAST(date_created AS VARCHAR)        AS date_created,
  -- CAST(date_posted AS VARCHAR)         AS date_posted,
  -- CAST(date_deposited AS VARCHAR)      AS date_deposited,
  -- CAST(date_published AS VARCHAR)      AS date_published,
  -- CAST(date_published_online AS VARCHAR)      AS date_published_online,
  -- CAST(date_issued AS VARCHAR)         AS date_issued,
  -- CAST(date_indexed AS VARCHAR)        AS date_indexed,
  -- CAST(date_updated AS VARCHAR)        AS date_updated,
  -- CAST(date_registered AS VARCHAR)     AS date_registered,

  -- Relationships (keep these for true version links)
  CAST(relations_json AS VARCHAR)       AS relations_json,
  CAST(version_label AS VARCHAR)       AS version_label,
  CAST(is_version_of AS VARCHAR)       AS is_version_of,      -- keep as text; we’ll interpret later
  CAST(is_preprint_of AS VARCHAR)      AS is_preprint_of,
  CAST(has_preprint AS VARCHAR)      AS has_preprint,
  CAST(has_review AS VARCHAR)      AS has_review,
  CAST(has_published_version AS VARCHAR)      AS has_published_version,
  CAST(published_version_ids_json AS VARCHAR) AS published_version_ids_json,
  CAST(version_of_ids_json AS VARCHAR) AS version_of_ids_json,
  CAST(update_to_json AS VARCHAR)      AS update_to_json,
  CAST(raw_relationships_json AS VARCHAR)       AS raw_relationships_json,
FROM all_backends
""")

con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


Unnamed: 0,n
0,9486893


In [4]:
data = con.execute("SELECT * FROM server_thin").df()
# data.drop_duplicates(subset=['record_id'], keep='first', inplace=False)

data = data.drop_duplicates()
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",,,,,,,,,false,,,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",,,,,,,,,false,,,,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",,,,,,,,,false,,,,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",,,,,,,,,false,,,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",,,,,,,,,false,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,Three Objections to Modern Physics,Lubomir Vlcek,,,,,,,,,,,,,
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,Particle Mass Ratios,DT Froedge,,,,,,,,,,,,,
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,Quantum FFF Theory Proposals for Some Unsolved...,Leo Vuyk,,,,,,,,,,,,,
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,Investigation of the Formalism of Particle Dyn...,Chi-Yi Chen,,,,,,,,,,,,,


In [5]:
import pandas as pd

# ================================
# 1) Load saved artifacts
# ================================
records_hierarchy_df = pd.read_pickle("outputs/records_hierarchy_df.pkl")
date_first_seen_df   = pd.read_pickle("outputs/date_first_seen.pkl")

# ================================
# 2) Normalize keys
# ================================
def norm_key(s):
    return s.astype(str).str.strip()

for df_ in (records_hierarchy_df, date_first_seen_df):
    df_["record_id"] = norm_key(df_["record_id"])
    df_["server_name"] = norm_key(df_["server_name"])

data["record_id"] = norm_key(data["record_id"])
data["server_name"] = norm_key(data["server_name"])

# ================================
# 3) Ensure each RHS table is unique on (record_id, server_name)
# ================================
records_hierarchy_df = records_hierarchy_df.drop_duplicates(["record_id", "server_name"])
date_first_seen_df   = date_first_seen_df.drop_duplicates(["record_id", "server_name"])

# ================================
# 4) Build MASTER pairs from records_hierarchy_df
#    (this is what you said you want to keep)
# ================================
master_pairs = set(zip(records_hierarchy_df["record_id"], records_hierarchy_df["server_name"]))

# Filter helpers
def filter_to_master_pairs(df: pd.DataFrame) -> pd.DataFrame:
    pairs = list(zip(df["record_id"], df["server_name"]))
    return df.loc[pd.Series(pairs, index=df.index).isin(master_pairs)].copy()

# ================================
# 5) Filter data and date_first_seen to MASTER pairs
# ================================
data_master = filter_to_master_pairs(data)
date_first_seen_master = filter_to_master_pairs(date_first_seen_df)

# Optional safety: also dedupe data on the same key (should not change if clean)
data_master = data_master.drop_duplicates(["record_id", "server_name"], keep="first")

# ================================
# 6) Merge (left join from MASTER DATA)
# ================================
join_keys = ["record_id", "server_name"]

data_clean_hierarchy = (
    data_master
      .merge(records_hierarchy_df, on=join_keys, how="left", validate="one_to_one")
      .merge(date_first_seen_master, on=join_keys, how="left", validate="one_to_one")
)

# ================================
# 7) Sanity checks (correct ones)
# ================================
print("Master pairs (records_hierarchy_df rows):", len(records_hierarchy_df))
print("Master unique record_id:", records_hierarchy_df["record_id"].nunique())

print("Rows in raw data:", len(data))
print("Rows in data after filtering to master pairs:", len(data_master))

print("Rows in date_first_seen after filtering to master pairs:", len(date_first_seen_master))
print("Final rows in data_clean_hierarchy:", len(data_clean_hierarchy))

print("\nMissing records_hierarchy:", data_clean_hierarchy["records_hierarchy"].isna().sum())
print("Missing date_first_seen:", data_clean_hierarchy["date_first_seen"].isna().sum())
print("Missing publication_year_first_seen:", data_clean_hierarchy["publication_year_first_seen"].isna().sum())

print("\nDuplicates on (record_id, server_name) in final:",
      data_clean_hierarchy.duplicated(["record_id","server_name"]).sum())

print("\nHierarchy counts:")
print(data_clean_hierarchy["records_hierarchy"].value_counts(dropna=False).head(30))


Master pairs (records_hierarchy_df rows): 8410094
Master unique record_id: 8410094
Rows in raw data: 8413526
Rows in data after filtering to master pairs: 8410094
Rows in date_first_seen after filtering to master pairs: 8410094
Final rows in data_clean_hierarchy: 8410094

Missing records_hierarchy: 0
Missing date_first_seen: 0
Missing publication_year_first_seen: 0

Duplicates on (record_id, server_name) in final: 0

Hierarchy counts:
records_hierarchy
parent                              7950093
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8286
mirror (AgEcon Search)                 6702
child                                  2028
mirror (ResearchGate)                  1633
correction                              355
comment                                 325
mirror (Zenodo)                         297
mirror (SSRN)  

In [6]:
# 1. Create the backup copy in a new column
data_clean_hierarchy["records_hierarchy_backup"] = data_clean_hierarchy["records_hierarchy"].copy()

# 2. Overwrite all cells in the original column with the string 'parent'
data_clean_hierarchy["records_hierarchy"] = 'parent'


In [7]:
data_clean_hierarchy["records_hierarchy"].value_counts(dropna=False).head(60)

records_hierarchy
parent    8410094
Name: count, dtype: int64

In [8]:
data_clean_hierarchy["records_hierarchy_backup"].value_counts(dropna=False).head(60)

records_hierarchy_backup
parent                              7950093
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8286
mirror (AgEcon Search)                 6702
child                                  2028
mirror (ResearchGate)                  1633
correction                              355
comment                                 325
mirror (Zenodo)                         297
mirror (SSRN)                            36
mirror (Open Science Framework)          31
mirror (bioRxiv)                         29
mirror (Humanities Commons CORE)         24
others                                   12
parent_duplicate                          3
mirror (eLife)                            3
mirror (Research Square)                  2
mirror (CERN document server)             2
mirror (TechRxiv)                         1
mirror 

In [9]:
data_clean_hierarchy.count()

record_id                      8410094
server_name                    8410094
backend                        8410094
doi                            6784859
doi_url                        6784859
landing_page_url               8330648
title                          8410069
authors_flat                   8194686
institutions_flat              1865338
countries_flat                  867847
relations_json                 4253292
version_label                  2974496
is_version_of                  6707562
is_preprint_of                 6707562
has_preprint                   6707562
has_review                     6707562
has_published_version          6707562
published_version_ids_json           0
version_of_ids_json                  0
update_to_json                    8899
raw_relationships_json         3520391
records_hierarchy              8410094
date_first_seen                8410094
publication_year_first_seen    8410094
records_hierarchy_backup       8410094
dtype: int64

# dedupe on title+authors (+ optional year)

In [10]:
"""
Reproducible 2-pass dedupe pipeline (Exact pass -> Fuzzy pass) with:
- Strong-but-cheap title normalization (cached)
- 3 author signatures: tokenbag | last_initial | last
- Stage A strict (title + authors_fp) exact
- Stage B relaxed (shared authors overlap) within exact-title groups (optional per stage)
- Optional fuzzy title fallback (token containment) BLOCKED by authors_fp (+ optional year)
- Prefilter modes:
    * title_dup  : keep rows where cleaned title repeats (fast exact stages)
    * author_dup : keep rows where authors_fp repeats (enables fuzzy stages when titles differ)
    * none       : keep all eligible (debug)

Includes:
- Metrics counters per stage
- Summary printing + early stop
- Deterministic labeling
- Designed for speed + low false positives (especially with last_initial)

USAGE:
1) Define STAGES_EXACT and STAGES_FUZZY
2) Run:
   df_out, metrics = run_dedupe_pipeline_two_passes(
       df,
       stages_exact=STAGES_EXACT,
       stages_fuzzy=STAGES_FUZZY,
       early_stop_if_new_labels_lt=500,
       print_summary=True,
       return_all_metrics=True,
       servers=None,
       across_servers=True,
       use_year=False,
       choose_parent="oldest",
       prefilter=True,
       date_candidates=('date_first_seen',),
       hierarchy_col="records_hierarchy",
       parent_id_col="parent_record_id",
       group_id_col="dup_group_id",
       add_authors_fingerprint_col=True,
       add_title_clean_col=True,
   )
"""

import pandas as pd
import numpy as np
import re
import time
import unicodedata
from typing import Iterable, Optional, Dict, Any, Tuple, List

# ============================================================
# 0) Regex + NA helpers
# ============================================================
_WS = re.compile(r"\s+")
_PUNCT_ALL = re.compile(r"[^\w\s]", re.UNICODE)  # remove everything except word chars + spaces
NA_LIKE = {"", "none", "null", "nan", "n/a", "[]", "{}", "na"}


# ============================================================
# 1) Utility: pick a date column + record_id numeric fallback
# ============================================================
def _pick_first_existing(df: pd.DataFrame, candidates: Iterable[str]) -> Optional[str]:
    for c in candidates:
        if c in df.columns:
            return c
    return None


def _record_id_key(s: pd.Series) -> pd.Series:
    """Fast numeric key from record_id (extract first digits)."""
    digits = s.astype("string").str.extract(r"(\d+)")[0]
    return pd.to_numeric(digits, errors="coerce")


# ============================================================
# 2) Title normalization (cheap, high ROI) + token containment
# ============================================================
def _strip_accents_text(x: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFKD", x) if not unicodedata.combining(c)
    )


def _clean_title_series_v2(s: pd.Series) -> pd.Series:
    """
    Strong-but-cheap title normalization:
      - lowercase
      - strip accents
      - remove punctuation -> spaces
      - collapse whitespace
    """
    s = s.astype("string").fillna("").str.strip().str.lower()
    s = s.where(~s.isin(list(NA_LIKE)), "")
    s = s.apply(_strip_accents_text)
    s = s.str.replace(_PUNCT_ALL, " ", regex=True)
    s = s.str.replace(_WS, " ", regex=True).str.strip()
    return s


def _title_tokens_from_clean(title_clean: str) -> List[str]:
    """Tokenize already-clean title into tokens; drop very short tokens (len < 2)."""
    if not title_clean:
        return []
    return [t for t in title_clean.split(" ") if len(t) >= 2]


def _containment_score(a_tokens: List[str], b_tokens: List[str]) -> float:
    """
    Containment score:
        |A ∩ B| / min(|A|, |B|)
    Good for small title differences when tokens still mostly match.
    """
    if not a_tokens or not b_tokens:
        return 0.0
    A, B = set(a_tokens), set(b_tokens)
    denom = min(len(A), len(B))
    if denom <= 0:
        return 0.0
    return len(A & B) / denom


# ============================================================
# 3) Author canonicalization (3 modes)
# ============================================================
def _strip_accents(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c)
    )


def _normalize_one_author_tokenbag(author: str) -> str:
    """
    Token-bag per author:
    - remove punctuation
    - split tokens
    - sort tokens within author
    - join with "_"
    """
    if not author:
        return ""
    a = _strip_accents(str(author)).lower().strip()
    if not a or a in NA_LIKE:
        return ""
    a = _PUNCT_ALL.sub(" ", a)
    a = _WS.sub(" ", a).strip()
    if not a:
        return ""
    toks = [t for t in a.split(" ") if t]
    if not toks:
        return ""
    toks = sorted(toks)
    return "_".join(toks)


def _normalize_one_author_last_initial(author: str) -> str:
    """
    Middle-ground signature: "last|first_initial"
    Rules:
      - If comma: "Last, First ..." -> last = first token before comma;
                                     initial = first token after comma (first-name token only)
      - If no comma: "First ... Last" -> last = last token; initial = first token
      - If we can't find an initial, return "" (reduces false positives)
    """
    if not author:
        return ""
    a = _strip_accents(str(author)).lower().strip()
    if not a or a in NA_LIKE:
        return ""

    if "," in a:
        left, right = a.split(",", 1)
        left = _PUNCT_ALL.sub(" ", left)
        right = _PUNCT_ALL.sub(" ", right)
        left = _WS.sub(" ", left).strip()
        right = _WS.sub(" ", right).strip()
        if not left:
            return ""
        last_toks = [t for t in left.split(" ") if t]
        if not last_toks:
            return ""
        last = last_toks[0]  # keep your "first token if multi-token surname" philosophy

        first_toks = [t for t in right.split(" ") if t]
        if not first_toks:
            return ""  # avoid false positives
        ini = first_toks[0][:1]
        return f"{last}|{ini}" if ini else ""
    else:
        a = _PUNCT_ALL.sub(" ", a)
        a = _WS.sub(" ", a).strip()
        toks = [t for t in a.split(" ") if t]
        if len(toks) < 2:
            return ""
        ini = toks[0][:1]
        last = toks[-1]
        return f"{last}|{ini}" if (ini and last) else ""


def _normalize_one_author_last(author: str) -> str:
    """Last-name-only signature (high recall, more false positives)."""
    if not author:
        return ""
    a = _strip_accents(str(author)).lower().strip()
    if not a or a in NA_LIKE:
        return ""

    if "," in a:
        left = a.split(",", 1)[0].strip()
        left = _PUNCT_ALL.sub(" ", left)
        left = _WS.sub(" ", left).strip()
        if not left:
            return ""
        toks = [t for t in left.split(" ") if t]
        if not toks:
            return ""
        return toks[0]
    else:
        a = _PUNCT_ALL.sub(" ", a)
        a = _WS.sub(" ", a).strip()
        toks = [t for t in a.split(" ") if t]
        if not toks:
            return ""
        return toks[-1]


def build_authors_fingerprint_series(authors_flat: pd.Series, mode: str) -> pd.Series:
    """
    Build author fingerprint per row:
      - split authors on ';'
      - normalize each author (depends on mode)
      - drop empties
      - dedupe within row
      - sort
      - join with ';'
    """
    if mode not in {"tokenbag", "last_initial", "last"}:
        raise ValueError("mode must be tokenbag | last_initial | last")

    s = authors_flat.astype("string").fillna("").str.strip()
    s = s.where(~s.str.lower().isin(list(NA_LIKE)), "")

    if mode == "tokenbag":
        norm_fn = _normalize_one_author_tokenbag
    elif mode == "last_initial":
        norm_fn = _normalize_one_author_last_initial
    else:
        norm_fn = _normalize_one_author_last

    def row_to_fp(x: str) -> str:
        if not x:
            return ""
        authors = [a.strip() for a in str(x).split(";") if a.strip()]
        norm = [norm_fn(a) for a in authors]
        norm = [z for z in norm if z]
        norm = sorted(set(norm))
        return ";".join(norm)

    return s.apply(row_to_fp)


def _author_tokens_from_fp(fp: str) -> List[str]:
    if not fp:
        return []
    return [t for t in fp.split(";") if t]


def _overlap_count(a_tokens: List[str], b_tokens: List[str]) -> int:
    if not a_tokens or not b_tokens:
        return 0
    return len(set(a_tokens) & set(b_tokens))


# ============================================================
# 4) Single-stage dedupe:
#    - Prefilter (title_dup/author_dup/none)
#    - Stage A strict (title + authors_fp) exact match
#    - Optional fuzzy title fallback (within same authors_fp)
#    - Optional Stage B relaxed (shared authors overlap) within exact-title groups
# ============================================================
def dedupe_title_authors_stage(
    df: pd.DataFrame,
    *,
    # stage config
    stage_name: str = "stage",
    authors_fp_mode: str = "tokenbag",         # tokenbag | last_initial | last

    # fuzzy config (title containment), executed only if enabled
    title_fuzzy_fallback: bool = False,
    min_title_tokens: int = 6,
    min_title_containment: float = 0.70,
    fuzzy_compare_strategy: str = "parent_only",  # parent_only | all_pairs_small (parent_only is safest/fastest)

    # relaxed (shared authors overlap) config (exact title only)
    relaxed_shared_authors: bool = True,
    min_authors_required: int = 2,
    min_shared_authors: int = 2,

    # prefilter strategy (important!)
    prefilter_mode: str = "title_dup",         # title_dup | author_dup | none
    prefilter: bool = True,                    # if False, skip ">=2" group filter (slower)

    # global options
    servers=None,
    across_servers: bool = True,
    use_year: bool = False,
    choose_parent: str = "oldest",             # oldest | most_recent
    overwrite_mode: str = "parent_only",       # any | parent_only | unlabeled_only

    # columns
    server_col: str = "server_name",
    record_id_col: str = "record_id",
    title_col: str = "title",
    authors_col: str = "authors_flat",
    year_col: str = "publication_year_first_seen",
    date_candidates: Tuple[str, ...] = ("date_first_seen",),

    hierarchy_col: str = "records_hierarchy",
    parent_id_col: str = "parent_record_id",
    group_id_col: str = "dup_group_id",

    # caching/debug columns
    add_authors_fingerprint_col: bool = True,
    authors_fingerprint_col: str = "authors_fp",
    add_title_clean_col: bool = True,
    title_clean_col: str = "title_clean_v2",

    return_metrics: bool = False,
) -> pd.DataFrame | Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    One dedupe stage. Designed to be composed into a multi-stage pipeline.
    """

    t0 = time.perf_counter()

    metrics: Dict[str, Any] = {
        "stage_name": stage_name,
        "n_rows_df": int(len(df)),
        "n_candidates_initial": 0,
        "prefilter_mode": prefilter_mode,
        "prefilter_rows": 0,
        "prefilter_groups": 0,
        "work_rows_after_keys": 0,

        "stageA_groups": 0,
        "stageA_children_labeled": 0,

        "fuzzy_enabled": bool(title_fuzzy_fallback),
        "fuzzy_groups": 0,
        "fuzzy_pairs_checked": 0,
        "fuzzy_children_labeled": 0,

        "stageB_enabled": bool(relaxed_shared_authors),
        "stageB_title_groups": 0,
        "stageB_clusters": 0,
        "stageB_children_labeled": 0,

        "time_s": 0.0,
    }

    # ------------------------------------------------------
    # Ensure output cols exist
    # ------------------------------------------------------
    for c in (hierarchy_col, parent_id_col, group_id_col):
        if c not in df.columns:
            df[c] = pd.NA

    if add_authors_fingerprint_col and authors_fingerprint_col not in df.columns:
        df[authors_fingerprint_col] = pd.NA
    if add_title_clean_col and title_clean_col not in df.columns:
        df[title_clean_col] = pd.NA

    # ------------------------------------------------------
    # Eligibility
    # ------------------------------------------------------
    h = df[hierarchy_col]
    if overwrite_mode == "any":
        eligible = pd.Series(True, index=df.index)
    elif overwrite_mode == "parent_only":
        eligible = h.astype("string").str.lower().str.strip().eq("parent")
    elif overwrite_mode == "unlabeled_only":
        eligible = h.isna()
    else:
        raise ValueError("overwrite_mode must be any | parent_only | unlabeled_only")

    # server filter
    if servers is None:
        server_mask = pd.Series(True, index=df.index)
    elif isinstance(servers, str):
        server_mask = df[server_col].eq(servers)
    else:
        server_mask = df[server_col].isin(list(servers))

    m = eligible & server_mask
    metrics["n_candidates_initial"] = int(m.sum())
    if not m.any():
        metrics["time_s"] = time.perf_counter() - t0
        return (df, metrics) if return_metrics else df

    # ------------------------------------------------------
    # Prefilter: decide which indices to consider in this stage
    # ------------------------------------------------------
    if prefilter_mode == "title_dup":
        # title-based prefilter (fast for exact title stages)
        t_clean = df.loc[m, title_clean_col] if (add_title_clean_col and title_clean_col in df.columns and df.loc[m, title_clean_col].notna().any()) else None
        if t_clean is None:
            t_clean = _clean_title_series_v2(df.loc[m, title_col])
        vc = t_clean.value_counts()
        keep_idx = t_clean[t_clean.isin(vc[vc >= 2].index)].index
        metrics["prefilter_groups"] = int((vc >= 2).sum())

    elif prefilter_mode == "author_dup":
        # author-fp based prefilter (crucial for fuzzy pass; titles may differ)
        # compute fp only for m rows
        a_fp = build_authors_fingerprint_series(df.loc[m, authors_col], mode=authors_fp_mode)
        vc = a_fp.value_counts()
        keep_idx = a_fp[a_fp.isin(vc[vc >= 2].index)].index
        metrics["prefilter_groups"] = int((vc >= 2).sum())

    elif prefilter_mode == "none":
        keep_idx = df.index[m]
        metrics["prefilter_groups"] = 0

    else:
        raise ValueError("prefilter_mode must be title_dup | author_dup | none")

    metrics["prefilter_rows"] = int(len(keep_idx))
    if len(keep_idx) == 0:
        metrics["time_s"] = time.perf_counter() - t0
        return (df, metrics) if return_metrics else df

    # ------------------------------------------------------
    # Work subset + compute/attach cached normalization keys
    # ------------------------------------------------------
    cols_needed = [server_col, record_id_col, title_col, authors_col]
    if use_year:
        cols_needed.append(year_col)
    date_col = _pick_first_existing(df, date_candidates)
    if date_col:
        cols_needed.append(date_col)

    work = df.loc[keep_idx, cols_needed].copy()

    # Title clean (cache to df if asked)
    if add_title_clean_col:
        # compute for missing only (cheap)
        t_missing = df.loc[work.index, title_clean_col].isna()
        if t_missing.any():
            df.loc[work.index[t_missing], title_clean_col] = _clean_title_series_v2(df.loc[work.index[t_missing], title_col]).values
        work["_t"] = df.loc[work.index, title_clean_col].astype("string").fillna("")
    else:
        work["_t"] = _clean_title_series_v2(work[title_col])

    # Authors fp (mode-specific; cache into df column if asked)
    work["_a_fp"] = build_authors_fingerprint_series(work[authors_col], mode=authors_fp_mode)
    if add_authors_fingerprint_col:
        df.loc[work.index, authors_fingerprint_col] = work["_a_fp"].values

    # Year (optional)
    if use_year:
        y = pd.to_numeric(work[year_col], errors="coerce")
        y = y.where((y >= 1000) & (y <= 3000)).round().astype("Int64")
        work["_y"] = y
    else:
        work["_y"] = pd.NA

    # require non-empty keys
    if use_year:
        work = work[(work["_t"] != "") & (work["_a_fp"] != "") & work["_y"].notna()].copy()
    else:
        work = work[(work["_t"] != "") & (work["_a_fp"] != "")].copy()

    metrics["work_rows_after_keys"] = int(len(work))
    if work.empty:
        metrics["time_s"] = time.perf_counter() - t0
        return (df, metrics) if return_metrics else df

    # ------------------------------------------------------
    # Stage A STRICT: exact match on (title_clean + authors_fp [+year] [+server scope])
    # ------------------------------------------------------
    if use_year:
        strict_base = work["_t"] + "||" + work["_a_fp"] + "||" + work["_y"].astype("string")
    else:
        strict_base = work["_t"] + "||" + work["_a_fp"]

    if across_servers:
        work["_grp_strict"] = strict_base
    else:
        work["_grp_strict"] = work[server_col].astype("string") + "||" + strict_base

    strict = work
    if prefilter:
        vcg = work["_grp_strict"].value_counts()
        dup_keys = vcg[vcg >= 2].index
        strict = work[work["_grp_strict"].isin(dup_keys)].copy()

    metrics["stageA_groups"] = int(strict["_grp_strict"].nunique()) if not strict.empty else 0

    # sort keys for parent choice
    if date_col and date_col in strict.columns:
        strict["_dt"] = pd.to_datetime(strict[date_col], errors="coerce")
    else:
        strict["_dt"] = pd.NaT
    strict["_rid"] = _record_id_key(strict[record_id_col])

    if not strict.empty:
        if choose_parent == "oldest":
            strict = strict.sort_values(
                by=["_grp_strict", "_dt", "_rid"],
                ascending=[True, True, True],
                na_position="last",
            )
        elif choose_parent == "most_recent":
            strict = strict.sort_values(
                by=["_grp_strict", "_dt", "_rid"],
                ascending=[True, False, False],
                na_position="last",
            )
        else:
            raise ValueError("choose_parent must be oldest | most_recent")

        parents = strict.groupby("_grp_strict", sort=False).head(1)
        parent_rid_map = parents.set_index("_grp_strict")[record_id_col]
        parent_srv_map = parents.set_index("_grp_strict")[server_col]

        strict["_parent_rid"] = strict["_grp_strict"].map(parent_rid_map)
        strict["_parent_srv"] = strict["_grp_strict"].map(parent_srv_map)

        is_parent = strict[record_id_col].eq(strict["_parent_rid"])
        parent_idx = strict.index[is_parent]
        child_idx = strict.index[~is_parent]

        metrics["stageA_children_labeled"] = int(len(child_idx))

        df.loc[parent_idx, hierarchy_col] = "parent"
        df.loc[parent_idx, parent_id_col] = pd.NA
        df.loc[child_idx, hierarchy_col] = (
            "parent - duplicate (" + strict.loc[child_idx, "_parent_srv"].astype("string") + ")"
        )
        df.loc[child_idx, parent_id_col] = strict.loc[child_idx, "_parent_rid"].values

        # deterministic group id
        df.loc[strict.index, group_id_col] = (
            pd.util.hash_pandas_object(strict["_grp_strict"], index=False)
            .astype("uint64")
            .astype(str)
            .values
        )

    # ------------------------------------------------------
    # Fuzzy title fallback (BLOCKED by authors_fp [+year], only remaining eligible)
    # Important: this can find near-duplicate titles because we do NOT rely on title_dup.
    # ------------------------------------------------------
    if title_fuzzy_fallback:
        # remaining eligible after Stage A
        h2 = df[hierarchy_col]
        if overwrite_mode == "parent_only":
            eligible2 = h2.astype("string").str.lower().str.strip().eq("parent")
        elif overwrite_mode == "unlabeled_only":
            eligible2 = h2.isna()
        else:
            eligible2 = pd.Series(True, index=df.index)

        remain_idx = work.index.intersection(df.index[eligible2])
        wF = work.loc[remain_idx].copy()

        if not wF.empty:
            # block by authors_fp (+year) because authors are "more trustworthy"
            if use_year:
                wF["_grp_auth"] = wF["_a_fp"] + "||" + wF["_y"].astype("string")
            else:
                wF["_grp_auth"] = wF["_a_fp"]

            # keep only blocks with >=2 rows
            vc_auth = wF["_grp_auth"].value_counts()
            keep_auth = vc_auth[vc_auth >= 2].index
            wF = wF[wF["_grp_auth"].isin(keep_auth)].copy()

            metrics["fuzzy_groups"] = int(wF["_grp_auth"].nunique()) if not wF.empty else 0

            if not wF.empty:
                # date/rid for parent selection
                if date_col and date_col in wF.columns:
                    wF["_dt"] = pd.to_datetime(wF[date_col], errors="coerce")
                else:
                    wF["_dt"] = pd.NaT
                wF["_rid"] = _record_id_key(wF[record_id_col])

                # tokens cache per row (within this stage)
                tokens_map = {idx: _title_tokens_from_clean(wF.loc[idx, "_t"]) for idx in wF.index}

                for grp, g in wF.groupby("_grp_auth", sort=False):
                    if len(g) < 2:
                        continue

                    # gate: ignore titles with too few tokens
                    idxs = [idx for idx in g.index if len(tokens_map.get(idx, [])) >= min_title_tokens]
                    if len(idxs) < 2:
                        continue

                    gg = g.loc[idxs].copy()
                    if choose_parent == "oldest":
                        gg = gg.sort_values(by=["_dt", "_rid"], ascending=[True, True], na_position="last")
                    else:
                        gg = gg.sort_values(by=["_dt", "_rid"], ascending=[False, False], na_position="last")

                    if fuzzy_compare_strategy == "parent_only":
                        parent_idx = gg.index[0]
                        parent_tokens = tokens_map[parent_idx]
                        parent_rid = gg.loc[parent_idx, record_id_col]
                        parent_srv = gg.loc[parent_idx, server_col]

                        # ensure parent labeled
                        df.loc[parent_idx, hierarchy_col] = "parent"
                        df.loc[parent_idx, parent_id_col] = pd.NA

                        for idx in gg.index[1:]:
                            metrics["fuzzy_pairs_checked"] += 1
                            sc = _containment_score(parent_tokens, tokens_map[idx])
                            if sc >= min_title_containment:
                                df.loc[idx, hierarchy_col] = f"parent - duplicate ({parent_srv})"
                                df.loc[idx, parent_id_col] = parent_rid
                                df.loc[idx, group_id_col] = f"fuzzy::{authors_fp_mode}::{grp}"
                                metrics["fuzzy_children_labeled"] += 1

                    elif fuzzy_compare_strategy == "all_pairs_small":
                        # safer than global all-pairs; still can be heavy if blocks are large.
                        # We'll cluster by greedy expansion (bounded within block).
                        idxs2 = gg.index.tolist()
                        used = set()
                        for i in idxs2:
                            if i in used:
                                continue
                            used.add(i)
                            cluster = [i]
                            for j in idxs2:
                                if j in used:
                                    continue
                                metrics["fuzzy_pairs_checked"] += 1
                                sc = _containment_score(tokens_map[i], tokens_map[j])
                                if sc >= min_title_containment:
                                    used.add(j)
                                    cluster.append(j)

                            if len(cluster) >= 2:
                                # choose parent (oldest/most recent) within cluster
                                cldf = gg.loc[cluster].copy()
                                if choose_parent == "oldest":
                                    cldf = cldf.sort_values(by=["_dt", "_rid"], ascending=[True, True], na_position="last")
                                else:
                                    cldf = cldf.sort_values(by=["_dt", "_rid"], ascending=[False, False], na_position="last")

                                p_idx = cldf.index[0]
                                p_rid = cldf.loc[p_idx, record_id_col]
                                p_srv = cldf.loc[p_idx, server_col]
                                df.loc[p_idx, hierarchy_col] = "parent"
                                df.loc[p_idx, parent_id_col] = pd.NA
                                for cidx in cldf.index[1:]:
                                    df.loc[cidx, hierarchy_col] = f"parent - duplicate ({p_srv})"
                                    df.loc[cidx, parent_id_col] = p_rid
                                    df.loc[cidx, group_id_col] = f"fuzzy::{authors_fp_mode}::{grp}"
                                    metrics["fuzzy_children_labeled"] += 1
                    else:
                        raise ValueError("fuzzy_compare_strategy must be parent_only | all_pairs_small")

    # ------------------------------------------------------
    # Stage B RELAXED (shared authors overlap) within exact title groups
    # ------------------------------------------------------
    if relaxed_shared_authors:
        h3 = df[hierarchy_col]
        if overwrite_mode == "parent_only":
            eligible3 = h3.astype("string").str.lower().str.strip().eq("parent")
        elif overwrite_mode == "unlabeled_only":
            eligible3 = h3.isna()
        else:
            eligible3 = pd.Series(True, index=df.index)

        remain_idx = work.index.intersection(df.index[eligible3])
        w2 = work.loc[remain_idx].copy()
        if not w2.empty:
            if use_year:
                relaxed_base = w2["_t"] + "||" + w2["_y"].astype("string")
            else:
                relaxed_base = w2["_t"]

            if across_servers:
                w2["_grp_title"] = relaxed_base
            else:
                w2["_grp_title"] = w2[server_col].astype("string") + "||" + relaxed_base

            # keep only repeated titles
            vc2 = w2["_grp_title"].value_counts()
            keep_groups = vc2[vc2 >= 2].index
            w2 = w2[w2["_grp_title"].isin(keep_groups)].copy()

            metrics["stageB_title_groups"] = int(w2["_grp_title"].nunique()) if not w2.empty else 0

            if not w2.empty:
                w2["_a_tokens"] = w2["_a_fp"].apply(_author_tokens_from_fp)
                w2["_a_n"] = w2["_a_tokens"].apply(len)

                if date_col and date_col in w2.columns:
                    w2["_dt"] = pd.to_datetime(w2[date_col], errors="coerce")
                else:
                    w2["_dt"] = pd.NaT
                w2["_rid"] = _record_id_key(w2[record_id_col])

                group_counter = 0
                children_total = 0

                for grp, g in w2.groupby("_grp_title", sort=False):
                    if len(g) < 2:
                        continue

                    g = g[g["_a_n"] >= min_authors_required].copy()
                    if len(g) < 2:
                        continue

                    idxs = g.index.tolist()
                    used = set()
                    clusters = []

                    # simple greedy clustering based on author overlap
                    for i in idxs:
                        if i in used:
                            continue
                        used.add(i)
                        cl = [i]
                        for j in idxs:
                            if j in used:
                                continue
                            if _overlap_count(g.loc[i, "_a_tokens"], g.loc[j, "_a_tokens"]) >= min_shared_authors:
                                used.add(j)
                                cl.append(j)
                        if len(cl) >= 2:
                            clusters.append(cl)

                    for cl in clusters:
                        group_counter += 1
                        cldf = g.loc[cl].copy()
                        if choose_parent == "oldest":
                            cldf = cldf.sort_values(by=["_dt", "_rid"], ascending=[True, True], na_position="last")
                        else:
                            cldf = cldf.sort_values(by=["_dt", "_rid"], ascending=[False, False], na_position="last")

                        parent_idx = cldf.index[0]
                        parent_rid = cldf.loc[parent_idx, record_id_col]
                        parent_srv = cldf.loc[parent_idx, server_col]

                        df.loc[parent_idx, hierarchy_col] = "parent"
                        df.loc[parent_idx, parent_id_col] = pd.NA
                        df.loc[parent_idx, group_id_col] = f"relaxed::{stage_name}::{group_counter}"

                        child_idxs = [x for x in cldf.index if x != parent_idx]
                        children_total += len(child_idxs)

                        df.loc[child_idxs, hierarchy_col] = f"parent - duplicate ({parent_srv})"
                        df.loc[child_idxs, parent_id_col] = parent_rid
                        df.loc[child_idxs, group_id_col] = f"relaxed::{stage_name}::{group_counter}"

                metrics["stageB_clusters"] = int(group_counter)
                metrics["stageB_children_labeled"] = int(children_total)

    metrics["time_s"] = time.perf_counter() - t0
    return (df, metrics) if return_metrics else df


# ============================================================
# 5) Stage runner with summary + early stop
# ============================================================
def _count_children_labels(series: pd.Series) -> int:
    s = series.astype("string").fillna("")
    return int(s.str.startswith("parent - duplicate").sum())


def run_dedupe_stages(
    df: pd.DataFrame,
    *,
    stages: List[Dict[str, Any]],
    early_stop_if_new_labels_lt: int = 100,
    print_summary: bool = True,
    return_all_metrics: bool = True,
    # common kwargs passed to every stage
    **common_kwargs,
) -> Tuple[pd.DataFrame, List[Dict[str, Any]]] | pd.DataFrame:
    """
    Runs a list of stages sequentially with:
      - delta duplicates added per stage
      - early stop
    """
    df_out = df
    metrics_all: List[Dict[str, Any]] = []

    prev_children = _count_children_labels(df_out[common_kwargs.get("hierarchy_col", "records_hierarchy")])

    for stage in stages:
        name = stage.get("name", stage.get("stage_name", "stage"))
        t0 = time.perf_counter()

        df_out, m = dedupe_title_authors_stage(
            df_out,
            return_metrics=True,
            stage_name=name,
            **common_kwargs,
            **{k: v for k, v in stage.items() if k not in {"name", "stage_name"}},
        )

        now_children = _count_children_labels(df_out[common_kwargs.get("hierarchy_col", "records_hierarchy")])
        delta = now_children - prev_children
        prev_children = now_children

        m["stage_runtime_s"] = time.perf_counter() - t0
        m["new_children_added"] = int(delta)
        metrics_all.append(m)

        if print_summary:
            print(
                f"[{name}] new_children={delta} | "
                f"cand={m['n_candidates_initial']} | "
                f"prefilter_rows={m['prefilter_rows']} | "
                f"A_children={m['stageA_children_labeled']} | "
                f"fuzzy_children={m['fuzzy_children_labeled']} | "
                f"B_children={m['stageB_children_labeled']} | "
                f"time={m['stage_runtime_s']:.2f}s"
            )

        if delta < early_stop_if_new_labels_lt:
            if print_summary:
                print(f"Early stop after {name}: delta {delta} < {early_stop_if_new_labels_lt}")
            break

    return (df_out, metrics_all) if return_all_metrics else df_out


# ============================================================
# 6) Two-pass pipeline: Exact pass -> Fuzzy pass on remaining parents
# ============================================================
def run_dedupe_pipeline_two_passes(
    df: pd.DataFrame,
    *,
    stages_exact: List[Dict[str, Any]],
    stages_fuzzy: List[Dict[str, Any]],
    early_stop_if_new_labels_lt: int = 100,
    print_summary: bool = True,
    return_all_metrics: bool = True,
    **common_kwargs,
) -> Tuple[pd.DataFrame, List[Dict[str, Any]]] | pd.DataFrame:
    """
    Pass A: run stages_exact (typically no fuzzy, prefilter_mode=title_dup).
    Pass B: run stages_fuzzy (fuzzy enabled, prefilter_mode=author_dup), on remaining parents only.

    IMPORTANT:
      - For Pass A, it is normal to use overwrite_mode="any" for stage1, then "parent_only" for stage2-3.
      - For Pass B, use overwrite_mode="parent_only" so we only touch unresolved parents.
    """
    all_metrics: List[Dict[str, Any]] = []
    df_out = df

    if print_summary:
        print("\n=== PASS A: EXACT ===")

    df_out, mA = run_dedupe_stages(
        df_out,
        stages=stages_exact,
        early_stop_if_new_labels_lt=early_stop_if_new_labels_lt,
        print_summary=print_summary,
        return_all_metrics=True,
        **common_kwargs,
    )
    all_metrics.extend(mA)

    if print_summary:
        print("\n=== PASS B: FUZZY (remaining parents) ===")

    df_out, mB = run_dedupe_stages(
        df_out,
        stages=stages_fuzzy,
        early_stop_if_new_labels_lt=early_stop_if_new_labels_lt,
        print_summary=print_summary,
        return_all_metrics=True,
        **common_kwargs,
    )
    all_metrics.extend(mB)

    return (df_out, all_metrics) if return_all_metrics else df_out


# ============================================================
# 7) Default stage configs (recommended)
# ============================================================

# PASS A (EXACT) — fast + high precision
STAGES_EXACT = [
    dict(
        name="A1_tokenbag_exact",
        authors_fp_mode="tokenbag",
        prefilter_mode="title_dup",
        title_fuzzy_fallback=False,
        relaxed_shared_authors=True,
        min_authors_required=1,
        min_shared_authors=1,
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_tokenbag",
    ),
    dict(
        name="A2_last_initial_exact",
        authors_fp_mode="last_initial",
        prefilter_mode="title_dup",
        title_fuzzy_fallback=False,
        relaxed_shared_authors=True,
        min_authors_required=1,
        min_shared_authors=1,
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_last_initial",
    ),
    dict(
        name="A3_last_exact_strict",
        authors_fp_mode="last",
        prefilter_mode="title_dup",
        title_fuzzy_fallback=False,
        relaxed_shared_authors=False,  # last-only already high recall; keep strict
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_last",
    ),
]

# PASS B (FUZZY) — only remaining parents; block by authors_fp repetition
# Note: relaxed_shared_authors is usually OFF here to keep false positives down.
STAGES_FUZZY = [
    dict(
        name="B1_tokenbag_fuzzy",
        authors_fp_mode="tokenbag",
        prefilter_mode="author_dup",
        title_fuzzy_fallback=True,
        min_title_tokens=6,
        min_title_containment=0.75,  # start conservative; lower = more recall, more risk
        fuzzy_compare_strategy="parent_only",
        relaxed_shared_authors=False,
        # min_authors_required=1,
        # min_shared_authors=1,
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_tokenbag",
    ),
    dict(
        name="B2_last_initial_fuzzy",
        authors_fp_mode="last_initial",
        prefilter_mode="author_dup",
        title_fuzzy_fallback=True,
        min_title_tokens=6,
        min_title_containment=0.80,
        fuzzy_compare_strategy="parent_only",
        relaxed_shared_authors=False,
        # min_authors_required=1,
        # min_shared_authors=1,
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_last_initial",
    ),
]

# ============================================================
# 8) Example usage
# ============================================================
# df_out, metrics = run_dedupe_pipeline_two_passes(
#     df,
#     stages_exact=STAGES_EXACT,
#     stages_fuzzy=STAGES_FUZZY,
#     early_stop_if_new_labels_lt=500,
#     print_summary=True,
#     return_all_metrics=True,
#     servers=None,
#     across_servers=True,
#     use_year=False,
#     choose_parent="oldest",
#     prefilter=True,
#     date_candidates=('date_first_seen',),
#     hierarchy_col="records_hierarchy",
#     parent_id_col="parent_record_id",
#     group_id_col="dup_group_id",
#     add_authors_fingerprint_col=True,
#     add_title_clean_col=True,
#     title_clean_col="title_clean_v2",
# )
#
# print(metrics[-1])
# print(df_out["records_hierarchy"].value_counts(dropna=False).head(60))


In [11]:
data_out, metrics = run_dedupe_pipeline_two_passes(
    data_clean_hierarchy,
    stages_exact=STAGES_EXACT,
    stages_fuzzy=STAGES_FUZZY,
    early_stop_if_new_labels_lt=1,
    print_summary=True,
    return_all_metrics=True,
    servers=None,
    across_servers=True,
    use_year=False,
    choose_parent="oldest",
    prefilter=True,
    date_candidates=('date_first_seen',),
    hierarchy_col="records_hierarchy",
    parent_id_col="parent_record_id",
    group_id_col="dup_group_id",
    add_authors_fingerprint_col=True,
    add_title_clean_col=True,
    title_clean_col="title_clean_v2",
)

print(metrics[-1])
print(data_out["records_hierarchy"].value_counts(dropna=False).head(60))



=== PASS A: EXACT ===
[A1_tokenbag_exact] new_children=480047 | cand=8410094 | prefilter_rows=1073601 | A_children=406251 | fuzzy_children=0 | B_children=73796 | time=497.49s
[A2_last_initial_exact] new_children=16338 | cand=7930047 | prefilter_rows=224930 | A_children=13750 | fuzzy_children=0 | B_children=2588 | time=240.06s
[A3_last_exact_strict] new_children=4202 | cand=7913709 | prefilter_rows=194179 | A_children=4202 | fuzzy_children=0 | B_children=0 | time=5.49s

=== PASS B: FUZZY (remaining parents) ===
[B1_tokenbag_fuzzy] new_children=134485 | cand=7909507 | prefilter_rows=3431714 | A_children=0 | fuzzy_children=134485 | B_children=0 | time=813.40s
[B2_last_initial_fuzzy] new_children=16318 | cand=7775022 | prefilter_rows=3586018 | A_children=0 | fuzzy_children=16318 | B_children=0 | time=708.43s
{'stage_name': 'B2_last_initial_fuzzy', 'n_rows_df': 8410094, 'n_candidates_initial': 7775022, 'prefilter_mode': 'author_dup', 'prefilter_rows': 3586018, 'prefilter_groups': 714875, '

In [12]:
from pathlib import Path
import json
import pandas as pd

OUT_DIR = Path("outputs_new")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Save metrics
with open(OUT_DIR / "dedupe_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

pd.DataFrame(metrics).to_csv(OUT_DIR / "dedupe_metrics.csv", index=False)

# Save dataset
data_out.to_parquet(OUT_DIR / "dedupe_data_out.parquet", index=False)

print("✅ All files saved to outputs/")


✅ All files saved to outputs/


In [13]:
from pathlib import Path

OUT_DIR = Path("outputs_new")
OUT_DIR.mkdir(exist_ok=True)

new_cols = [
    "records_hierarchy_backup",
    "records_hierarchy",
    "parent_record_id",
    "dup_group_id",
    "authors_fp_tokenbag",
    "authors_fp_last_initial",
    "authors_fp_last",
    "title_clean_v2",
]
# original_cols = set(df.columns)

# detect new columns
# new_cols = [c for c in data_out.columns if c not in original_cols]

cols_to_save = ["record_id"] + new_cols

data_out[cols_to_save].to_parquet(
    OUT_DIR / "dedupe_data_out_new_cols.parquet",
    index=False
)
data_out[cols_to_save].to_csv(
    OUT_DIR / "dedupe_data_out_new_cols.csv",
    index=False
)
print("✅ Saved:", cols_to_save)



✅ Saved: ['record_id', 'records_hierarchy_backup', 'records_hierarchy', 'parent_record_id', 'dup_group_id', 'authors_fp_tokenbag', 'authors_fp_last_initial', 'authors_fp_last', 'title_clean_v2']


In [14]:
metrics

[{'stage_name': 'A1_tokenbag_exact',
  'n_rows_df': 8410094,
  'n_candidates_initial': 8410094,
  'prefilter_mode': 'title_dup',
  'prefilter_rows': 1073601,
  'prefilter_groups': 421611,
  'work_rows_after_keys': 998989,
  'stageA_groups': 320136,
  'stageA_children_labeled': 406251,
  'fuzzy_enabled': False,
  'fuzzy_groups': 0,
  'fuzzy_pairs_checked': 0,
  'fuzzy_children_labeled': 0,
  'stageB_enabled': True,
  'stageB_title_groups': 105969,
  'stageB_clusters': 71232,
  'stageB_children_labeled': 73796,
  'time_s': 495.854478934,
  'stage_runtime_s': 497.49351292999995,
  'new_children_added': 480047},
 {'stage_name': 'A2_last_initial_exact',
  'n_rows_df': 8410094,
  'n_candidates_initial': 7930047,
  'prefilter_mode': 'title_dup',
  'prefilter_rows': 224930,
  'prefilter_groups': 52987,
  'work_rows_after_keys': 146423,
  'stageA_groups': 12908,
  'stageA_children_labeled': 13750,
  'fuzzy_enabled': False,
  'fuzzy_groups': 0,
  'fuzzy_pairs_checked': 0,
  'fuzzy_children_label

In [15]:
data_out

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",,,,,,,,,false,,,,,parent,2020-05-03,2020,parent,,,,,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",,,,,,,,,false,,,,,parent,2020-04-25,2020,parent,,,mohamed_mostafa,doxycycline and minocycline drugs as a treatme...,mostafa|m,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",,,,,,,,,false,,,,,parent,2020-04-16,2020,parent,,,dasgupta_rimjhim,a genetic perspective of 2019 ncov in relation...,dasgupta|r,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",,,,,,,,,false,,,,,parent,2020-04-15,2020,parent,,,,,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",,,,,,,,,false,,,,,parent,2020-04-15,2020,parent,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8410089,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,Three Objections to Modern Physics,Lubomir Vlcek,,,,,,,,,,,,,,parent,2014-09-01,2014,parent,,,lubomir_vlcek,three objections to modern physics,vlcek|l,
8410090,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,Particle Mass Ratios,DT Froedge,,,,,,,,,,,,,,parent,2011-12-01,2011,parent,,,dt_froedge,particle mass ratios,froedge|d,
8410091,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,Quantum FFF Theory Proposals for Some Unsolved...,Leo Vuyk,,,,,,,,,,,,,,parent,2014-06-01,2014,parent,,,leo_vuyk,quantum fff theory proposals for some unsolved...,vuyk|l,
8410092,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,Investigation of the Formalism of Particle Dyn...,Chi-Yi Chen,,,,,,,,,,,,,,parent,2013-06-01,2013,parent,,,chen_chi_yi,investigation of the formalism of particle dyn...,chen|c,


In [16]:
print(data_out["records_hierarchy_backup"].value_counts(dropna=False).head(60))

records_hierarchy_backup
parent                              7950093
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8286
mirror (AgEcon Search)                 6702
child                                  2028
mirror (ResearchGate)                  1633
correction                              355
comment                                 325
mirror (Zenodo)                         297
mirror (SSRN)                            36
mirror (Open Science Framework)          31
mirror (bioRxiv)                         29
mirror (Humanities Commons CORE)         24
others                                   12
parent_duplicate                          3
mirror (eLife)                            3
mirror (Research Square)                  2
mirror (CERN document server)             2
mirror (TechRxiv)                         1
mirror 

In [17]:
print(data_out["parent_record_id"].value_counts(dropna=False).head(60))

parent_record_id
<NA>                                         7758704
datacite::10.5281/zenodo.14526038                663
datacite::10.5281/zenodo.15609432                570
datacite::10.22004/ag.econ.138684                449
datacite::10.22004/ag.econ.136112                301
datacite::10.22004/ag.econ.286599                300
datacite::10.5281/zenodo.15832876                285
datacite::10.5281/zenodo.15161515                260
crossref::10.31219/osf.io/ed2a9                  209
crossref::10.32388/174914                        183
datacite::10.5281/zenodo.6784019                 181
datacite::10.5281/zenodo.15631517                166
crossref::10.2139/ssrn.2721313                   163
datacite::10.5281/zenodo.15690627                158
datacite::10.5281/zenodo.1297654                 140
datacite::10.5281/zenodo.17172763                127
datacite::10.5281/zenodo.16684574                118
datacite::10.22004/ag.econ.286899                117
crossref::10.21055/preprints-

In [18]:
pattern = "fuzzy::las"

mask = data_out['dup_group_id'].str.contains(pattern, regex=False, na=False)
result = data_out[mask]
print(len(result))
result

16318


Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last
1210,crossref::10.33774/apsa-2024-xzkc5,APSA Preprints,crossref,10.33774/apsa-2024-xzkc5,https://doi.org/10.33774/apsa-2024-xzkc5,https://preprints.apsanet.org/engage/apsa/arti...,The Impact of Job Growth and Inflation on Pres...,"Doti, James; Campbell, Tom",Chapman University,,,,,,,,false,,,,,parent - duplicate (SSRN),2024-05-29,2024,parent,crossref::10.2139/ssrn.4657592,fuzzy::last_initial::campbell|t;doti|j,campbell_tom;doti_james,the impact of job growth and inflation on pres...,campbell|t;doti|j,
1695,crossref::10.3897/arphapreprints.e86933,ARPHA Preprints,crossref,10.3897/arphapreprints.e86933,https://doi.org/10.3897/arphapreprints.e86933,https://preprints.arphahub.com/article/86933/,Evidence of plant-soil feedback in South Texas...,"Bowman, Elizabeth; Plowes, Robert; Gilbert, La...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.3897/neobiota.81.86672,,,true,,,,,parent - duplicate (Research Square),2022-06-01,2022,parent,crossref::10.21203/rs.3.rs-668160/v1,fuzzy::last_initial::bowman|e;gilbert|l;plowes|r,,evidence of plant soil feedback in south texas...,bowman|e;gilbert|l;plowes|r,
1750,crossref::10.3897/arphapreprints.e61912,ARPHA Preprints,crossref,10.3897/arphapreprints.e61912,https://doi.org/10.3897/arphapreprints.e61912,https://preprints.arphahub.com/article/61912/,EcoBank: A flexible database platform for shar...,"Kim, Hyun Woo; Yoon, Sungsoo; Kim, Mokyoung; S...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.3897/bdj.9.e61866,,,true,,,,,parent - duplicate (Authorea Inc.),2020-12-10,2020,parent,crossref::10.22541/au.160490531.17626170/v1,fuzzy::last_initial::kim|h;kim|k;kim|m;shin|m;...,,ecobank a flexible database platform for shari...,kim|h;kim|k;kim|m;shin|m;yoon|h;yoon|s,
1964,crossref::10.3897/arphapreprints.e101357,ARPHA Preprints,crossref,10.3897/arphapreprints.e101357,https://doi.org/10.3897/arphapreprints.e101357,https://preprints.arphahub.com/article/101357/,Two new species of the bamboo-feeding planthop...,"Li, Hongxing; Yang, Lin; Chen, Xiang-Sheng",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.3897/zookeys.1183.101123,,,true,,,,,parent - duplicate (ARPHA Preprints),2023-02-06,2023,parent,crossref::10.3897/arphapreprints.e100712,fuzzy::last_initial::chen|x;li|h;yang|l,,two new species of the bamboo feeding planthop...,chen|x;li|h;yang|l,
2144,crossref::10.3897/arphapreprints.e68669,ARPHA Preprints,crossref,10.3897/arphapreprints.e68669,https://doi.org/10.3897/arphapreprints.e68669,https://preprints.arphahub.com/article/68669/,"Phrynarachne birudis&amp;nbsp;sp. nov., a new ...","Im, Jae Seong; Kim, Seung Tae; Lee, Sue Yeon",,,,,,,,,false,,,,,parent - duplicate (ARPHA Preprints),2021-05-20,2021,parent,crossref::10.3897/arphapreprints.e67978,fuzzy::last_initial::im|j;kim|s;lee|s,,phrynarachne birudis amp nbsp sp nov a new cra...,im|j;kim|s;lee|s,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8398180,openalex::W2888432452,viXra,openalex,,,https://vixra.org/pdf/1808.0479v1.pdf,Optimization of WEDM Parameters for SUPER Ni-7...,Y. Rameswara Reddy; B. Chandra Mohana Reddy,,,,,,,,,,,,,,parent - duplicate (viXra),2018-08-01,2018,parent,openalex::W2888102254,fuzzy::last_initial::reddy|b;reddy|y,,optimization of wedm parameters for super ni 7...,reddy|b;reddy|y,
8398232,openalex::W2889056834,viXra,openalex,,,https://vixra.org/pdf/1808.0565v1.pdf,The Accelerated Expansion of the Universe is E...,Elkin Igor,,,,,,,,,,,,,,parent - duplicate (viXra),2018-08-01,2018,parent,openalex::W2888326560,fuzzy::last_initial::igor|e,elkin_igor,the accelerated expansion of the universe is e...,igor|e,
8398923,openalex::W2902708990,viXra,openalex,,,https://vixra.org/pdf/1811.0418v1.pdf,The Thermodynamic Properties of Zero Oscillati...,Miheeev Sergey Vladimirovich,,,,,,,,,,,,,,parent - duplicate (viXra),2018-11-01,2018,parent,openalex::W2900995111,fuzzy::last_initial::vladimirovich|m,,the thermodynamic properties of zero oscillati...,vladimirovich|m,
8403090,openalex::W2981112927,viXra,openalex,,,https://vixra.org/pdf/1910.0241v1.pdf,Energy and Matter Creation - The Poynting Vortex,Richard Lawrence Norman; Jeremy Dunning-Davies,,,,,,,,,,,,,,parent - duplicate (ResearchGate),2019-10-01,2019,parent,datacite::10.13140/rg.2.2.21002.16329,fuzzy::last_initial::davies|j;norman|r,davies_dunning_jeremy;lawrence_norman_richard,energy and matter creation the poynting vortex,davies|j;norman|r,


In [19]:
result['relations_json'][281751]

'{"is-preprint-of": [{"asserted-by": "subject", "id": "10.1021/acs.chemmater.1c03349", "id-type": "doi"}]}'

In [20]:
data_out[data_out["record_id"]=='crossref::10.26434/chemrxiv-2021-cj17s']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last
281806,crossref::10.26434/chemrxiv-2021-cj17s,ChemRxiv,crossref,10.26434/chemrxiv-2021-cj17s,https://doi.org/10.26434/chemrxiv-2021-cj17s,https://chemrxiv.org/engage/chemrxiv/article-d...,First-Principles Plane-Wave-Based Exploration ...,"Ertural, Christina; Stoffel, Ralf; Müller, Pet...",RWTH Aachen University,,,,,,,,False,,,,,parent,2021-07-26,2021,parent,,,,first principles plane wave based exploration ...,dronskowski|r;ertural|c;muller|p;stoffel|r;vogt|c,


In [21]:
data_out[data_out["parent_record_id"]=='crossref::10.26434/chemrxiv-2022-fd190']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last
282821,crossref::10.26434/chemrxiv-2022-fd190-v2,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v2,https://doi.org/10.26434/chemrxiv-2022-fd190-v2,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190,,,,False,,,,,parent - duplicate (ChemRxiv),2022-04-25,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,relaxed::A1_tokenbag_exact::8264,alexander_bagger;alonso_hernandez_rosas;christ...,selectivity and intrinsic activity of function...,,
282876,crossref::10.26434/chemrxiv-2022-fd190-v3,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v3,https://doi.org/10.26434/chemrxiv-2022-fd190-v3,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,,parent - duplicate (ChemRxiv),2022-05-02,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,8105542989974901890,alexander_bagger;christensen_oliver;daasbjerg_...,selectivity and intrinsic activity of function...,,
282883,crossref::10.26434/chemrxiv-2022-fd190-v4,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v4,https://doi.org/10.26434/chemrxiv-2022-fd190-v4,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,,parent - duplicate (ChemRxiv),2022-05-04,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,8105542989974901890,alexander_bagger;christensen_oliver;daasbjerg_...,selectivity and intrinsic activity of function...,,
283800,crossref::10.26434/chemrxiv-2022-fd190-v5,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v5,https://doi.org/10.26434/chemrxiv-2022-fd190-v5,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,,parent - duplicate (ChemRxiv),2022-08-24,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,8105542989974901890,alexander_bagger;christensen_oliver;daasbjerg_...,selectivity and intrinsic activity of function...,,
309786,crossref::10.26434/chemrxiv-2022-fd190-v6,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v6,https://doi.org/10.26434/chemrxiv-2022-fd190-v6,https://chemrxiv.org/engage/chemrxiv/article-d...,Can the CO2 Reduction Reaction be Improved on ...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,10.1021/acscatal.2c04200,,,True,,,,,parent - duplicate (ChemRxiv),2022-10-18,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,fuzzy::tokenbag::alexander_bagger;christensen_...,alexander_bagger;christensen_oliver;daasbjerg_...,can the co2 reduction reaction be improved on ...,,


In [22]:
pattern = "chemrxiv.11846943"


mask = data_out['doi'].str.contains(pattern, regex=False, na=False)
result = data_out[mask]
print(len(result))
result

9


Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last
276994,crossref::10.26434/chemrxiv.11846943.v1,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v1,https://doi.org/10.26434/chemrxiv.11846943.v1,https://chemrxiv.org/engage/chemrxiv/article-d...,Analysis of Whole Genome Sequences and Homolog...,"Shanker, Arun; Bhanu, Divya; Alluri, Anajani",Central Research Institute for Dryland Agricul...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.11846943.v2;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (Open Science Framework),2020-02-13,2020,parent,crossref::10.31219/osf.io/2zuea,16256016634036771957,alluri_anajani;arun_shanker;bhanu_divya,analysis of whole genome sequences and homolog...,,
277030,crossref::10.26434/chemrxiv.11846943.v2,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v2,https://doi.org/10.26434/chemrxiv.11846943.v2,https://chemrxiv.org/engage/chemrxiv/article-d...,Analysis of Whole Genome Sequences and Homolog...,"Shanker, Arun; Bhanu, Divya; Alluri, Anajani",Central Research Institute for Dryland Agricul...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.11846943.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (Open Science Framework),2020-02-14,2020,parent,crossref::10.31219/osf.io/2zuea,fuzzy::tokenbag::alluri_anajani;arun_shanker;b...,alluri_anajani;arun_shanker;bhanu_divya,analysis of whole genome sequences and homolog...,,
277064,crossref::10.26434/chemrxiv.11846943.v3,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v3,https://doi.org/10.26434/chemrxiv.11846943.v3,https://chemrxiv.org/engage/chemrxiv/article-d...,Analysis of Whole Genome Sequences and Homolog...,"Shanker, Arun; Alluri, Anjani; Bhanu, Divya",Central Research Institute for Dryland Agricul...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.11846943.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-02-17,2020,parent,crossref::10.26434/chemrxiv.11846943.v2,relaxed::A1_tokenbag_exact::8040,alluri_anjani;arun_shanker;bhanu_divya,analysis of whole genome sequences and homolog...,,
277093,crossref::10.26434/chemrxiv.11846943.v4,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v4,https://doi.org/10.26434/chemrxiv.11846943.v4,https://chemrxiv.org/engage/chemrxiv/article-d...,Whole Genome Sequences Analysis and Homology M...,"Shanker, Arun; Alluri, Anjani; Bhanu, Divya",Central Research Institute for Dryland Agricul...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.11846943.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (Open Science Framework),2020-02-21,2020,parent,crossref::10.31219/osf.io/2zuea,fuzzy::last_initial::alluri|a;bhanu|d;shanker|a,alluri_anjani;arun_shanker;bhanu_divya,whole genome sequences analysis and homology m...,alluri|a;bhanu|d;shanker|a,
277128,crossref::10.26434/chemrxiv.11846943.v5,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v5,https://doi.org/10.26434/chemrxiv.11846943.v5,https://chemrxiv.org/engage/chemrxiv/article-d...,Whole Genome Sequence Analysis and Homology Mo...,"Shanker, Arun; Alluri, Anjani; Bhanu, Divya",Central Research Institute for Dryland Agricul...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.11846943.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-03-10,2020,parent,crossref::10.26434/chemrxiv.11846943.v4,fuzzy::tokenbag::alluri_anjani;arun_shanker;bh...,alluri_anjani;arun_shanker;bhanu_divya,whole genome sequence analysis and homology mo...,,
277157,crossref::10.26434/chemrxiv.11846943.v6,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v6,https://doi.org/10.26434/chemrxiv.11846943.v6,https://chemrxiv.org/engage/chemrxiv/article-d...,Whole Genome Sequence Analysis and Homology Mo...,"Shanker, Arun; Bhanu, Divya; Alluri, Anjani",Central Research Institute for Dryland Agricul...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.11846943.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-03-30,2020,parent,crossref::10.26434/chemrxiv.11846943.v5,1890956516455513972,alluri_anjani;arun_shanker;bhanu_divya,whole genome sequence analysis and homology mo...,,
277180,crossref::10.26434/chemrxiv.11846943.v7,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v7,https://doi.org/10.26434/chemrxiv.11846943.v7,https://chemrxiv.org/engage/chemrxiv/article-d...,Whole Genome Sequence Analysis and Homology Mo...,"Shanker, Arun; Bhanu, Divya; Alluri, Anjani",Central Research Institute for Dryland Agricul...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.11846943.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-04-03,2020,parent,crossref::10.26434/chemrxiv.11846943.v5,1890956516455513972,alluri_anjani;arun_shanker;bhanu_divya,whole genome sequence analysis and homology mo...,,
277210,crossref::10.26434/chemrxiv.11846943.v8,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v8,https://doi.org/10.26434/chemrxiv.11846943.v8,https://chemrxiv.org/engage/chemrxiv/article-d...,Whole Genome Sequence Analysis and Homology Mo...,"Shanker, Arun; Bhanu, Divya; Alluri, Anjani",Central Research Institute for Dryland Agricul...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.11846943.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-04-24,2020,parent,crossref::10.26434/chemrxiv.11846943.v5,1890956516455513972,alluri_anjani;arun_shanker;bhanu_divya,whole genome sequence analysis and homology mo...,,
277240,crossref::10.26434/chemrxiv.11846943.v9,ChemRxiv,crossref,10.26434/chemrxiv.11846943.v9,https://doi.org/10.26434/chemrxiv.11846943.v9,https://chemrxiv.org/engage/chemrxiv/article-d...,Whole Genome Sequence Analysis and Homology Mo...,"Shanker, Arun; Bhanu, Divya; Alluri, Anjani; G...",Central Research Institute for Dryland Agricul...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.26434/chemrxiv.11846943.v1;10.26434/chemrxi...,10.1039/d0nj00974a,,,True,,,,,parent - duplicate (ChemRxiv),2020-04-27,2020,parent,crossref::10.26434/chemrxiv.11846943.v5,relaxed::A1_tokenbag_exact::8044,alluri_anjani;arun_shanker;bhanu_divya;gupta_s...,whole genome sequence analysis and homology mo...,,


In [23]:
result['records_hierarchy'].value_counts()

records_hierarchy
parent - duplicate (ChemRxiv)                  6
parent - duplicate (Open Science Framework)    3
Name: count, dtype: int64

In [24]:
result['title'].value_counts()

title
Whole Genome Sequence Analysis and Homology Modelling of a 3C Like Peptidase and a Non-Structural Protein 3 of the SARS-CoV-2 Shows Protein Ligand Interaction with an Aza-Peptide and a Noncovalent Lead Inhibitor with Possible Antiviral Properties                       5
Analysis of Whole Genome Sequences and Homology Modelling of a 3-C Like Peptidase and a Non-Structural Protein of the Novel Coronavirus COVID-19 Shows Protein Ligand Interaction with an Aza-Peptide and a Noncovalent Lead Inhibitor with Possible Antiviral Properties    2
Analysis of Whole Genome Sequences and Homology Modelling of a 3C Like Peptidase and a Non-Structural Protein of the Novel Coronavirus COVID-19 Shows Protein Ligand Interaction with an Aza-Peptide and a Noncovalent Lead Inhibitor with Possible Antiviral Properties     1
Whole Genome Sequences Analysis and Homology Modelling of a 3C Like Peptidase and a Non-Structural Protein 3 of the SARS-CoV-2 Shows Protein Ligand Interaction with an Aza-Peptide a

In [25]:
result['authors_flat'].value_counts()

authors_flat
Shanker, Arun; Alluri, Anjani; Bhanu, Divya                      3
Shanker, Arun; Bhanu, Divya; Alluri, Anjani                      3
Shanker, Arun; Bhanu, Divya; Alluri, Anajani                     2
Shanker, Arun; Bhanu, Divya; Alluri, Anjani; Gupta, Samriddhi    1
Name: count, dtype: int64

In [26]:
pattern = "10.26434/chemrxiv-2022-fd190"


mask = data_out['doi'].str.contains(pattern, regex=False, na=False)
result = data_out[mask]
print(len(result))
result

6


Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last
282550,crossref::10.26434/chemrxiv-2022-fd190,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190,https://doi.org/10.26434/chemrxiv-2022-fd190,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,,,,,,,False,,,,,parent,2022-03-18,2022,parent,,relaxed::A1_tokenbag_exact::8264,alexander_bagger;christensen_oliver;daasbjerg_...,selectivity and intrinsic activity of function...,,
282821,crossref::10.26434/chemrxiv-2022-fd190-v2,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v2,https://doi.org/10.26434/chemrxiv-2022-fd190-v2,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190,,,,False,,,,,parent - duplicate (ChemRxiv),2022-04-25,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,relaxed::A1_tokenbag_exact::8264,alexander_bagger;alonso_hernandez_rosas;christ...,selectivity and intrinsic activity of function...,,
282876,crossref::10.26434/chemrxiv-2022-fd190-v3,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v3,https://doi.org/10.26434/chemrxiv-2022-fd190-v3,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,,parent - duplicate (ChemRxiv),2022-05-02,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,8105542989974901890,alexander_bagger;christensen_oliver;daasbjerg_...,selectivity and intrinsic activity of function...,,
282883,crossref::10.26434/chemrxiv-2022-fd190-v4,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v4,https://doi.org/10.26434/chemrxiv-2022-fd190-v4,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,,parent - duplicate (ChemRxiv),2022-05-04,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,8105542989974901890,alexander_bagger;christensen_oliver;daasbjerg_...,selectivity and intrinsic activity of function...,,
283800,crossref::10.26434/chemrxiv-2022-fd190-v5,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v5,https://doi.org/10.26434/chemrxiv-2022-fd190-v5,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,,parent - duplicate (ChemRxiv),2022-08-24,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,8105542989974901890,alexander_bagger;christensen_oliver;daasbjerg_...,selectivity and intrinsic activity of function...,,
309786,crossref::10.26434/chemrxiv-2022-fd190-v6,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v6,https://doi.org/10.26434/chemrxiv-2022-fd190-v6,https://chemrxiv.org/engage/chemrxiv/article-d...,Can the CO2 Reduction Reaction be Improved on ...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,10.1021/acscatal.2c04200,,,True,,,,,parent - duplicate (ChemRxiv),2022-10-18,2022,parent,crossref::10.26434/chemrxiv-2022-fd190,fuzzy::tokenbag::alexander_bagger;christensen_...,alexander_bagger;christensen_oliver;daasbjerg_...,can the co2 reduction reaction be improved on ...,,


In [27]:
pattern = "10.26434/chemrxiv-2022-fd190"


mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
print(len(result))
result

6


Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
287671,crossref::10.26434/chemrxiv-2022-fd190,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190,https://doi.org/10.26434/chemrxiv-2022-fd190,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,,,,,,,False,,,,
287942,crossref::10.26434/chemrxiv-2022-fd190-v2,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v2,https://doi.org/10.26434/chemrxiv-2022-fd190-v2,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190,,,,False,,,,
287997,crossref::10.26434/chemrxiv-2022-fd190-v3,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v3,https://doi.org/10.26434/chemrxiv-2022-fd190-v3,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,
288004,crossref::10.26434/chemrxiv-2022-fd190-v4,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v4,https://doi.org/10.26434/chemrxiv-2022-fd190-v4,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,
288921,crossref::10.26434/chemrxiv-2022-fd190-v5,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v5,https://doi.org/10.26434/chemrxiv-2022-fd190-v5,https://chemrxiv.org/engage/chemrxiv/article-d...,Selectivity and Intrinsic Activity of Function...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,,,,False,,,,
314907,crossref::10.26434/chemrxiv-2022-fd190-v6,ChemRxiv,crossref,10.26434/chemrxiv-2022-fd190-v6,https://doi.org/10.26434/chemrxiv-2022-fd190-v6,https://chemrxiv.org/engage/chemrxiv/article-d...,Can the CO2 Reduction Reaction be Improved on ...,"Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong...",University of Copenhagen; Aarhus University,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.26434/chemrxiv-2022-fd190;10.26434/chemrxiv...,10.1021/acscatal.2c04200,,,True,,,,


In [28]:
result['title'].value_counts()

title
Selectivity and Intrinsic Activity of Functionalized Cu Surfaces: Can the CO2 Reduction Reaction be Improved on Cu?    5
Can the CO2 Reduction Reaction be Improved on Cu: Selectivity and Intrinsic Activity of Functionalized Cu Surfaces     1
Name: count, dtype: int64

In [29]:
result['authors_flat'].value_counts()

authors_flat
Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong; Bagger, Alexander; Vang Lauritsen, Jeppe; Uttrup Pedersen, Steen; Daasbjerg, Kim; Rossmeisl, Jan                             5
Christensen, Oliver; Zhao, Siqi; Sun, Zhaozong; Bagger, Alexander; Rosas-Hernández, Alonso; Vang Lauritsen, Jeppe; Uttrup Pedersen, Steen; Daasbjerg, Kim; Rossmeisl, Jan    1
Name: count, dtype: int64

In [30]:
sample_titles = data_out.sample(5)[['title', 'authors_flat']].title
data_out[data_out.title.isin(sample_titles)][['title','authors_flat','records_hierarchy','date_first_seen']]

Unnamed: 0,title,authors_flat,records_hierarchy,date_first_seen
1399962,Jacques Derrida on Recognition,Isabelle Aubert,parent,2018-01-01
4737852,Using Predictive Analytics to Optimize Digital...,"Aindri, Jaiswal",parent - duplicate (SSRN),2025-08-25
4737853,Using Predictive Analytics to Optimize Digital...,"Aindri, Jaiswal",parent - duplicate (Zenodo),2025-08-25
6950072,Enhancement and suppression of heat transfer b...,"Lazarian, A.",parent,2022-03-21
7203389,A Log-domain Interior Point Method for Convex ...,"Liu, Bingqi; Liao-McPherson, Dominic",parent,2024-03-21
7370026,Video Summarisation with Incident and Context ...,"De Silva, Ulindu; Fernando, Leon; Bandara, Kal...",parent,2025-01-10


In [31]:
pattern = "10.26434/chemrxiv.13102877"


mask = data_out['doi'].str.contains(pattern, regex=False, na=False)
result = data_out[mask]
print(len(result))
result

32


Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,records_hierarchy_backup,parent_record_id,dup_group_id,authors_fp_tokenbag,title_clean_v2,authors_fp_last_initial,authors_fp_last
279942,crossref::10.26434/chemrxiv.13102877.v1,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v1,https://doi.org/10.26434/chemrxiv.13102877.v1,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD): ...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v2;10.26434/chemrxi...,,,,False,,,,,parent,2020-10-23,2020,parent,,16854196709595633026,datta_shoumen,aptamers for detection and diagnostics add pro...,datta|s,
279972,crossref::10.26434/chemrxiv.13102877.v2,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v2,https://doi.org/10.26434/chemrxiv.13102877.v2,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD): ...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-10-27,2020,parent,crossref::10.26434/chemrxiv.13102877.v1,16854196709595633026,datta_shoumen,aptamers for detection and diagnostics add pro...,,
279999,crossref::10.26434/chemrxiv.13102877.v3,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v3,https://doi.org/10.26434/chemrxiv.13102877.v3,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD) i...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent,2020-11-06,2020,parent,,15378799223749550050,datta_shoumen,aptamers for detection and diagnostics add is ...,datta|s,
280030,crossref::10.26434/chemrxiv.13102877.v4,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v4,https://doi.org/10.26434/chemrxiv.13102877.v4,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD) i...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-11-09,2020,parent,crossref::10.26434/chemrxiv.13102877.v3,15378799223749550050,datta_shoumen,aptamers for detection and diagnostics add is ...,,
280070,crossref::10.26434/chemrxiv.13102877.v5,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v5,https://doi.org/10.26434/chemrxiv.13102877.v5,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD) i...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-11-11,2020,parent,crossref::10.26434/chemrxiv.13102877.v3,15378799223749550050,datta_shoumen,aptamers for detection and diagnostics add is ...,,
280100,crossref::10.26434/chemrxiv.13102877.v6,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v6,https://doi.org/10.26434/chemrxiv.13102877.v6,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD) i...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-11-16,2020,parent,crossref::10.26434/chemrxiv.13102877.v3,15378799223749550050,datta_shoumen,aptamers for detection and diagnostics add is ...,,
280155,crossref::10.26434/chemrxiv.13102877.v7,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v7,https://doi.org/10.26434/chemrxiv.13102877.v7,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD) i...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-11-23,2020,parent,crossref::10.26434/chemrxiv.13102877.v3,15378799223749550050,datta_shoumen,aptamers for detection and diagnostics add is ...,,
280205,crossref::10.26434/chemrxiv.13102877.v8,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v8,https://doi.org/10.26434/chemrxiv.13102877.v8,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD) i...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-11-24,2020,parent,crossref::10.26434/chemrxiv.13102877.v3,15378799223749550050,datta_shoumen,aptamers for detection and diagnostics add is ...,,
280247,crossref::10.26434/chemrxiv.13102877.v9,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v9,https://doi.org/10.26434/chemrxiv.13102877.v9,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD) i...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-11-30,2020,parent,crossref::10.26434/chemrxiv.13102877.v3,15378799223749550050,datta_shoumen,aptamers for detection and diagnostics add is ...,,
280295,crossref::10.26434/chemrxiv.13102877.v10,ChemRxiv,crossref,10.26434/chemrxiv.13102877.v10,https://doi.org/10.26434/chemrxiv.13102877.v10,https://chemrxiv.org/engage/chemrxiv/article-d...,Aptamers for Detection and Diagnostics (ADD) i...,"Datta, Shoumen",MIT,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv.13102877.v1;10.26434/chemrxi...,,,,False,,,,,parent - duplicate (ChemRxiv),2020-12-03,2020,parent,crossref::10.26434/chemrxiv.13102877.v3,15378799223749550050,datta_shoumen,aptamers for detection and diagnostics add is ...,,
