In [1]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


# A) Counts

## import data

In [2]:
import duckdb
from pathlib import Path

con = duckdb.connect()

# Low-memory settings
con.execute("PRAGMA threads=1;")
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")
con.execute("PRAGMA memory_limit='2GB';")           # try 1GB if still unstable
con.execute("PRAGMA temp_directory='data/tmp_duckdb';")

# 2) Build paths robustly from the notebook folder
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "by_server"

# IMPORTANT: your files are hive-partitioned like:
all_backends = (BASE / "*" / "*.parquet").as_posix()

con.execute(f"""
CREATE OR REPLACE VIEW all_backends AS
SELECT * FROM read_parquet('{all_backends}', hive_partitioning=true, union_by_name=true);
""")

# A unified "all_rows" view
con.execute("""
CREATE OR REPLACE VIEW all_rows AS
SELECT * FROM all_backends
""")

print(con.execute("SHOW TABLES").fetchall())


[('all_backends',), ('all_rows',)]


In [3]:
con.execute("""
SELECT backend, COUNT(*) AS total, COUNT(DISTINCT record_id) AS unique_record_id
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
""").df()


Unnamed: 0,backend,total,unique_record_id
0,crossref,3612833,3187171
1,datacite,3523368,3520391
2,openalex,2350692,1702532


## total count

In [4]:
con.execute("""
SELECT COUNT(DISTINCT record_id) AS total_dedup_records
FROM (
    SELECT record_id FROM all_backends
)
""").df()


Unnamed: 0,total_dedup_records
0,8410094


# B) import and merge data : data, records_hierarchy_df, date_first_seen_df

## import data

In [5]:
con.execute(f"""
CREATE OR REPLACE VIEW server_thin AS
SELECT
  CAST(record_id AS VARCHAR)           AS record_id,
  CAST(server_name AS VARCHAR)         AS server_name,
  CAST(backend AS VARCHAR)             AS backend,

  CAST(doi AS VARCHAR)                 AS doi,
  CAST(doi_url AS VARCHAR)             AS doi_url,
  CAST(landing_page_url AS VARCHAR)    AS landing_page_url,

  CAST(title AS VARCHAR) AS title,
  -- CAST(abstract_text AS VARCHAR)      AS abstract_text,
  CAST(authors_flat AS VARCHAR)      AS authors_flat,
  CAST(institutions_flat AS VARCHAR)      AS institutions_flat,
  CAST(countries_flat AS VARCHAR)      AS countries_flat,
  
  -- Dates (helpful for temporal patterns)
  -- CAST(publication_year AS VARCHAR)    AS publication_year,
  -- CAST(date_created AS VARCHAR)        AS date_created,
  -- CAST(date_posted AS VARCHAR)         AS date_posted,
  -- CAST(date_deposited AS VARCHAR)      AS date_deposited,
  -- CAST(date_published AS VARCHAR)      AS date_published,
  -- CAST(date_published_online AS VARCHAR)      AS date_published_online,
  -- CAST(date_issued AS VARCHAR)         AS date_issued,
  -- CAST(date_indexed AS VARCHAR)        AS date_indexed,
  -- CAST(date_updated AS VARCHAR)        AS date_updated,
  -- CAST(date_registered AS VARCHAR)     AS date_registered,

  -- Relationships (keep these for true version links)
  CAST(relations_json AS VARCHAR)       AS relations_json,
  CAST(version_label AS VARCHAR)       AS version_label,
  CAST(is_version_of AS VARCHAR)       AS is_version_of,      -- keep as text; we’ll interpret later
  CAST(is_preprint_of AS VARCHAR)      AS is_preprint_of,
  CAST(has_preprint AS VARCHAR)      AS has_preprint,
  CAST(has_review AS VARCHAR)      AS has_review,
  CAST(has_published_version AS VARCHAR)      AS has_published_version,
  CAST(published_version_ids_json AS VARCHAR) AS published_version_ids_json,
  CAST(version_of_ids_json AS VARCHAR) AS version_of_ids_json,
  CAST(update_to_json AS VARCHAR)      AS update_to_json,
  CAST(raw_relationships_json AS VARCHAR)       AS raw_relationships_json,
FROM all_backends
""")

con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


Unnamed: 0,n
0,9486893


In [6]:
data = con.execute("SELECT * FROM server_thin").df()
# data.drop_duplicates(subset=['record_id'], keep='first', inplace=False)

data = data.drop_duplicates()
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",,,,,,,,,false,,,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",,,,,,,,,false,,,,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",,,,,,,,,false,,,,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",,,,,,,,,false,,,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",,,,,,,,,false,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,Three Objections to Modern Physics,Lubomir Vlcek,,,,,,,,,,,,,
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,Particle Mass Ratios,DT Froedge,,,,,,,,,,,,,
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,Quantum FFF Theory Proposals for Some Unsolved...,Leo Vuyk,,,,,,,,,,,,,
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,Investigation of the Formalism of Particle Dyn...,Chi-Yi Chen,,,,,,,,,,,,,


## merging

In [7]:
import pandas as pd

# ================================
# 1) Load saved artifacts
# ================================
records_hierarchy_df = pd.read_pickle("outputs/records_hierarchy_df.pkl")
date_first_seen_df   = pd.read_pickle("outputs/date_first_seen.pkl")

# ================================
# 2) Normalize keys
# ================================
def norm_key(s):
    return s.astype(str).str.strip()

for df_ in (records_hierarchy_df, date_first_seen_df):
    df_["record_id"] = norm_key(df_["record_id"])
    df_["server_name"] = norm_key(df_["server_name"])

data["record_id"] = norm_key(data["record_id"])
data["server_name"] = norm_key(data["server_name"])

# ================================
# 3) Ensure each RHS table is unique on (record_id, server_name)
# ================================
records_hierarchy_df = records_hierarchy_df.drop_duplicates(["record_id", "server_name"])
date_first_seen_df   = date_first_seen_df.drop_duplicates(["record_id", "server_name"])

# ================================
# 4) Build MASTER pairs from records_hierarchy_df
#    (this is what you said you want to keep)
# ================================
master_pairs = set(zip(records_hierarchy_df["record_id"], records_hierarchy_df["server_name"]))

# Filter helpers
def filter_to_master_pairs(df: pd.DataFrame) -> pd.DataFrame:
    pairs = list(zip(df["record_id"], df["server_name"]))
    return df.loc[pd.Series(pairs, index=df.index).isin(master_pairs)].copy()

# ================================
# 5) Filter data and date_first_seen to MASTER pairs
# ================================
data_master = filter_to_master_pairs(data)
date_first_seen_master = filter_to_master_pairs(date_first_seen_df)

# Optional safety: also dedupe data on the same key (should not change if clean)
data_master = data_master.drop_duplicates(["record_id", "server_name"], keep="first")

# ================================
# 6) Merge (left join from MASTER DATA)
# ================================
join_keys = ["record_id", "server_name"]

data_clean_hierarchy = (
    data_master
      .merge(records_hierarchy_df, on=join_keys, how="left", validate="one_to_one")
      .merge(date_first_seen_master, on=join_keys, how="left", validate="one_to_one")
)

# ================================
# 7) Sanity checks (correct ones)
# ================================
print("Master pairs (records_hierarchy_df rows):", len(records_hierarchy_df))
print("Master unique record_id:", records_hierarchy_df["record_id"].nunique())

print("Rows in raw data:", len(data))
print("Rows in data after filtering to master pairs:", len(data_master))

print("Rows in date_first_seen after filtering to master pairs:", len(date_first_seen_master))
print("Final rows in data_clean_hierarchy:", len(data_clean_hierarchy))

print("\nMissing records_hierarchy:", data_clean_hierarchy["records_hierarchy"].isna().sum())
print("Missing date_first_seen:", data_clean_hierarchy["date_first_seen"].isna().sum())
print("Missing publication_year_first_seen:", data_clean_hierarchy["publication_year_first_seen"].isna().sum())

print("\nDuplicates on (record_id, server_name) in final:",
      data_clean_hierarchy.duplicated(["record_id","server_name"]).sum())

print("\nHierarchy counts:")
print(data_clean_hierarchy["records_hierarchy"].value_counts(dropna=False).head(30))


Master pairs (records_hierarchy_df rows): 8410094
Master unique record_id: 8410094
Rows in raw data: 8413526
Rows in data after filtering to master pairs: 8410094
Rows in date_first_seen after filtering to master pairs: 8410094
Final rows in data_clean_hierarchy: 8410094

Missing records_hierarchy: 0
Missing date_first_seen: 0
Missing publication_year_first_seen: 0

Duplicates on (record_id, server_name) in final: 0

Hierarchy counts:
records_hierarchy
parent                              7950093
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8286
mirror (AgEcon Search)                 6702
child                                  2028
mirror (ResearchGate)                  1633
correction                              355
comment                                 325
mirror (Zenodo)                         297
mirror (SSRN)  

## Exploration

In [8]:
data_clean_hierarchy

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",,,,,,,,,false,,,,,parent,2020-05-03,2020
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",,,,,,,,,false,,,,,parent,2020-04-25,2020
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",,,,,,,,,false,,,,,parent,2020-04-16,2020
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",,,,,,,,,false,,,,,parent,2020-04-15,2020
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",,,,,,,,,false,,,,,parent,2020-04-15,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8410089,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,Three Objections to Modern Physics,Lubomir Vlcek,,,,,,,,,,,,,,parent,2014-09-01,2014
8410090,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,Particle Mass Ratios,DT Froedge,,,,,,,,,,,,,,parent,2011-12-01,2011
8410091,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,Quantum FFF Theory Proposals for Some Unsolved...,Leo Vuyk,,,,,,,,,,,,,,parent,2014-06-01,2014
8410092,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,Investigation of the Formalism of Particle Dyn...,Chi-Yi Chen,,,,,,,,,,,,,,parent,2013-06-01,2013


In [9]:
pattern = "is-preprint-of"


mask = data_clean_hierarchy['relations_json'].str.contains(pattern, regex=False, na=False)
result = data_clean_hierarchy[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen
189,crossref::10.33774/apsa-2019-if2he-v2,APSA Preprints,crossref,10.33774/apsa-2019-if2he-v2,https://doi.org/10.33774/apsa-2019-if2he-v2,https://preprints.apsanet.org/engage/apsa/arti...,Transparency in Practice in Qualitative Research,"Kapiszewski, Diana; Karcher, Sebastian",Georgetown University; Qualitative Data Reposi...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-if2he,10.1017/s1049096520000955,,,true,,,,,version,2019-10-02,2019
194,crossref::10.33774/apsa-2019-wlx66-v2,APSA Preprints,crossref,10.33774/apsa-2019-wlx66-v2,https://doi.org/10.33774/apsa-2019-wlx66-v2,https://preprints.apsanet.org/engage/apsa/arti...,Interrogating the Appropriation of Spaces of C...,"Hussain, Ghulam",Quaid-i-Azam University,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-wlx66,10.3390/rel10110627,,,true,,,,,version,2019-10-24,2019
218,crossref::10.33774/apsa-2019-wjva3-v6,APSA Preprints,crossref,10.33774/apsa-2019-wjva3-v6,https://doi.org/10.33774/apsa-2019-wjva3-v6,https://preprints.apsanet.org/engage/apsa/arti...,Who Will Defend Democracy? Evaluating Tradeoff...,"Nyhan, Brendan; Helmke, Gretchen; Sanders, Mit...",Dartmouth College; University of Rochester; Br...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-wjva3;10.33774/apsa-2019-wj...,10.1080/17457289.2020.1790577,,,true,,,,,version,2020-04-09,2020
242,crossref::10.33774/apsa-2020-r718r,APSA Preprints,crossref,10.33774/apsa-2020-r718r,https://doi.org/10.33774/apsa-2020-r718r,https://preprints.apsanet.org/engage/apsa/arti...,The Rhetoric of Inaction: Failing to Fail Forw...,"Emmons, Cassandra; Pavone, Tommaso",Harvard University; University of Oslo,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2020-r718r-v2,10.1080/13501763.2021.1954065,,,true,,,,,version,2020-09-08,2020
244,crossref::10.33774/apsa-2020-tvgh9,APSA Preprints,crossref,10.33774/apsa-2020-tvgh9,https://doi.org/10.33774/apsa-2020-tvgh9,https://preprints.apsanet.org/engage/apsa/arti...,"Lawyers, Judges, and the Obstinate State: The ...","Pavone, Tommaso",University of Oslo,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2020-tvgh9-v2,10.1057/s41253-020-00136-4,,,true,,,,,version,2020-09-08,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8381923,crossref::10.1101/2025.02.17.25322370,medRxiv,crossref,10.1101/2025.02.17.25322370,https://doi.org/10.1101/2025.02.17.25322370,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Clinical properties of the Short Mood and Feel...,"Jobim, Gabriele dos Santos; do Amaral, João Vi...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.jpsychires.2025.08.025,,,true,,,,,parent,2025-02-23,2025
8381924,crossref::10.1101/2025.02.14.25322283,medRxiv,crossref,10.1101/2025.02.14.25322283,https://doi.org/10.1101/2025.02.14.25322283,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Characterizing resting-state EEG oscillatory a...,"Jaramillo-Jimenez, Alberto; Mantilla-Ramos, Yo...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.compbiomed.2025.111080,,,true,,,,,parent,2025-02-18,2025
8381925,crossref::10.1101/2025.02.14.25322306,medRxiv,crossref,10.1101/2025.02.14.25322306,https://doi.org/10.1101/2025.02.14.25322306,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Exposotypes in Psychotic Disorders,"Yassin, Walid; Kromenacker, Bryan; Green, Jame...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1038/s41598-025-14438-6,,,true,,,,,parent,2025-02-18,2025
8381926,crossref::10.1101/2025.02.11.25322055,medRxiv,crossref,10.1101/2025.02.11.25322055,https://doi.org/10.1101/2025.02.11.25322055,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Determinants of Modern Contraceptive Use among...,"Um, Samnang; Chamroen, Pall; Sieng, Chantrea; ...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1371/journal.pgph.0004346,,,true,,,,,parent,2025-02-13,2025


In [10]:
result["records_hierarchy"].value_counts(dropna=False)

records_hierarchy
parent              417022
version              36974
child                  696
publish_version         27
parent_duplicate         3
Name: count, dtype: int64

In [11]:
result["server_name"].value_counts(dropna=False)

server_name
Research Square                         155422
bioRxiv                                 110451
Preprints.org                            67841
medRxiv                                  33477
JMIR Preprints                           24999
ChemRxiv                                 18207
PsyArXiv                                  8458
Open Science Framework                    7522
EGUsphere                                 7074
SocArXiv                                  5503
TechRxiv                                  4632
Earth and Space Science Open Archive      3069
INA-Rxiv                                  1612
EarthArXiv                                1365
Thesis Commons                             983
engrXiv                                    487
Beilstein Archives                         461
ARPHA Preprints                            312
Advance                                    309
EdArXiv                                    279
MarXiv                                     257
S

In [12]:
pattern = "/"


mask = data_clean_hierarchy['is_preprint_of'].str.contains(pattern, regex=False, na=False)
result = data_clean_hierarchy[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen
189,crossref::10.33774/apsa-2019-if2he-v2,APSA Preprints,crossref,10.33774/apsa-2019-if2he-v2,https://doi.org/10.33774/apsa-2019-if2he-v2,https://preprints.apsanet.org/engage/apsa/arti...,Transparency in Practice in Qualitative Research,"Kapiszewski, Diana; Karcher, Sebastian",Georgetown University; Qualitative Data Reposi...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-if2he,10.1017/s1049096520000955,,,true,,,,,version,2019-10-02,2019
194,crossref::10.33774/apsa-2019-wlx66-v2,APSA Preprints,crossref,10.33774/apsa-2019-wlx66-v2,https://doi.org/10.33774/apsa-2019-wlx66-v2,https://preprints.apsanet.org/engage/apsa/arti...,Interrogating the Appropriation of Spaces of C...,"Hussain, Ghulam",Quaid-i-Azam University,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-wlx66,10.3390/rel10110627,,,true,,,,,version,2019-10-24,2019
218,crossref::10.33774/apsa-2019-wjva3-v6,APSA Preprints,crossref,10.33774/apsa-2019-wjva3-v6,https://doi.org/10.33774/apsa-2019-wjva3-v6,https://preprints.apsanet.org/engage/apsa/arti...,Who Will Defend Democracy? Evaluating Tradeoff...,"Nyhan, Brendan; Helmke, Gretchen; Sanders, Mit...",Dartmouth College; University of Rochester; Br...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-wjva3;10.33774/apsa-2019-wj...,10.1080/17457289.2020.1790577,,,true,,,,,version,2020-04-09,2020
242,crossref::10.33774/apsa-2020-r718r,APSA Preprints,crossref,10.33774/apsa-2020-r718r,https://doi.org/10.33774/apsa-2020-r718r,https://preprints.apsanet.org/engage/apsa/arti...,The Rhetoric of Inaction: Failing to Fail Forw...,"Emmons, Cassandra; Pavone, Tommaso",Harvard University; University of Oslo,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2020-r718r-v2,10.1080/13501763.2021.1954065,,,true,,,,,version,2020-09-08,2020
244,crossref::10.33774/apsa-2020-tvgh9,APSA Preprints,crossref,10.33774/apsa-2020-tvgh9,https://doi.org/10.33774/apsa-2020-tvgh9,https://preprints.apsanet.org/engage/apsa/arti...,"Lawyers, Judges, and the Obstinate State: The ...","Pavone, Tommaso",University of Oslo,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2020-tvgh9-v2,10.1057/s41253-020-00136-4,,,true,,,,,version,2020-09-08,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8381923,crossref::10.1101/2025.02.17.25322370,medRxiv,crossref,10.1101/2025.02.17.25322370,https://doi.org/10.1101/2025.02.17.25322370,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Clinical properties of the Short Mood and Feel...,"Jobim, Gabriele dos Santos; do Amaral, João Vi...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.jpsychires.2025.08.025,,,true,,,,,parent,2025-02-23,2025
8381924,crossref::10.1101/2025.02.14.25322283,medRxiv,crossref,10.1101/2025.02.14.25322283,https://doi.org/10.1101/2025.02.14.25322283,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Characterizing resting-state EEG oscillatory a...,"Jaramillo-Jimenez, Alberto; Mantilla-Ramos, Yo...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.compbiomed.2025.111080,,,true,,,,,parent,2025-02-18,2025
8381925,crossref::10.1101/2025.02.14.25322306,medRxiv,crossref,10.1101/2025.02.14.25322306,https://doi.org/10.1101/2025.02.14.25322306,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Exposotypes in Psychotic Disorders,"Yassin, Walid; Kromenacker, Bryan; Green, Jame...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1038/s41598-025-14438-6,,,true,,,,,parent,2025-02-18,2025
8381926,crossref::10.1101/2025.02.11.25322055,medRxiv,crossref,10.1101/2025.02.11.25322055,https://doi.org/10.1101/2025.02.11.25322055,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Determinants of Modern Contraceptive Use among...,"Um, Samnang; Chamroen, Pall; Sieng, Chantrea; ...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1371/journal.pgph.0004346,,,true,,,,,parent,2025-02-13,2025


In [13]:
result["server_name"].value_counts(dropna=False)

server_name
arXiv                                   1282633
Research Square                          155422
bioRxiv                                  110451
Zenodo                                    99455
Preprints.org                             67831
medRxiv                                   33477
JMIR Preprints                            24998
ChemRxiv                                  18207
Humanities Commons CORE                    8577
PsyArXiv                                   8458
Open Science Framework                     7752
EGUsphere                                  7074
SocArXiv                                   5503
TechRxiv                                   4632
Earth and Space Science Open Archive       3069
INA-Rxiv                                   1612
EarthArXiv                                 1362
Thesis Commons                              983
CERN document server                        530
engrXiv                                     487
Beilstein Archives          

In [14]:
result["records_hierarchy"].value_counts(dropna=False)

records_hierarchy
parent              1808372
version               37027
child                   696
publish_version          27
comment                   7
parent_duplicate          3
Name: count, dtype: int64

In [15]:
data_clean_hierarchy[data_clean_hierarchy['has_published_version']== 'true']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen
189,crossref::10.33774/apsa-2019-if2he-v2,APSA Preprints,crossref,10.33774/apsa-2019-if2he-v2,https://doi.org/10.33774/apsa-2019-if2he-v2,https://preprints.apsanet.org/engage/apsa/arti...,Transparency in Practice in Qualitative Research,"Kapiszewski, Diana; Karcher, Sebastian",Georgetown University; Qualitative Data Reposi...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-if2he,10.1017/s1049096520000955,,,true,,,,,version,2019-10-02,2019
194,crossref::10.33774/apsa-2019-wlx66-v2,APSA Preprints,crossref,10.33774/apsa-2019-wlx66-v2,https://doi.org/10.33774/apsa-2019-wlx66-v2,https://preprints.apsanet.org/engage/apsa/arti...,Interrogating the Appropriation of Spaces of C...,"Hussain, Ghulam",Quaid-i-Azam University,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-wlx66,10.3390/rel10110627,,,true,,,,,version,2019-10-24,2019
218,crossref::10.33774/apsa-2019-wjva3-v6,APSA Preprints,crossref,10.33774/apsa-2019-wjva3-v6,https://doi.org/10.33774/apsa-2019-wjva3-v6,https://preprints.apsanet.org/engage/apsa/arti...,Who Will Defend Democracy? Evaluating Tradeoff...,"Nyhan, Brendan; Helmke, Gretchen; Sanders, Mit...",Dartmouth College; University of Rochester; Br...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2019-wjva3;10.33774/apsa-2019-wj...,10.1080/17457289.2020.1790577,,,true,,,,,version,2020-04-09,2020
242,crossref::10.33774/apsa-2020-r718r,APSA Preprints,crossref,10.33774/apsa-2020-r718r,https://doi.org/10.33774/apsa-2020-r718r,https://preprints.apsanet.org/engage/apsa/arti...,The Rhetoric of Inaction: Failing to Fail Forw...,"Emmons, Cassandra; Pavone, Tommaso",Harvard University; University of Oslo,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2020-r718r-v2,10.1080/13501763.2021.1954065,,,true,,,,,version,2020-09-08,2020
244,crossref::10.33774/apsa-2020-tvgh9,APSA Preprints,crossref,10.33774/apsa-2020-tvgh9,https://doi.org/10.33774/apsa-2020-tvgh9,https://preprints.apsanet.org/engage/apsa/arti...,"Lawyers, Judges, and the Obstinate State: The ...","Pavone, Tommaso",University of Oslo,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.33774/apsa-2020-tvgh9-v2,10.1057/s41253-020-00136-4,,,true,,,,,version,2020-09-08,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8381923,crossref::10.1101/2025.02.17.25322370,medRxiv,crossref,10.1101/2025.02.17.25322370,https://doi.org/10.1101/2025.02.17.25322370,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Clinical properties of the Short Mood and Feel...,"Jobim, Gabriele dos Santos; do Amaral, João Vi...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.jpsychires.2025.08.025,,,true,,,,,parent,2025-02-23,2025
8381924,crossref::10.1101/2025.02.14.25322283,medRxiv,crossref,10.1101/2025.02.14.25322283,https://doi.org/10.1101/2025.02.14.25322283,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Characterizing resting-state EEG oscillatory a...,"Jaramillo-Jimenez, Alberto; Mantilla-Ramos, Yo...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.compbiomed.2025.111080,,,true,,,,,parent,2025-02-18,2025
8381925,crossref::10.1101/2025.02.14.25322306,medRxiv,crossref,10.1101/2025.02.14.25322306,https://doi.org/10.1101/2025.02.14.25322306,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Exposotypes in Psychotic Disorders,"Yassin, Walid; Kromenacker, Bryan; Green, Jame...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1038/s41598-025-14438-6,,,true,,,,,parent,2025-02-18,2025
8381926,crossref::10.1101/2025.02.11.25322055,medRxiv,crossref,10.1101/2025.02.11.25322055,https://doi.org/10.1101/2025.02.11.25322055,http://medrxiv.org/lookup/doi/10.1101/2025.02....,Determinants of Modern Contraceptive Use among...,"Um, Samnang; Chamroen, Pall; Sieng, Chantrea; ...",,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1371/journal.pgph.0004346,,,true,,,,,parent,2025-02-13,2025


In [16]:
data_clean_hierarchy['relations_json'][6707554]

'[{"relatedIdentifier": "10.1016/j.nuclphysb.2006.01.034", "relatedIdentifierType": "DOI", "relationType": "IsVersionOf"}]'

In [17]:
pattern = "10.5281/zenodo.899002"


mask = data_clean_hierarchy['doi'].str.contains(pattern, regex=False, na=False)
result = data_clean_hierarchy[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen
4820937,datacite::10.5281/zenodo.899002,Zenodo,datacite,10.5281/zenodo.899002,https://doi.org/10.5281/zenodo.899002,https://zenodo.org/record/899002,Biobutanol production from apple pomace: The i...,"Hijosa-Valsero, María; Paniagua-García, Ana Is...",Instituto Tecnológico Agrario de Castilla y Le...,,"[{""relatedIdentifier"": ""10.1007/s00253-017-852...",,,,,,False,,,,"{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",parent,2017-09-21,2017


In [18]:
pattern = "Cites"


mask = data_clean_hierarchy['relations_json'].str.contains(pattern, regex=False, na=False)
result = data_clean_hierarchy[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen
270136,datacite::10.17181/f7yev-pd160,CERN document server,datacite,10.17181/f7yev-pd160,https://doi.org/10.17181/f7yev-pd160,https://repository.cern/doi/10.17181/f7yev-pd160,Phase-II Tracking in ATLAS and search for ttHH...,"Falda Ulhoa Coelho, Luis Felipe",Laboratory of Instrumentation and Experimental...,,"[{""relatedIdentifier"": ""arXiv:2412.15090"", ""re...",,,,,,false,,,,"{""client"": {""data"": {""id"": ""cern.cds"", ""type"":...",parent,2025-07-13,2025
1614928,datacite::10.17613/wdx2-4z21,Humanities Commons CORE,datacite,10.17613/wdx2-4z21,https://doi.org/10.17613/wdx2-4z21,https://works.hcommons.org/doi/10.17613/wdx2-4z21,"The Self, Emptiness, and Awareness","Janew, Claus",,,"[{""relatedIdentifier"": ""10.17613/6avkm-v3b70"",...",,,10.17613/8r723-4m824,,,true,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,2024-05-02,2024
1623907,datacite::10.17613/8r723-4m824,Humanities Commons CORE,datacite,10.17613/8r723-4m824,https://doi.org/10.17613/8r723-4m824,https://works.hcommons.org/doi/10.17613/8r723-...,"The Self, Emptiness, and Awareness","Janew, Claus",,,"[{""relatedIdentifier"": ""10.17613/6avkm-v3b70"",...",,,,,,false,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,2024-07-20,2024
1626992,datacite::10.17613/8ecgy-ra479,Humanities Commons CORE,datacite,10.17613/8ecgy-ra479,https://doi.org/10.17613/8ecgy-ra479,https://works.hcommons.org/doi/10.17613/8ecgy-...,Digital Challenges: Queens in the Crónica de ...,Marija Blašković,Pompeu Fabra University; University Pompeu Fabra,,"[{""relatedIdentifier"": ""10.4000/books.esb.63"",...",,,10.17613/p3qe4-az689,,,true,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,2025-01-16,2025
1626993,datacite::10.17613/p3qe4-az689,Humanities Commons CORE,datacite,10.17613/p3qe4-az689,https://doi.org/10.17613/p3qe4-az689,https://works.hcommons.org/doi/10.17613/p3qe4-...,Digital Challenges: Queens in the Crónica de ...,Marija Blašković,Pompeu Fabra University; University Pompeu Fabra,,"[{""relatedIdentifier"": ""10.4000/books.esb.63"",...",,,,,,false,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,2025-01-16,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4827894,datacite::10.5281/zenodo.5033245,Zenodo,datacite,10.5281/zenodo.5033245,https://doi.org/10.5281/zenodo.5033245,https://zenodo.org/record/5033245,Early Treatment of the Inflammatory Stage of C...,"Bastian, Elizabeth; Karrow, Niel A.; Halgas, O...","Family Practice Oncology Network, British Colu...",,"[{""relatedIdentifier"": ""10.31083/j.rcm.2020.04...",,,,,,false,,,,"{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",parent,2021-06-25,2021
4828011,datacite::10.5281/zenodo.5091840,Zenodo,datacite,10.5281/zenodo.5091840,https://doi.org/10.5281/zenodo.5091840,https://zenodo.org/record/5091840,Mannitol bioproduction from surplus grape must...,"Hijosa-Valsero, María; Garita-Cambronero, Jers...",,,"[{""relatedIdentifier"": ""10.1016/j.lwt.2021.112...",,,10.5281/zenodo.5091839,,,true,,,,"{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",parent,2021-07-12,2021
4828012,datacite::10.5281/zenodo.5091839,Zenodo,datacite,10.5281/zenodo.5091839,https://doi.org/10.5281/zenodo.5091839,https://zenodo.org/record/5091839,Mannitol bioproduction from surplus grape must...,"Hijosa-Valsero, María; Garita-Cambronero, Jers...",,,"[{""relatedIdentifier"": ""10.1016/j.lwt.2021.112...",,,,,,false,,,,"{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",parent,2021-07-12,2021
4828121,datacite::10.5281/zenodo.5142604,Zenodo,datacite,10.5281/zenodo.5142604,https://doi.org/10.5281/zenodo.5142604,https://zenodo.org/record/5142604,Rebuttal of Taylor and Barrón-Ortiz 2021 Rethi...,"Outram, Alan K.; Bendrey, Robin; Evershed, Ric...",University of Exeter; University of Edinburgh;...,,"[{""relatedIdentifier"": ""10.1126/science.116859...",,,10.5281/zenodo.5142603,,10.1038/s41598-021-86832-9,true,,,,"{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",parent,2021-07-28,2021


In [19]:
data_clean_hierarchy['relations_json'][3385298]

In [20]:
data_clean_hierarchy['relations_json'][3385298]

In [21]:
data_clean_hierarchy[data_clean_hierarchy['is_preprint_of'].notna()]

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",,,,,,,,,false,,,,,parent,2020-05-03,2020
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",,,,,,,,,false,,,,,parent,2020-04-25,2020
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",,,,,,,,,false,,,,,parent,2020-04-16,2020
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",,,,,,,,,false,,,,,parent,2020-04-15,2020
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",,,,,,,,,false,,,,,parent,2020-04-15,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8384519,crossref::10.36375/prepare_u.iiche.a417,prepare@u,crossref,10.36375/prepare_u.iiche.a417,https://doi.org/10.36375/prepare_u.iiche.a417,https://preprint.prepare.org.in/index.php/iich...,Effects of Indoor Plants on Occupants' Percept...,"BUDANIYA, MUKESH; DASGUPTA, MANI SANKAR",,,,,,,,,false,,,,,parent,2024-08-21,2024
8384520,crossref::10.36375/prepare_u.iiche.a418,prepare@u,crossref,10.36375/prepare_u.iiche.a418,https://doi.org/10.36375/prepare_u.iiche.a418,https://preprint.prepare.org.in/index.php/iich...,Replacement of coal by RDF (Refused Derived Fuel),"KUILA, SUNIL BARAN; DAS, SOUMYADIP; THAKUR, RI...",,,,,,,,,false,,,,,parent,2023-12-30,2023
8384521,crossref::10.36375/prepare_u.a419,prepare@u,crossref,10.36375/prepare_u.a419,https://doi.org/10.36375/prepare_u.a419,https://preprint.prepare.org.in/index.php/prep...,Box Type Solar Cooker Components,"Sarma, Kota Anjaneya",,,,,,,,,false,,,,,parent,2025-02-25,2025
8384522,crossref::10.1079/searchrxiv.2022.00012,searchRxiv,crossref,10.1079/searchrxiv.2022.00012,https://doi.org/10.1079/searchrxiv.2022.00012,http://www.cabidigitallibrary.org/doi/10.1079/...,Withdrawal Statement,"Publishing, CABI",,,,,,,,,false,,,,,parent,2022-08-09,2022


# Resolve duplicates across OSF + OSF communities using the OSF id (osf.io/<id>)

## Quick proof (diagnostic)

In [22]:
import pandas as pd
import re

def extract_osf_id(x):
    if pd.isna(x):
        return pd.NA
    m = re.search(r"osf\.io/([a-z0-9]+)", str(x).lower())
    return m.group(1) if m else pd.NA

tmp = data_clean_hierarchy.copy()

tmp["osf_id_from_doi"] = tmp["doi"].map(extract_osf_id) if "doi" in tmp.columns else pd.NA
tmp["osf_id_from_lp"]  = tmp["landing_page_url"].map(extract_osf_id) if "landing_page_url" in tmp.columns else pd.NA

tmp["osf_id"] = tmp["osf_id_from_doi"].fillna(tmp["osf_id_from_lp"])

print("Rows with OSF id:", tmp["osf_id"].notna().sum())
print("Duplicated OSF id rows:", tmp["osf_id"].notna().sum() - tmp["osf_id"].dropna().nunique())
print(tmp.loc[tmp["osf_id"].notna(), "server_name"].value_counts().head(20))


Rows with OSF id: 234869
Duplicated OSF id rows: 27858
server_name
Open Science Framework     119481
PsyArXiv                    56866
SocArXiv                    21541
INA-Rxiv                    17837
Thesis Commons               3959
EdArXiv                      2547
engrXiv                      2196
EarthArXiv                   1844
Law Archive                  1808
EcoEvoRxiv                    938
MetaArXiv                     880
MarXiv                        508
Arabixiv                      502
AfricArXiv                    501
AgriRxiv                      438
LIS Scholarship Archive       397
SportRxiv                     386
MindRxiv                      335
ECSarXiv                      314
MediArXiv                     309
Name: count, dtype: int64


In [23]:
tmp

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,institutions_flat,countries_flat,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,date_first_seen,publication_year_first_seen,osf_id_from_doi,osf_id_from_lp,osf_id
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",,,,,,,,,false,,,,,parent,2020-05-03,2020,,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",,,,,,,,,false,,,,,parent,2020-04-25,2020,,,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",,,,,,,,,false,,,,,parent,2020-04-16,2020,,,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",,,,,,,,,false,,,,,parent,2020-04-15,2020,,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",,,,,,,,,false,,,,,parent,2020-04-15,2020,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8410089,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,Three Objections to Modern Physics,Lubomir Vlcek,,,,,,,,,,,,,,parent,2014-09-01,2014,,,
8410090,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,Particle Mass Ratios,DT Froedge,,,,,,,,,,,,,,parent,2011-12-01,2011,,,
8410091,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,Quantum FFF Theory Proposals for Some Unsolved...,Leo Vuyk,,,,,,,,,,,,,,parent,2014-06-01,2014,,,
8410092,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,Investigation of the Formalism of Particle Dyn...,Chi-Yi Chen,,,,,,,,,,,,,,parent,2013-06-01,2013,,,


## function: resolve OSF duplicates by OSF ID (works across different DOI prefixes)

In [24]:
import pandas as pd
import numpy as np
import re

def resolve_osf_duplicates_fast(
    df: pd.DataFrame,
    preferred_parent_servers: set,
    prefer_backend: str = "crossref",
    choose_parent: str = "oldest",           # "oldest" or "most_recent"
    date_col: str = "date_created",          # if missing -> record_id fallback
    overwrite_mode: str = "parent_only",     # "any" | "parent_only" | "unlabeled_only"
    # columns
    server_col: str = "server_name",
    backend_col: str = "backend",
    record_id_col: str = "record_id",
    doi_col: str = "doi",
    landing_col: str = "landing_page_url",
    hierarchy_col: str = "records_hierarchy",
    parent_id_col: str = "parent_record_id",
    # perf knobs
    coarse_filter: str = "osf.io/",          # cheap contains() filter before regex
    min_group_size: int = 2
) -> pd.DataFrame:
    """
    Fast OSF duplicate resolver for huge frames (millions of rows).

    Key idea:
      - Avoid regex over the full dataframe.
      - First, cheaply filter rows that likely contain 'osf.io/' in doi or landing_page_url.
      - Only then extract OSF id and group to resolve duplicates.
      - Finally, write results back only for touched rows.

    Parent selection priority:
      1) preferred community servers first (SocArXiv, PsyArXiv, etc.)
      2) prefer backend == prefer_backend (crossref)
      3) date (oldest/most_recent)
      4) record_id numeric key (tie-break)
    """

    out = df.copy()

    # Ensure output cols exist
    if hierarchy_col not in out.columns:
        out[hierarchy_col] = pd.NA
    if parent_id_col not in out.columns:
        out[parent_id_col] = pd.NA

    # Eligibility mask (run only where you allow overwriting)
    h = out[hierarchy_col]
    if overwrite_mode == "any":
        eligible = pd.Series(True, index=out.index)
    elif overwrite_mode == "parent_only":
        eligible = h.astype(str).str.strip().str.lower().eq("parent")
    elif overwrite_mode == "unlabeled_only":
        eligible = h.isna()
    else:
        raise ValueError("overwrite_mode must be: any | parent_only | unlabeled_only")

    # ---- 1) Coarse filter: only rows likely to be OSF-related (very fast)
    # Use fillna("") so .str.contains doesn't create object issues
    m_osf = pd.Series(False, index=out.index)

    if doi_col in out.columns:
        m_osf |= out[doi_col].fillna("").astype(str).str.contains(coarse_filter, case=False, regex=False)
    if landing_col in out.columns:
        m_osf |= out[landing_col].fillna("").astype(str).str.contains(coarse_filter, case=False, regex=False)

    m = eligible & m_osf
    if not m.any():
        return out

    # Work on subset only
    sub = out.loc[m, [server_col, backend_col, record_id_col]].copy()

    # ---- 2) Extract OSF id from subset (regex only on ~235k rows, not 8M)
    pat = re.compile(r"osf\.io/([a-z0-9]+)", re.I)

    def extract_osf_id_series(s: pd.Series) -> pd.Series:
        return s.fillna("").astype(str).str.lower().str.extract(pat, expand=False)

    osf_id = pd.Series(pd.NA, index=sub.index, dtype="object")
    if doi_col in out.columns:
        osf_id = extract_osf_id_series(out.loc[m, doi_col])
    if landing_col in out.columns:
        osf_id = osf_id.fillna(extract_osf_id_series(out.loc[m, landing_col]))

    # drop rows with no extracted id (coarse filter can include a few false positives)
    sub["_osf_id"] = osf_id
    sub = sub[sub["_osf_id"].notna()].copy()
    if sub.empty:
        return out

    # ---- 3) Prepare sort keys on subset
    sub["_is_pref_server"] = sub[server_col].isin(preferred_parent_servers)

    # backend preference (vectorized)
    sub["_is_pref_backend"] = (
        sub[backend_col].fillna("").astype(str).str.lower().eq(str(prefer_backend).lower())
    )

    # date key
    if date_col in out.columns:
        sub["_dt"] = pd.to_datetime(out.loc[sub.index, date_col], errors="coerce")
    else:
        sub["_dt"] = pd.NaT

    # record_id numeric key (extract digits)
    rid_digits = sub[record_id_col].astype(str).str.extract(r"(\d+)")[0]
    sub["_rid_key"] = pd.to_numeric(rid_digits, errors="coerce")

    if choose_parent not in {"oldest", "most_recent"}:
        raise ValueError("choose_parent must be: oldest | most_recent")
    date_asc = (choose_parent == "oldest")

    # ---- 4) Only group IDs that actually have duplicates (saves time)
    # value_counts on 235k rows is cheap
    dup_ids = sub["_osf_id"].value_counts()
    dup_ids = dup_ids[dup_ids >= min_group_size].index
    sub = sub[sub["_osf_id"].isin(dup_ids)].copy()
    if sub.empty:
        return out

    # ---- 5) Resolve per OSF id
    # Sorting once, then picking first per group is faster than looping all groups with Python
    sub_sorted = sub.sort_values(
        by=["_osf_id", "_is_pref_server", "_is_pref_backend", "_dt", "_rid_key"],
        ascending=[True, False, False, date_asc, True],
        na_position="last"
    )

    # Parent idx per osf_id = first row after sorting
    parent_idx_by_id = sub_sorted.groupby("_osf_id", sort=False).head(1)
    parent_map_rid = parent_idx_by_id.set_index("_osf_id")[record_id_col]
    parent_map_srv = parent_idx_by_id.set_index("_osf_id")[server_col]

    # For all rows in sub_sorted, map parent rid & parent server
    sub_sorted["_parent_rid"] = sub_sorted["_osf_id"].map(parent_map_rid)
    sub_sorted["_parent_srv"] = sub_sorted["_osf_id"].map(parent_map_srv)

    # Identify which rows are parent vs child
    is_parent_row = sub_sorted[record_id_col].eq(sub_sorted["_parent_rid"])

    # Write back to OUT
    parent_rows = sub_sorted.index[is_parent_row]
    child_rows  = sub_sorted.index[~is_parent_row]

    out.loc[parent_rows, hierarchy_col] = "parent"
    out.loc[parent_rows, parent_id_col] = pd.NA

    out.loc[child_rows, hierarchy_col] = "mirror (" + sub_sorted.loc[child_rows, "_parent_srv"].astype(str) + ")"
    out.loc[child_rows, parent_id_col] = sub_sorted.loc[child_rows, "_parent_rid"].values

    return out


In [25]:
PREFERRED_OSF_PARENT = {
    "SocArXiv","Law Archive","PsyArXiv","EdArXiv","EarthArXiv","Thesis Commons",
    "LIS Scholarship Archive","SportRxiv","INA-Rxiv","Arabixiv","engrXiv","MetaArXiv",
    "MindRxiv","MarXiv","AgriRxiv","NutriXiv","ECSarXiv","FocUS Archive","Frenxiv",
    "EcoEvoRxiv","IndiaRxiv","PaleorXiv","AfricArXiv","BioHackrXiv","MediArXiv"
}

data_clean_hierarchy = resolve_osf_duplicates_fast(
    df=data_clean_hierarchy,
    preferred_parent_servers=PREFERRED_OSF_PARENT,
    prefer_backend="crossref",
    choose_parent="oldest",
    date_col="date_first_seen",
    overwrite_mode="parent_only",
)

print(data_clean_hierarchy["records_hierarchy"].value_counts(dropna=False).head(30))


records_hierarchy
parent                              7931373
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8286
mirror (AgEcon Search)                 6702
mirror (Open Science Framework)        4356
mirror (PsyArXiv)                      4304
mirror (INA-Rxiv)                      3768
mirror (SocArXiv)                      3395
child                                  2028
mirror (ResearchGate)                  1633
mirror (Law Archive)                    935
mirror (EarthArXiv)                     410
correction                              355
mirror (Thesis Commons)                 346
comment                                 325
mirror (Zenodo)                         297
mirror (engrXiv)                        256
mirror (MarXiv)                         194
mirror (LIS Scholarship Archive)        137
mirror (MindRx

# dedupe on title+authors (+ optional year)

In [26]:
"""
Reproducible 2-pass dedupe pipeline (Exact pass -> Fuzzy pass) with:
- Strong-but-cheap title normalization (cached)
- 3 author signatures: tokenbag | last_initial | last
- Stage A strict (title + authors_fp) exact
- Stage B relaxed (shared authors overlap) within exact-title groups (optional per stage)
- Optional fuzzy title fallback (token containment) BLOCKED by authors_fp (+ optional year)
- Prefilter modes:
    * title_dup  : keep rows where cleaned title repeats (fast exact stages)
    * author_dup : keep rows where authors_fp repeats (enables fuzzy stages when titles differ)
    * none       : keep all eligible (debug)

Includes:
- Metrics counters per stage
- Summary printing + early stop
- Deterministic labeling
- Designed for speed + low false positives (especially with last_initial)

USAGE:
1) Define STAGES_EXACT and STAGES_FUZZY
2) Run:
   df_out, metrics = run_dedupe_pipeline_two_passes(
       df,
       stages_exact=STAGES_EXACT,
       stages_fuzzy=STAGES_FUZZY,
       early_stop_if_new_labels_lt=500,
       print_summary=True,
       return_all_metrics=True,
       servers=None,
       across_servers=True,
       use_year=False,
       choose_parent="oldest",
       prefilter=True,
       date_candidates=('date_first_seen',),
       hierarchy_col="records_hierarchy",
       parent_id_col="parent_record_id",
       group_id_col="dup_group_id",
       add_authors_fingerprint_col=True,
       add_title_clean_col=True,
   )
"""

import pandas as pd
import numpy as np
import re
import time
import unicodedata
from typing import Iterable, Optional, Dict, Any, Tuple, List

# ============================================================
# 0) Regex + NA helpers
# ============================================================
_WS = re.compile(r"\s+")
_PUNCT_ALL = re.compile(r"[^\w\s]", re.UNICODE)  # remove everything except word chars + spaces
NA_LIKE = {"", "none", "null", "nan", "n/a", "[]", "{}", "na"}


# ============================================================
# 1) Utility: pick a date column + record_id numeric fallback
# ============================================================
def _pick_first_existing(df: pd.DataFrame, candidates: Iterable[str]) -> Optional[str]:
    for c in candidates:
        if c in df.columns:
            return c
    return None


def _record_id_key(s: pd.Series) -> pd.Series:
    """Fast numeric key from record_id (extract first digits)."""
    digits = s.astype("string").str.extract(r"(\d+)")[0]
    return pd.to_numeric(digits, errors="coerce")


# ============================================================
# 2) Title normalization (cheap, high ROI) + token containment
# ============================================================
def _strip_accents_text(x: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFKD", x) if not unicodedata.combining(c)
    )


def _clean_title_series_v2(s: pd.Series) -> pd.Series:
    """
    Strong-but-cheap title normalization:
      - lowercase
      - strip accents
      - remove punctuation -> spaces
      - collapse whitespace
    """
    s = s.astype("string").fillna("").str.strip().str.lower()
    s = s.where(~s.isin(list(NA_LIKE)), "")
    s = s.apply(_strip_accents_text)
    s = s.str.replace(_PUNCT_ALL, " ", regex=True)
    s = s.str.replace(_WS, " ", regex=True).str.strip()
    return s


def _title_tokens_from_clean(title_clean: str) -> List[str]:
    """Tokenize already-clean title into tokens; drop very short tokens (len < 2)."""
    if not title_clean:
        return []
    return [t for t in title_clean.split(" ") if len(t) >= 2]


def _containment_score(a_tokens: List[str], b_tokens: List[str]) -> float:
    """
    Containment score:
        |A ∩ B| / min(|A|, |B|)
    Good for small title differences when tokens still mostly match.
    """
    if not a_tokens or not b_tokens:
        return 0.0
    A, B = set(a_tokens), set(b_tokens)
    denom = min(len(A), len(B))
    if denom <= 0:
        return 0.0
    return len(A & B) / denom


# ============================================================
# 3) Author canonicalization (3 modes)
# ============================================================
def _strip_accents(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c)
    )


def _normalize_one_author_tokenbag(author: str) -> str:
    """
    Token-bag per author:
    - remove punctuation
    - split tokens
    - sort tokens within author
    - join with "_"
    """
    if not author:
        return ""
    a = _strip_accents(str(author)).lower().strip()
    if not a or a in NA_LIKE:
        return ""
    a = _PUNCT_ALL.sub(" ", a)
    a = _WS.sub(" ", a).strip()
    if not a:
        return ""
    toks = [t for t in a.split(" ") if t]
    if not toks:
        return ""
    toks = sorted(toks)
    return "_".join(toks)


def _normalize_one_author_last_initial(author: str) -> str:
    """
    Middle-ground signature: "last|first_initial"
    Rules:
      - If comma: "Last, First ..." -> last = first token before comma;
                                     initial = first token after comma (first-name token only)
      - If no comma: "First ... Last" -> last = last token; initial = first token
      - If we can't find an initial, return "" (reduces false positives)
    """
    if not author:
        return ""
    a = _strip_accents(str(author)).lower().strip()
    if not a or a in NA_LIKE:
        return ""

    if "," in a:
        left, right = a.split(",", 1)
        left = _PUNCT_ALL.sub(" ", left)
        right = _PUNCT_ALL.sub(" ", right)
        left = _WS.sub(" ", left).strip()
        right = _WS.sub(" ", right).strip()
        if not left:
            return ""
        last_toks = [t for t in left.split(" ") if t]
        if not last_toks:
            return ""
        last = last_toks[0]  # keep your "first token if multi-token surname" philosophy

        first_toks = [t for t in right.split(" ") if t]
        if not first_toks:
            return ""  # avoid false positives
        ini = first_toks[0][:1]
        return f"{last}|{ini}" if ini else ""
    else:
        a = _PUNCT_ALL.sub(" ", a)
        a = _WS.sub(" ", a).strip()
        toks = [t for t in a.split(" ") if t]
        if len(toks) < 2:
            return ""
        ini = toks[0][:1]
        last = toks[-1]
        return f"{last}|{ini}" if (ini and last) else ""


def _normalize_one_author_last(author: str) -> str:
    """Last-name-only signature (high recall, more false positives)."""
    if not author:
        return ""
    a = _strip_accents(str(author)).lower().strip()
    if not a or a in NA_LIKE:
        return ""

    if "," in a:
        left = a.split(",", 1)[0].strip()
        left = _PUNCT_ALL.sub(" ", left)
        left = _WS.sub(" ", left).strip()
        if not left:
            return ""
        toks = [t for t in left.split(" ") if t]
        if not toks:
            return ""
        return toks[0]
    else:
        a = _PUNCT_ALL.sub(" ", a)
        a = _WS.sub(" ", a).strip()
        toks = [t for t in a.split(" ") if t]
        if not toks:
            return ""
        return toks[-1]


def build_authors_fingerprint_series(authors_flat: pd.Series, mode: str) -> pd.Series:
    """
    Build author fingerprint per row:
      - split authors on ';'
      - normalize each author (depends on mode)
      - drop empties
      - dedupe within row
      - sort
      - join with ';'
    """
    if mode not in {"tokenbag", "last_initial", "last"}:
        raise ValueError("mode must be tokenbag | last_initial | last")

    s = authors_flat.astype("string").fillna("").str.strip()
    s = s.where(~s.str.lower().isin(list(NA_LIKE)), "")

    if mode == "tokenbag":
        norm_fn = _normalize_one_author_tokenbag
    elif mode == "last_initial":
        norm_fn = _normalize_one_author_last_initial
    else:
        norm_fn = _normalize_one_author_last

    def row_to_fp(x: str) -> str:
        if not x:
            return ""
        authors = [a.strip() for a in str(x).split(";") if a.strip()]
        norm = [norm_fn(a) for a in authors]
        norm = [z for z in norm if z]
        norm = sorted(set(norm))
        return ";".join(norm)

    return s.apply(row_to_fp)


def _author_tokens_from_fp(fp: str) -> List[str]:
    if not fp:
        return []
    return [t for t in fp.split(";") if t]


def _overlap_count(a_tokens: List[str], b_tokens: List[str]) -> int:
    if not a_tokens or not b_tokens:
        return 0
    return len(set(a_tokens) & set(b_tokens))


# ============================================================
# 4) Single-stage dedupe:
#    - Prefilter (title_dup/author_dup/none)
#    - Stage A strict (title + authors_fp) exact match
#    - Optional fuzzy title fallback (within same authors_fp)
#    - Optional Stage B relaxed (shared authors overlap) within exact-title groups
# ============================================================
def dedupe_title_authors_stage(
    df: pd.DataFrame,
    *,
    # stage config
    stage_name: str = "stage",
    authors_fp_mode: str = "tokenbag",         # tokenbag | last_initial | last

    # fuzzy config (title containment), executed only if enabled
    title_fuzzy_fallback: bool = False,
    min_title_tokens: int = 6,
    min_title_containment: float = 0.70,
    fuzzy_compare_strategy: str = "parent_only",  # parent_only | all_pairs_small (parent_only is safest/fastest)

    # relaxed (shared authors overlap) config (exact title only)
    relaxed_shared_authors: bool = True,
    min_authors_required: int = 2,
    min_shared_authors: int = 2,

    # prefilter strategy (important!)
    prefilter_mode: str = "title_dup",         # title_dup | author_dup | none
    prefilter: bool = True,                    # if False, skip ">=2" group filter (slower)

    # global options
    servers=None,
    across_servers: bool = True,
    use_year: bool = False,
    choose_parent: str = "oldest",             # oldest | most_recent
    overwrite_mode: str = "parent_only",       # any | parent_only | unlabeled_only

    # columns
    server_col: str = "server_name",
    record_id_col: str = "record_id",
    title_col: str = "title",
    authors_col: str = "authors_flat",
    year_col: str = "publication_year_first_seen",
    date_candidates: Tuple[str, ...] = ("date_first_seen",),

    hierarchy_col: str = "records_hierarchy",
    parent_id_col: str = "parent_record_id",
    group_id_col: str = "dup_group_id",

    # caching/debug columns
    add_authors_fingerprint_col: bool = True,
    authors_fingerprint_col: str = "authors_fp",
    add_title_clean_col: bool = True,
    title_clean_col: str = "title_clean_v2",

    return_metrics: bool = False,
) -> pd.DataFrame | Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    One dedupe stage. Designed to be composed into a multi-stage pipeline.
    """

    t0 = time.perf_counter()

    metrics: Dict[str, Any] = {
        "stage_name": stage_name,
        "n_rows_df": int(len(df)),
        "n_candidates_initial": 0,
        "prefilter_mode": prefilter_mode,
        "prefilter_rows": 0,
        "prefilter_groups": 0,
        "work_rows_after_keys": 0,

        "stageA_groups": 0,
        "stageA_children_labeled": 0,

        "fuzzy_enabled": bool(title_fuzzy_fallback),
        "fuzzy_groups": 0,
        "fuzzy_pairs_checked": 0,
        "fuzzy_children_labeled": 0,

        "stageB_enabled": bool(relaxed_shared_authors),
        "stageB_title_groups": 0,
        "stageB_clusters": 0,
        "stageB_children_labeled": 0,

        "time_s": 0.0,
    }

    # ------------------------------------------------------
    # Ensure output cols exist
    # ------------------------------------------------------
    for c in (hierarchy_col, parent_id_col, group_id_col):
        if c not in df.columns:
            df[c] = pd.NA

    if add_authors_fingerprint_col and authors_fingerprint_col not in df.columns:
        df[authors_fingerprint_col] = pd.NA
    if add_title_clean_col and title_clean_col not in df.columns:
        df[title_clean_col] = pd.NA

    # ------------------------------------------------------
    # Eligibility
    # ------------------------------------------------------
    h = df[hierarchy_col]
    if overwrite_mode == "any":
        eligible = pd.Series(True, index=df.index)
    elif overwrite_mode == "parent_only":
        eligible = h.astype("string").str.lower().str.strip().eq("parent")
    elif overwrite_mode == "unlabeled_only":
        eligible = h.isna()
    else:
        raise ValueError("overwrite_mode must be any | parent_only | unlabeled_only")

    # server filter
    if servers is None:
        server_mask = pd.Series(True, index=df.index)
    elif isinstance(servers, str):
        server_mask = df[server_col].eq(servers)
    else:
        server_mask = df[server_col].isin(list(servers))

    m = eligible & server_mask
    metrics["n_candidates_initial"] = int(m.sum())
    if not m.any():
        metrics["time_s"] = time.perf_counter() - t0
        return (df, metrics) if return_metrics else df

    # ------------------------------------------------------
    # Prefilter: decide which indices to consider in this stage
    # ------------------------------------------------------
    if prefilter_mode == "title_dup":
        # title-based prefilter (fast for exact title stages)
        t_clean = df.loc[m, title_clean_col] if (add_title_clean_col and title_clean_col in df.columns and df.loc[m, title_clean_col].notna().any()) else None
        if t_clean is None:
            t_clean = _clean_title_series_v2(df.loc[m, title_col])
        vc = t_clean.value_counts()
        keep_idx = t_clean[t_clean.isin(vc[vc >= 2].index)].index
        metrics["prefilter_groups"] = int((vc >= 2).sum())

    elif prefilter_mode == "author_dup":
        # author-fp based prefilter (crucial for fuzzy pass; titles may differ)
        # compute fp only for m rows
        a_fp = build_authors_fingerprint_series(df.loc[m, authors_col], mode=authors_fp_mode)
        vc = a_fp.value_counts()
        keep_idx = a_fp[a_fp.isin(vc[vc >= 2].index)].index
        metrics["prefilter_groups"] = int((vc >= 2).sum())

    elif prefilter_mode == "none":
        keep_idx = df.index[m]
        metrics["prefilter_groups"] = 0

    else:
        raise ValueError("prefilter_mode must be title_dup | author_dup | none")

    metrics["prefilter_rows"] = int(len(keep_idx))
    if len(keep_idx) == 0:
        metrics["time_s"] = time.perf_counter() - t0
        return (df, metrics) if return_metrics else df

    # ------------------------------------------------------
    # Work subset + compute/attach cached normalization keys
    # ------------------------------------------------------
    cols_needed = [server_col, record_id_col, title_col, authors_col]
    if use_year:
        cols_needed.append(year_col)
    date_col = _pick_first_existing(df, date_candidates)
    if date_col:
        cols_needed.append(date_col)

    work = df.loc[keep_idx, cols_needed].copy()

    # Title clean (cache to df if asked)
    if add_title_clean_col:
        # compute for missing only (cheap)
        t_missing = df.loc[work.index, title_clean_col].isna()
        if t_missing.any():
            df.loc[work.index[t_missing], title_clean_col] = _clean_title_series_v2(df.loc[work.index[t_missing], title_col]).values
        work["_t"] = df.loc[work.index, title_clean_col].astype("string").fillna("")
    else:
        work["_t"] = _clean_title_series_v2(work[title_col])

    # Authors fp (mode-specific; cache into df column if asked)
    work["_a_fp"] = build_authors_fingerprint_series(work[authors_col], mode=authors_fp_mode)
    if add_authors_fingerprint_col:
        df.loc[work.index, authors_fingerprint_col] = work["_a_fp"].values

    # Year (optional)
    if use_year:
        y = pd.to_numeric(work[year_col], errors="coerce")
        y = y.where((y >= 1000) & (y <= 3000)).round().astype("Int64")
        work["_y"] = y
    else:
        work["_y"] = pd.NA

    # require non-empty keys
    if use_year:
        work = work[(work["_t"] != "") & (work["_a_fp"] != "") & work["_y"].notna()].copy()
    else:
        work = work[(work["_t"] != "") & (work["_a_fp"] != "")].copy()

    metrics["work_rows_after_keys"] = int(len(work))
    if work.empty:
        metrics["time_s"] = time.perf_counter() - t0
        return (df, metrics) if return_metrics else df

    # ------------------------------------------------------
    # Stage A STRICT: exact match on (title_clean + authors_fp [+year] [+server scope])
    # ------------------------------------------------------
    if use_year:
        strict_base = work["_t"] + "||" + work["_a_fp"] + "||" + work["_y"].astype("string")
    else:
        strict_base = work["_t"] + "||" + work["_a_fp"]

    if across_servers:
        work["_grp_strict"] = strict_base
    else:
        work["_grp_strict"] = work[server_col].astype("string") + "||" + strict_base

    strict = work
    if prefilter:
        vcg = work["_grp_strict"].value_counts()
        dup_keys = vcg[vcg >= 2].index
        strict = work[work["_grp_strict"].isin(dup_keys)].copy()

    metrics["stageA_groups"] = int(strict["_grp_strict"].nunique()) if not strict.empty else 0

    # sort keys for parent choice
    if date_col and date_col in strict.columns:
        strict["_dt"] = pd.to_datetime(strict[date_col], errors="coerce")
    else:
        strict["_dt"] = pd.NaT
    strict["_rid"] = _record_id_key(strict[record_id_col])

    if not strict.empty:
        if choose_parent == "oldest":
            strict = strict.sort_values(
                by=["_grp_strict", "_dt", "_rid"],
                ascending=[True, True, True],
                na_position="last",
            )
        elif choose_parent == "most_recent":
            strict = strict.sort_values(
                by=["_grp_strict", "_dt", "_rid"],
                ascending=[True, False, False],
                na_position="last",
            )
        else:
            raise ValueError("choose_parent must be oldest | most_recent")

        parents = strict.groupby("_grp_strict", sort=False).head(1)
        parent_rid_map = parents.set_index("_grp_strict")[record_id_col]
        parent_srv_map = parents.set_index("_grp_strict")[server_col]

        strict["_parent_rid"] = strict["_grp_strict"].map(parent_rid_map)
        strict["_parent_srv"] = strict["_grp_strict"].map(parent_srv_map)

        is_parent = strict[record_id_col].eq(strict["_parent_rid"])
        parent_idx = strict.index[is_parent]
        child_idx = strict.index[~is_parent]

        metrics["stageA_children_labeled"] = int(len(child_idx))

        df.loc[parent_idx, hierarchy_col] = "parent"
        df.loc[parent_idx, parent_id_col] = pd.NA
        df.loc[child_idx, hierarchy_col] = (
            "parent - duplicate (" + strict.loc[child_idx, "_parent_srv"].astype("string") + ")"
        )
        df.loc[child_idx, parent_id_col] = strict.loc[child_idx, "_parent_rid"].values

        # deterministic group id
        df.loc[strict.index, group_id_col] = (
            pd.util.hash_pandas_object(strict["_grp_strict"], index=False)
            .astype("uint64")
            .astype(str)
            .values
        )

    # ------------------------------------------------------
    # Fuzzy title fallback (BLOCKED by authors_fp [+year], only remaining eligible)
    # Important: this can find near-duplicate titles because we do NOT rely on title_dup.
    # ------------------------------------------------------
    if title_fuzzy_fallback:
        # remaining eligible after Stage A
        h2 = df[hierarchy_col]
        if overwrite_mode == "parent_only":
            eligible2 = h2.astype("string").str.lower().str.strip().eq("parent")
        elif overwrite_mode == "unlabeled_only":
            eligible2 = h2.isna()
        else:
            eligible2 = pd.Series(True, index=df.index)

        remain_idx = work.index.intersection(df.index[eligible2])
        wF = work.loc[remain_idx].copy()

        if not wF.empty:
            # block by authors_fp (+year) because authors are "more trustworthy"
            if use_year:
                wF["_grp_auth"] = wF["_a_fp"] + "||" + wF["_y"].astype("string")
            else:
                wF["_grp_auth"] = wF["_a_fp"]

            # keep only blocks with >=2 rows
            vc_auth = wF["_grp_auth"].value_counts()
            keep_auth = vc_auth[vc_auth >= 2].index
            wF = wF[wF["_grp_auth"].isin(keep_auth)].copy()

            metrics["fuzzy_groups"] = int(wF["_grp_auth"].nunique()) if not wF.empty else 0

            if not wF.empty:
                # date/rid for parent selection
                if date_col and date_col in wF.columns:
                    wF["_dt"] = pd.to_datetime(wF[date_col], errors="coerce")
                else:
                    wF["_dt"] = pd.NaT
                wF["_rid"] = _record_id_key(wF[record_id_col])

                # tokens cache per row (within this stage)
                tokens_map = {idx: _title_tokens_from_clean(wF.loc[idx, "_t"]) for idx in wF.index}

                for grp, g in wF.groupby("_grp_auth", sort=False):
                    if len(g) < 2:
                        continue

                    # gate: ignore titles with too few tokens
                    idxs = [idx for idx in g.index if len(tokens_map.get(idx, [])) >= min_title_tokens]
                    if len(idxs) < 2:
                        continue

                    gg = g.loc[idxs].copy()
                    if choose_parent == "oldest":
                        gg = gg.sort_values(by=["_dt", "_rid"], ascending=[True, True], na_position="last")
                    else:
                        gg = gg.sort_values(by=["_dt", "_rid"], ascending=[False, False], na_position="last")

                    if fuzzy_compare_strategy == "parent_only":
                        parent_idx = gg.index[0]
                        parent_tokens = tokens_map[parent_idx]
                        parent_rid = gg.loc[parent_idx, record_id_col]
                        parent_srv = gg.loc[parent_idx, server_col]

                        # ensure parent labeled
                        df.loc[parent_idx, hierarchy_col] = "parent"
                        df.loc[parent_idx, parent_id_col] = pd.NA

                        for idx in gg.index[1:]:
                            metrics["fuzzy_pairs_checked"] += 1
                            sc = _containment_score(parent_tokens, tokens_map[idx])
                            if sc >= min_title_containment:
                                df.loc[idx, hierarchy_col] = f"parent - duplicate ({parent_srv})"
                                df.loc[idx, parent_id_col] = parent_rid
                                df.loc[idx, group_id_col] = f"fuzzy::{authors_fp_mode}::{grp}"
                                metrics["fuzzy_children_labeled"] += 1

                    elif fuzzy_compare_strategy == "all_pairs_small":
                        # safer than global all-pairs; still can be heavy if blocks are large.
                        # We'll cluster by greedy expansion (bounded within block).
                        idxs2 = gg.index.tolist()
                        used = set()
                        for i in idxs2:
                            if i in used:
                                continue
                            used.add(i)
                            cluster = [i]
                            for j in idxs2:
                                if j in used:
                                    continue
                                metrics["fuzzy_pairs_checked"] += 1
                                sc = _containment_score(tokens_map[i], tokens_map[j])
                                if sc >= min_title_containment:
                                    used.add(j)
                                    cluster.append(j)

                            if len(cluster) >= 2:
                                # choose parent (oldest/most recent) within cluster
                                cldf = gg.loc[cluster].copy()
                                if choose_parent == "oldest":
                                    cldf = cldf.sort_values(by=["_dt", "_rid"], ascending=[True, True], na_position="last")
                                else:
                                    cldf = cldf.sort_values(by=["_dt", "_rid"], ascending=[False, False], na_position="last")

                                p_idx = cldf.index[0]
                                p_rid = cldf.loc[p_idx, record_id_col]
                                p_srv = cldf.loc[p_idx, server_col]
                                df.loc[p_idx, hierarchy_col] = "parent"
                                df.loc[p_idx, parent_id_col] = pd.NA
                                for cidx in cldf.index[1:]:
                                    df.loc[cidx, hierarchy_col] = f"parent - duplicate ({p_srv})"
                                    df.loc[cidx, parent_id_col] = p_rid
                                    df.loc[cidx, group_id_col] = f"fuzzy::{authors_fp_mode}::{grp}"
                                    metrics["fuzzy_children_labeled"] += 1
                    else:
                        raise ValueError("fuzzy_compare_strategy must be parent_only | all_pairs_small")

    # ------------------------------------------------------
    # Stage B RELAXED (shared authors overlap) within exact title groups
    # ------------------------------------------------------
    if relaxed_shared_authors:
        h3 = df[hierarchy_col]
        if overwrite_mode == "parent_only":
            eligible3 = h3.astype("string").str.lower().str.strip().eq("parent")
        elif overwrite_mode == "unlabeled_only":
            eligible3 = h3.isna()
        else:
            eligible3 = pd.Series(True, index=df.index)

        remain_idx = work.index.intersection(df.index[eligible3])
        w2 = work.loc[remain_idx].copy()
        if not w2.empty:
            if use_year:
                relaxed_base = w2["_t"] + "||" + w2["_y"].astype("string")
            else:
                relaxed_base = w2["_t"]

            if across_servers:
                w2["_grp_title"] = relaxed_base
            else:
                w2["_grp_title"] = w2[server_col].astype("string") + "||" + relaxed_base

            # keep only repeated titles
            vc2 = w2["_grp_title"].value_counts()
            keep_groups = vc2[vc2 >= 2].index
            w2 = w2[w2["_grp_title"].isin(keep_groups)].copy()

            metrics["stageB_title_groups"] = int(w2["_grp_title"].nunique()) if not w2.empty else 0

            if not w2.empty:
                w2["_a_tokens"] = w2["_a_fp"].apply(_author_tokens_from_fp)
                w2["_a_n"] = w2["_a_tokens"].apply(len)

                if date_col and date_col in w2.columns:
                    w2["_dt"] = pd.to_datetime(w2[date_col], errors="coerce")
                else:
                    w2["_dt"] = pd.NaT
                w2["_rid"] = _record_id_key(w2[record_id_col])

                group_counter = 0
                children_total = 0

                for grp, g in w2.groupby("_grp_title", sort=False):
                    if len(g) < 2:
                        continue

                    g = g[g["_a_n"] >= min_authors_required].copy()
                    if len(g) < 2:
                        continue

                    idxs = g.index.tolist()
                    used = set()
                    clusters = []

                    # simple greedy clustering based on author overlap
                    for i in idxs:
                        if i in used:
                            continue
                        used.add(i)
                        cl = [i]
                        for j in idxs:
                            if j in used:
                                continue
                            if _overlap_count(g.loc[i, "_a_tokens"], g.loc[j, "_a_tokens"]) >= min_shared_authors:
                                used.add(j)
                                cl.append(j)
                        if len(cl) >= 2:
                            clusters.append(cl)

                    for cl in clusters:
                        group_counter += 1
                        cldf = g.loc[cl].copy()
                        if choose_parent == "oldest":
                            cldf = cldf.sort_values(by=["_dt", "_rid"], ascending=[True, True], na_position="last")
                        else:
                            cldf = cldf.sort_values(by=["_dt", "_rid"], ascending=[False, False], na_position="last")

                        parent_idx = cldf.index[0]
                        parent_rid = cldf.loc[parent_idx, record_id_col]
                        parent_srv = cldf.loc[parent_idx, server_col]

                        df.loc[parent_idx, hierarchy_col] = "parent"
                        df.loc[parent_idx, parent_id_col] = pd.NA
                        df.loc[parent_idx, group_id_col] = f"relaxed::{stage_name}::{group_counter}"

                        child_idxs = [x for x in cldf.index if x != parent_idx]
                        children_total += len(child_idxs)

                        df.loc[child_idxs, hierarchy_col] = f"parent - duplicate ({parent_srv})"
                        df.loc[child_idxs, parent_id_col] = parent_rid
                        df.loc[child_idxs, group_id_col] = f"relaxed::{stage_name}::{group_counter}"

                metrics["stageB_clusters"] = int(group_counter)
                metrics["stageB_children_labeled"] = int(children_total)

    metrics["time_s"] = time.perf_counter() - t0
    return (df, metrics) if return_metrics else df


# ============================================================
# 5) Stage runner with summary + early stop
# ============================================================
def _count_children_labels(series: pd.Series) -> int:
    s = series.astype("string").fillna("")
    return int(s.str.startswith("parent - duplicate").sum())


def run_dedupe_stages(
    df: pd.DataFrame,
    *,
    stages: List[Dict[str, Any]],
    early_stop_if_new_labels_lt: int = 100,
    print_summary: bool = True,
    return_all_metrics: bool = True,
    # common kwargs passed to every stage
    **common_kwargs,
) -> Tuple[pd.DataFrame, List[Dict[str, Any]]] | pd.DataFrame:
    """
    Runs a list of stages sequentially with:
      - delta duplicates added per stage
      - early stop
    """
    df_out = df
    metrics_all: List[Dict[str, Any]] = []

    prev_children = _count_children_labels(df_out[common_kwargs.get("hierarchy_col", "records_hierarchy")])

    for stage in stages:
        name = stage.get("name", stage.get("stage_name", "stage"))
        t0 = time.perf_counter()

        df_out, m = dedupe_title_authors_stage(
            df_out,
            return_metrics=True,
            stage_name=name,
            **common_kwargs,
            **{k: v for k, v in stage.items() if k not in {"name", "stage_name"}},
        )

        now_children = _count_children_labels(df_out[common_kwargs.get("hierarchy_col", "records_hierarchy")])
        delta = now_children - prev_children
        prev_children = now_children

        m["stage_runtime_s"] = time.perf_counter() - t0
        m["new_children_added"] = int(delta)
        metrics_all.append(m)

        if print_summary:
            print(
                f"[{name}] new_children={delta} | "
                f"cand={m['n_candidates_initial']} | "
                f"prefilter_rows={m['prefilter_rows']} | "
                f"A_children={m['stageA_children_labeled']} | "
                f"fuzzy_children={m['fuzzy_children_labeled']} | "
                f"B_children={m['stageB_children_labeled']} | "
                f"time={m['stage_runtime_s']:.2f}s"
            )

        if delta < early_stop_if_new_labels_lt:
            if print_summary:
                print(f"Early stop after {name}: delta {delta} < {early_stop_if_new_labels_lt}")
            break

    return (df_out, metrics_all) if return_all_metrics else df_out


# ============================================================
# 6) Two-pass pipeline: Exact pass -> Fuzzy pass on remaining parents
# ============================================================
def run_dedupe_pipeline_two_passes(
    df: pd.DataFrame,
    *,
    stages_exact: List[Dict[str, Any]],
    stages_fuzzy: List[Dict[str, Any]],
    early_stop_if_new_labels_lt: int = 100,
    print_summary: bool = True,
    return_all_metrics: bool = True,
    **common_kwargs,
) -> Tuple[pd.DataFrame, List[Dict[str, Any]]] | pd.DataFrame:
    """
    Pass A: run stages_exact (typically no fuzzy, prefilter_mode=title_dup).
    Pass B: run stages_fuzzy (fuzzy enabled, prefilter_mode=author_dup), on remaining parents only.

    IMPORTANT:
      - For Pass A, it is normal to use overwrite_mode="any" for stage1, then "parent_only" for stage2-3.
      - For Pass B, use overwrite_mode="parent_only" so we only touch unresolved parents.
    """
    all_metrics: List[Dict[str, Any]] = []
    df_out = df

    if print_summary:
        print("\n=== PASS A: EXACT ===")

    df_out, mA = run_dedupe_stages(
        df_out,
        stages=stages_exact,
        early_stop_if_new_labels_lt=early_stop_if_new_labels_lt,
        print_summary=print_summary,
        return_all_metrics=True,
        **common_kwargs,
    )
    all_metrics.extend(mA)

    if print_summary:
        print("\n=== PASS B: FUZZY (remaining parents) ===")

    df_out, mB = run_dedupe_stages(
        df_out,
        stages=stages_fuzzy,
        early_stop_if_new_labels_lt=early_stop_if_new_labels_lt,
        print_summary=print_summary,
        return_all_metrics=True,
        **common_kwargs,
    )
    all_metrics.extend(mB)

    return (df_out, all_metrics) if return_all_metrics else df_out


# ============================================================
# 7) Default stage configs (recommended)
# ============================================================

# PASS A (EXACT) — fast + high precision
STAGES_EXACT = [
    dict(
        name="A1_tokenbag_exact",
        authors_fp_mode="tokenbag",
        prefilter_mode="title_dup",
        title_fuzzy_fallback=False,
        relaxed_shared_authors=True,
        min_authors_required=1,
        min_shared_authors=1,
        overwrite_mode="any",
        authors_fingerprint_col="authors_fp_tokenbag",
    ),
    dict(
        name="A2_last_initial_exact",
        authors_fp_mode="last_initial",
        prefilter_mode="title_dup",
        title_fuzzy_fallback=False,
        relaxed_shared_authors=True,
        min_authors_required=1,
        min_shared_authors=1,
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_last_initial",
    ),
    dict(
        name="A3_last_exact_strict",
        authors_fp_mode="last",
        prefilter_mode="title_dup",
        title_fuzzy_fallback=False,
        relaxed_shared_authors=False,  # last-only already high recall; keep strict
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_last",
    ),
]

# PASS B (FUZZY) — only remaining parents; block by authors_fp repetition
# Note: relaxed_shared_authors is usually OFF here to keep false positives down.
STAGES_FUZZY = [
    dict(
        name="B1_tokenbag_fuzzy",
        authors_fp_mode="tokenbag",
        prefilter_mode="author_dup",
        title_fuzzy_fallback=True,
        min_title_tokens=6,
        min_title_containment=0.70,  # start conservative; lower = more recall, more risk
        fuzzy_compare_strategy="parent_only",
        relaxed_shared_authors=False,
        # min_authors_required=1,
        # min_shared_authors=1,
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_tokenbag",
    ),
    dict(
        name="B2_last_initial_fuzzy",
        authors_fp_mode="last_initial",
        prefilter_mode="author_dup",
        title_fuzzy_fallback=True,
        min_title_tokens=6,
        min_title_containment=0.80,
        fuzzy_compare_strategy="parent_only",
        relaxed_shared_authors=False,
        # min_authors_required=1,
        # min_shared_authors=1,
        overwrite_mode="parent_only",
        authors_fingerprint_col="authors_fp_last_initial",
    ),
]

# ============================================================
# 8) Example usage
# ============================================================
# df_out, metrics = run_dedupe_pipeline_two_passes(
#     df,
#     stages_exact=STAGES_EXACT,
#     stages_fuzzy=STAGES_FUZZY,
#     early_stop_if_new_labels_lt=500,
#     print_summary=True,
#     return_all_metrics=True,
#     servers=None,
#     across_servers=True,
#     use_year=False,
#     choose_parent="oldest",
#     prefilter=True,
#     date_candidates=('date_first_seen',),
#     hierarchy_col="records_hierarchy",
#     parent_id_col="parent_record_id",
#     group_id_col="dup_group_id",
#     add_authors_fingerprint_col=True,
#     add_title_clean_col=True,
#     title_clean_col="title_clean_v2",
# )
#
# print(metrics[-1])
# print(df_out["records_hierarchy"].value_counts(dropna=False).head(60))


In [27]:
data_clean_hierarchy['records_hierarchy_copy'] = data_clean_hierarchy['records_hierarchy']

In [28]:
df=data_clean_hierarchy[data_clean_hierarchy["server_name"].isin(['ChemRxiv'])]

df["records_hierarchy"].value_counts(dropna=False).head(60)

records_hierarchy
parent    46475
Name: count, dtype: int64

In [None]:
data_out, metrics = run_dedupe_pipeline_two_passes(
    data_clean_hierarchy,
    stages_exact=STAGES_EXACT,
    stages_fuzzy=STAGES_FUZZY,
    early_stop_if_new_labels_lt=1,
    print_summary=True,
    return_all_metrics=True,
    servers=None,
    across_servers=True,
    use_year=False,
    choose_parent="oldest",
    prefilter=True,
    date_candidates=('date_first_seen',),
    hierarchy_col="records_hierarchy",
    parent_id_col="parent_record_id",
    group_id_col="dup_group_id",
    add_authors_fingerprint_col=True,
    add_title_clean_col=True,
    title_clean_col="title_clean_v2",
)

print(metrics[-1])
print(data_out["records_hierarchy"].value_counts(dropna=False).head(60))



=== PASS A: EXACT ===


In [None]:
metrics

In [None]:
data_out

In [None]:
print(data_out["records_hierarchy_copy"].value_counts(dropna=False).head(60))

In [None]:
print(data_out["parent_record_id"].value_counts(dropna=False).head(60))

In [None]:
pattern = "fuzzy::las"


mask = data_out['dup_group_id'].str.contains(pattern, regex=False, na=False)
result = data_out[mask]
print(len(result))
result

In [None]:
result['relations_json'][281751]

In [None]:
data_out[data_out["record_id"]=='crossref::10.26434/chemrxiv-2021-cj17s']

In [None]:
data_out[data_out["parent_record_id"]=='crossref::10.26434/chemrxiv-2022-fd190']

In [None]:
pattern = "chemrxiv.11846943"


mask = data_out['doi'].str.contains(pattern, regex=False, na=False)
result = data_out[mask]
print(len(result))
result

In [None]:
result['records_hierarchy'].value_counts()

In [None]:
result['title'].value_counts()

In [None]:
result['authors_flat'].value_counts()

In [None]:
pattern = "10.26434/chemrxiv-2022-fd190"


mask = data_out['doi'].str.contains(pattern, regex=False, na=False)
result = data_out[mask]
print(len(result))
result

In [None]:
pattern = "10.26434/chemrxiv-2022-fd190"


mask = data_clean_hierarchy['doi'].str.contains(pattern, regex=False, na=False)
result = data_clean_hierarchy[mask]
print(len(result))
result

In [None]:
result['title'].value_counts()

In [None]:
result['authors_flat'].value_counts()

In [None]:
sample_titles = data_out.sample(5)[['title', 'authors_flat']].title
data_out[data_out.title.isin(sample_titles)][['title','authors_flat','records_hierarchy','date_first_seen']]

In [None]:
pattern = "10.26434/chemrxiv.13102877"


mask = data_out['doi'].str.contains(pattern, regex=False, na=False)
result = data_out[mask]
print(len(result))
result