In [1]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


# A) Counts

## import data

In [2]:
import duckdb
from pathlib import Path

con = duckdb.connect()

# Low-memory settings
con.execute("PRAGMA threads=1;")
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")
con.execute("PRAGMA memory_limit='2GB';")           # try 1GB if still unstable
con.execute("PRAGMA temp_directory='data/tmp_duckdb';")

# 2) Build paths robustly from the notebook folder
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "by_server"

# IMPORTANT: your files are hive-partitioned like:
all_backends = (BASE / "*" / "*.parquet").as_posix()

con.execute(f"""
CREATE OR REPLACE VIEW all_backends AS
SELECT * FROM read_parquet('{all_backends}', hive_partitioning=true, union_by_name=true);
""")

# A unified "all_rows" view
con.execute("""
CREATE OR REPLACE VIEW all_rows AS
SELECT * FROM all_backends
""")

print(con.execute("SHOW TABLES").fetchall())


[('all_backends',), ('all_rows',)]


In [3]:
con.execute("""
SELECT backend, COUNT(*) AS total, COUNT(record_id) AS with_record_id
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
""").df()


Unnamed: 0,backend,total,with_record_id
0,crossref,3612833,3612833
1,datacite,3523368,3523368
2,openalex,2350692,2350692


## total count

In [4]:
con.execute("""
SELECT COUNT(DISTINCT record_id) AS total_dedup_records
FROM (
    SELECT record_id FROM all_backends
)
""").df()


Unnamed: 0,total_dedup_records
0,8410094


## count per backend

In [5]:
con.execute("""
SELECT
    backend,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
ORDER BY dedup_records DESC
""").df()


Unnamed: 0,backend,dedup_records
0,datacite,3520391
1,crossref,3187171
2,openalex,1702532


## count per server

In [6]:
con.execute("""
SELECT
    backend,
    server_name,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, server_name, record_id FROM all_backends
)
GROUP BY backend, server_name
ORDER BY dedup_records DESC
""").df()


Unnamed: 0,backend,server_name,dedup_records
0,datacite,arXiv,2920797
1,crossref,SSRN,1258958
2,openalex,HAL,1056424
3,crossref,Research Square,450818
4,openalex,RePEc: Research Papers in Economics,389398
...,...,...,...
108,crossref,Prepublicaciones OpenCiencia,8
109,crossref,Therapoid,7
110,crossref,NewAddictionsX,7
111,openalex,Bepress Legal Repository,6


In [7]:
server_name_df = con.execute("""
SELECT server_name, COUNT(DISTINCT record_id) AS n
FROM all_backends
GROUP BY 1
ORDER BY n DESC
LIMIT 200;
""").df()


In [8]:
server_name_df.head(60)

Unnamed: 0,server_name,n
0,arXiv,2920797
1,SSRN,1258958
2,HAL,1056424
3,Research Square,450818
4,RePEc: Research Papers in Economics,389398
5,bioRxiv,306948
6,eLife,247558
7,AgEcon Search,188173
8,ResearchGate,181231
9,Qeios,172316


In [9]:
server_name_df.tail(52)

Unnamed: 0,server_name,n
60,CERN document server,973
61,ARPHA Preprints,890
62,MetaArXiv,880
63,SportRxiv,878
64,Gates Open Research,863
65,AgriRxiv,818
66,Beilstein Archives,697
67,Covid-19 Preprints,647
68,EasyChair preprint,620
69,MarXiv,508


# B) Explorations

In [10]:
# con.execute(f"""
# CREATE OR REPLACE VIEW server_thin AS
# SELECT
#   CAST(record_id AS VARCHAR)           AS record_id,
#   CAST(server_name AS VARCHAR)         AS server_name,
#   CAST(backend AS VARCHAR)             AS backend,

#   CAST(doi AS VARCHAR)                 AS doi,
#   CAST(doi_url AS VARCHAR)             AS doi_url,
#   CAST(landing_page_url AS VARCHAR)    AS landing_page_url,

#   CAST(version_label AS VARCHAR)       AS version_label,

#   -- Relationships (keep these for true version links)
#   CAST(relations_json AS VARCHAR)       AS relations_json,
#   CAST(raw_relationships_json AS VARCHAR)       AS raw_relationships_json,
#   CAST(is_version_of AS VARCHAR)       AS is_version_of,      -- keep as text; we’ll interpret later
#   CAST(version_of_ids_json AS VARCHAR) AS version_of_ids_json,
#   CAST(is_preprint_of AS VARCHAR)      AS is_preprint_of,
#   CAST(published_version_ids_json AS VARCHAR) AS published_version_ids_json,

#   -- Dates (helpful for temporal patterns)
#   CAST(date_posted AS VARCHAR)         AS date_posted,
#   CAST(date_published AS VARCHAR)      AS date_published,
#   CAST(date_published_online AS VARCHAR)      AS date_published_online,
#   CAST(date_issued AS VARCHAR)         AS date_issued,
#   CAST(date_deposited AS VARCHAR)      AS date_deposited,
#   CAST(date_indexed AS VARCHAR)        AS date_indexed,
#   CAST(date_created AS VARCHAR)        AS date_created,
#   CAST(date_registered AS VARCHAR)     AS date_registered,
#   CAST(date_updated AS VARCHAR)        AS date_updated,
#   CAST(publication_year AS VARCHAR)    AS publication_year
# FROM all_backends
# """)

# con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


In [11]:
con.execute(f"""
CREATE OR REPLACE VIEW server_thin AS
SELECT
  CAST(record_id AS VARCHAR)           AS record_id,
  CAST(server_name AS VARCHAR)         AS server_name,
  CAST(backend AS VARCHAR)             AS backend,

  CAST(doi AS VARCHAR)                 AS doi,
  CAST(doi_url AS VARCHAR)             AS doi_url,
  CAST(landing_page_url AS VARCHAR)    AS landing_page_url,

  CAST(title AS VARCHAR) AS title,
  -- CAST(abstract_text AS VARCHAR)      AS abstract_text,
  CAST(authors_flat AS VARCHAR)      AS authors_flat,
  
  -- Dates (helpful for temporal patterns)
  CAST(publication_year AS VARCHAR)    AS publication_year,
  CAST(date_created AS VARCHAR)        AS date_created,
  -- CAST(date_posted AS VARCHAR)         AS date_posted,
  -- CAST(date_deposited AS VARCHAR)      AS date_deposited,

  -- Relationships (keep these for true version links)
  CAST(relations_json AS VARCHAR)       AS relations_json,
  CAST(version_label AS VARCHAR)       AS version_label,
  CAST(is_version_of AS VARCHAR)       AS is_version_of,      -- keep as text; we’ll interpret later
  CAST(is_preprint_of AS VARCHAR)      AS is_preprint_of,
  CAST(has_preprint AS VARCHAR)      AS has_preprint,
  CAST(has_review AS VARCHAR)      AS has_review,
  CAST(has_published_version AS VARCHAR)      AS has_published_version,
  CAST(published_version_ids_json AS VARCHAR) AS published_version_ids_json,
  CAST(version_of_ids_json AS VARCHAR) AS version_of_ids_json,
  CAST(update_to_json AS VARCHAR)      AS update_to_json,
  CAST(raw_relationships_json AS VARCHAR)       AS raw_relationships_json,
FROM all_backends
""")

con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


Unnamed: 0,n
0,9486893


## Global Exploration

### Duplicates

In [12]:
data = con.execute("SELECT * FROM server_thin").df()
# data.drop_duplicates(subset=['record_id'], keep='first', inplace=False)

data = data.drop_duplicates()
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,"Bird’s Eye View on the Diagnosis, Treatment, &...","Panchalingala, Sai Bhargavi",2020.0,2020-09-15,,,,,,,false,,,,
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,Doxycycline and Minocycline Drugs as a Treatme...,"Mostafa, Mohamed",2020.0,2020-09-15,,,,,,,false,,,,
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,A Genetic Perspective of 2019-nCoV in Relation...,"Dasgupta, Rimjhim",2020.0,2020-09-15,,,,,,,false,,,,
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,Marine Algae as a Natural Source for Antiviral...,"Musale, Amar S; G., Raja Krishna Kumar; Sapre,...",2020.0,2020-09-17,,,,,,,false,,,,
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,Possible Prevention of COVID 19 by Using Linol...,"Subhash, Venkata; G, Raja Krishna Kumar; Sapre...",2020.0,2020-09-17,,,,,,,false,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,Three Objections to Modern Physics,Lubomir Vlcek,2014.0,2025-10-10T00:00:00,,,,,,,,,,,
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,Particle Mass Ratios,DT Froedge,2011.0,2025-10-10T00:00:00,,,,,,,,,,,
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,Quantum FFF Theory Proposals for Some Unsolved...,Leo Vuyk,2014.0,2025-10-10T00:00:00,,,,,,,,,,,
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,Investigation of the Formalism of Particle Dyn...,Chi-Yi Chen,2013.0,2025-10-10T00:00:00,,,,,,,,,,,


In [13]:
dupes = data[data.duplicated(subset=['record_id'], keep=False)]
dupes

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
15570,crossref::10.31124/advance.24454624.v1,Advance,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,Ancient Social Motive Simulation Hypothesis of...,"Thomas, Frederick",2024.0,2024-02-05,,,,,,,false,,,,
16016,crossref::10.31124/advance.170921771.12975902/v1,Advance,crossref,10.31124/advance.170921771.12975902/v1,https://doi.org/10.31124/advance.170921771.129...,https://advance.sagepub.com/users/748375/artic...,ScholarOne - The Colonial Origin of Population...,"Saijo, Harunobu; Xu, Crystal; Zhang, Anna",2024.0,2024-02-29,,,,,,,false,,,,
232722,crossref::10.22541/essoar.167016857.73309909.1,Authorea Inc.,crossref,10.22541/essoar.167016857.73309909.1,https://doi.org/10.22541/essoar.167016857.7330...,https://www.authorea.com/users/525275/articles...,Non-native hosts of an invasive seaweed holobi...,"Author, Albert",2022.0,2022-12-04,,,,,,,false,,,,
232752,crossref::10.22541/essoar.167024940.09387576/v1,Authorea Inc.,crossref,10.22541/essoar.167024940.09387576/v1,https://doi.org/10.22541/essoar.167024940.0938...,https://www.authorea.com/users/525275/articles...,(test) IL-36γ in Enthesitis related Juvenile I...,"Author, Albert",2022.0,2022-12-05,,,,,,,false,,,,
232822,crossref::10.22541/essoar.167045090.02735580/v1,Authorea Inc.,crossref,10.22541/essoar.167045090.02735580/v1,https://doi.org/10.22541/essoar.167045090.0273...,https://www.authorea.com/users/563627/articles...,LROCNet: Detecting Impact Ejecta and Older Cra...,"Dunkel, Emily; Lu, Steven; Grimes, Kevin; McAu...",2022.0,2022-12-07,,,,,,,false,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446146,crossref::10.22541/essoar.170365313.34695479/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170365313.34695479/v1,https://doi.org/10.22541/essoar.170365313.3469...,https://essopenarchive.org/users/531775/articl...,Analysis of the IGS contribution to ITRF2020,"Rebischung, Paul; Altamimi, Zuheir; Métivier, ...",2023.0,2023-12-27,,,,,,,false,,,,
446148,crossref::10.22541/essoar.170680200.06135599/v2,Earth and Space Science Open Archive,crossref,10.22541/essoar.170680200.06135599/v2,https://doi.org/10.22541/essoar.170680200.0613...,https://essopenarchive.org/users/672150/articl...,Long-term trends in the extremes and variabili...,"Zhai, Dongran; Beaulieu, Claudie; Kudela, Raphael",2024.0,2024-02-29,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.22541/essoar.170680200.06135599/v1,,,,false,,,,
446149,crossref::10.22541/essoar.170923255.57545328/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,Data Drought in the Humid Tropics: How to Over...,"Frankenberg, Christian; Bar-On, Yinon Moise; Y...",2024.0,2024-02-29,,,,,,,false,,,,
446341,crossref::10.22541/essoar.170612447.73077522/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170612447.73077522/v1,https://doi.org/10.22541/essoar.170612447.7307...,https://essopenarchive.org/doi/full/10.22541/e...,Towards Semi-Autonomous Robotic Arm Manipulati...,"Alharthi, Abdullah; Tokatli, Ozan; Lopez, Erwi...",2024.0,2024-01-24,,,,,,,false,,,,


In [14]:
dupes['server_name'].value_counts()

server_name
Authorea Inc.                           3432
Earth and Space Science Open Archive    3430
Advance                                    2
Name: count, dtype: int64

In [15]:
data[data['record_id']=='crossref::10.31124/advance.24454624.v1']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
15570,crossref::10.31124/advance.24454624.v1,Advance,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,Ancient Social Motive Simulation Hypothesis of...,"Thomas, Frederick",2024.0,2024-02-05,,,,,,,False,,,,
273723,crossref::10.31124/advance.24454624.v1,Authorea Inc.,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,Ancient Social Motive Simulation Hypothesis of...,"Thomas, Frederick",2024.0,2024-02-05,,,,,,,False,,,,


In [16]:
data[data['record_id']=='crossref::10.22541/essoar.170923255.57545328/v1']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
250777,crossref::10.22541/essoar.170923255.57545328/v1,Authorea Inc.,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,Data Drought in the Humid Tropics: How to Over...,"Frankenberg, Christian; Bar-On, Yinon Moise; Y...",2024.0,2024-02-29,,,,,,,False,,,,
446149,crossref::10.22541/essoar.170923255.57545328/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,Data Drought in the Humid Tropics: How to Over...,"Frankenberg, Christian; Bar-On, Yinon Moise; Y...",2024.0,2024-02-29,,,,,,,False,,,,


#### Resolution

In [17]:
import pandas as pd

# ----------------------------------
# 1) Define server priority
#    Lower value = higher priority (kept first)
# ----------------------------------
server_priority = {
    "Earth and Space Science Open Archive": 1,
    "Advance": 2,
    "Authorea Inc.": 3
}

# Work on a copy to avoid side effects
data = data.copy()

# ----------------------------------
# 2) Add priority column
#    Unknown servers get lowest priority
# ----------------------------------
data['server_priority'] = (
    data['server_name']
    .map(server_priority)
    .fillna(99)
    .astype(int)
)

# ----------------------------------
# 3) Deduplicate STRICTLY on record_id
#    - Sort so preferred server comes first
#    - Keep only the best row per record_id
# ----------------------------------
data_clean = (
    data
    .sort_values(by=['record_id', 'server_priority'])
    .drop_duplicates(subset=['record_id'], keep='first')
    .drop(columns=['server_priority'])
)

# ----------------------------------
# 4) (Optional but recommended) Inspect removals
# ----------------------------------
removed = data.loc[~data.index.isin(data_clean.index)]

print("Removed rows by server_name:")
print(removed['server_name'].value_counts(dropna=False))

print("\nRows before:", len(data))
print("Rows after :", len(data_clean))
print("Rows removed:", len(removed))

# ----------------------------------
# data_clean is the final deduplicated dataframe
# ----------------------------------


Removed rows by server_name:
server_name
Authorea Inc.    3432
Name: count, dtype: int64

Rows before: 8413526
Rows after : 8410094
Rows removed: 3432


In [18]:
removed

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,server_priority
232722,crossref::10.22541/essoar.167016857.73309909.1,Authorea Inc.,crossref,10.22541/essoar.167016857.73309909.1,https://doi.org/10.22541/essoar.167016857.7330...,https://www.authorea.com/users/525275/articles...,Non-native hosts of an invasive seaweed holobi...,"Author, Albert",2022.0,2022-12-04,,,,,,,false,,,,,3
232752,crossref::10.22541/essoar.167024940.09387576/v1,Authorea Inc.,crossref,10.22541/essoar.167024940.09387576/v1,https://doi.org/10.22541/essoar.167024940.0938...,https://www.authorea.com/users/525275/articles...,(test) IL-36γ in Enthesitis related Juvenile I...,"Author, Albert",2022.0,2022-12-05,,,,,,,false,,,,,3
232822,crossref::10.22541/essoar.167045090.02735580/v1,Authorea Inc.,crossref,10.22541/essoar.167045090.02735580/v1,https://doi.org/10.22541/essoar.167045090.0273...,https://www.authorea.com/users/563627/articles...,LROCNet: Detecting Impact Ejecta and Older Cra...,"Dunkel, Emily; Lu, Steven; Grimes, Kevin; McAu...",2022.0,2022-12-07,,,,,,,false,,,,,3
232890,crossref::10.22541/essoar.167065826.69236575/v1,Authorea Inc.,crossref,10.22541/essoar.167065826.69236575/v1,https://doi.org/10.22541/essoar.167065826.6923...,https://www.authorea.com/users/565295/articles...,A simulation study for future geodetic satelli...,"Najder, Joanna Marta; Sośnica, Krzysztof Jakub...",2022.0,2022-12-10,,,,,,,false,,,,,3
232903,crossref::10.22541/essoar.167079456.68733298/v1,Authorea Inc.,crossref,10.22541/essoar.167079456.68733298/v1,https://doi.org/10.22541/essoar.167079456.6873...,https://www.authorea.com/users/525475/articles...,Test Document,"Hanson, R. Brooks",2022.0,2022-12-11,,,,,,,false,,,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273720,crossref::10.1002/essoar.10510558.3,Authorea Inc.,crossref,10.1002/essoar.10510558.3,https://doi.org/10.1002/essoar.10510558.3,https://essopenarchive.org/users/968711/articl...,Eddy covariance data reveal that a small fresh...,"Hounshell, Alexandria G; D'Acunha, Brenda M; B...",2022.0,2022-12-13,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1002/essoar.10510558.1,,,,false,,,,,3
273721,crossref::10.1002/essoar.10505617.2,Authorea Inc.,crossref,10.1002/essoar.10505617.2,https://doi.org/10.1002/essoar.10505617.2,https://essopenarchive.org/users/545576/articl...,COVID-19 and Social Vulnerabilities in Virgini...,"Patel, Parthay; Patel, Bhaumik",2025.0,2025-10-23,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1002/essoar.10505617.1,,,,false,,,,,3
273722,crossref::10.1002/essoar.10505617.3,Authorea Inc.,crossref,10.1002/essoar.10505617.3,https://doi.org/10.1002/essoar.10505617.3,https://essopenarchive.org/users/545576/articl...,COVID-19 and Social Vulnerabilities in Virgini...,"Patel, Parthay; Patel, Bhaumik",2025.0,2025-11-13,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1002/essoar.10505617.1,,,,false,,,,,3
273723,crossref::10.31124/advance.24454624.v1,Authorea Inc.,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,Ancient Social Motive Simulation Hypothesis of...,"Thomas, Frederick",2024.0,2024-02-05,,,,,,,false,,,,,3


In [19]:
removed['server_name'].value_counts()

server_name
Authorea Inc.    3432
Name: count, dtype: int64

In [20]:
data_clean.count()

record_id                     8410094
server_name                   8410094
backend                       8410094
doi                           6784859
doi_url                       6784859
landing_page_url              8330648
title                         8410069
authors_flat                  8194686
publication_year              8264048
date_created                  8410094
relations_json                4253292
version_label                 2974496
is_version_of                 6707562
is_preprint_of                6707562
has_preprint                  6707562
has_review                    6707562
has_published_version         6707562
published_version_ids_json          0
version_of_ids_json                 0
update_to_json                   8899
raw_relationships_json        3520391
dtype: int64

### Clean columns

In [21]:
data_clean["title"] = (
    data_clean["title"]
    .astype(str)
    .str.strip()
    .replace({"": pd.NA, "None": pd.NA, "null": pd.NA, "nan": pd.NA,
              "N/A": pd.NA, "[]": pd.NA, "{}": pd.NA})
)


In [22]:
data_clean["doi"] = (
    data_clean["doi"]
    .astype(str)
    .str.strip()
    .replace({"": pd.NA, "None": pd.NA, "null": pd.NA, "nan": pd.NA,
              "N/A": pd.NA, "[]": pd.NA, "{}": pd.NA})
)


In [23]:
data_clean["authors_flat"] = (
    data_clean["authors_flat"]
    .astype(str)
    .str.strip()
    .replace({"": pd.NA, "None": pd.NA, "null": pd.NA, "nan": pd.NA,
              "N/A": pd.NA, "[]": pd.NA, "{}": pd.NA})
)


In [24]:
data_clean["landing_page_url"] = (
    data_clean["landing_page_url"]
    .astype(str)
    .str.strip()
    .replace({"": pd.NA, "None": pd.NA, "null": pd.NA, "nan": pd.NA,
              "N/A": pd.NA, "[]": pd.NA, "{}": pd.NA})
)


In [25]:
data_clean.shape

(8410094, 21)

In [26]:
data_clean.count()

record_id                     8410094
server_name                   8410094
backend                       8410094
doi                           6784859
doi_url                       6784859
landing_page_url              8330648
title                         8408457
authors_flat                  8194673
publication_year              8264048
date_created                  8410094
relations_json                4253292
version_label                 2974496
is_version_of                 6707562
is_preprint_of                6707562
has_preprint                  6707562
has_review                    6707562
has_published_version         6707562
published_version_ids_json          0
version_of_ids_json                 0
update_to_json                   8899
raw_relationships_json        3520391
dtype: int64

### Summary

In [27]:
import pandas as pd
import numpy as np

data = data_clean.copy()

# -------------------------------------------------
# Configuration
# -------------------------------------------------
GROUP_COL = 'server_name'

MISSINGNESS_COLUMNS = [
    'doi',
    'landing_page_url'
]

COMPLETENESS_COLUMNS = [
    'version_label',
    'relations_json',
    'is_version_of',
    'is_preprint_of',
    'has_review',
    'has_preprint',
    'has_published_version',
    'version_of_ids_json',
    'update_to_json',
    'published_version_ids_json',
    'raw_relationships_json'
]

# -------------------------------------------------
# Helper functions
# -------------------------------------------------
def is_missing(series):
    """Strict missingness (NaN / None)"""
    return series.isna()

def is_incomplete(series):
    """
    Missing OR empty content:
    - NaN
    - empty string
    - empty list
    - empty dict
    """
    return (
        series.isna()
        | (series.astype(str).str.strip() == '')
        | (series.astype(str).isin(['[]', '{}']))
    )

# -------------------------------------------------
# Base counts per server
# -------------------------------------------------
base = (
    data
    .groupby(GROUP_COL, dropna=False)
    .size()
    .rename('total_records')
    .to_frame()
)

# -------------------------------------------------
# Missingness metrics
# -------------------------------------------------
for col in MISSINGNESS_COLUMNS:
    missing_count = (
        data.loc[is_missing(data[col])]
        .groupby(GROUP_COL, dropna=False)
        .size()
        .rename(f'{col}_missing_count')
    )

    base = base.join(missing_count, how='left').fillna(0)
    base[f'{col}_missing_count'] = base[f'{col}_missing_count'].astype(int)
    base[f'{col}_missing_percent'] = (
        base[f'{col}_missing_count'] / base['total_records'] * 100
    ).round(2)

# -------------------------------------------------
# Completeness metrics
# -------------------------------------------------
for col in COMPLETENESS_COLUMNS:
    incomplete_count = (
        data.loc[is_incomplete(data[col])]
        .groupby(GROUP_COL, dropna=False)
        .size()
        .rename(f'{col}_incomplete_count')
    )

    base = base.join(incomplete_count, how='left').fillna(0)
    base[f'{col}_incomplete_count'] = base[f'{col}_incomplete_count'].astype(int)
    base[f'{col}_incomplete_percent'] = (
        base[f'{col}_incomplete_count'] / base['total_records'] * 100
    ).round(2)

# -------------------------------------------------
# Final table
# -------------------------------------------------
summary = base.sort_values('total_records', ascending=False)

# summary


In [28]:
summary.head(60)

Unnamed: 0_level_0,total_records,doi_missing_count,doi_missing_percent,landing_page_url_missing_count,landing_page_url_missing_percent,version_label_incomplete_count,version_label_incomplete_percent,relations_json_incomplete_count,relations_json_incomplete_percent,is_version_of_incomplete_count,is_version_of_incomplete_percent,is_preprint_of_incomplete_count,is_preprint_of_incomplete_percent,has_review_incomplete_count,has_review_incomplete_percent,has_preprint_incomplete_count,has_preprint_incomplete_percent,has_published_version_incomplete_count,has_published_version_incomplete_percent,version_of_ids_json_incomplete_count,version_of_ids_json_incomplete_percent,update_to_json_incomplete_count,update_to_json_incomplete_percent,published_version_ids_json_incomplete_count,published_version_ids_json_incomplete_percent,raw_relationships_json_incomplete_count,raw_relationships_json_incomplete_percent
server_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
arXiv,2920797,0,0.0,0,0.0,0,0.0,1638047,56.08,2920797,100.0,1638164,56.09,2920797,100.0,2920797,100.0,0,0.0,2920797,100.0,2920797,100.0,2920797,100.0,0,0.0
SSRN,1258958,0,0.0,0,0.0,1258938,100.0,1258703,99.98,1258958,100.0,1258703,99.98,1258958,100.0,1258958,100.0,0,0.0,1258958,100.0,1258938,100.0,1258958,100.0,1258958,100.0
HAL,1056424,1024809,97.01,103,0.01,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0
Research Square,450818,0,0.0,0,0.0,450813,100.0,283964,62.99,435544,96.61,295396,65.52,450818,100.0,450818,100.0,0,0.0,450818,100.0,450813,100.0,450818,100.0,450818,100.0
RePEc: Research Papers in Economics,389398,364045,93.49,39749,10.21,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0
bioRxiv,306948,0,0.0,0,0.0,306937,100.0,196496,64.02,306947,100.0,196497,64.02,306948,100.0,306948,100.0,0,0.0,306948,100.0,306937,100.0,306948,100.0,306948,100.0
eLife,247558,0,0.0,0,0.0,247192,99.85,151705,61.28,233206,94.2,247537,99.99,247558,100.0,238270,96.25,0,0.0,247558,100.0,247192,99.85,247558,100.0,247558,100.0
AgEcon Search,188173,0,0.0,0,0.0,188173,100.0,188173,100.0,188173,100.0,188173,100.0,188173,100.0,188173,100.0,0,0.0,188173,100.0,188173,100.0,188173,100.0,0,0.0
ResearchGate,181231,0,0.0,0,0.0,174949,96.53,174741,96.42,174741,96.42,181231,100.0,181231,100.0,181231,100.0,0,0.0,181231,100.0,181231,100.0,181231,100.0,0,0.0
Qeios,172316,0,0.0,0,0.0,172316,100.0,111907,64.94,167745,97.35,172316,100.0,172316,100.0,172316,100.0,0,0.0,172316,100.0,172316,100.0,172316,100.0,172316,100.0


In [29]:
summary.tail(52)

Unnamed: 0_level_0,total_records,doi_missing_count,doi_missing_percent,landing_page_url_missing_count,landing_page_url_missing_percent,version_label_incomplete_count,version_label_incomplete_percent,relations_json_incomplete_count,relations_json_incomplete_percent,is_version_of_incomplete_count,is_version_of_incomplete_percent,is_preprint_of_incomplete_count,is_preprint_of_incomplete_percent,has_review_incomplete_count,has_review_incomplete_percent,has_preprint_incomplete_count,has_preprint_incomplete_percent,has_published_version_incomplete_count,has_published_version_incomplete_percent,version_of_ids_json_incomplete_count,version_of_ids_json_incomplete_percent,update_to_json_incomplete_count,update_to_json_incomplete_percent,published_version_ids_json_incomplete_count,published_version_ids_json_incomplete_percent,raw_relationships_json_incomplete_count,raw_relationships_json_incomplete_percent
server_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
CERN document server,973,0,0.0,0,0.0,933,95.89,46,4.73,973,100.0,443,45.53,973,100.0,973,100.0,0,0.0,973,100.0,973,100.0,973,100.0,0,0.0
ARPHA Preprints,890,0,0.0,0,0.0,890,100.0,578,64.94,890,100.0,578,64.94,890,100.0,890,100.0,0,0.0,890,100.0,890,100.0,890,100.0,890,100.0
MetaArXiv,880,0,0.0,0,0.0,880,100.0,681,77.39,813,92.39,742,84.32,880,100.0,880,100.0,0,0.0,880,100.0,880,100.0,880,100.0,880,100.0
SportRxiv,878,0,0.0,0,0.0,878,100.0,794,90.43,877,99.89,795,90.55,878,100.0,878,100.0,0,0.0,878,100.0,878,100.0,878,100.0,878,100.0
Gates Open Research,863,0,0.0,0,0.0,539,62.46,83,9.62,799,92.58,863,100.0,128,14.83,835,96.76,0,0.0,863,100.0,539,62.46,863,100.0,863,100.0
AgriRxiv,818,0,0.0,0,0.0,818,100.0,805,98.41,818,100.0,805,98.41,818,100.0,818,100.0,0,0.0,818,100.0,818,100.0,818,100.0,818,100.0
Beilstein Archives,697,0,0.0,0,0.0,697,100.0,236,33.86,697,100.0,236,33.86,697,100.0,697,100.0,0,0.0,697,100.0,697,100.0,697,100.0,697,100.0
Covid-19 Preprints,647,0,0.0,0,0.0,647,100.0,647,100.0,647,100.0,647,100.0,647,100.0,647,100.0,0,0.0,647,100.0,647,100.0,647,100.0,647,100.0
EasyChair preprint,620,0,0.0,0,0.0,620,100.0,620,100.0,620,100.0,620,100.0,620,100.0,620,100.0,0,0.0,620,100.0,620,100.0,620,100.0,620,100.0
MarXiv,508,0,0.0,0,0.0,508,100.0,249,49.02,506,99.61,251,49.41,508,100.0,508,100.0,0,0.0,508,100.0,508,100.0,508,100.0,508,100.0


In [30]:
data_clean

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,Long-term monitoring of land surface phenologi...,"Tsutsumida, Narumasa",2018.0,2019-11-13,,,,,,,false,,,,
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,Impact of spatial scale for phenological indic...,"Tsutsumida, Narumasa; Kaduk, Jörg",2018.0,2019-11-13,,,,,,,false,,,,
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,Observations of Low Latitude Red Aurora in Mex...,"Gonzalez-Esparza, J. Americo; Cuevas-Cardona, ...",2018.0,2019-11-13,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1029/2017sw001789;10.1029/2018sw001995,,,true,,,,
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,Pipeline oil fire detection with MODIS active ...,"Ogungbuyi, Michael Gbenga; Martinez, Peter; Ec...",2018.0,2019-11-13,,,,,,,false,,,,
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,Land Product Validation of MODIS Derived FPAR ...,"Sharp, Iain; Sanchez-Azofeifa, Arturo; Musilek...",2018.0,2019-12-03,,,,,,,false,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806404,openalex::W999921877,RePEc: Research Papers in Economics,openalex,,,https://steconomice.uoradea.ro/anale/volume/20...,IDE sous l'influence du degré de l'intégration...,Simona-Gabriela Serbu Masca,2008.0,2025-10-10T00:00:00,,,,,,,,,,,
1016201,openalex::W999947037,HAL,openalex,,,https://hal.science/hal-01922484,Building realistic potential patients queries ...,Lorraine Goeuriot; Wendy W. Chapman; Gareth Jf...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,
2713111,openalex::W999974616,RePEc: Research Papers in Economics,openalex,,,https://EconPapers.repec.org/RePEc:cde:cdewps:19,Debt Financing with Limited Liability and Quan...,Krishnendu Ghosh Dastidar,1994.0,2025-10-10T00:00:00,,,,,,,,,,,
1015101,openalex::W999989114,HAL,openalex,,,https://hal.science/hal-01268467,Sustainable orchards' redesign: at the crossro...,Servane Penvern; Sylvaine Simon; Stéphane Bell...,2012.0,2025-10-10T00:00:00,,,,,,,,,,,


## Manage hierarchy

### check those who have are version

In [31]:
pattern = "ew version"

mask = data_clean['version_label'].str.contains(pattern, regex=False, na=False)
result = data_clean[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
2121749,crossref::10.12688/aasopenres.12825.2,Open Research Africa,crossref,10.12688/aasopenres.12825.2,https://doi.org/10.12688/aasopenres.12825.2,https://aasopenresearch.org/articles/1-1/v2,Prevalence of chloroquine and antifolate drug ...,"Abugri, James; Ansah, Felix; Asante, Kwaku P.;...",2018.0,2018-12-03,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",New version,,,,10.21956/aasopenres.13998.r26676,false,,,"[{""DOI"": ""10.12688/aasopenres.12825.1"", ""label...",
2121794,crossref::10.12688/aasopenres.12832.2,Open Research Africa,crossref,10.12688/aasopenres.12832.2,https://doi.org/10.12688/aasopenres.12832.2,https://aasopenresearch.org/articles/1-3/v2,The Collaborative African Genomics Network (CA...,"Mboowa, Gerald; Mwesigwa, Savannah; Katagirya,...",2018.0,2018-06-21,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",New version,,,,10.21956/aasopenres.13951.r26487,false,,,"[{""DOI"": ""10.12688/aasopenres.12832.1"", ""label...",
2121813,crossref::10.12688/aasopenres.12837.2,Open Research Africa,crossref,10.12688/aasopenres.12837.2,https://doi.org/10.12688/aasopenres.12837.2,https://aasopenresearch.org/articles/1-12/v2,Microbiological assessment of sachet water “pu...,"Mosi, Lydia; Adadey, Samuel Mawuli; Sowah, San...",2019.0,2019-01-24,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",New version,,,,10.21956/aasopenres.14017.r26741;10.21956/aaso...,false,,,"[{""DOI"": ""10.12688/aasopenres.12837.1"", ""label...",
2121910,crossref::10.12688/aasopenres.12841.2,Open Research Africa,crossref,10.12688/aasopenres.12841.2,https://doi.org/10.12688/aasopenres.12841.2,https://aasopenresearch.org/articles/1-2/v2,Ethical and scientific considerations on the e...,"Elliott, Alison M.; Roestenberg, Meta; Wajja, ...",2018.0,2018-08-06,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",New version,,,,10.21956/aasopenres.13967.r26567,false,,,"[{""DOI"": ""10.12688/aasopenres.12841.1"", ""label...",
2121901,crossref::10.12688/aasopenres.12844.2,Open Research Africa,crossref,10.12688/aasopenres.12844.2,https://doi.org/10.12688/aasopenres.12844.2,https://aasopenresearch.org/articles/1-13/v2,Model framework for governance of genomic rese...,"Yakubu, Aminu; Tindana, Paulina; Matimba, Alic...",2018.0,2018-12-12,,New version,,,,,false,,,"[{""DOI"": ""10.12688/aasopenres.12844.1"", ""label...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572348,crossref::10.3410/f1000research.1-6.v2,F1000Research,crossref,10.3410/f1000research.1-6.v2,https://doi.org/10.3410/f1000research.1-6.v2,http://f1000research.com/articles/1-6/v2,Sirenomelia in a Cameroonian woman: a case rep...,"Morfaw, Frederick LI; Nana, Philip N",2012.0,2015-06-25,,New version,,,,,false,,,"[{""DOI"": ""10.3410/f1000research.1-6.v1"", ""labe...",
5805088,datacite::10.5281/zenodo.16753537,Zenodo,datacite,10.5281/zenodo.16753537,https://doi.org/10.5281/zenodo.16753537,https://zenodo.org/doi/10.5281/zenodo.16753537,Temperature-driven tunability of a vanadium di...,"Francesco, Scotognella",2025.0,2025-08-06,"[{""relatedIdentifier"": ""10.5281/zenodo.1743480...",2 [New version of the manuscript uploaded on A...,,,,,false,,,,"{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ..."
5840882,datacite::10.5281/zenodo.17434801,Zenodo,datacite,10.5281/zenodo.17434801,https://doi.org/10.5281/zenodo.17434801,https://zenodo.org/doi/10.5281/zenodo.17434801,Temperature-driven tunability of a vanadium di...,"Francesco, Scotognella",2025.0,2025-10-24,"[{""relatedIdentifier"": ""10.5281/zenodo.1675353...",2 [New version of the manuscript uploaded on A...,,10.5281/zenodo.16753537,,,true,,,,"{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ..."
5883185,datacite::10.5281/zenodo.17980589,Zenodo,datacite,10.5281/zenodo.17980589,https://doi.org/10.5281/zenodo.17980589,https://zenodo.org/doi/10.5281/zenodo.17980589,Singularity-Free Planck-Scale Dual-Unified Ene...,"Nand, Gokula",2025.0,2025-12-18,"[{""relatedIdentifier"": ""10.5281/zenodo.1800041...",new version,,,,,false,,,,"{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ..."


In [32]:
result['server_name'].value_counts()

server_name
F1000Research             5651
Wellcome Open Research    1299
Open Research Europe       698
HRB Open Research          361
Gates Open Research        324
Open Research Africa       100
AMRC Open Research          11
MNI Open Research            6
Zenodo                       4
VeriXiv                      2
Name: count, dtype: int64

In [33]:
df_version = data_clean.copy()

# Normalize version_label once
vl = df_version["version_label"].astype(str).str.strip().str.lower()

# CHILD rules
child_mask = (
    # mask_remain &
    vl.isin(["new version", 'new version; retraction'])
)
df_version.loc[child_mask, "records_hierarchy"] = "version"

# correction rules
correction_mask = (
    # mask_remain &
    vl.isin(["correction"])
)
df_version.loc[correction_mask, "records_hierarchy"] = "correction"

print(df_version["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
NaN           8401285
version          8454
correction        355
Name: count, dtype: int64


In [34]:
df_label = df_version[df_version['records_hierarchy'].notna()]
df_label['server_name'].value_counts()

server_name
F1000Research              5651
Wellcome Open Research     1299
Open Research Europe        698
HRB Open Research           361
eLife                       354
Gates Open Research         324
Open Research Africa        100
AMRC Open Research           11
MNI Open Research             6
VeriXiv                       2
Zenodo                        2
WikiJournal of Medicine       1
Name: count, dtype: int64

In [35]:
pattern = "is-version-of"

mask = df_version['relations_json'].str.contains(pattern, regex=False, na=False)
result = df_version[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy
442472,crossref::10.1002/essoar.10503117.2,Earth and Space Science Open Archive,crossref,10.1002/essoar.10503117.2,https://doi.org/10.1002/essoar.10503117.2,https://essopenarchive.org/users/545221/articl...,Carbon Dioxide Removal Estimation Method to Re...,"Fiume, Shannon",2023.0,2023-09-20,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1002/essoar.10503117.1,,,,false,,,,,
442443,crossref::10.1002/essoar.10503623.3,Earth and Space Science Open Archive,crossref,10.1002/essoar.10503623.3,https://doi.org/10.1002/essoar.10503623.3,https://essopenarchive.org/users/574499/articl...,A Bayesian model for quantifying errors in cit...,"Eisma, Jessica A; Schoups, Gerrit; Davids, Jef...",2023.0,2023-04-04,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1002/essoar.10503623.1,,,,false,,,,,
442692,crossref::10.1002/essoar.10503738.2,Earth and Space Science Open Archive,crossref,10.1002/essoar.10503738.2,https://doi.org/10.1002/essoar.10503738.2,https://essopenarchive.org/users/530351/articl...,Global oceanic iron distribution estimated by ...,"Doi, Toshimasa; Osafune, Satoshi; Masuda, Shuh...",2024.0,2024-07-31,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1002/essoar.10503738.1,,,,false,,,,,
442249,crossref::10.1002/essoar.10505190.2,Earth and Space Science Open Archive,crossref,10.1002/essoar.10505190.2,https://doi.org/10.1002/essoar.10505190.2,https://essopenarchive.org/users/544442/articl...,Soil profile stratigraphy detected by ground p...,"Wang, Ping; Li, Xinju; Min, Xiangyu; Xu, Shuo",2023.0,2023-02-01,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1002/essoar.10505190.1,,,,false,,,,,
442462,crossref::10.1002/essoar.10505545.4,Earth and Space Science Open Archive,crossref,10.1002/essoar.10505545.4,https://doi.org/10.1002/essoar.10505545.4,https://essopenarchive.org/users/543175/articl...,Internal vs Forced Variability Metrics for Geo...,"Sane, Aakash; Fox-Kemper, Baylor; Ullman, David",2023.0,2023-08-10,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1002/essoar.10505545.1,,,,false,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9330883,crossref::10.7554/elife.99997.3,eLife,crossref,10.7554/elife.99997.3,https://doi.org/10.7554/elife.99997.3,https://elifesciences.org/articles/99997,Tripartite organization of brain state dynamic...,"Liu, Lanfang; Jiang, Jiahao; Li, Hehui; Ding, ...",2025.0,2025-01-21,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99997.1;10.7554/elife.99997.2,,10.1101/2024.06.13.598625,,false,,,,,
9370551,crossref::10.7554/elife.99999,eLife,crossref,10.7554/elife.99999,https://doi.org/10.7554/elife.99999,https://elifesciences.org/articles/99999,Glia-mediated gut–brain cytokine signaling cou...,"Malita, Alina; Skakkebaek, Anne H; Kubrak, Olg...",2025.0,2024-09-18,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99999.1;10.7554/elife.99999.2,,10.1101/2024.06.25.600726,,false,,,,,
9351901,crossref::10.7554/elife.99999.1,eLife,crossref,10.7554/elife.99999.1,https://doi.org/10.7554/elife.99999.1,https://elifesciences.org/reviewed-preprints/9...,Glia-mediated gut-brain cytokine signaling cou...,"Malita, Alina; Kubrak, Olga; Chen, Xiaokang; K...",2024.0,2024-09-18,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.25.600726,,,,false,,,,,
9366338,crossref::10.7554/elife.99999.2,eLife,crossref,10.7554/elife.99999.2,https://doi.org/10.7554/elife.99999.2,https://elifesciences.org/reviewed-preprints/9...,Glia-mediated gut-brain cytokine signaling cou...,"Malita, Alina; Skakkebaek, Anne H; Kubrak, Olg...",2025.0,2025-06-26,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.25.600726;10.7554/elife.99999.1,,,,false,,,,,


In [36]:
result['server_name'].value_counts()

server_name
Research Square                         15274
eLife                                   14352
Preprints.org                           13462
ChemRxiv                                12927
PsyArXiv                                 5525
Qeios                                    4571
Open Science Framework                   2288
Authorea Inc.                            1931
CrimRxiv                                 1516
TechRxiv                                 1313
SocArXiv                                 1021
Cambridge Open Engage                     996
Earth and Space Science Open Archive      809
APSA Preprints                            367
EdArXiv                                   131
MetaArXiv                                  67
Thesis Commons                             42
Advance                                    42
PaleorXiv                                  17
EGUsphere                                  13
Encyclopedia                               11
LIS Scholarship Archiv

In [37]:
df_version = df_version.copy()

# Normalize relations_json once
relations_norm = df_version["relations_json"].astype(str).str.lower()

# CHILD rules
child_mask = (
    # mask_remain &
    relations_norm.str.contains("is-version-of", na=False)
)
df_version.loc[child_mask, "records_hierarchy"] = "version"

print(df_version["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
NaN           8324572
version         85167
correction        355
Name: count, dtype: int64


In [38]:
df_label = df_version[df_version['records_hierarchy'].notna()]
df_label['server_name'].value_counts()

server_name
Research Square                         15274
eLife                                   14706
Preprints.org                           13462
ChemRxiv                                12927
F1000Research                            5651
PsyArXiv                                 5525
Qeios                                    4571
Open Science Framework                   2288
Authorea Inc.                            1931
CrimRxiv                                 1516
TechRxiv                                 1313
Wellcome Open Research                   1299
SocArXiv                                 1021
Cambridge Open Engage                     996
Earth and Space Science Open Archive      809
Open Research Europe                      698
APSA Preprints                            367
HRB Open Research                         361
Gates Open Research                       324
EdArXiv                                   131
Open Research Africa                      100
MetaArXiv             

### check those who are review

In [39]:
pattern = "is-review-of"

mask = df_version['relations_json'].str.contains(pattern, regex=False, na=False)
result = df_version[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy
5643709,crossref::10.14293/p2199-8442.1.sop-geo.pnx1a0...,ScienceOpen Preprints,crossref,10.14293/p2199-8442.1.sop-geo.pnx1a0.v1.rfmmoi,https://doi.org/10.14293/p2199-8442.1.sop-geo....,https://scienceopen.com/document/review?review...,"Review of ""Investigation of The Younger Dryas ...","Powell, James",2025.0,2025-06-05,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,
5640221,crossref::10.14293/p2199-8442.1.sop-life.pptzs...,ScienceOpen Preprints,crossref,10.14293/p2199-8442.1.sop-life.pptzsif.v1.reapoa,https://doi.org/10.14293/p2199-8442.1.sop-life...,https://scienceopen.com/document/review?review...,"Review of ""MENTAL ILLNESS: AN INVISIBLE TRAUMA""","CASSINADANE, Dr. ANANDA VAYARAVEL",2021.0,2021-08-10,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,
5642452,crossref::10.14293/p2199-8442.1.sop-matsci.p6s...,ScienceOpen Preprints,crossref,10.14293/p2199-8442.1.sop-matsci.p6ss0g.v1.rcnunj,https://doi.org/10.14293/p2199-8442.1.sop-mats...,https://scienceopen.com/document/review?review...,"Review of ""Varying the Infill Parameters of an...","TOUMIAT, Karima",2023.0,2023-08-22,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,
5642292,crossref::10.14293/s2199-1006.1.sor-ag.a7cgne....,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-ag.a7cgne.v1.rrwmgs,https://doi.org/10.14293/s2199-1006.1.sor-ag.a...,https://scienceopen.com/document/review?review...,"Review of ""&lt;b&gt;A review: CRISPR/Cas12-Med...","Iyer, Archana",2023.0,2023-06-06,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,
5640628,crossref::10.14293/s2199-1006.1.sor-ag.afqaon....,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-ag.afqaon.v1.rnzmdu,https://doi.org/10.14293/s2199-1006.1.sor-ag.a...,https://scienceopen.com/document/review?review...,"Review of ""An Overview of COVID-19 Pandemic: E...","Martí-Brenes, Prof.Dr.Mauricio",2023.0,2023-05-02,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9366362,crossref::10.7554/elife.99999.2.sa1,eLife,crossref,10.7554/elife.99999.2.sa1,https://doi.org/10.7554/elife.99999.2.sa1,https://elifesciences.org/reviewed-preprints/9...,Joint Public Review: Glia-mediated gut-brain c...,,2025.0,2025-06-26,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,
9366367,crossref::10.7554/elife.99999.2.sa2,eLife,crossref,10.7554/elife.99999.2.sa2,https://doi.org/10.7554/elife.99999.2.sa2,https://elifesciences.org/reviewed-preprints/9...,eLife Assessment: Glia-mediated gut-brain cyto...,"Ewer, John",2025.0,2025-06-26,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,
9370553,crossref::10.7554/elife.99999.3.sa0,eLife,crossref,10.7554/elife.99999.3.sa0,https://doi.org/10.7554/elife.99999.3.sa0,https://elifesciences.org/articles/99999v1,eLife Assessment: Glia-mediated gut–brain cyto...,"Ewer, John",2025.0,2025-09-09,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,
9370552,crossref::10.7554/elife.99999.3.sa1,eLife,crossref,10.7554/elife.99999.3.sa1,https://doi.org/10.7554/elife.99999.3.sa1,https://elifesciences.org/articles/99999v1/pee...,Joint Public Review: Glia-mediated gut–brain c...,,2025.0,2025-09-09,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,


In [40]:
result['server_name'].value_counts()

server_name
eLife                    73369
Qeios                    55838
EGUsphere                45433
ScienceOpen Preprints      790
UCL Open Environment         7
CrimRxiv                     6
Name: count, dtype: int64

In [41]:
df_version_review = df_version.copy()

# Normalize relations_json once
relations_norm = df_version_review["relations_json"].astype(str).str.lower()

# Review signals
review_mask = (
    # mask_remain &
    (
        # relations_norm.str.contains("has-review", na=False) |
        relations_norm.str.contains("is-review-of", na=False) 
    )
)

df_version_review.loc[review_mask, "records_hierarchy"] = "review"
print(df_version_review["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
NaN           8149129
review         175443
version         85167
correction        355
Name: count, dtype: int64


In [42]:
df_label = df_version_review[df_version_review['records_hierarchy'].notna()]
df_label['server_name'].value_counts()

server_name
eLife                                   88075
Qeios                                   60409
EGUsphere                               45446
Research Square                         15274
Preprints.org                           13462
ChemRxiv                                12927
F1000Research                            5651
PsyArXiv                                 5525
Open Science Framework                   2288
Authorea Inc.                            1931
CrimRxiv                                 1522
TechRxiv                                 1313
Wellcome Open Research                   1299
SocArXiv                                 1021
Cambridge Open Engage                     996
Earth and Space Science Open Archive      809
ScienceOpen Preprints                     790
Open Research Europe                      698
APSA Preprints                            367
HRB Open Research                         361
Gates Open Research                       324
EdArXiv               

### use regular expression to extract version and others

In [43]:
import re
import numpy as np
import pandas as pd

df = df_version_review.copy()

# ------------------------------------------------------------
# 0) Ensure target column exists
# ------------------------------------------------------------

# normalize strings once (safe)
df["landing_norm"] = df.get("landing_page_url", "").astype(str).str.lower()
df["doi_norm"] = df.get("doi", "").astype(str).str.lower()

# ------------------------------------------------------------
# 1) Your regex (good: avoids /v284p and .v50i7)
# ------------------------------------------------------------
VERSION_RX = re.compile(
    r'(?P<token>('
    r'/v\d+(?![a-z0-9])|'       # /v1 but not /v284p
    r'\.v\d+(?![a-z0-9])|'      # .v1 but not .v50i7
    r'_v\d+(?![a-z0-9])|'       # _v1
    r'-v\d+(?![a-z0-9])|'       # -v2 but not -v8018x
    r'-rc\d+(?![a-z0-9])|'      # -rc1
    r'-cc\d+(?![a-z0-9])|'      # -cc1
    r'-supplement|'             # -supplement
    r'\.sa\d+(?![a-z0-9])'      # .sa10
    r'))',
    re.IGNORECASE
)

# helpers: classify token family + get numeric v
VNUM_RX = re.compile(r'(?:^|[._/\-])v(\d+)$', re.IGNORECASE)  # matches v1 at end of token
def extract_token(text: str):
    if not isinstance(text, str) or text.strip() == "" or text.lower() == "nan":
        return None
    m = VERSION_RX.search(text)
    return m.group("token") if m else None

def token_kind(token: str):
    if not isinstance(token, str):
        return None
    t = token.lower()
    if t.startswith(("/v", ".v", "_v", "-v")):
        return "explicit_version"
    if t.startswith("-rc"):
        return "rc"
    if t.startswith("-cc"):
        return "cc"
    if t == "-supplement":
        return "supplement"
    if t.startswith(".sa"):
        return "supplementary_asset"
    return "other"

def token_vnum(token: str):
    if not isinstance(token, str):
        return np.nan
    t = token.lower()
    # extract trailing v number for explicit version tokens only
    m = re.search(r'v(\d+)$', t)
    return float(m.group(1)) if m else np.nan

# ------------------------------------------------------------
# 2) Apply extraction: landing_page_url first, then doi
# ------------------------------------------------------------
remain = df["records_hierarchy"].fillna("other").eq("other")

# --- landing page tokens ---
df.loc[remain, "version_token_lp"] = df.loc[remain, "landing_norm"].map(extract_token)
df.loc[remain, "token_kind_lp"] = df.loc[remain, "version_token_lp"].map(token_kind)
df.loc[remain, "vnum_lp"] = df.loc[remain, "version_token_lp"].map(token_vnum)

# Label rules (landing_page_url)
# - Only use explicit vN tokens for parent/child
# - parent if v0 or v1, child if v>=2
lp_parent = remain & (df["token_kind_lp"] == "explicit_version") & (df["vnum_lp"].isin([0.0, 1.0]))
lp_child  = remain & (df["token_kind_lp"] == "explicit_version") & (df["vnum_lp"] >= 2)

df.loc[lp_parent, "records_hierarchy"] = "parent"
df.loc[lp_child,  "records_hierarchy"] = "version"

# Everything else from landing tokens (rc/cc/supplement/.sa) keep as other
# If you prefer to label them separately, do it here:
df.loc[remain & df["token_kind_lp"].isin(["rc","cc"]), "records_hierarchy"] = "comment"
df.loc[remain & df["token_kind_lp"].isin(["supplement","supplementary_asset"]), "records_hierarchy"] = "part_of"

# --- doi tokens (only for still-unlabeled) ---
remain2 = df["records_hierarchy"].fillna("other").eq("other")

df.loc[remain2, "version_token_doi"] = df.loc[remain2, "doi_norm"].map(extract_token)
df.loc[remain2, "token_kind_doi"] = df.loc[remain2, "version_token_doi"].map(token_kind)
df.loc[remain2, "vnum_doi"] = df.loc[remain2, "version_token_doi"].map(token_vnum)

doi_parent = remain2 & (df["token_kind_doi"] == "explicit_version") & (df["vnum_doi"].isin([0.0, 1.0]))
doi_child  = remain2 & (df["token_kind_doi"] == "explicit_version") & (df["vnum_doi"] >= 2)

df.loc[doi_parent, "records_hierarchy"] = "parent"
df.loc[doi_child,  "records_hierarchy"] = "version"

# ------------------------------------------------------------
# 3) Optional: quick diagnostics
# ------------------------------------------------------------
# How many got labeled via landing vs doi?
print(df["records_hierarchy"].value_counts(dropna=False))
# print(df.loc[df["records_hierarchy"].isin(["parent","child"]),
#              ["server_name","landing_page_url","doi","version_token_lp","version_token_doi","records_hierarchy"]].head(20))


records_hierarchy
NaN           7324091
parent         782740
review         175443
version        122862
part_of          4583
correction        355
comment            20
Name: count, dtype: int64


In [44]:
df_label = df[df['records_hierarchy'].notna()]
df_label['server_name'].value_counts()

server_name
Research Square                                                         450818
Preprints.org                                                           115815
eLife                                                                    88075
Open Science Framework                                                   80649
Qeios                                                                    60414
                                                                         ...  
WikiJournal of Medicine                                                      1
CERN document server                                                         1
Prepublicaciones OpenCiencia                                                 1
Digital Access to Scholarship at Harvard (DASH) (Harvard University)         1
National Bureau of Economic Research                                         1
Name: count, Length: 79, dtype: int64

In [45]:
df = df.copy()

# Ensure column exists
# if "records_hierarchy" not in df.columns:
#     df["records_hierarchy"] = "other"

# Normalize relations_json once
relations_norm = df["landing_page_url"].astype(str).str.lower()

# Only touch rows not already parent/child
mask_remain = df["records_hierarchy"].isin(["other", None, np.nan])

# Part signals
part_mask = (
    mask_remain &
    (
        relations_norm.str.contains("#fig", na=False) |
        relations_norm.str.contains("#digest", na=False) |
        relations_norm.str.contains("#supp", na=False) |
        relations_norm.str.contains("#video", na=False) |
        relations_norm.str.contains("#media", na=False) |
        relations_norm.str.contains("#tbl", na=False) |
        relations_norm.str.contains("#table", na=False) |
        relations_norm.str.contains("#sd", na=False) |
        relations_norm.str.contains("#transrepform", na=False) |
        relations_norm.str.contains("/figures#", na=False) |
        relations_norm.str.contains("#box", na=False) |
        relations_norm.str.contains("#app", na=False) |
        relations_norm.str.contains("#resp", na=False) |
        relations_norm.str.contains("#a", na=False) |
        relations_norm.str.contains("#b", na=False) |
        relations_norm.str.contains("#c", na=False) |
        relations_norm.str.contains("#s", na=False) |
        relations_norm.str.contains("#atbl", na=False) |
        relations_norm.str.contains("#sa", na=False) |
        relations_norm.str.contains("#none", na=False) |
        relations_norm.str.contains("#desfig", na=False) |
        relations_norm.str.contains("#keyresource", na=False) |
        relations_norm.str.contains("#abstract", na=False) 
    )
)
df.loc[part_mask, "records_hierarchy"] = "part_of"
#
# Review signals
comment_mask = (
    mask_remain &
    (
        relations_norm.str.contains("#ac", na=False) |
        relations_norm.str.contains("#rc", na=False) |
        relations_norm.str.contains("#cc", na=False) |
        relations_norm.str.contains("#ec", na=False) |
        relations_norm.str.contains("/peer-reviews", na=False) |
        relations_norm.str.contains("#decision-letter", na=False) |
        relations_norm.str.contains("#cec", na=False) 
    )
)

df.loc[comment_mask, "records_hierarchy"] = "comment"


print(df["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
NaN           7178907
parent         782740
review         175443
part_of        149462
version        122862
correction        355
comment           325
Name: count, dtype: int64


In [46]:
df_label = df[df['records_hierarchy'].notna()]
df_label['server_name'].value_counts()

server_name
Research Square                                                         450818
eLife                                                                   231745
Preprints.org                                                           115815
Open Science Framework                                                   80649
Qeios                                                                    60414
                                                                         ...  
WikiJournal of Science                                                       1
CERN document server                                                         1
Prepublicaciones OpenCiencia                                                 1
Digital Access to Scholarship at Harvard (DASH) (Harvard University)         1
National Bureau of Economic Research                                         1
Name: count, Length: 81, dtype: int64

### identify publish versions

In [47]:
pattern = "has-preprint"

mask = df['relations_json'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
429693,crossref::10.1002/essoar.10502762.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10502762.1,https://doi.org/10.1002/essoar.10502762.1,https://essopenarchive.org/doi/full/10.1002/es...,Extant mat world analog microbes synchronize m...,"Biddanda, Bopaiah A; Weinke, Anthony D",2020.0,2020-04-28,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,10.1002/essoar.10502762.1,10.1002/essoar.10502762.1,,true,,,,,,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10502762.1,,,,,,
429702,crossref::10.1002/essoar.10503378.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10503378.1,https://doi.org/10.1002/essoar.10503378.1,https://essopenarchive.org/doi/full/10.1002/es...,Adjoint Slip Inversion under a Constrained Opt...,"Tago, Josué; Cruz-Atienza, Víctor M.; Villafue...",2020.0,2020-06-18,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,10.1002/essoar.10503378.1;10.1093/gji/ggab165,10.1002/essoar.10503378.1,,true,,,,,,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10503378.1,,,,,,
429703,crossref::10.1002/essoar.10503397.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10503397.1,https://doi.org/10.1002/essoar.10503397.1,https://essopenarchive.org/doi/full/10.1002/es...,Restoring pre-industrial CO2 levels while achi...,"Capron, Mark E; Stewart, Jim R.; de Ramon N'Ye...",2020.0,2020-06-18,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,10.1002/essoar.10503397.1;10.3390/en13184972,10.1002/essoar.10503397.1,,true,,,,,,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10503397.1,,,,,,
429708,crossref::10.1002/essoar.10503526.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10503526.1,https://doi.org/10.1002/essoar.10503526.1,https://essopenarchive.org/doi/full/10.1002/es...,Determining the Isotopic Composition of Surfac...,"Hu, Yongbo; Xiao, Wei; Wei, Zhongwang; Welp, L...",2020.0,2020-07-05,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,10.1002/essoar.10503526.1;10.1029/2020ea001304,10.1002/essoar.10503526.1,,true,,,,,,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10503526.1,,,,,,
429713,crossref::10.1002/essoar.10503764.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10503764.1,https://doi.org/10.1002/essoar.10503764.1,https://essopenarchive.org/doi/full/10.1002/es...,Observational evidence of interaction between ...,"Figueiredo, Cosme Alexandre O. B.; Wrasse, Cri...",2020.0,2020-07-30,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,,10.1002/essoar.10503437.1,,false,,,,,,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10503764.1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9373415,crossref::10.7554/elife.99989.3,eLife,crossref,10.7554/elife.99989.3,https://doi.org/10.7554/elife.99989.3,https://elifesciences.org/articles/99989,Tonotopy is not preserved in a descending stag...,"Gu, Miaoqing; Liang, Shanshan; Zhu, Jiahui; Li...",2025.0,2025-10-14,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99989.1;10.7554/elife.99989.2,,10.1101/2024.05.25.595883,,false,,,,,version,https://elifesciences.org/articles/99989,10.7554/elife.99989.3,,,,,,
9330914,crossref::10.7554/elife.99997,eLife,crossref,10.7554/elife.99997,https://doi.org/10.7554/elife.99997,https://elifesciences.org/articles/99997,Tripartite organization of brain state dynamic...,"Liu, Lanfang; Jiang, Jiahao; Li, Hehui; Ding, ...",2025.0,2024-10-08,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99997.1;10.7554/elife.99997.2,,10.1101/2024.06.13.598625,,false,,,,,version,https://elifesciences.org/articles/99997,10.7554/elife.99997,,,,,,
9330883,crossref::10.7554/elife.99997.3,eLife,crossref,10.7554/elife.99997.3,https://doi.org/10.7554/elife.99997.3,https://elifesciences.org/articles/99997,Tripartite organization of brain state dynamic...,"Liu, Lanfang; Jiang, Jiahao; Li, Hehui; Ding, ...",2025.0,2025-01-21,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99997.1;10.7554/elife.99997.2,,10.1101/2024.06.13.598625,,false,,,,,version,https://elifesciences.org/articles/99997,10.7554/elife.99997.3,,,,,,
9370551,crossref::10.7554/elife.99999,eLife,crossref,10.7554/elife.99999,https://doi.org/10.7554/elife.99999,https://elifesciences.org/articles/99999,Glia-mediated gut–brain cytokine signaling cou...,"Malita, Alina; Skakkebaek, Anne H; Kubrak, Olg...",2025.0,2024-09-18,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99999.1;10.7554/elife.99999.2,,10.1101/2024.06.25.600726,,false,,,,,version,https://elifesciences.org/articles/99999,10.7554/elife.99999,,,,,,


In [48]:
result['server_name'].value_counts()

server_name
eLife                                   9294
Earth and Space Science Open Archive      29
Gates Open Research                       28
Name: count, dtype: int64

In [49]:
df = df.copy()

# Normalize relations_json once
relations_norm = df["relations_json"].astype(str).str.lower()

# Review signals
review_mask = (
    (
        relations_norm.str.contains("has-preprint", na=False) 
    )
)

df.loc[review_mask, "records_hierarchy"] = "publish_version"
print(df["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
NaN                7175014
parent              782712
review              175443
part_of             149462
version             117432
publish_version       9351
correction             355
comment                325
Name: count, dtype: int64


In [50]:
df_label = df[df['records_hierarchy'].notna()]
df_label['server_name'].value_counts()

server_name
Research Square                                                         450818
eLife                                                                   235609
Preprints.org                                                           115815
Open Science Framework                                                   80649
Qeios                                                                    60414
                                                                         ...  
WikiJournal of Science                                                       1
CERN document server                                                         1
Prepublicaciones OpenCiencia                                                 1
Digital Access to Scholarship at Harvard (DASH) (Harvard University)         1
National Bureau of Economic Research                                         1
Name: count, Length: 81, dtype: int64

#### remaining

In [51]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
arXiv                                  2920797
SSRN                                   1258958
HAL                                    1056216
RePEc: Research Papers in Economics     388814
bioRxiv                                 306948
                                        ...   
searchRxiv                                   2
Gates Open Research                          1
MNI Open Research                            1
Open Research Africa                         1
AMRC Open Research                           1
Name: count, Length: 103, dtype: int64

### Label server who assingn unique doi for all version as parent

In [52]:
import pandas as pd
import numpy as np
import re

# ============================================================
# 1) Read the Google Sheet (rules tab) as CSV
#    - Works for public / shared-to-anyone sheets
# ============================================================

SHEET_ID = "10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4"
GID = "1230415212"  # the gid you shared for the 'rules' tab

rules_csv_url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv&gid={GID}"
rules = pd.read_csv(rules_csv_url)

# Normalize column names (strip spaces/newlines)
rules.columns = (
    rules.columns.astype(str)
    .str.replace(r"\s+", " ", regex=True)  # collapse whitespace/newlines
    .str.strip()
)

# ============================================================
# 2) Extract servers where "versionning - doi" == "unique"
# ============================================================

# These are the column names seen in your screenshot:
SERVER_COL = "Field_server_name"
VERS_COL = "versionning - doi"  # after normalization it should match like this

# Safety: show close matches if something is off
if SERVER_COL not in rules.columns or VERS_COL not in rules.columns:
    print("Columns available:", rules.columns.tolist())
    raise KeyError(f"Expected columns not found. Need: {SERVER_COL!r} and {VERS_COL!r}")

# Build "unique versioning" server list
unique_servers = (
    rules.loc[
        rules[VERS_COL].astype(str).str.strip().str.lower().eq("unique"),
        SERVER_COL
    ]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
    .tolist()
)

print(f"Unique-versioning servers found: {len(unique_servers)}")
print(unique_servers[:30])  # preview

# ============================================================
# 3) Apply mapping to your dataframe df -> records_hierarchy = "parent"
# ============================================================

df = df.copy()

# Normalize server_name in df for matching
df["server_name_norm"] = df["server_name"].astype(str).str.strip()

# Normalize unique list too (strip)
unique_servers_norm = [s.strip() for s in unique_servers]

# Create / overwrite records_hierarchy
# Default: keep existing if present; else "other"
# df["records_hierarchy"] = df["records_hierarchy"] if "records_hierarchy" in df.columns else "other"

mask_unique = df["server_name_norm"].isin(unique_servers_norm)
# df.loc[mask_unique, "records_hierarchy"] = "parent"
# only fill missing values (uncomment instead of overwrite)
df.loc[mask_unique & df["records_hierarchy"].isna(), "records_hierarchy"] = "parent"

# Optional: drop helper col
df.drop(columns=["server_name_norm"], inplace=True)

# Quick check
print(df["records_hierarchy"].value_counts(dropna=False).head(10))
# print(df.loc[df["server_name"].isin(unique_servers_norm), ["server_name","records_hierarchy"]].drop_duplicates().head(20))


Unique-versioning servers found: 60
['arXiv', 'HAL', 'SSRN', 'RePEc: Research Papers in Economics', 'bioRxiv', 'Zenodo', 'medRxiv', 'Munich Personal RePEc Archive', 'JMIR Preprints', 'AgEcon Search', 'viXra', 'IACR Cryptology ePrint Archive', 'Qeios', 'Nature Precedings', 'SciELO Preprints', 'EcoEvoRxiv', 'National Bureau of Economic Research', 'PREPRINTS.RU', 'ARPHA Preprints', 'SportRxiv', 'Covid-19 Preprints', 'DSpace@MIT', 'Organic Eprints', 'Digital Access to Scholarship at Harvard (DASH) (Harvard University)', 'PhilSci-Archive', 'prepare@u', 'UCL Open Environment', 'EnerarXiv', 'AIJR Preprints', 'IndiaRxiv']
records_hierarchy
parent             7841901
review              175443
part_of             149462
version             117432
NaN                 115825
publish_version       9351
correction             355
comment                325
Name: count, dtype: int64


#### remaining

In [53]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
ChemRxiv                                26513
Earth and Space Science Open Archive    13089
eLife                                   11949
EGUsphere                               10209
TechRxiv                                 8320
PsyArXiv                                 6632
PeerJ Preprints                          6446
EarthArXiv                               6425
Authorea Inc.                            6092
engrXiv                                  4925
SocArXiv                                 3072
INA-Rxiv                                 2830
Cambridge Open Engage                    2089
Advance                                  1717
Thesis Commons                           1507
APSA Preprints                           1101
AgriRxiv                                  787
EdArXiv                                   550
Law Archive                               287
Arabixiv                                  209
MetaArXiv                                 196
ScienceOpen Preprints 

### manage server-by-server

#### eLife

In [54]:
df_remain[df_remain['server_name']=='eLife']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9130897,crossref::10.7554/elife,eLife,crossref,10.7554/elife,https://doi.org/10.7554/elife,https://elifesciences.org/,eLife,,,2017-07-25,,,,,,,false,,,,,,https://elifesciences.org/,10.7554/elife,,,,,,
9274104,crossref::10.7554/elife.00003,eLife,crossref,10.7554/elife.00003,https://doi.org/10.7554/elife.00003,https://elifesciences.org/articles/00003,A novel role for lipid droplets in the organis...,"Anand, Preetha; Cermelli, Silvia; Li, Zhihuan;...",2012.0,2012-11-13,,,,,,,false,,,,,,https://elifesciences.org/articles/00003,10.7554/elife.00003,,,,,,
9274103,crossref::10.7554/elife.00005,eLife,crossref,10.7554/elife.00005,https://doi.org/10.7554/elife.00005,https://elifesciences.org/articles/00005,Molecular architecture of human polycomb repre...,"Ciferri, Claudio; Lander, Gabriel C; Maiolica,...",2012.0,2012-10-30,,,,,,,false,,,,,,https://elifesciences.org/articles/00005,10.7554/elife.00005,,,,,,
9274102,crossref::10.7554/elife.00007,eLife,crossref,10.7554/elife.00007,https://doi.org/10.7554/elife.00007,https://elifesciences.org/articles/00007,Herbivory-induced volatiles function as defens...,"Schuman, Meredith C; Barthel, Kathleen; Baldwi...",2012.0,2012-10-15,"{""is-supplemented-by"": [{""asserted-by"": ""subje...",,,,,,false,,,,,,https://elifesciences.org/articles/00007,10.7554/elife.00007,,,,,,
9274105,crossref::10.7554/elife.00011,eLife,crossref,10.7554/elife.00011,https://doi.org/10.7554/elife.00011,https://elifesciences.org/articles/00011,Nascent-Seq reveals novel features of mouse ci...,"Menet, Jerome S; Rodriguez, Joseph; Abruzzi, K...",2012.0,2012-11-13,"{""is-supplemented-by"": [{""asserted-by"": ""subje...",,,,,,false,,,,,,https://elifesciences.org/articles/00011,10.7554/elife.00011,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9324934,crossref::10.7554/elife.99560,eLife,crossref,10.7554/elife.99560,https://doi.org/10.7554/elife.99560,https://elifesciences.org/articles/99560,Paying attention,"Poth, Christian H",2024.0,2024-06-10,,,,,,,false,,,,,,https://elifesciences.org/articles/99560,10.7554/elife.99560,,,,,,
9325271,crossref::10.7554/elife.99765,eLife,crossref,10.7554/elife.99765,https://doi.org/10.7554/elife.99765,https://elifesciences.org/articles/99765,Redox takes control,"Plaza-Menacho, Iván",2024.0,2024-06-20,,,,,,,false,,,,,,https://elifesciences.org/articles/99765,10.7554/elife.99765,,,,,,
9327321,crossref::10.7554/elife.99770,eLife,crossref,10.7554/elife.99770,https://doi.org/10.7554/elife.99770,https://elifesciences.org/articles/99770,Exploring protein structural ensembles: Integr...,"Belyaeva, Julia; Elgeti, Matthias",2024.0,2024-09-16,,,,,,,false,,,,,,https://elifesciences.org/articles/99770,10.7554/elife.99770,,,,,,
9326155,crossref::10.7554/elife.99804,eLife,crossref,10.7554/elife.99804,https://doi.org/10.7554/elife.99804,https://elifesciences.org/articles/99804,Mapping the fly nerve cord,"Seggewisse, Anna; Winding, Michael",2024.0,2024-07-09,,,,,,,false,,,,,,https://elifesciences.org/articles/99804,10.7554/elife.99804,,,,,,


In [55]:
pattern = "10.7554/elife.12523"

mask = df['doi'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9287585,crossref::10.7554/elife.12523,eLife,crossref,10.7554/elife.12523,https://doi.org/10.7554/elife.12523,https://elifesciences.org/articles/12523,Hold your breath!,"Lust, Katharina; Wittbrodt, Joachim",2015.0,2015-12-11,,,,,,,False,,,,,,https://elifesciences.org/articles/12523,10.7554/elife.12523,,,,,,
9130880,crossref::10.7554/elife.12523.001,eLife,crossref,10.7554/elife.12523.001,https://doi.org/10.7554/elife.12523.001,http://elifesciences.org/lookup/doi/10.7554/eL...,Figure 1. The reactive oxygen species (ROS) th...,,,2015-12-11,,,,,,,False,,,,,,http://elifesciences.org/lookup/doi/10.7554/el...,10.7554/elife.12523.001,,,,,,


In [56]:
pattern = "10.7554/elife.100000"

mask = df['doi'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9366162,crossref::10.7554/elife.100000,eLife,crossref,10.7554/elife.100000,https://doi.org/10.7554/elife.100000,https://elifesciences.org/articles/100000,Group identification drives brain integration ...,"Xie, Enhui; Zha, Shuyi; Xu, Yiyang; Li, Xianchun",2025.0,2024-10-14,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.100000.1;10.7554/elife.100000.2;...,,10.1101/2024.06.03.597223,,False,,,,,publish_version,https://elifesciences.org/articles/100000,10.7554/elife.100000,,,,,,
9352818,crossref::10.7554/elife.100000.1,eLife,crossref,10.7554/elife.100000.1,https://doi.org/10.7554/elife.100000.1,https://elifesciences.org/reviewed-preprints/1...,Group identification drives brain integration ...,"Xie, Enhui; Zha, Shuyi; Xu, Yiyang; Li, Xianchun",2024.0,2024-10-14,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.03.597223,,,,False,,,,,version,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1,,,,,,
9353978,crossref::10.7554/elife.100000.1.sa0,eLife,crossref,10.7554/elife.100000.1.sa0,https://doi.org/10.7554/elife.100000.1.sa0,https://elifesciences.org/reviewed-preprints/1...,Author response: Group identification drives b...,"Xie, Enhui; Zha, Shuyi; Xu, Yiyang; Li, Xianchun",2024.0,2024-10-14,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,False,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1.sa0,,,,,,
9353967,crossref::10.7554/elife.100000.1.sa1,eLife,crossref,10.7554/elife.100000.1.sa1,https://doi.org/10.7554/elife.100000.1.sa1,https://elifesciences.org/reviewed-preprints/1...,Reviewer #2 (Public review): Group identificat...,,2024.0,2024-10-14,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,False,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1.sa1,,,,,,
9353972,crossref::10.7554/elife.100000.1.sa2,eLife,crossref,10.7554/elife.100000.1.sa2,https://doi.org/10.7554/elife.100000.1.sa2,https://elifesciences.org/reviewed-preprints/1...,Reviewer #1 (Public review): Group identificat...,,2024.0,2024-10-14,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,False,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1.sa2,,,,,,
9353963,crossref::10.7554/elife.100000.1.sa3,eLife,crossref,10.7554/elife.100000.1.sa3,https://doi.org/10.7554/elife.100000.1.sa3,https://elifesciences.org/reviewed-preprints/1...,eLife Assessment: Group identification drives ...,"Fornito, Alex",2024.0,2024-10-14,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,False,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1.sa3,,,,,,
9360443,crossref::10.7554/elife.100000.2,eLife,crossref,10.7554/elife.100000.2,https://doi.org/10.7554/elife.100000.2,https://elifesciences.org/reviewed-preprints/1...,Group identification drives brain integration ...,"Xie, Enhui; Zha, Shuyi; Xu, Yiyang; Li, Xianchun",2025.0,2025-04-01,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.03.597223;10.7554/elife.100000.1,,,,False,,,,,version,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.2,,,,,,
9361880,crossref::10.7554/elife.100000.2.sa0,eLife,crossref,10.7554/elife.100000.2.sa0,https://doi.org/10.7554/elife.100000.2.sa0,https://elifesciences.org/reviewed-preprints/1...,Author response: Group identification drives b...,"Xie, Enhui; Zha, Shuyi; Xu, Yiyang; Li, Xianchun",2025.0,2025-04-02,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,False,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.2.sa0,,,,,,
9361870,crossref::10.7554/elife.100000.2.sa1,eLife,crossref,10.7554/elife.100000.2.sa1,https://doi.org/10.7554/elife.100000.2.sa1,https://elifesciences.org/reviewed-preprints/1...,Reviewer #1 (Public review): Group identificat...,,2025.0,2025-04-02,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,False,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.2.sa1,,,,,,
9361866,crossref::10.7554/elife.100000.2.sa2,eLife,crossref,10.7554/elife.100000.2.sa2,https://doi.org/10.7554/elife.100000.2.sa2,https://elifesciences.org/reviewed-preprints/1...,eLife Assessment: Group identification drives ...,"Fornito, Alex",2025.0,2025-04-02,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,False,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.2.sa2,,,,,,


In [57]:
pattern = ".1"

mask = df[df['server_name']=='eLife']['doi'].str.contains(pattern, regex=False, na=False)
result = df[df['server_name']=='eLife'][mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9366162,crossref::10.7554/elife.100000,eLife,crossref,10.7554/elife.100000,https://doi.org/10.7554/elife.100000,https://elifesciences.org/articles/100000,Group identification drives brain integration ...,"Xie, Enhui; Zha, Shuyi; Xu, Yiyang; Li, Xianchun",2025.0,2024-10-14,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.100000.1;10.7554/elife.100000.2;...,,10.1101/2024.06.03.597223,,false,,,,,publish_version,https://elifesciences.org/articles/100000,10.7554/elife.100000,,,,,,
9352818,crossref::10.7554/elife.100000.1,eLife,crossref,10.7554/elife.100000.1,https://doi.org/10.7554/elife.100000.1,https://elifesciences.org/reviewed-preprints/1...,Group identification drives brain integration ...,"Xie, Enhui; Zha, Shuyi; Xu, Yiyang; Li, Xianchun",2024.0,2024-10-14,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.03.597223,,,,false,,,,,version,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1,,,,,,
9353978,crossref::10.7554/elife.100000.1.sa0,eLife,crossref,10.7554/elife.100000.1.sa0,https://doi.org/10.7554/elife.100000.1.sa0,https://elifesciences.org/reviewed-preprints/1...,Author response: Group identification drives b...,"Xie, Enhui; Zha, Shuyi; Xu, Yiyang; Li, Xianchun",2024.0,2024-10-14,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1.sa0,,,,,,
9353967,crossref::10.7554/elife.100000.1.sa1,eLife,crossref,10.7554/elife.100000.1.sa1,https://doi.org/10.7554/elife.100000.1.sa1,https://elifesciences.org/reviewed-preprints/1...,Reviewer #2 (Public review): Group identificat...,,2024.0,2024-10-14,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1.sa1,,,,,,
9353972,crossref::10.7554/elife.100000.1.sa2,eLife,crossref,10.7554/elife.100000.1.sa2,https://doi.org/10.7554/elife.100000.1.sa2,https://elifesciences.org/reviewed-preprints/1...,Reviewer #1 (Public review): Group identificat...,,2024.0,2024-10-14,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,review,https://elifesciences.org/reviewed-preprints/1...,10.7554/elife.100000.1.sa2,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9353740,crossref::10.7554/elife.99997.1.sa2,eLife,crossref,10.7554/elife.99997.1.sa2,https://doi.org/10.7554/elife.99997.1.sa2,https://elifesciences.org/reviewed-preprints/9...,Reviewer #1 (Public review): Tripartite organi...,,2024.0,2024-10-08,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa2,,,,,,
9353736,crossref::10.7554/elife.99997.1.sa3,eLife,crossref,10.7554/elife.99997.1.sa3,https://doi.org/10.7554/elife.99997.1.sa3,https://elifesciences.org/reviewed-preprints/9...,eLife Assessment: Tripartite organization of b...,"Martin, Andrea E",2024.0,2024-10-08,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa3,,,,,,
9351901,crossref::10.7554/elife.99999.1,eLife,crossref,10.7554/elife.99999.1,https://doi.org/10.7554/elife.99999.1,https://elifesciences.org/reviewed-preprints/9...,Glia-mediated gut-brain cytokine signaling cou...,"Malita, Alina; Kubrak, Olga; Chen, Xiaokang; K...",2024.0,2024-09-18,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.25.600726,,,,false,,,,,version,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99999.1,,,,,,
9352368,crossref::10.7554/elife.99999.1.sa0,eLife,crossref,10.7554/elife.99999.1.sa0,https://doi.org/10.7554/elife.99999.1.sa0,https://elifesciences.org/reviewed-preprints/9...,Joint Public Review: Glia-mediated gut-brain c...,,2024.0,2024-09-18,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,false,,,,,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99999.1.sa0,,,,,,


In [58]:
df[df['server_name']=='eLife']['doi']

9130897                10.7554/elife
9274104          10.7554/elife.00003
9131097      10.7554/elife.00003.001
9131098      10.7554/elife.00003.002
9131100      10.7554/elife.00003.003
                     ...            
9366367    10.7554/elife.99999.2.sa2
9370550        10.7554/elife.99999.3
9370553    10.7554/elife.99999.3.sa0
9370552    10.7554/elife.99999.3.sa1
9370554    10.7554/elife.99999.3.sa2
Name: doi, Length: 247558, dtype: object

In [59]:
import re
import numpy as np
import pandas as pd

df = df.copy()

# Ensure column exists
# if "records_hierarchy" not in df.columns:
#     df["records_hierarchy"] = np.nan

# Work ONLY on remaining rows
remaining = df["records_hierarchy"].isna()

is_elife = df["server_name"].astype(str).str.lower().eq("elife")
doi_l = df["doi"].astype(str).str.lower()

# ---------- Regex patterns ----------

# Base parent: 10.7554/elife.12523
RX_PARENT_BASE = re.compile(    r"^10\.7554/elife\.\d+$",    re.IGNORECASE)

# Explicit parent v1 (NO padding)
RX_PARENT_V1 = re.compile(    r"^10\.7554/elife\.\d+\.1$",    re.IGNORECASE)

# Any dotted numeric suffix (captures padding too)
RX_ANY_SUFFIX = re.compile(r"^10\.7554/elife\.\d+\.(\d+)$", re.IGNORECASE)

# ---------- Apply rules ----------

# Parent: base DOI
mask_parent_base = remaining & is_elife & doi_l.str.match(RX_PARENT_BASE, na=False)
df.loc[mask_parent_base, "records_hierarchy"] = "parent"

# Parent: explicit ".1" ONLY
mask_parent_v1 = remaining & is_elife & doi_l.str.match(RX_PARENT_V1, na=False)
df.loc[mask_parent_v1, "records_hierarchy"] = "parent_duplicate"

# Child: any numeric suffix EXCEPT exact ".1"
suffix = doi_l.str.extract(RX_ANY_SUFFIX)[0]

mask_child = (
    remaining
    & is_elife
    & suffix.notna()
    & (suffix != "1")   # excludes .1 but keeps .001, .002, .2, etc.
)

df.loc[mask_child, "records_hierarchy"] = "version"
print(df["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
parent              7853725
review               175443
part_of              149462
version              117553
NaN                  103877
publish_version        9351
correction              355
comment                 325
parent_duplicate          3
Name: count, dtype: int64


In [60]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
ChemRxiv                                26513
Earth and Space Science Open Archive    13089
EGUsphere                               10209
TechRxiv                                 8320
PsyArXiv                                 6632
PeerJ Preprints                          6446
EarthArXiv                               6425
Authorea Inc.                            6092
engrXiv                                  4925
SocArXiv                                 3072
INA-Rxiv                                 2830
Cambridge Open Engage                    2089
Advance                                  1717
Thesis Commons                           1507
APSA Preprints                           1101
AgriRxiv                                  787
EdArXiv                                   550
Law Archive                               287
Arabixiv                                  209
MetaArXiv                                 196
ScienceOpen Preprints                     151
LIS Scholarship Archiv

In [61]:
df_remain[df_remain['server_name']=='eLife']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9130897,crossref::10.7554/elife,eLife,crossref,10.7554/elife,https://doi.org/10.7554/elife,https://elifesciences.org/,eLife,,,2017-07-25,,,,,,,False,,,,,,https://elifesciences.org/,10.7554/elife,,,,,,


In [62]:
# pattern = "10.7554/elife.99997"

# mask = df['doi'].str.contains(pattern, regex=False, na=False)
# result = df[mask]
# result

# df = df[df.records_hiearchy.isin(['publish_version', 'version'])]



#### ChemRxiv

In [63]:
df_remain[df_remain['server_name']=='ChemRxiv']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
286137,crossref::10.26434/chemrxiv-2021-00kkd,ChemRxiv,crossref,10.26434/chemrxiv-2021-00kkd,https://doi.org/10.26434/chemrxiv-2021-00kkd,https://chemrxiv.org/engage/chemrxiv/article-d...,Understanding MOF nucleation from solution wit...,"Kollias, Loukas; Rousseau, Roger; Glezakou, Va...",2021.0,2021-12-23,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-00kkd,,,,,,
312423,crossref::10.26434/chemrxiv-2021-00rj4,ChemRxiv,crossref,10.26434/chemrxiv-2021-00rj4,https://doi.org/10.26434/chemrxiv-2021-00rj4,https://chemrxiv.org/engage/chemrxiv/article-d...,Electrochemical Ozone Generation Using Compact...,"Wood, Georgia; Terrero Rodriguez, Irina; Tully...",2021.0,2022-01-25,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1149/1945-7111/ac3ff4,,,true,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-00rj4,,,,,,
312367,crossref::10.26434/chemrxiv-2021-012c7,ChemRxiv,crossref,10.26434/chemrxiv-2021-012c7,https://doi.org/10.26434/chemrxiv-2021-012c7,https://chemrxiv.org/engage/chemrxiv/article-d...,Surface modification of carbon dots with tetra...,"Sviridova, Elizaveta; Barras, Alexandre; Plotn...",2021.0,2022-01-25,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.msec.2022.112697,,,true,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-012c7,,,,,,
286132,crossref::10.26434/chemrxiv-2021-01dfq,ChemRxiv,crossref,10.26434/chemrxiv-2021-01dfq,https://doi.org/10.26434/chemrxiv-2021-01dfq,https://chemrxiv.org/engage/chemrxiv/article-d...,On the Role of Solvent in the Formation of Vac...,"Marinova, Veselina; Wood, Geoffrey P. F.; Marz...",2021.0,2021-12-23,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-01dfq,,,,,,
312472,crossref::10.26434/chemrxiv-2021-01hrg,ChemRxiv,crossref,10.26434/chemrxiv-2021-01hrg,https://doi.org/10.26434/chemrxiv-2021-01hrg,https://chemrxiv.org/engage/chemrxiv/article-d...,Visible Light-driven Metal-free C–H Functional...,"Kersting, Lena; Kuhn, Leah; Anokhin, Maksim; S...",2021.0,2021-12-21,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1002/cptc.202200109,,,true,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-01hrg,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285987,crossref::10.26434/chemrxiv.5917351,ChemRxiv,crossref,10.26434/chemrxiv.5917351,https://doi.org/10.26434/chemrxiv.5917351,https://chemrxiv.org/articles/CO2_Activation_o...,CO2 Activation on Heterostructures of Bi2O3-Na...,"Nolan, Michael",2018.0,2018-02-23,,,,,,,false,,,,,,https://chemrxiv.org/articles/co2_activation_o...,10.26434/chemrxiv.5917351,,,,,,
287405,crossref::10.26434/chemrxiv.6483989,ChemRxiv,crossref,10.26434/chemrxiv.6483989,https://doi.org/10.26434/chemrxiv.6483989,https://chemrxiv.org/engage/chemrxiv/article-d...,Highly sensitive gating in pH-responsive nanoc...,"Lopez, Luis G.; Nap, Rikkert J.",2018.0,2018-06-12,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.6483989,,,,,,
287403,crossref::10.26434/chemrxiv.8021102,ChemRxiv,crossref,10.26434/chemrxiv.8021102,https://doi.org/10.26434/chemrxiv.8021102,https://chemrxiv.org/engage/chemrxiv/article-d...,Potassium Speciation and Distribution for the ...,"Knutsson, Pavleta; Maric, Jelena; Knutsson, Je...",2019.0,2019-04-23,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.8021102,,,,,,
287404,crossref::10.26434/chemrxiv.8378342,ChemRxiv,crossref,10.26434/chemrxiv.8378342,https://doi.org/10.26434/chemrxiv.8378342,https://chemrxiv.org/engage/chemrxiv/article-d...,Outlier Modeling in Gear Bearing Using Autoenc...,"Singh, Sunny; Shiv, Praneet; Ahmed, Atif",2019.0,2019-07-01,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.8378342,,,,,,


In [64]:
pattern = "v"

mask = df_remain[df_remain['server_name']=='ChemRxiv']['doi'].str.contains(pattern, regex=False, na=False)
result = df_remain[df_remain['server_name']=='ChemRxiv'][mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
286137,crossref::10.26434/chemrxiv-2021-00kkd,ChemRxiv,crossref,10.26434/chemrxiv-2021-00kkd,https://doi.org/10.26434/chemrxiv-2021-00kkd,https://chemrxiv.org/engage/chemrxiv/article-d...,Understanding MOF nucleation from solution wit...,"Kollias, Loukas; Rousseau, Roger; Glezakou, Va...",2021.0,2021-12-23,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-00kkd,,,,,,
312423,crossref::10.26434/chemrxiv-2021-00rj4,ChemRxiv,crossref,10.26434/chemrxiv-2021-00rj4,https://doi.org/10.26434/chemrxiv-2021-00rj4,https://chemrxiv.org/engage/chemrxiv/article-d...,Electrochemical Ozone Generation Using Compact...,"Wood, Georgia; Terrero Rodriguez, Irina; Tully...",2021.0,2022-01-25,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1149/1945-7111/ac3ff4,,,true,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-00rj4,,,,,,
312367,crossref::10.26434/chemrxiv-2021-012c7,ChemRxiv,crossref,10.26434/chemrxiv-2021-012c7,https://doi.org/10.26434/chemrxiv-2021-012c7,https://chemrxiv.org/engage/chemrxiv/article-d...,Surface modification of carbon dots with tetra...,"Sviridova, Elizaveta; Barras, Alexandre; Plotn...",2021.0,2022-01-25,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.msec.2022.112697,,,true,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-012c7,,,,,,
286132,crossref::10.26434/chemrxiv-2021-01dfq,ChemRxiv,crossref,10.26434/chemrxiv-2021-01dfq,https://doi.org/10.26434/chemrxiv-2021-01dfq,https://chemrxiv.org/engage/chemrxiv/article-d...,On the Role of Solvent in the Formation of Vac...,"Marinova, Veselina; Wood, Geoffrey P. F.; Marz...",2021.0,2021-12-23,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-01dfq,,,,,,
312472,crossref::10.26434/chemrxiv-2021-01hrg,ChemRxiv,crossref,10.26434/chemrxiv-2021-01hrg,https://doi.org/10.26434/chemrxiv-2021-01hrg,https://chemrxiv.org/engage/chemrxiv/article-d...,Visible Light-driven Metal-free C–H Functional...,"Kersting, Lena; Kuhn, Leah; Anokhin, Maksim; S...",2021.0,2021-12-21,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1002/cptc.202200109,,,true,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-01hrg,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285987,crossref::10.26434/chemrxiv.5917351,ChemRxiv,crossref,10.26434/chemrxiv.5917351,https://doi.org/10.26434/chemrxiv.5917351,https://chemrxiv.org/articles/CO2_Activation_o...,CO2 Activation on Heterostructures of Bi2O3-Na...,"Nolan, Michael",2018.0,2018-02-23,,,,,,,false,,,,,,https://chemrxiv.org/articles/co2_activation_o...,10.26434/chemrxiv.5917351,,,,,,
287405,crossref::10.26434/chemrxiv.6483989,ChemRxiv,crossref,10.26434/chemrxiv.6483989,https://doi.org/10.26434/chemrxiv.6483989,https://chemrxiv.org/engage/chemrxiv/article-d...,Highly sensitive gating in pH-responsive nanoc...,"Lopez, Luis G.; Nap, Rikkert J.",2018.0,2018-06-12,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.6483989,,,,,,
287403,crossref::10.26434/chemrxiv.8021102,ChemRxiv,crossref,10.26434/chemrxiv.8021102,https://doi.org/10.26434/chemrxiv.8021102,https://chemrxiv.org/engage/chemrxiv/article-d...,Potassium Speciation and Distribution for the ...,"Knutsson, Pavleta; Maric, Jelena; Knutsson, Je...",2019.0,2019-04-23,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.8021102,,,,,,
287404,crossref::10.26434/chemrxiv.8378342,ChemRxiv,crossref,10.26434/chemrxiv.8378342,https://doi.org/10.26434/chemrxiv.8378342,https://chemrxiv.org/engage/chemrxiv/article-d...,Outlier Modeling in Gear Bearing Using Autoenc...,"Singh, Sunny; Shiv, Praneet; Ahmed, Atif",2019.0,2019-07-01,,,,,,,false,,,,,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.8378342,,,,,,


In [65]:
df = df.copy()

# Work only on remaining (not already forced to parent)
mask_remain = df["records_hierarchy"] != "parent"

# Normalize version_label once
server_name = df["server_name"].astype(str).str.strip()#.str.lower()

# CHILD rules
child_mask = (
    mask_remain &
    server_name.isin(["ChemRxiv"])
)
df.loc[child_mask, "records_hierarchy"] = "parent"

print(df["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
parent              7893182
review               175443
part_of              149462
version              104609
NaN                   77364
publish_version        9351
correction              355
comment                 325
parent_duplicate          3
Name: count, dtype: int64


In [66]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
Earth and Space Science Open Archive    13089
EGUsphere                               10209
TechRxiv                                 8320
PsyArXiv                                 6632
PeerJ Preprints                          6446
EarthArXiv                               6425
Authorea Inc.                            6092
engrXiv                                  4925
SocArXiv                                 3072
INA-Rxiv                                 2830
Cambridge Open Engage                    2089
Advance                                  1717
Thesis Commons                           1507
APSA Preprints                           1101
AgriRxiv                                  787
EdArXiv                                   550
Law Archive                               287
Arabixiv                                  209
MetaArXiv                                 196
ScienceOpen Preprints                     151
LIS Scholarship Archive                    85
PoolText              

#### OSF-based servers

In [67]:
df_remain[df_remain['server_name']=='TechRxiv']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5693929,crossref::10.36227/techrxiv.10002041,TechRxiv,crossref,10.36227/techrxiv.10002041,https://doi.org/10.36227/techrxiv.10002041,https://www.techrxiv.org/articles/Comment_on_C...,Comment on ‘ Comment on ‘Enhancing the securit...,"Ding, Yuan",2019.0,2019-10-18,,,,,,,false,,,,,,https://www.techrxiv.org/articles/comment_on_c...,10.36227/techrxiv.10002041,,,,,,
5693935,crossref::10.36227/techrxiv.10002782,TechRxiv,crossref,10.36227/techrxiv.10002782,https://doi.org/10.36227/techrxiv.10002782,https://www.techrxiv.org/articles/Novel_Energi...,Novel Energisation Method for Offshore Wind Fa...,"Saborío-Romano, Oscar; Bidadfar, Ali; Sakamuri...",2019.0,2019-10-20,,,,,,,false,,,,,,https://www.techrxiv.org/articles/novel_energi...,10.36227/techrxiv.10002782,,,,,,
5693930,crossref::10.36227/techrxiv.10005770,TechRxiv,crossref,10.36227/techrxiv.10005770,https://doi.org/10.36227/techrxiv.10005770,https://www.techrxiv.org/articles/Modified_SHE...,Modified SHE for Grid Connection.pdf,"Santra, Subhendu Bikash",2019.0,2019-10-20,,,,,,,false,,,,,,https://www.techrxiv.org/articles/modified_she...,10.36227/techrxiv.10005770,,,,,,
5693933,crossref::10.36227/techrxiv.10007051,TechRxiv,crossref,10.36227/techrxiv.10007051,https://doi.org/10.36227/techrxiv.10007051,https://www.techrxiv.org/articles/Can_Frequenc...,Can Frequency Diverse Array Prevent Wireless E...,"Ding, Yuan; Narbudowicz, Adam",2019.0,2019-10-21,,,,,,,false,,,,,,https://www.techrxiv.org/articles/can_frequenc...,10.36227/techrxiv.10007051,,,,,,
5693932,crossref::10.36227/techrxiv.10008968,TechRxiv,crossref,10.36227/techrxiv.10008968,https://doi.org/10.36227/techrxiv.10008968,https://www.techrxiv.org/articles/Online_param...,Online parameter identification of synchronous...,"Alves, Erick; Noeland, Jonas; Marafioti, Gianc...",2019.0,2019-10-21,,,,,,,false,,,,,,https://www.techrxiv.org/articles/online_param...,10.36227/techrxiv.10008968,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5702124,crossref::10.36227/techrxiv.24750039,TechRxiv,crossref,10.36227/techrxiv.24750039,https://doi.org/10.36227/techrxiv.24750039,https://www.techrxiv.org/articles/preprint/Opt...,Optimizing Complex CPQ Software Systems for Qu...,"Alexander, Thomson",2023.0,2023-12-07,,,,,,,false,,,,,,https://www.techrxiv.org/articles/preprint/opt...,10.36227/techrxiv.24750039,,,,,,
5702125,crossref::10.36227/techrxiv.24751587,TechRxiv,crossref,10.36227/techrxiv.24751587,https://doi.org/10.36227/techrxiv.24751587,https://www.techrxiv.org/articles/preprint/Fre...,Frequency Diverse Array With Discrete Fourier ...,"Wang, Kai; Yu, Zichuan; Jin, Zhiyuan; Zhong, F...",2023.0,2023-12-07,,,,,,,false,,,,,,https://www.techrxiv.org/articles/preprint/fre...,10.36227/techrxiv.24751587,,,,,,
5702126,crossref::10.36227/techrxiv.24751989,TechRxiv,crossref,10.36227/techrxiv.24751989,https://doi.org/10.36227/techrxiv.24751989,https://www.techrxiv.org/articles/preprint/Qua...,Quantum Computing based Channel and Signa...,"Farouk, Ahmed; AbuAli, Najah Abed; Mumtaz, Shahid",2023.0,2023-12-07,,,,,,,false,,,,,,https://www.techrxiv.org/articles/preprint/qua...,10.36227/techrxiv.24751989,,,,,,
5717544,crossref::10.36227/techrxiv.28424396,TechRxiv,crossref,10.36227/techrxiv.28424396,https://doi.org/10.36227/techrxiv.28424396,https://techrxiv.figshare.com/articles/preprin...,Lightweight Online Real-time ASR,"Kalhoro, Meer; Nadir, Masab",2025.0,2025-02-17,,,,,,,false,,,,,,https://techrxiv.figshare.com/articles/preprin...,10.36227/techrxiv.28424396,,,,,,


In [68]:
pattern = "10.31234/osf.io/zypk9"

mask = df['doi'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2391992,crossref::10.31234/osf.io/zypk9,PsyArXiv,crossref,10.31234/osf.io/zypk9,https://doi.org/10.31234/osf.io/zypk9,https://osf.io/zypk9,The Opposition of Surprisal and Semantic Simil...,"Sun, Kun; Nixon, Jessie S.",2020.0,2020-12-10,,,,,,,False,,,,,,https://osf.io/zypk9,10.31234/osf.io/zypk9,,,,,,
2398751,crossref::10.31234/osf.io/zypk9_v1,PsyArXiv,crossref,10.31234/osf.io/zypk9_v1,https://doi.org/10.31234/osf.io/zypk9_v1,https://osf.io/zypk9_v1,WITHDRAWN,,2020.0,2025-05-19,,,,,,,False,,,,,parent,https://osf.io/zypk9_v1,10.31234/osf.io/zypk9_v1,_v1,explicit_version,1.0,,,


In [69]:
pattern = "10.36227/techrxiv.24750039"

mask = df['doi'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5702124,crossref::10.36227/techrxiv.24750039,TechRxiv,crossref,10.36227/techrxiv.24750039,https://doi.org/10.36227/techrxiv.24750039,https://www.techrxiv.org/articles/preprint/Opt...,Optimizing Complex CPQ Software Systems for Qu...,"Alexander, Thomson",2023.0,2023-12-07,,,,,,,False,,,,,,https://www.techrxiv.org/articles/preprint/opt...,10.36227/techrxiv.24750039,,,,,,
5709343,crossref::10.36227/techrxiv.24750039.v1,TechRxiv,crossref,10.36227/techrxiv.24750039.v1,https://doi.org/10.36227/techrxiv.24750039.v1,https://www.techrxiv.org/doi/full/10.36227/tec...,Optimizing Complex CPQ Software Systems for Qu...,"Alexander, Thomson",2023.0,2023-12-07,,,,,,,False,,,,,parent,https://www.techrxiv.org/doi/full/10.36227/tec...,10.36227/techrxiv.24750039.v1,.v1,explicit_version,1.0,,,


In [70]:
import re
import numpy as np

# Ensure column exists
if 'records_hierarchy' not in df.columns:
    df['records_hierarchy'] = pd.NA

# Work only on remaining (unlabeled)
remain = df['records_hierarchy'].isna()

# ----------------------------
# 1) OSF-based servers: parent if DOI is exactly osf.io/<5chars>
#    Examples:
#      10.31234/osf.io/zypk9  -> parent
#      10.31234/osf.io/zypk9_v1  -> NOT parent by this rule
# ----------------------------
OSF_SERVERS = {
    'PsyArXiv',
    'Thesis Commons',
    'SocArXiv',
    'OSF Preprints',
    'Open Science Framework',
    'MindRxiv',
    'MetaArXiv',
    'SportRxiv',
    'LawArXiv',
    'EarthArXiv',
    'EngrXiv',
    'MarXiv',
    'INA-Rxiv',
    'AfricArXiv',
    'AgriXiv',
    'Arabixiv',
    # add/remove servers you want to include
}

# matches "...osf.io/ABCDE" at end of DOI string
OSF_PARENT_RX = re.compile(r'osf\.io/[a-z0-9]{5}$', re.IGNORECASE)

mask_osf_parent = (
    remain
    # & df['server_name'].isin(OSF_SERVERS)
    & df['doi'].astype(str).str.lower().str.contains('osf.io/', na=False)
    & df['doi'].astype(str).str.match(r'.*osf\.io/[a-z0-9]{5}$', na=False)
)

df.loc[mask_osf_parent, 'records_hierarchy'] = 'parent'


# ----------------------------
# 2) TechRxiv: parent if DOI is exactly "techrxiv.<digits>"
#    Examples:
#      10.36227/techrxiv.24750039 -> parent
#      10.36227/techrxiv.24750039.v1 -> NOT parent by this rule
# ----------------------------
TECHRXIV_PARENT_RX = re.compile(r'techrxiv\.\d+$', re.IGNORECASE)

mask_techrxiv_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'TechRxiv')
    & df['doi'].astype(str).str.match(r'.*techrxiv\.\d+$', na=False)
)

df.loc[mask_techrxiv_parent, 'records_hierarchy'] = 'parent'


# ----------------------------
# Optional: if you want to label obvious "child" for OSF when suffix exists
# (only do this if you are confident the suffix means versioning)
# Example: osf.io/xxxxx_v2 or osf.io/xxxxx-v2 or osf.io/xxxxx.v2
# ----------------------------
OSF_CHILD_RX = re.compile(r'osf\.io/[a-z0-9]{5}([._-]?v\d+)$', re.IGNORECASE)

mask_osf_child = (
    df['records_hierarchy'].isna()
    & df['server_name'].isin(OSF_SERVERS)
    & df['doi'].astype(str).str.match(r'.*osf\.io/[a-z0-9]{5}([._-]?v\d+)$', na=False)
)

df.loc[mask_osf_child, 'records_hierarchy'] = 'version'


# ----------------------------
# Optional: TechRxiv child rule if you see explicit versioning later
# Example: techrxiv.24750039.v2 or techrxiv.24750039-v2
# ----------------------------
mask_techrxiv_child = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'TechRxiv')
    & df['doi'].astype(str).str.match(r'.*techrxiv\.\d+([._-]?v\d+)$', na=False)
)

df.loc[mask_techrxiv_child, 'records_hierarchy'] = 'version'


# Quick check
print(df['records_hierarchy'].value_counts(dropna=False))


records_hierarchy
parent              7921678
review               175443
part_of              149462
version              104609
NaN                   48868
publish_version        9351
correction              355
comment                 325
parent_duplicate          3
Name: count, dtype: int64


In [71]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
Earth and Space Science Open Archive    13089
EGUsphere                               10209
PeerJ Preprints                          6446
Authorea Inc.                            6092
EarthArXiv                               4693
engrXiv                                  2733
Cambridge Open Engage                    2089
Advance                                  1717
APSA Preprints                           1101
AgriRxiv                                  380
ScienceOpen Preprints                     151
PoolText                                   79
Oroboros Instruments                       70
F1000Research                              14
Open Research Africa                        1
AMRC Open Research                          1
MNI Open Research                           1
Gates Open Research                         1
eLife                                       1
Name: count, dtype: int64

In [72]:
pattern = "10.31234/osf.io/zypk9"

mask = df['doi'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2391992,crossref::10.31234/osf.io/zypk9,PsyArXiv,crossref,10.31234/osf.io/zypk9,https://doi.org/10.31234/osf.io/zypk9,https://osf.io/zypk9,The Opposition of Surprisal and Semantic Simil...,"Sun, Kun; Nixon, Jessie S.",2020.0,2020-12-10,,,,,,,False,,,,,parent,https://osf.io/zypk9,10.31234/osf.io/zypk9,,,,,,
2398751,crossref::10.31234/osf.io/zypk9_v1,PsyArXiv,crossref,10.31234/osf.io/zypk9_v1,https://doi.org/10.31234/osf.io/zypk9_v1,https://osf.io/zypk9_v1,WITHDRAWN,,2020.0,2025-05-19,,,,,,,False,,,,,parent,https://osf.io/zypk9_v1,10.31234/osf.io/zypk9_v1,_v1,explicit_version,1.0,,,


In [73]:
pattern = "10.36227/techrxiv.24750039"

mask = df['doi'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5702124,crossref::10.36227/techrxiv.24750039,TechRxiv,crossref,10.36227/techrxiv.24750039,https://doi.org/10.36227/techrxiv.24750039,https://www.techrxiv.org/articles/preprint/Opt...,Optimizing Complex CPQ Software Systems for Qu...,"Alexander, Thomson",2023.0,2023-12-07,,,,,,,False,,,,,parent,https://www.techrxiv.org/articles/preprint/opt...,10.36227/techrxiv.24750039,,,,,,
5709343,crossref::10.36227/techrxiv.24750039.v1,TechRxiv,crossref,10.36227/techrxiv.24750039.v1,https://doi.org/10.36227/techrxiv.24750039.v1,https://www.techrxiv.org/doi/full/10.36227/tec...,Optimizing Complex CPQ Software Systems for Qu...,"Alexander, Thomson",2023.0,2023-12-07,,,,,,,False,,,,,parent,https://www.techrxiv.org/doi/full/10.36227/tec...,10.36227/techrxiv.24750039.v1,.v1,explicit_version,1.0,,,


#### EarthArXiv, Authorea Inc., Cambridge Open Engage

In [74]:
import re
import pandas as pd

# Ensure target column exists
# if 'records_hierarchy' not in df.columns:
#     df['records_hierarchy'] = pd.NA

remain = df['records_hierarchy'].isna()

doi_s = df['doi'].astype(str).str.strip().str.lower()

# -------------------------
# EarthArXiv: 10.15697/<token>
# Examples: 10.15697/fk20022, 10.15697/fk2v59g
# -------------------------
mask_eartharxiv_parent = (
    remain
    & (df['server_name'] == 'EarthArXiv')
    & doi_s.str.match(r'^10\.15697/[a-z0-9]+$', na=False)
)
df.loc[mask_eartharxiv_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# Authorea: 10.22541/au.<digits>.<digits>
# Examples: 10.22541/au.148928287.78038962
# -------------------------
mask_authorea_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'Authorea Inc.')
    & doi_s.str.match(r'^10\.22541/au\.\d+\.\d+$', na=False)
)
df.loc[mask_authorea_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# The Winnower: 10.15200/winn.<digits>.<digits>
# Examples: 10.15200/winn.143644.45920
# -------------------------
mask_winn_parent = (
    df['records_hierarchy'].isna()
    & doi_s.str.match(r'^10\.15200/winn\.\d+\.\d+$', na=False)
)
df.loc[mask_winn_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# engrXiv: 10.31224/<digits>
# Examples: 10.31224/2109
# -------------------------
mask_engrxiv_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'engrXiv')
    & doi_s.str.match(r'^10\.31224/\d+$', na=False)
)
df.loc[mask_engrxiv_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# Cambridge Open Engage: 10.33774/coe-<year or token>-<token>
# Examples: 10.33774/coe-2025-zz7tp, 10.33774/coe-2020-03prm, 10.33774/coe-xxxx-xxxxx
# -------------------------
mask_coe_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'Cambridge Open Engage')
    & doi_s.str.match(r'^10\.33774/coe-[a-z0-9]{4}-[a-z0-9]{4,}$', na=False)
)
df.loc[mask_coe_parent, 'records_hierarchy'] = 'parent'

# Optional quick check
print(df['records_hierarchy'].value_counts(dropna=False))


records_hierarchy
parent              7932594
review               175443
part_of              149462
version              104609
NaN                   37952
publish_version        9351
correction              355
comment                 325
parent_duplicate          3
Name: count, dtype: int64


#### Advance, APSA Preprints, AgriRxiv, EGUsphere

In [75]:
import pandas as pd

# Ensure column exists
# if 'records_hierarchy' not in df.columns:
#     df['records_hierarchy'] = pd.NA

remain = df['records_hierarchy'].isna()
doi_s = df['doi'].astype(str).str.strip().str.lower()

# -------------------------
# Advance: 10.31124/advance.<digits>
# Examples: 10.31124/advance.10005662, 10.31124/advance.9978302
# -------------------------
mask_advance_parent = (
    remain
    & (df['server_name'] == 'Advance')
    & doi_s.str.match(r'^10\.31124/advance\.\d+$', na=False)
)
df.loc[mask_advance_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# APSA Preprints: 10.33774/apsa-<yyyy>-<token>
# Examples: 10.33774/apsa-2025-zqggn, 10.33774/apsa-2019-0b2dl
# -------------------------
mask_apsa_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'APSA Preprints')
    & doi_s.str.match(r'^10\.33774/apsa-\d{4}-[a-z0-9]+$', na=False)
)
df.loc[mask_apsa_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# AgriRxiv: 10.31220/agrirxiv.<yyyy>.<5digits>
# Examples: 10.31220/agrirxiv.2020.00001, 10.31220/agrirxiv.2025.00384
# -------------------------
mask_agrirxiv_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'AgriRxiv')
    & doi_s.str.match(r'^10\.31220/agrirxiv\.\d{4}\.\d{5}$', na=False)
)
df.loc[mask_agrirxiv_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# EGUsphere (and EGUsphere-style journal DOIs): 10.5194/<slug>-<yyyy>-<number>
# Examples:
#   10.5194/egusphere-2022-1
#   10.5194/egusphere-2022-1000
#   10.5194/amt-2022-295
#   10.5194/hess-2024-3989
# -------------------------
mask_egusphere_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'EGUsphere')
    & doi_s.str.match(r'^10\.5194/[a-z0-9]+-\d{4}-\d+$', na=False)
)
df.loc[mask_egusphere_parent, 'records_hierarchy'] = 'parent'

# Optional quick check
print(df['records_hierarchy'].value_counts(dropna=False))

records_hierarchy
parent              7946001
review               175443
part_of              149462
version              104609
NaN                   24545
publish_version        9351
correction              355
comment                 325
parent_duplicate          3
Name: count, dtype: int64


#### Oroboros Instruments, PoolText, ScienceOpen Preprints, EarthArXiv

In [76]:
import pandas as pd

# Ensure column exists
# if 'records_hierarchy' not in df.columns:
#     df['records_hierarchy'] = pd.NA

remain = df['records_hierarchy'].isna()
doi_s = df['doi'].astype(str).str.strip().str.lower()

# ------------------------------------------------------------
# Oroboros Instruments (examples)
# - 10.26124/bec.2024-0007
# - 10.26124/mitofit:ea19.mipschool.0005
# Parent if EXACT pattern with nothing after.
# ------------------------------------------------------------
mask_oroboros_parent = (
    remain
    & (df['server_name'] == 'Oroboros Instruments')
    & doi_s.str.match(
        r'^10\.26124/('
        r'bec\.\d{4}-\d{4}'                       # bec.2024-0007
        r'|mitofit:[a-z0-9]+\.[a-z0-9]+\.\d{4}'   # mitofit:ea19.mipschool.0005
        r')$',
        na=False
    )
)
df.loc[mask_oroboros_parent, 'records_hierarchy'] = 'parent'

# ------------------------------------------------------------
# Oroboros Instruments — Parent DOI patterns
# Examples (parents):
# - 10.26124/bec.2025-0005ar
# - 10.26124/bec.2025-0005it
# - 10.26124/bec:2024-0001
# - 10.26124/becprep.2025-0006.ed2
# - 10.26124/becprep.2025-0005
# - 10.26124/mitofit:190001
# - 10.26124/mitofit:2021-0005
# ------------------------------------------------------------
OROBOROS_PARENT_RX = r'^10\.26124/(' \
    r'bec[.:]\d{4}-\d{4}[a-z]{0,3}' \
    r'|' \
    r'becprep\.\d{4}-\d{4}(?:\.[a-z0-9]{1,6})?' \
    r'|' \
    r'mitofit:\d{6}' \
    r'|' \
    r'mitofit:\d{4}-\d{4}' \
    r')$'

mask_oroboros_parent = (
    remain
    & (df['server_name'] == 'Oroboros Instruments')
    & doi_s.str.match(OROBOROS_PARENT_RX, na=False)
)

df.loc[mask_oroboros_parent, 'records_hierarchy'] = 'parent'

# ------------------------------------------------------------
# PoolText (examples)
# - 10.31923/5547-4288-0095
# - 10.31923/pooltext-preprint-0067-3907-0053
# Parent if EXACT pattern with nothing after.
# ------------------------------------------------------------
mask_pooltext_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'PoolText')
    & doi_s.str.match(
        r'^10\.31923/('
        r'\d{4}-\d{4}-\d{4}'                                  # 5547-4288-0095
        r'|pooltext-preprint-\d{4}-\d{4}-\d{4}'               # pooltext-preprint-0067-3907-0053
        r')$',
        na=False
    )
)
df.loc[mask_pooltext_parent, 'records_hierarchy'] = 'parent'


# ------------------------------------------------------------
# ScienceOpen Preprints — Parent DOI patterns (STRICT / exact)
# Parent if EXACT pattern with nothing after.
# ------------------------------------------------------------

SCIENCEOPEN_PARENT_RX = (
    r'^10\.14293/('
    # 1) Old pattern like: s2199-1006.1.sor-.sdg.01
    r's2199-1006\.1\.sor-\.[a-z0-9]+\.\d{2}'
    r'|'
    # 2) New s2199-r2om patterns:
    #    - s2199-r2om-0001
    #    - s2199-r2om-abs-0003
    r's2199-r2om-(?:abs-)?\d{4}'
    r'|'
    # 3) New "rexpo" compact pattern:
    #    - s2199-rexpo22011v1
    r's2199-rexpo\d{5}v\d+'
    r'|'
    # 4) New ssp-am patterns:
    #    - s2199-ssp-am22-0001
    #    - s2199-ssp-am23-01001
    #    - s2199-ssp-am25-01015
    r's2199-ssp-am\d{2}-\d{4,5}'
    r'|'
    # 5) sblunisa patterns:
    #    - sblunisa.2023a024.mm (old)
    #    - sblunisa.2023a002.vnm (new)
    #    - sblunisa.2023a017.ojjt (new)
    r'sblunisa\.\d{4}a\d{3}\.[a-z0-9]{2,4}'
    r')$'
)

mask_scienceopen_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'ScienceOpen Preprints')
    & doi_s.str.match(SCIENCEOPEN_PARENT_RX, na=False)
)

df.loc[mask_scienceopen_parent, 'records_hierarchy'] = 'parent'


# ------------------------------------------------------------
# EarthArXiv
# Examples:
# - 10.31223/x50025  (5)
# - 10.31223/x5003j  (6)
# - 10.31223/x5zr0p  (6)
# Parent if EXACT pattern with nothing after.
# ------------------------------------------------------------
mask_eartharxiv_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'EarthArXiv')
    & doi_s.str.match(r'^10\.31223/[a-z0-9]{5,6}$', na=False)   # 5–6 chars after slash
)
df.loc[mask_eartharxiv_parent, 'records_hierarchy'] = 'parent'


# Optional quick check
print(df['records_hierarchy'].value_counts(dropna=False))

records_hierarchy
parent              7950992
review               175443
part_of              149462
version              104609
NaN                   19554
publish_version        9351
correction              355
comment                 325
parent_duplicate          3
Name: count, dtype: int64


#### PeerJ Preprints

In [77]:
import re

# ------------------------------------------------------------
# PeerJ Preprints
# Parent:
#  - no trailing vN: 10.7287/peerj.preprints.1001
#  - trailing v1:    10.7287/peerj.preprints.1001v1
# Child:
#  - trailing v2+:   10.7287/peerj.preprints.1001v2, v3, ...
# ------------------------------------------------------------

doi_s = df['doi'].astype(str).str.strip().str.lower()

mask_peerj = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'PeerJ Preprints')
    & doi_s.str.startswith('10.7287/peerj.preprints.', na=False)
)

# Extract trailing version number if present (v1, v2, ...)
peerj_v_str = doi_s.where(mask_peerj).str.extract(r'v(?P<v>\d+)$')['v']

# Convert safely to numeric (NaN stays NaN)
peerj_v_num = pd.to_numeric(peerj_v_str, errors='coerce')

# Parent: no version suffix OR v1
mask_peerj_parent = mask_peerj & (peerj_v_num.isna() | (peerj_v_num == 1))
df.loc[mask_peerj_parent, 'records_hierarchy'] = 'parent'

# Child: v2+
mask_peerj_child = mask_peerj & (peerj_v_num >= 2)
df.loc[mask_peerj_child, 'records_hierarchy'] = 'version'

# Optional quick check
print(df['records_hierarchy'].value_counts(dropna=False))

records_hierarchy
parent              7956063
review               175443
part_of              149462
version              105984
NaN                   13108
publish_version        9351
correction              355
comment                 325
parent_duplicate          3
Name: count, dtype: int64


#### Earth and Space Science Open Archive

In [78]:
import pandas as pd

# -------------------------------------------------------------------
# Normalize DOI strings:
# - ensure everything is string
# - remove leading/trailing spaces
# - lowercase for consistent matching
# -------------------------------------------------------------------
doi_s = df['doi'].astype(str).str.strip().str.lower()

# -------------------------------------------------------------------
# Identify records to process:
# - records_hierarchy not yet assigned
# - server is Earth and Space Science Open Archive (ESSOAr)
# -------------------------------------------------------------------
mask_essoar = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'Earth and Space Science Open Archive')
)

# -------------------------------------------------------------------
# Extract the final numeric version from the DOI
#
# Examples:
#   10.1002/essoar.10500074.1  → n = 1
#   10.1002/essoar.10500061.2  → n = 2
#   10.22541/essoar.xxx.xxx.3  → n = 3
#
# Regex explanation:
#   \.        → literal dot
#   (?P<n>)  → capture group named "n"
#   \d+      → one or more digits
#   $        → end of string (must be the last segment)
# -------------------------------------------------------------------
essoar_n_str = (
    doi_s
    .where(mask_essoar)                # only evaluate ESSOAr rows
    .str.extract(r'\.(?P<n>\d+)$')['n']  # extract trailing version number
)

# Convert extracted version to numeric:
# - invalid or missing values become NaN (safe for comparisons)
essoar_n = pd.to_numeric(essoar_n_str, errors='coerce')

# -------------------------------------------------------------------
# Label parent records:
# - ESSOAr records
# - version suffix == ".1"
# -------------------------------------------------------------------
mask_essoar_parent = mask_essoar & (essoar_n == 1)
df.loc[mask_essoar_parent, 'records_hierarchy'] = 'parent'

# -------------------------------------------------------------------
# Label child records:
# - ESSOAr records
# - version suffix >= ".2"
# -------------------------------------------------------------------
mask_essoar_child = mask_essoar & (essoar_n >= 2)
df.loc[mask_essoar_child, 'records_hierarchy'] = 'child'

# -------------------------------------------------------------------
# Optional sanity check:
# Show distribution of hierarchy labels
# -------------------------------------------------------------------
print(df['records_hierarchy'].value_counts(dropna=False))


records_hierarchy
parent              7967124
review               175443
part_of              149462
version              105984
publish_version        9351
child                  2028
correction              355
comment                 325
NaN                      19
parent_duplicate          3
Name: count, dtype: int64


In [79]:
df.loc[df['server_name'].eq('Earth and Space Science Open Archive'), 'records_hierarchy'].value_counts(dropna=False)

records_hierarchy
parent             20047
child               2028
version              824
publish_version       29
part_of                4
Name: count, dtype: int64

#### F1000Research

In [80]:
df.loc[df['server_name'].eq('F1000Research'), 'records_hierarchy'].value_counts(dropna=False)


records_hierarchy
parent     11155
version     5704
NaN           14
Name: count, dtype: int64

In [81]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
F1000Research           14
Open Research Africa     1
AMRC Open Research       1
Gates Open Research      1
MNI Open Research        1
eLife                    1
Name: count, dtype: int64

In [82]:
df_remain[df_remain['server_name']=='F1000Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
587381,crossref::10.12688/f1000research,F1000Research,crossref,10.12688/f1000research,https://doi.org/10.12688/f1000research,http://www.f1000research.com,F1000Research,,,2025-07-24,,,,,,,False,,,,,,http://www.f1000research.com,10.12688/f1000research,,,,,,
588652,crossref::10.12688/f1000research.11198.1,F1000Research,crossref,10.12688/f1000research.11198.1,https://doi.org/10.12688/f1000research.11198.1,https://f1000research.com/articles/6-1014,New perspectives on the regulation of type II ...,"Becerra-Diaz, Mireya",2017.0,2025-11-28,,,,,,,False,,,,,,https://f1000research.com/articles/6-1014,10.12688/f1000research.11198.1,,,,,,
580538,crossref::10.12688/f1000research.124059.1,F1000Research,crossref,10.12688/f1000research.124059.1,https://doi.org/10.12688/f1000research.124059.1,https://f1000research.com/articles/11-1230/,LiftoffTools: a toolkit for comparing gene ann...,"Shumate, Alaina; Salzberg, Steven",2022.0,2022-10-28,,,,,,,False,,,,,,https://f1000research.com/articles/11-1230/,10.12688/f1000research.124059.1,,,,,,
588651,crossref::10.12688/f1000research.14416.1,F1000Research,crossref,10.12688/f1000research.14416.1,https://doi.org/10.12688/f1000research.14416.1,https://f1000research.com/articles/7-1362,Recent advances in the understanding and manag...,"Hay, Ashley",2018.0,2025-11-28,,,,,,,False,,,,,,https://f1000research.com/articles/7-1362,10.12688/f1000research.14416.1,,,,,,
588733,crossref::10.12688/f1000research.163729.1,F1000Research,crossref,10.12688/f1000research.163729.1,https://doi.org/10.12688/f1000research.163729.1,https://f1000research.com/articles/14-656,Autoimmune Patient Health Through a Flourishin...,"Pasca Rina, Amherstia",2025.0,2025-12-01,,,,,,,False,,,,,,https://f1000research.com/articles/14-656,10.12688/f1000research.163729.1,,,,,,
588738,crossref::10.12688/f1000research.166247.1,F1000Research,crossref,10.12688/f1000research.166247.1,https://doi.org/10.12688/f1000research.166247.1,https://f1000research.com/articles/14-598,Anticipated Growth in Healthcare Spending: The...,"Rugchatjaroen, Krish",2025.0,2025-12-01,,,,,,,False,,,,,,https://f1000research.com/articles/14-598,10.12688/f1000research.166247.1,,,,,,
588744,crossref::10.12688/f1000research.168206.1,F1000Research,crossref,10.12688/f1000research.168206.1,https://doi.org/10.12688/f1000research.168206.1,https://f1000research.com/articles/14-975,Scientific Productivity and Population Health:...,"Altamimi, Omar",2025.0,2025-12-01,,,,,,,False,,,,,,https://f1000research.com/articles/14-975,10.12688/f1000research.168206.1,,,,,,
572211,crossref::10.3410/10.3410/f1000devtestarticley,F1000Research,crossref,10.3410/10.3410/f1000devtestarticley,https://doi.org/10.3410/10.3410/f1000devtestar...,http://www.xy.net/article,someTitle,"abcde, XXXX",2012.0,2012-06-11,,,,,,,False,,,,,,http://www.xy.net/article,10.3410/10.3410/f1000devtestarticley,,,,,,
572499,crossref::10.3410/123.430,F1000Research,crossref,10.3410/123.430,https://doi.org/10.3410/123.430,http://www.someURl.com,someTitle,"someName, someName",2009.0,2012-09-13,,,,,,,False,,,,,,http://www.someurl.com,10.3410/123.430,,,,,,
572210,crossref::10.3410/f1000devtestarticlez,F1000Research,crossref,10.3410/f1000devtestarticlez,https://doi.org/10.3410/f1000devtestarticlez,http://researchdev.f1000.com/articles/F1000Dev...,F1000DevTestArticleZ,"LNameZ, firstnameZ",2012.0,2012-06-07,,,,,,,False,,,,,,http://researchdev.f1000.com/articles/f1000dev...,10.3410/f1000devtestarticlez,,,,,,


In [83]:
df_remain[df_remain['server_name']=='Open Research Africa']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2121895,crossref::10.12688/aasopenres,Open Research Africa,crossref,10.12688/aasopenres,https://doi.org/10.12688/aasopenres,http://www.aasopenresearch.org,AAS Open Research,,,2022-03-25,,,,,,,False,,,,,,http://www.aasopenresearch.org,10.12688/aasopenres,,,,,,


In [84]:
df_remain[df_remain['server_name']=='AMRC Open Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
178,crossref::10.12688/amrcopenres,AMRC Open Research,crossref,10.12688/amrcopenres,https://doi.org/10.12688/amrcopenres,http://www.amrcopenresearch.org,AMRC Open Research,,,2019-02-19,,,,,,,False,,,,,,http://www.amrcopenresearch.org,10.12688/amrcopenres,,,,,,


In [85]:
df_remain[df_remain['server_name']=='MNI Open Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2013305,crossref::10.12688/mniopenres,MNI Open Research,crossref,10.12688/mniopenres,https://doi.org/10.12688/mniopenres,http://www.mniopenresearch.org,MNI Open Research,,,2021-06-21,,,,,,,False,,,,,,http://www.mniopenresearch.org,10.12688/mniopenres,,,,,,


In [86]:
df_remain[df_remain['server_name']=='Gates Open Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
590205,crossref::10.12688/gatesopenres,Gates Open Research,crossref,10.12688/gatesopenres,https://doi.org/10.12688/gatesopenres,http://www.gatesopenresearch.org,Gates Open Research,,,2017-11-06,,,,,,,False,,,,,,http://www.gatesopenresearch.org,10.12688/gatesopenres,,,,,,


In [87]:
df_remain[df_remain['server_name']=='eLife']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9130897,crossref::10.7554/elife,eLife,crossref,10.7554/elife,https://doi.org/10.7554/elife,https://elifesciences.org/,eLife,,,2017-07-25,,,,,,,False,,,,,,https://elifesciences.org/,10.7554/elife,,,,,,


In [88]:
import pandas as pd

# Normalize DOI strings:
# - lower case
# - strip spaces
# - collapse accidental double slashes after the prefix (10.3410// -> 10.3410/)
doi_s = (
    df["doi"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"^10\.3410//", "10.3410/", regex=True)
)

# ------------------------------------------------------------
# F1000Research
# We support TWO DOI "families" that exist in your data:
#
# A) 10.12688/f1000research[.<id>.<N>]
#    - root: 10.12688/f1000research            -> parent
#    - versioned: 10.12688/f1000research.11198.1 -> parent
#                 10.12688/f1000research.11198.2 -> child
#
# B) 10.3410/f1000research.<something>.vN
#    - 10.3410/f1000research.1-29.v1 -> parent
#    - 10.3410/f1000research.1-29.v2 -> child
#    - 10.3410/f1000research.2-72.v1 -> parent
# ------------------------------------------------------------
mask_f1000 = (
    df["records_hierarchy"].isna()
    & (df["server_name"] == "F1000Research")
)

# -------------------------
# Case A: 10.12688 root DOI
# -------------------------
mask_f1000_root = mask_f1000 & doi_s.eq("10.12688/f1000research")
df.loc[mask_f1000_root, "records_hierarchy"] = "parent"

# -----------------------------------------------
# Case A2: 10.12688/f1000research.<digits>.<N>
# -----------------------------------------------
f1000_v12688_str = (
    doi_s.where(mask_f1000)
         .str.extract(r"^10\.12688/f1000research\.\d+\.(?P<v>\d+)$")["v"]
)
f1000_v12688 = pd.to_numeric(f1000_v12688_str, errors="coerce")

mask_f1000_12688_parent = mask_f1000 & (f1000_v12688 == 1)
df.loc[mask_f1000_12688_parent, "records_hierarchy"] = "parent"

mask_f1000_12688_child = mask_f1000 & (f1000_v12688 >= 2)
df.loc[mask_f1000_12688_child, "records_hierarchy"] = "child"

# -----------------------------------------
# Case B: 10.3410/f1000research.<...>.vN
# - accept things like: 1-29, 2-72, 1-50, etc.
# - also accept possible extra dots inside the middle part
# -----------------------------------------
f1000_v3410_str = (
    doi_s.where(mask_f1000)
         .str.extract(r"^10\.3410/f1000research\.[a-z0-9.\-]+\.v(?P<v>\d+)$")["v"]
)
f1000_v3410 = pd.to_numeric(f1000_v3410_str, errors="coerce")

mask_f1000_3410_parent = mask_f1000 & (f1000_v3410 == 1)
df.loc[mask_f1000_3410_parent, "records_hierarchy"] = "parent"

mask_f1000_3410_child = mask_f1000 & (f1000_v3410 >= 2)
df.loc[mask_f1000_3410_child, "records_hierarchy"] = "child"

# -------------------------
# Optional: remaining -> others
# -------------------------
mask_f1000_left = mask_f1000 & df["records_hierarchy"].isna()
df.loc[mask_f1000_left, "records_hierarchy"] = "others"

# Quick check
print(df.loc[df["server_name"] == "F1000Research", "records_hierarchy"].value_counts(dropna=False))


records_hierarchy
parent     11162
version     5704
others         7
Name: count, dtype: int64


In [89]:
# Normalize DOI (same style you already use)
doi_s = df["doi"].astype(str).str.strip().str.lower()

# Only touch rows not yet labeled
mask_unlabeled = df["records_hierarchy"].isna()

# Root DOIs you want to classify as "others"
ROOT_OTHERS_DOIS = {
    "10.7554/elife",
    "10.12688/gatesopenres",
    "10.12688/mniopenres",
    "10.12688/amrcopenres",
    "10.12688/aasopenres",
}

mask_root_others = mask_unlabeled & doi_s.isin(ROOT_OTHERS_DOIS)
df.loc[mask_root_others, "records_hierarchy"] = "others"


In [90]:

# -------------------------------------------------------------------
# Optional sanity check:
# Show distribution of hierarchy labels
# -------------------------------------------------------------------
print(df['records_hierarchy'].value_counts(dropna=False))

records_hierarchy
parent              7967131
review               175443
part_of              149462
version              105984
publish_version        9351
child                  2028
correction              355
comment                 325
others                   12
parent_duplicate          3
Name: count, dtype: int64


In [91]:
pattern = "others"


mask = df['records_hierarchy'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2121895,crossref::10.12688/aasopenres,Open Research Africa,crossref,10.12688/aasopenres,https://doi.org/10.12688/aasopenres,http://www.aasopenresearch.org,AAS Open Research,,,2022-03-25,,,,,,,False,,,,,others,http://www.aasopenresearch.org,10.12688/aasopenres,,,,,,
178,crossref::10.12688/amrcopenres,AMRC Open Research,crossref,10.12688/amrcopenres,https://doi.org/10.12688/amrcopenres,http://www.amrcopenresearch.org,AMRC Open Research,,,2019-02-19,,,,,,,False,,,,,others,http://www.amrcopenresearch.org,10.12688/amrcopenres,,,,,,
590205,crossref::10.12688/gatesopenres,Gates Open Research,crossref,10.12688/gatesopenres,https://doi.org/10.12688/gatesopenres,http://www.gatesopenresearch.org,Gates Open Research,,,2017-11-06,,,,,,,False,,,,,others,http://www.gatesopenresearch.org,10.12688/gatesopenres,,,,,,
2013305,crossref::10.12688/mniopenres,MNI Open Research,crossref,10.12688/mniopenres,https://doi.org/10.12688/mniopenres,http://www.mniopenresearch.org,MNI Open Research,,,2021-06-21,,,,,,,False,,,,,others,http://www.mniopenresearch.org,10.12688/mniopenres,,,,,,
572211,crossref::10.3410/10.3410/f1000devtestarticley,F1000Research,crossref,10.3410/10.3410/f1000devtestarticley,https://doi.org/10.3410/10.3410/f1000devtestar...,http://www.xy.net/article,someTitle,"abcde, XXXX",2012.0,2012-06-11,,,,,,,False,,,,,others,http://www.xy.net/article,10.3410/10.3410/f1000devtestarticley,,,,,,
572499,crossref::10.3410/123.430,F1000Research,crossref,10.3410/123.430,https://doi.org/10.3410/123.430,http://www.someURl.com,someTitle,"someName, someName",2009.0,2012-09-13,,,,,,,False,,,,,others,http://www.someurl.com,10.3410/123.430,,,,,,
572210,crossref::10.3410/f1000devtestarticlez,F1000Research,crossref,10.3410/f1000devtestarticlez,https://doi.org/10.3410/f1000devtestarticlez,http://researchdev.f1000.com/articles/F1000Dev...,F1000DevTestArticleZ,"LNameZ, firstnameZ",2012.0,2012-06-07,,,,,,,False,,,,,others,http://researchdev.f1000.com/articles/f1000dev...,10.3410/f1000devtestarticlez,,,,,,
572209,crossref::10.3410/f1000devtestcrossmarkpolicy,F1000Research,crossref,10.3410/f1000devtestcrossmarkpolicy,https://doi.org/10.3410/f1000devtestcrossmarkp...,resource,F1000DevTestCrossMarkPolicy,"Manager, Policy",2012.0,2012-05-31,,,,,,,False,,,,,others,resource,10.3410/f1000devtestcrossmarkpolicy,,,,,,
572212,crossref::10.3410/f1000res,F1000Research,crossref,10.3410/f1000res,https://doi.org/10.3410/f1000res,2046-1402,F1000 Research,,,2013-03-05,,,,,,,False,,,,,others,2046-1402,10.3410/f1000res,,,,,,
572214,crossref::10.3410/f1000research,F1000Research,crossref,10.3410/f1000research,https://doi.org/10.3410/f1000research,http://www.f1000research.com/,F1000Research,,,2013-05-09,,,,,,,False,,,,,others,http://www.f1000research.com/,10.3410/f1000research,,,,,,


In [92]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

Series([], Name: count, dtype: int64)

In [93]:
# df_remain[df_remain['server_name']=='AgriRxiv']['landing_page_url'][207416]

In [94]:
# pattern = "others"


# mask = df[df['server_name']=='F1000Research']['records_hierarchy'].str.contains(pattern, regex=False, na=False)
# result = df[df['server_name']=='F1000Research'][mask]
# result

In [95]:
# df_remain['landing_page_url'][286254]

In [96]:
# pattern = ".1"


# mask = df[df['server_name']=='eLife']['doi'].str.contains(pattern, regex=False, na=False)
# result = df[df['server_name']=='eLife'][mask]
# result

In [97]:
# result['landing_page_url'][9366338]

In [98]:
# result['landing_page_url'][9297794] 

In [99]:
# pattern = "10.7287/peerj.preprints.999"
# #video #media 

# mask = df['doi'].str.contains(pattern, regex=False, na=False)
# result = df[mask]
# result

In [100]:
# df_remain['relations_json'].value_counts()

In [101]:
# df_remain[df_remain['relations_json']=='{"is-preprint-of": [{"asserted-by": "subject", "id": "10.31237/osf.io/yr86k", "id-type": "doi"}]}']

## Check duplicates in secondary preprint repositories

### get list of accros server

In [102]:
import pandas as pd

# ============================================================
# Goal
# ============================================================
# Read the "rules" Google Sheet and extract a list of servers
# that are marked as primary sources.
#
# In the sheet:
# - Column "Field_server_name" contains the server name
# - Column "primary_source" contains "yes" for primary servers
#
# Output:
# - unique_servers: Python list of server names marked as primary
# ============================================================


# ============================================================
# 1) Read the Google Sheet tab as CSV
# ============================================================
# Google Sheets can be exported as CSV if the sheet is public
# (or shared with link access).
# You need:
# - SHEET_ID: the spreadsheet ID
# - GID: the tab id (worksheet id)
SHEET_ID = "10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4"
GID = "1230415212"  # tab gid for the rules sheet

# Build the CSV export URL
rules_csv_url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv&gid={GID}"

# Read the rules table into a DataFrame
rules = pd.read_csv(rules_csv_url)


# ============================================================
# 2) Normalize column names
# ============================================================
# Sheets sometimes contain extra spaces/newlines in column headers.
# This normalizes them to reduce "column not found" errors.
rules.columns = (
    rules.columns.astype(str)
    .str.replace(r"\s+", " ", regex=True)  # collapse multiple spaces/newlines into 1 space
    .str.strip()                           # remove leading/trailing spaces
)


# ============================================================
# 3) Define which columns we rely on
# ============================================================
# SERVER_COL: server name column
# FLAG_COL: column marking primary servers ("yes")
SERVER_COL = "Field_server_name"
FLAG_COL = "primary_source"


# ============================================================
# 4) Safety check: make sure expected columns exist
# ============================================================
# If the sheet changes (renamed columns, etc.), we fail early with a helpful message.
if SERVER_COL not in rules.columns or FLAG_COL not in rules.columns:
    print("Columns available in the sheet:", rules.columns.tolist())
    raise KeyError(f"Expected columns not found. Need: {SERVER_COL!r} and {FLAG_COL!r}")


# ============================================================
# 5) Extract primary servers (where primary_source == "yes")
# ============================================================
# Steps:
# - normalize the flag column to text
# - strip spaces, lowercase
# - keep rows where value == "yes"
# - take the server names
# - drop missing names
# - strip spaces
# - keep unique values
primary_servers = (
    rules.loc[
        rules[FLAG_COL].astype(str).str.strip().str.lower().eq("yes"),
        SERVER_COL
    ]
    .dropna()                 # remove missing server names
    .astype(str)
    .str.strip()              # normalize server name text
    .unique()                 # keep distinct values only
    .tolist()                 # convert numpy array to normal Python list
)


# ============================================================
# 6) Quick preview
# ============================================================
print(f"Primary servers found: {len(primary_servers)}")
print(primary_servers[:30])  # show first 30 as a preview


Primary servers found: 73
['arXiv', 'SSRN', 'Research Square', 'bioRxiv', 'Preprints.org', 'Open Science Framework', 'Zenodo', 'medRxiv', 'Authorea Inc.', 'PsyArXiv', 'ChemRxiv', 'JMIR Preprints', 'AgEcon Search', 'TechRxiv', 'SocArXiv', 'Earth and Space Science Open Archive', 'INA-Rxiv', 'eLife', 'EarthArXiv', 'PeerJ Preprints', 'Nature Precedings', 'engrXiv', 'Advance', 'Thesis Commons', 'Cambridge Open Engage', 'EcoEvoRxiv', 'EdArXiv', 'ScienceOpen Preprints', 'Law Archive', 'APSA Preprints']


In [103]:
set(primary_servers[:5])

{'Preprints.org', 'Research Square', 'SSRN', 'arXiv', 'bioRxiv'}

### doi

In [104]:
df_mirror = df.copy()

In [105]:
df_touse = df_mirror[df_mirror['doi'].notna()].copy()
df_touse

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,Long-term monitoring of land surface phenologi...,"Tsutsumida, Narumasa",2018.0,2019-11-13,,,,,,,false,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500000.1,,,,,,
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,Impact of spatial scale for phenological indic...,"Tsutsumida, Narumasa; Kaduk, Jörg",2018.0,2019-11-13,,,,,,,false,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500002.1,,,,,,
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,Observations of Low Latitude Red Aurora in Mex...,"Gonzalez-Esparza, J. Americo; Cuevas-Cardona, ...",2018.0,2019-11-13,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1029/2017sw001789;10.1029/2018sw001995,,,true,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500004.1,,,,,,
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,Pipeline oil fire detection with MODIS active ...,"Ogungbuyi, Michael Gbenga; Martinez, Peter; Ec...",2018.0,2019-11-13,,,,,,,false,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500007.1,,,,,,
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,Land Product Validation of MODIS Derived FPAR ...,"Sharp, Iain; Sanchez-Azofeifa, Arturo; Musilek...",2018.0,2019-12-03,,,,,,,false,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500009.1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5667066,openalex::W999063011,Social Science Open Access Repository,openalex,10.15464/isi.42.2009.1-5,https://doi.org/10.15464/isi.42.2009.1-5,http://www.ssoar.info/ssoar/handle/document/21392,Jeder fünfte Erwerbstätige ist aus beruflichen...,Silvia Ruppenthal; Detlev Lück,2009.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.ssoar.info/ssoar/handle/document/21392,10.15464/isi.42.2009.1-5,,,,,,
2245001,openalex::W99945665,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Earthworm abundance and species richness: Cont...,Anna-Sophie Köhler; Julia Huber; Sebastian Wol...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
5667173,openalex::W999508398,Social Science Open Access Repository,openalex,10.15464/isi.38.2007.1-5,https://doi.org/10.15464/isi.38.2007.1-5,http://www.ssoar.info/ssoar/handle/document/21635,Berufstätigkeit von Müttern bleibt kontrovers:...,Angelika Scheuer; Jörg Dittmann,2007.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.ssoar.info/ssoar/handle/document/21635,10.15464/isi.38.2007.1-5,,,,,,
5673122,openalex::W999509321,Social Science Open Access Repository,openalex,10.23668/psycharchives.13440,https://doi.org/10.23668/psycharchives.13440,http://www.ssoar.info/ssoar/handle/document/3368,"Rezension: Edward Timms (Hg.), Freud und das K...",Wolfgang Hegener,1997.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.ssoar.info/ssoar/handle/document/3368,10.23668/psycharchives.13440,,,,,,


In [106]:
dupes = df_touse[df_touse.duplicated(subset=['doi'], keep=False)]
dupes

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
8976641,crossref::10.1101/103937,bioRxiv,crossref,10.1101/103937,https://doi.org/10.1101/103937,http://biorxiv.org/lookup/doi/10.1101/103937,LET-99-dependent spatial restriction of active...,"Bouvrais, H.; Chesneau, L.; Pastezeur, S.; Del...",2017.0,2017-01-29,,,,,,,false,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/103937,10.1101/103937,,,,,,
9073833,crossref::10.1101/2019.12.23.887166,bioRxiv,crossref,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,http://biorxiv.org/lookup/doi/10.1101/2019.12....,Model balancing: in search of consistent metab...,"Liebermeister, Wolfram; Noor, Elad",2019.0,2019-12-24,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.3390/metabo11110749,,,true,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2019.12....,10.1101/2019.12.23.887166,,,,,,
8986959,crossref::10.1101/2020.01.22.915215,bioRxiv,crossref,10.1101/2020.01.22.915215,https://doi.org/10.1101/2020.01.22.915215,http://biorxiv.org/lookup/doi/10.1101/2020.01....,Not only compulsivity: The SAPAP3-KO mouse rec...,"Lamothe, H; Schreiweis, C; Lavielle, O; Mallet...",2020.0,2020-01-23,,,,,,,false,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2020.01....,10.1101/2020.01.22.915215,,,,,,
8981770,crossref::10.1101/2020.02.03.919597,bioRxiv,crossref,10.1101/2020.02.03.919597,https://doi.org/10.1101/2020.02.03.919597,http://biorxiv.org/lookup/doi/10.1101/2020.02....,Germline inherited small RNAs clear untranslat...,"Quarato, Piergiuseppe; Singh, Meetali; Cornes,...",2020.0,2020-02-04,,,,,,,false,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2020.02....,10.1101/2020.02.03.919597,,,,,,
8891397,crossref::10.1101/2020.06.08.118984,bioRxiv,crossref,10.1101/2020.06.08.118984,https://doi.org/10.1101/2020.06.08.118984,http://biorxiv.org/lookup/doi/10.1101/2020.06....,TBPL2/TFIIA complex establishes the maternal t...,"Yu, Changwei; Cvetesic, Nevena; Hisler, Vincen...",2020.0,2020-06-09,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1038/s41467-020-20239-4,,,true,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2020.06....,10.1101/2020.06.08.118984,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486788,openalex::W983039221,viXra,openalex,10.13140/rg.2.2.29266.79041,https://doi.org/10.13140/rg.2.2.29266.79041,http://www.vixra.org/pdf/1503.0004v1.pdf,Ionization chamber noise fluctuations during l...,A. D. Skorbun; M. V. Efetov; G. I. Odinokin; O...,2020.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.vixra.org/pdf/1503.0004v1.pdf,10.13140/rg.2.2.29266.79041,,,,,,
2806374,openalex::W985302891,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088/fil...,Farmland price bubbles: wavelet-based evidence,Gabriel J. Power; Calum G. Turvey,2006.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://ageconsearch.umn.edu/record/133088/fil...,10.22004/ag.econ.133088,,,,,,
2817765,openalex::W986721525,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.196816,https://doi.org/10.22004/ag.econ.196816,https://ageconsearch.umn.edu/record/196816/fil...,Assessing the Value of Broadband Connectivity ...,Tyler Mark; Brian E. Whitacre; Terry Griffin,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://ageconsearch.umn.edu/record/196816/fil...,10.22004/ag.econ.196816,,,,,,
9486875,openalex::W998010466,viXra,openalex,10.13140/rg.2.2.36331.69924,https://doi.org/10.13140/rg.2.2.36331.69924,https://vixra.org/pdf/1409.0185v1.pdf,"TGD view about homeopathy, water memory, and e...",Matti Pitkänen,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://vixra.org/pdf/1409.0185v1.pdf,10.13140/rg.2.2.36331.69924,,,,,,


In [107]:
dupes['server_name'].value_counts()

server_name
RePEc: Research Papers in Economics                                     10494
arXiv                                                                    8236
AgEcon Search                                                            6540
HAL                                                                      4041
ResearchGate                                                             1633
DSpace@MIT                                                               1082
viXra                                                                     826
EconStor Preprints                                                        559
Munich Personal RePEc Archive                                             443
Zenodo                                                                    297
PhilSci-Archive                                                           161
IACR Cryptology ePrint Archive                                             91
Digital Access to Scholarship at Harvard (DASH) (Har

In [108]:
dupes[dupes['server_name']=='ResearchGate']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
3322080,datacite::10.13140/2.1.2328.2569,ResearchGate,datacite,10.13140/2.1.2328.2569,https://doi.org/10.13140/2.1.2328.2569,https://www.researchgate.net/doi/10.13140/2.1....,Dirac-like equation and the excited states of ...,"Silva, Paulo Roberto",2014.0,2014-08-19,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/2.1....,10.13140/2.1.2328.2569,,,,,,
3322083,datacite::10.13140/2.1.2910.4001,ResearchGate,datacite,10.13140/2.1.2910.4001,https://doi.org/10.13140/2.1.2910.4001,https://www.researchgate.net/doi/10.13140/2.1....,"Polymer physics, the quantum harmonic oscillat...",P R Silva,2014.0,2014-08-27,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/2.1....,10.13140/2.1.2910.4001,,,,,,
3322151,datacite::10.13140/rg.2.1.1007.9848,ResearchGate,datacite,10.13140/rg.2.1.1007.9848,https://doi.org/10.13140/rg.2.1.1007.9848,https://www.researchgate.net/doi/10.13140/RG.2...,classEx - an online software for classroom exp...,"Giamattei, Marcus; Lambsdorff, Johann Graf",2015.0,2015-07-20,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.1.1007.9848,,,,,,
3322312,datacite::10.13140/rg.2.1.1103.1281,ResearchGate,datacite,10.13140/rg.2.1.1103.1281,https://doi.org/10.13140/rg.2.1.1103.1281,https://www.researchgate.net/doi/10.13140/RG.2...,A TOPOLOGICAL BRAIN ELUCIDATES SYNTACTIC AND S...,"Tozzi, Arturo; Peters, James F",2016.0,2016-08-09,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.1.1103.1281,,,,,,
3322200,datacite::10.13140/rg.2.1.1103.9445,ResearchGate,datacite,10.13140/rg.2.1.1103.9445,https://doi.org/10.13140/rg.2.1.1103.9445,https://www.researchgate.net/doi/10.13140/RG.2...,TIME AS A GAUGE FIELD,"Tozzi, Arturo; Chafin, Clifford; Falco, Domeni...",2015.0,2015-10-13,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.1.1103.9445,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3354289,datacite::10.13140/rg.2.2.36811.13607,ResearchGate,datacite,10.13140/rg.2.2.36811.13607,https://doi.org/10.13140/rg.2.2.36811.13607,https://www.researchgate.net/doi/10.13140/RG.2...,Concurrent Sourcing in Local Public Services: ...,"Porcher, Simon",2019.0,2019-12-13,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36811.13607,,,,,,
3332704,datacite::10.13140/rg.2.2.36825.57442,ResearchGate,datacite,10.13140/rg.2.2.36825.57442,https://doi.org/10.13140/rg.2.2.36825.57442,https://www.researchgate.net/doi/10.13140/RG.2...,The Effects of Education on Health: An Interge...,"Huebener, Mathias",2018.0,2018-09-18,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36825.57442,,,,,,
3350078,datacite::10.13140/rg.2.2.36829.77284,ResearchGate,datacite,10.13140/rg.2.2.36829.77284,https://doi.org/10.13140/rg.2.2.36829.77284,https://www.researchgate.net/doi/10.13140/RG.2...,Non-Euclidean metric using Geometric Algebra,"Sánchez, Jesús",2019.0,2019-09-20,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36829.77284,,,,,,
3463359,datacite::10.13140/rg.2.2.36831.37285,ResearchGate,datacite,10.13140/rg.2.2.36831.37285,https://doi.org/10.13140/rg.2.2.36831.37285,https://www.researchgate.net/doi/10.13140/RG.2...,"FERTILITY DYNAMICS IN SWEDEN, SPAIN, THE CZECH...","Soldan, Elena; Gagauz, Olga",2018.0,2025-01-08,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36831.37285,,,,,,


In [109]:
df[df['doi']=='10.13140/rg.2.2.36331.69924']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
3349993,datacite::10.13140/rg.2.2.36331.69924,ResearchGate,datacite,10.13140/rg.2.2.36331.69924,https://doi.org/10.13140/rg.2.2.36331.69924,https://www.researchgate.net/doi/10.13140/RG.2...,"TGD view about homeopathy, water memory, and e...",M Pitkänen,2014.0,2019-09-19,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36331.69924,,,,,,
9486875,openalex::W998010466,viXra,openalex,10.13140/rg.2.2.36331.69924,https://doi.org/10.13140/rg.2.2.36331.69924,https://vixra.org/pdf/1409.0185v1.pdf,"TGD view about homeopathy, water memory, and e...",Matti Pitkänen,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://vixra.org/pdf/1409.0185v1.pdf,10.13140/rg.2.2.36331.69924,,,,,,


In [110]:
df[df['doi']=='10.22004/ag.econ.133088']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
102633,datacite::10.22004/ag.econ.133088,AgEcon Search,datacite,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088,Farmland price bubbles: wavelet-based evidence,"Power, Gabriel J.; Turvey, Calum G.",2006.0,2019-08-30,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/133088,10.22004/ag.econ.133088,,,,,,
2806374,openalex::W985302891,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088/fil...,Farmland price bubbles: wavelet-based evidence,Gabriel J. Power; Calum G. Turvey,2006.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://ageconsearch.umn.edu/record/133088/fil...,10.22004/ag.econ.133088,,,,,,


In [111]:
df[df['doi']=='10.1101/2019.12.23.887166']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9073833,crossref::10.1101/2019.12.23.887166,bioRxiv,crossref,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,http://biorxiv.org/lookup/doi/10.1101/2019.12....,Model balancing: in search of consistent metab...,"Liebermeister, Wolfram; Noor, Elad",2019.0,2019-12-24,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.3390/metabo11110749,,,True,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2019.12....,10.1101/2019.12.23.887166,,,,,,
1555009,openalex::W2995005865,HAL,openalex,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,https://hal.science/hal-02437604,Model balancing: consistent in-vivo kinetic co...,Wolfram Liebermeister,2019.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://hal.science/hal-02437604,10.1101/2019.12.23.887166,,,,,,


In [112]:
df[df['doi']=='10.1101/2020.02.03.919597']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
8981770,crossref::10.1101/2020.02.03.919597,bioRxiv,crossref,10.1101/2020.02.03.919597,https://doi.org/10.1101/2020.02.03.919597,http://biorxiv.org/lookup/doi/10.1101/2020.02....,Germline inherited small RNAs clear untranslat...,"Quarato, Piergiuseppe; Singh, Meetali; Cornes,...",2020.0,2020-02-04,,,,,,,False,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2020.02....,10.1101/2020.02.03.919597,,,,,,
1731345,openalex::W3004251285,HAL,openalex,10.1101/2020.02.03.919597,https://doi.org/10.1101/2020.02.03.919597,https://pasteur.hal.science/pasteur-02626442,Argonaute catalytic activity is required for m...,Piergiuseppe Quarato; Meetali Singh; Eric Corn...,2020.0,2020-02-07T00:00:00,,,,,,,,,,,,parent,https://pasteur.hal.science/pasteur-02626442,10.1101/2020.02.03.919597,,,,,,


In [113]:
df[df['doi']=='10.3220/rep_20_1_2014']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2246907,openalex::W102989963,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,PLANT BASED-DIVERSITY PRACTICES IN CONVENTIONA...,Marion Casagrande; Lionel Alletto; Christophe ...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2245787,openalex::W106941238,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,ANIMAL MANURE – REDUCED QUALITY BY ANAEROBIC D...,Anne-Kristin Løes; Anders Johansen; Reidun Pom...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2247124,openalex::W111413393,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Do you like organic wine? Preferences of organ...,Meike Janßen; Katrin Zander,2014.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2245890,openalex::W113337753,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Innovations in low input and organic dairy sup...,Fabio Tittarelli; Hanne Lakkenborg Kristensen;...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2251234,openalex::W25719276,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Soil quality changes in field trials comparing...,Andreas Fließbach; Verena Hammerl; Daniele Ant...,2014.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2255867,openalex::W46898626,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,THE PESTICIDAL POTENTIAL OF Alternanthera bran...,Olajumoke Oke Fayinminnu; Olawale Oreoluwa Shiro,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2255967,openalex::W49926529,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Producers Continuing Versus Exiting From Organ...,Sonja Brodt; Karen Klonsky; Ron Strochlic; Lui...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2244068,openalex::W50328423,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,A prototype tool for multicriteria assessments...,Hugo Fjelsted Alrøe; Egon Noe,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2256029,openalex::W53160016,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Organic Eprints – helping research results go ...,Ilse A. Rasmussen; Allan Leck Jensen; Helga Wi...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2244962,openalex::W60459218,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,PARTICIPATORY COTTON BREEDING AND CULTIVAR EVA...,Monika Messmer; Yogendra Shivas; Rajeev Verma;...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,


In [114]:
df[df['doi']=='10.48550/arxiv.0704.0324']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
6936795,datacite::10.48550/arxiv.0704.0324,arXiv,datacite,10.48550/arxiv.0704.0324,https://doi.org/10.48550/arxiv.0704.0324,https://arxiv.org/abs/0704.0324,On the pseudospectrum of elliptic quadratic di...,"Pravda-Starov, Karel",2007.0,2022-03-16,[],1.0,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/0704.0324,10.48550/arxiv.0704.0324,,,,,,
640498,openalex::W2951243300,HAL,openalex,10.48550/arxiv.0704.0324,https://doi.org/10.48550/arxiv.0704.0324,https://hal.science/hal-00139490,On the pseudospectrum of elliptic quadratic di...,Karel Pravda‐Starov,2007.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://hal.science/hal-00139490,10.48550/arxiv.0704.0324,,,,,,


In [115]:
df[df['doi']=='10.13140/2.1.2910.4001']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
3322083,datacite::10.13140/2.1.2910.4001,ResearchGate,datacite,10.13140/2.1.2910.4001,https://doi.org/10.13140/2.1.2910.4001,https://www.researchgate.net/doi/10.13140/2.1....,"Polymer physics, the quantum harmonic oscillat...",P R Silva,2014.0,2014-08-27,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",parent,https://www.researchgate.net/doi/10.13140/2.1....,10.13140/2.1.2910.4001,,,,,,
9465428,openalex::W135550700,viXra,openalex,10.13140/2.1.2910.4001,https://doi.org/10.13140/2.1.2910.4001,https://vixra.org/pdf/1402.0082v1.pdf,"Polymer physics, the quantum harmonic oscillat...",P. R. Silva,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://vixra.org/pdf/1402.0082v1.pdf,10.13140/2.1.2910.4001,,,,,,


In [116]:
df['records_hierarchy'].value_counts()

records_hierarchy
parent              7967131
review               175443
part_of              149462
version              105984
publish_version        9351
child                  2028
correction              355
comment                 325
others                   12
parent_duplicate          3
Name: count, dtype: int64

#### function

In [117]:
import pandas as pd

# -----------------------------
# 0) Define primary servers
# -----------------------------
# PRIMARY_SERVERS = {
#     "arXiv",
#     "bioRxiv",
#     "medRxiv",
#     "SSRN",
#     "TechRxiv",
#     "Wellcome Open Research",
#     "AgriRxiv",
#     "EarthArXiv",
#     "Law Archive",
#     "SocArXiv",
#     "Thesis Commons",
#     "Research Square",
#     "Open Research Europe",
#     "Oroboros Instruments",
#     "ResearchGate",
#     "AgEcon Search",
#     "Zenodo",
#     "Open Science Framework",
#     "Humanities Commons CORE",
#     "CERN document server",
#     "eLife",
# }

PRIMARY_SERVERS = set(primary_servers)  
# -----------------------------
# 1) Normalize DOI
# -----------------------------
doi_norm = (
    df_mirror["doi"]
    .astype("string")
    .str.strip()
    .str.lower()
    .replace({"": pd.NA, "none": pd.NA, "nan": pd.NA, "null": pd.NA})
)

# -----------------------------
# 2) Flags
# -----------------------------
is_primary = df_mirror["server_name"].isin(PRIMARY_SERVERS)
is_dup = doi_norm.notna() & doi_norm.duplicated(keep=False)

# -----------------------------
# 3) Identify DOI → primary server mapping (ONLY ONCE)
# -----------------------------
primary_by_doi = (
    df_mirror.loc[is_primary & is_dup, ["server_name"]]
    .assign(doi=doi_norm[is_primary & is_dup])
    .dropna(subset=["doi"])
    .groupby("doi")["server_name"]
    .first()   # take first primary (fast & deterministic)
)

# -----------------------------
# 4) Mark mirror rows
# -----------------------------
mask_mirror = is_dup & ~is_primary & doi_norm.isin(primary_by_doi.index)

df_mirror.loc[mask_mirror, "records_hierarchy"] = (
    "mirror (" + doi_norm[mask_mirror].map(primary_by_doi) + ")"
)


# primary_origin = doi_norm[mask_mirror].map(primary_by_doi).fillna("unknown").astype(str)

# df_mirror.loc[mask_mirror, "records_hierarchy"] = "mirror (" + primary_origin + ")"

# -----------------------------
# 5) Sanity check
# -----------------------------
print("Mirror rows:", mask_mirror.sum())
print(df_mirror["records_hierarchy"].value_counts(dropna=False).head(20))


Mirror rows: 16840
records_hierarchy
parent                              7950306
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8240
mirror (AgEcon Search)                 6541
child                                  2028
mirror (ResearchGate)                  1633
correction                              355
comment                                 325
mirror (Zenodo)                         297
mirror (Open Science Framework)          31
mirror (SSRN)                            30
mirror (bioRxiv)                         29
mirror (Humanities Commons CORE)         24
others                                   12
parent_duplicate                          3
mirror (eLife)                            3
mirror (CERN document server)             2
Name: count, dtype: int64


In [118]:
df_mirror['records_hierarchy'].value_counts()

records_hierarchy
parent                              7950306
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8240
mirror (AgEcon Search)                 6541
child                                  2028
mirror (ResearchGate)                  1633
correction                              355
comment                                 325
mirror (Zenodo)                         297
mirror (Open Science Framework)          31
mirror (SSRN)                            30
mirror (bioRxiv)                         29
mirror (Humanities Commons CORE)         24
others                                   12
parent_duplicate                          3
mirror (eLife)                            3
mirror (CERN document server)             2
mirror (Research Square)                  2
mirror (AgriRxiv)                         1
mirror (EarthA

In [119]:
dupes[dupes['server_name']=='EconStor Preprints']#.tail(60)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
495549,openalex::W107243416,EconStor Preprints,openalex,10.1007/s10273-008-0850-2,https://doi.org/10.1007/s10273-008-0850-2,http://hdl.handle.net/10419/42949,Am Vorabend des Gesundheitsfonds: Kaum ein Pro...,Klaus Jacobs,2008.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/42949,10.1007/s10273-008-0850-2,,,,,,
495567,openalex::W1122477516,EconStor Preprints,openalex,10.1007/s10273-012-1319-x,https://doi.org/10.1007/s10273-012-1319-x,http://hdl.handle.net/10419/68395,Zähe Liberalisierung: Blue Card,Max Friedrich Steinhardt,2012.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/68395,10.1007/s10273-012-1319-x,,,,,,
495592,openalex::W1150235247,EconStor Preprints,openalex,10.1007/s10273-014-1758-7,https://doi.org/10.1007/s10273-014-1758-7,http://hdl.handle.net/10419/106724,Deutsche Parallelveranstaltung? EU-Emissionsha...,Andreas Löschel,2014.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/106724,10.1007/s10273-014-1758-7,,,,,,
495606,openalex::W1166988194,EconStor Preprints,openalex,10.1007/s10272-006-0174-7,https://doi.org/10.1007/s10272-006-0174-7,http://hdl.handle.net/10419/41893,Negotiating future climate policy: The role of...,Preety M. Bhandari,2006.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/41893,10.1007/s10272-006-0174-7,,,,,,
488791,openalex::W1198321170,EconStor Preprints,openalex,10.1007/s10273-008-0758-x,https://doi.org/10.1007/s10273-008-0758-x,http://hdl.handle.net/10419/42977,Weiterentwicklung des SGB II: Flexible Lösunge...,Susanne Koch; Ulrich Walwei,2008.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/42977,10.1007/s10273-008-0758-x,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527697,openalex::W822245089,EconStor Preprints,openalex,10.13140/rg.2.2.31319.42409,https://doi.org/10.13140/rg.2.2.31319.42409,http://hdl.handle.net/10419/95827,Zur Industriepolitk in Europa,Georg Erber; Harald Hagemann; Stephan Seiter,1996.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/95827,10.13140/rg.2.2.31319.42409,,,,,,
457459,openalex::W854544018,EconStor Preprints,openalex,10.1007/s10272-005-0163-2,https://doi.org/10.1007/s10272-005-0163-2,http://hdl.handle.net/10419/41835,Some tough love on 'aid for trade',Simon J. Evenett,2005.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/41835,10.1007/s10272-005-0163-2,,,,,,
488736,openalex::W872556750,EconStor Preprints,openalex,10.1007/s10273-008-0758-x,https://doi.org/10.1007/s10273-008-0758-x,http://hdl.handle.net/10419/42941,Das Hartz-IV-Experiment,Hugh Mosley,2008.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/42941,10.1007/s10273-008-0758-x,,,,,,
527752,openalex::W930520461,EconStor Preprints,openalex,10.1007/s10273-008-0872-9,https://doi.org/10.1007/s10273-008-0872-9,http://hdl.handle.net/10419/43019,Schutz der geistigen Eigentumsrechte: Ein Pläd...,Cornelia Tausch,2008.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/43019,10.1007/s10273-008-0872-9,,,,,,


In [120]:
df_mirror[df_mirror['doi']=='10.1007/s10273-008-0850-2']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
495549,openalex::W107243416,EconStor Preprints,openalex,10.1007/s10273-008-0850-2,https://doi.org/10.1007/s10273-008-0850-2,http://hdl.handle.net/10419/42949,Am Vorabend des Gesundheitsfonds: Kaum ein Pro...,Klaus Jacobs,2008.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/42949,10.1007/s10273-008-0850-2,,,,,,
466923,openalex::W218899851,EconStor Preprints,openalex,10.1007/s10273-008-0850-2,https://doi.org/10.1007/s10273-008-0850-2,http://hdl.handle.net/10419/42931,Reform der ambulanten ärztlichen Vergütung: Me...,Jürgen Wasem; Anke Walendzik,2008.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/42931,10.1007/s10273-008-0850-2,,,,,,
504836,openalex::W2230538860,EconStor Preprints,openalex,10.1007/s10273-008-0850-2,https://doi.org/10.1007/s10273-008-0850-2,http://hdl.handle.net/10419/42921,Widerspruchsvolle und ambivalente Strukturwand...,Frank Schulz‐Nieswandt,2008.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/42921,10.1007/s10273-008-0850-2,,,,,,


In [121]:
# pattern = "osf.io"


# mask = ~dupes[dupes['server_name']=='Open Science Framework']['doi'].str.contains(pattern, regex=False, na=False)
# result = dupes[dupes['server_name']=='Open Science Framework'][mask]
# result

In [122]:
df_mirror[df_mirror['doi']=='10.1101/2019.12.23.887166']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9073833,crossref::10.1101/2019.12.23.887166,bioRxiv,crossref,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,http://biorxiv.org/lookup/doi/10.1101/2019.12....,Model balancing: in search of consistent metab...,"Liebermeister, Wolfram; Noor, Elad",2019.0,2019-12-24,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.3390/metabo11110749,,,True,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2019.12....,10.1101/2019.12.23.887166,,,,,,
1555009,openalex::W2995005865,HAL,openalex,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,https://hal.science/hal-02437604,Model balancing: consistent in-vivo kinetic co...,Wolfram Liebermeister,2019.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (bioRxiv),https://hal.science/hal-02437604,10.1101/2019.12.23.887166,,,,,,


In [123]:
pattern = "mirror"


mask = df_mirror[df_mirror['server_name']=='F1000Research']['records_hierarchy'].str.contains(pattern, regex=False, na=False)
result = df_mirror[df_mirror['server_name']=='F1000Research'][mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi


In [124]:
pattern = "mirror"


mask = df_mirror['records_hierarchy'].str.contains(pattern, regex=False, na=False)
result = df_mirror[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9464662,openalex::W1003383974,viXra,openalex,10.13140/rg.2.2.14101.88805,https://doi.org/10.13140/rg.2.2.14101.88805,https://vixra.org/pdf/1501.0014v1.pdf,How are the visual percepts constructed?,Minna Pitkänen,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (ResearchGate),https://vixra.org/pdf/1501.0014v1.pdf,10.13140/rg.2.2.14101.88805,,,,,,
9461657,openalex::W10037114,viXra,openalex,10.13140/rg.2.1.4248.2403,https://doi.org/10.13140/rg.2.1.4248.2403,https://vixra.org/pdf/1407.0122v1.pdf,A Reformulation of Classical Mechanics,Alejandro A. Torassa,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (ResearchGate),https://vixra.org/pdf/1407.0122v1.pdf,10.13140/rg.2.1.4248.2403,,,,,,
2713165,openalex::W100680786,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.50567,https://doi.org/10.22004/ag.econ.50567,https://ageconsearch.umn.edu/record/50567,The Impact of Oil Prices on the U.S. and Mexic...,Owen Wagner,2009.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/50567,10.22004/ag.econ.50567,,,,,,
9464726,openalex::W101331951,viXra,openalex,10.13140/rg.2.2.17278.64326,https://doi.org/10.13140/rg.2.2.17278.64326,https://vixra.org/pdf/1306.0071v1.pdf,The Optical Breakdown Density Limit Gamma givi...,Peter A Jackson,2013.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (ResearchGate),https://vixra.org/pdf/1306.0071v1.pdf,10.13140/rg.2.2.17278.64326,,,,,,
2713230,openalex::W101709879,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.131335,https://doi.org/10.22004/ag.econ.131335,https://ageconsearch.umn.edu/record/131335/fil...,Explaining Farmland Price Dynamics,Madhab R. Khoju; Bruce L. Ahrendsen,1993.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/131335/fil...,10.22004/ag.econ.131335,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2810559,openalex::W98271585,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.149739,https://doi.org/10.22004/ag.econ.149739,https://ageconsearch.umn.edu/record/149739/,Impacts of Climate Change on Corn and Soybean ...,Shuai Chen; Xiaoguang Chen; Jintao Xu,2013.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/149739/,10.22004/ag.econ.149739,,,,,,
9486788,openalex::W983039221,viXra,openalex,10.13140/rg.2.2.29266.79041,https://doi.org/10.13140/rg.2.2.29266.79041,http://www.vixra.org/pdf/1503.0004v1.pdf,Ionization chamber noise fluctuations during l...,A. D. Skorbun; M. V. Efetov; G. I. Odinokin; O...,2020.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (ResearchGate),http://www.vixra.org/pdf/1503.0004v1.pdf,10.13140/rg.2.2.29266.79041,,,,,,
2806374,openalex::W985302891,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088/fil...,Farmland price bubbles: wavelet-based evidence,Gabriel J. Power; Calum G. Turvey,2006.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/133088/fil...,10.22004/ag.econ.133088,,,,,,
2817765,openalex::W986721525,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.196816,https://doi.org/10.22004/ag.econ.196816,https://ageconsearch.umn.edu/record/196816/fil...,Assessing the Value of Broadband Connectivity ...,Tyler Mark; Brian E. Whitacre; Terry Griffin,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/196816/fil...,10.22004/ag.econ.196816,,,,,,


In [125]:
result['server_name'].value_counts()

server_name
RePEc: Research Papers in Economics                                     9994
HAL                                                                     3923
DSpace@MIT                                                              1082
viXra                                                                    826
Munich Personal RePEc Archive                                            423
EconStor Preprints                                                       197
PhilSci-Archive                                                          161
IACR Cryptology ePrint Archive                                            91
Digital Access to Scholarship at Harvard (DASH) (Harvard University)      67
E-LIS Repository                                                          28
Social Science Open Access Repository                                     19
Organic Eprints                                                           14
CogPrints                                                       

In [126]:
dupes_mirror_df = df_mirror[df_mirror['doi'].notna()]
dupes_mirror_df

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,Long-term monitoring of land surface phenologi...,"Tsutsumida, Narumasa",2018.0,2019-11-13,,,,,,,false,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500000.1,,,,,,
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,Impact of spatial scale for phenological indic...,"Tsutsumida, Narumasa; Kaduk, Jörg",2018.0,2019-11-13,,,,,,,false,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500002.1,,,,,,
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,Observations of Low Latitude Red Aurora in Mex...,"Gonzalez-Esparza, J. Americo; Cuevas-Cardona, ...",2018.0,2019-11-13,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1029/2017sw001789;10.1029/2018sw001995,,,true,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500004.1,,,,,,
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,Pipeline oil fire detection with MODIS active ...,"Ogungbuyi, Michael Gbenga; Martinez, Peter; Ec...",2018.0,2019-11-13,,,,,,,false,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500007.1,,,,,,
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,Land Product Validation of MODIS Derived FPAR ...,"Sharp, Iain; Sanchez-Azofeifa, Arturo; Musilek...",2018.0,2019-12-03,,,,,,,false,,,,,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500009.1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5667066,openalex::W999063011,Social Science Open Access Repository,openalex,10.15464/isi.42.2009.1-5,https://doi.org/10.15464/isi.42.2009.1-5,http://www.ssoar.info/ssoar/handle/document/21392,Jeder fünfte Erwerbstätige ist aus beruflichen...,Silvia Ruppenthal; Detlev Lück,2009.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.ssoar.info/ssoar/handle/document/21392,10.15464/isi.42.2009.1-5,,,,,,
2245001,openalex::W99945665,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Earthworm abundance and species richness: Cont...,Anna-Sophie Köhler; Julia Huber; Sebastian Wol...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
5667173,openalex::W999508398,Social Science Open Access Repository,openalex,10.15464/isi.38.2007.1-5,https://doi.org/10.15464/isi.38.2007.1-5,http://www.ssoar.info/ssoar/handle/document/21635,Berufstätigkeit von Müttern bleibt kontrovers:...,Angelika Scheuer; Jörg Dittmann,2007.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.ssoar.info/ssoar/handle/document/21635,10.15464/isi.38.2007.1-5,,,,,,
5673122,openalex::W999509321,Social Science Open Access Repository,openalex,10.23668/psycharchives.13440,https://doi.org/10.23668/psycharchives.13440,http://www.ssoar.info/ssoar/handle/document/3368,"Rezension: Edward Timms (Hg.), Freud und das K...",Wolfgang Hegener,1997.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.ssoar.info/ssoar/handle/document/3368,10.23668/psycharchives.13440,,,,,,


In [127]:
dupes_mirror = dupes_mirror_df[dupes_mirror_df.duplicated(subset=['doi'], keep=False)]
dupes_mirror

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
8976641,crossref::10.1101/103937,bioRxiv,crossref,10.1101/103937,https://doi.org/10.1101/103937,http://biorxiv.org/lookup/doi/10.1101/103937,LET-99-dependent spatial restriction of active...,"Bouvrais, H.; Chesneau, L.; Pastezeur, S.; Del...",2017.0,2017-01-29,,,,,,,false,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/103937,10.1101/103937,,,,,,
9073833,crossref::10.1101/2019.12.23.887166,bioRxiv,crossref,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,http://biorxiv.org/lookup/doi/10.1101/2019.12....,Model balancing: in search of consistent metab...,"Liebermeister, Wolfram; Noor, Elad",2019.0,2019-12-24,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.3390/metabo11110749,,,true,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2019.12....,10.1101/2019.12.23.887166,,,,,,
8986959,crossref::10.1101/2020.01.22.915215,bioRxiv,crossref,10.1101/2020.01.22.915215,https://doi.org/10.1101/2020.01.22.915215,http://biorxiv.org/lookup/doi/10.1101/2020.01....,Not only compulsivity: The SAPAP3-KO mouse rec...,"Lamothe, H; Schreiweis, C; Lavielle, O; Mallet...",2020.0,2020-01-23,,,,,,,false,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2020.01....,10.1101/2020.01.22.915215,,,,,,
8981770,crossref::10.1101/2020.02.03.919597,bioRxiv,crossref,10.1101/2020.02.03.919597,https://doi.org/10.1101/2020.02.03.919597,http://biorxiv.org/lookup/doi/10.1101/2020.02....,Germline inherited small RNAs clear untranslat...,"Quarato, Piergiuseppe; Singh, Meetali; Cornes,...",2020.0,2020-02-04,,,,,,,false,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2020.02....,10.1101/2020.02.03.919597,,,,,,
8891397,crossref::10.1101/2020.06.08.118984,bioRxiv,crossref,10.1101/2020.06.08.118984,https://doi.org/10.1101/2020.06.08.118984,http://biorxiv.org/lookup/doi/10.1101/2020.06....,TBPL2/TFIIA complex establishes the maternal t...,"Yu, Changwei; Cvetesic, Nevena; Hisler, Vincen...",2020.0,2020-06-09,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1038/s41467-020-20239-4,,,true,,,,,parent,http://biorxiv.org/lookup/doi/10.1101/2020.06....,10.1101/2020.06.08.118984,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486788,openalex::W983039221,viXra,openalex,10.13140/rg.2.2.29266.79041,https://doi.org/10.13140/rg.2.2.29266.79041,http://www.vixra.org/pdf/1503.0004v1.pdf,Ionization chamber noise fluctuations during l...,A. D. Skorbun; M. V. Efetov; G. I. Odinokin; O...,2020.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (ResearchGate),http://www.vixra.org/pdf/1503.0004v1.pdf,10.13140/rg.2.2.29266.79041,,,,,,
2806374,openalex::W985302891,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088/fil...,Farmland price bubbles: wavelet-based evidence,Gabriel J. Power; Calum G. Turvey,2006.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/133088/fil...,10.22004/ag.econ.133088,,,,,,
2817765,openalex::W986721525,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.196816,https://doi.org/10.22004/ag.econ.196816,https://ageconsearch.umn.edu/record/196816/fil...,Assessing the Value of Broadband Connectivity ...,Tyler Mark; Brian E. Whitacre; Terry Griffin,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/196816/fil...,10.22004/ag.econ.196816,,,,,,
9486875,openalex::W998010466,viXra,openalex,10.13140/rg.2.2.36331.69924,https://doi.org/10.13140/rg.2.2.36331.69924,https://vixra.org/pdf/1409.0185v1.pdf,"TGD view about homeopathy, water memory, and e...",Matti Pitkänen,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,mirror (ResearchGate),https://vixra.org/pdf/1409.0185v1.pdf,10.13140/rg.2.2.36331.69924,,,,,,


In [128]:
dupes_mirror['server_name'].value_counts()

server_name
RePEc: Research Papers in Economics                                     10494
arXiv                                                                    8236
AgEcon Search                                                            6540
HAL                                                                      4041
ResearchGate                                                             1633
DSpace@MIT                                                               1082
viXra                                                                     826
EconStor Preprints                                                        559
Munich Personal RePEc Archive                                             443
Zenodo                                                                    297
PhilSci-Archive                                                           161
IACR Cryptology ePrint Archive                                             91
Digital Access to Scholarship at Harvard (DASH) (Har

In [129]:
dupes_mirror['records_hierarchy'].value_counts()

records_hierarchy
parent                              17869
mirror (arXiv)                       8240
mirror (AgEcon Search)               6541
mirror (ResearchGate)                1633
mirror (Zenodo)                       297
mirror (Open Science Framework)        31
mirror (SSRN)                          30
mirror (bioRxiv)                       29
mirror (Humanities Commons CORE)       24
version                                 3
mirror (eLife)                          3
part_of                                 2
mirror (CERN document server)           2
mirror (Research Square)                2
mirror (AgriRxiv)                       1
mirror (EarthArXiv)                     1
mirror (SocArXiv)                       1
mirror (Law Archive)                    1
mirror (Thesis Commons)                 1
mirror (Oroboros Instruments)           1
mirror (TechRxiv)                       1
mirror (medRxiv)                        1
Name: count, dtype: int64

In [130]:
df_mirror['records_hierarchy'].value_counts()

records_hierarchy
parent                              7950306
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8240
mirror (AgEcon Search)                 6541
child                                  2028
mirror (ResearchGate)                  1633
correction                              355
comment                                 325
mirror (Zenodo)                         297
mirror (Open Science Framework)          31
mirror (SSRN)                            30
mirror (bioRxiv)                         29
mirror (Humanities Commons CORE)         24
others                                   12
parent_duplicate                          3
mirror (eLife)                            3
mirror (CERN document server)             2
mirror (Research Square)                  2
mirror (AgriRxiv)                         1
mirror (EarthA

In [131]:
data_clean['server_name'].value_counts().head(60)

server_name
arXiv                                                                   2920797
SSRN                                                                    1258958
HAL                                                                     1056424
Research Square                                                          450818
RePEc: Research Papers in Economics                                      389398
bioRxiv                                                                  306948
eLife                                                                    247558
AgEcon Search                                                            188173
ResearchGate                                                             181231
Qeios                                                                    172316
Zenodo                                                                   166786
Open Science Framework                                                   119481
Preprints.org               

In [132]:
df[df['records_hierarchy']=='parent']['server_name'].value_counts().head(60)

server_name
arXiv                                                                   2920797
SSRN                                                                    1258958
HAL                                                                     1056329
Research Square                                                          401982
RePEc: Research Papers in Economics                                      389112
bioRxiv                                                                  306948
AgEcon Search                                                            188173
ResearchGate                                                             181231
Zenodo                                                                   166784
Open Science Framework                                                   117187
Qeios                                                                    111902
Preprints.org                                                            102231
medRxiv                     

In [133]:
df_mirror[df_mirror['records_hierarchy']=='parent']['server_name'].value_counts().head(60)

server_name
arXiv                                                                   2920797
SSRN                                                                    1258958
HAL                                                                     1052407
Research Square                                                          401982
RePEc: Research Papers in Economics                                      379124
bioRxiv                                                                  306948
AgEcon Search                                                            188173
ResearchGate                                                             181231
Zenodo                                                                   166784
Open Science Framework                                                   117187
Qeios                                                                    111902
Preprints.org                                                            102231
medRxiv                     

### landing_page_url

In [134]:
df_landing_page_url = df_mirror[df_mirror['landing_page_url'].notna()]
df_landing_page_url = df_landing_page_url[df_landing_page_url['records_hierarchy']=='parent']

In [135]:
dupes_landing_page_url = df_landing_page_url[df_landing_page_url.duplicated(subset=['landing_page_url'], keep=False)]
dupes_landing_page_url

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
572299,crossref::10.12688/f1000research.1-10.v1,F1000Research,crossref,10.12688/f1000research.1-10.v1,https://doi.org/10.12688/f1000research.1-10.v1,http://f1000research.com/articles/1-10/v1,Murine Tim-1 is excluded from the immunologica...,"Lin, Jean; Chen, Leo; Kane, Lawrence P",2012.0,2013-05-09,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-10/v1,10.12688/f1000research.1-10.v1,/v1,explicit_version,1.0,,,
572275,crossref::10.12688/f1000research.1-12.v1,F1000Research,crossref,10.12688/f1000research.1-12.v1,https://doi.org/10.12688/f1000research.1-12.v1,http://f1000research.com/articles/1-12/v1,Diversion at the ER: How Plasmodium falciparum...,"Römisch, Karin",2012.0,2013-05-09,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-12/v1,10.12688/f1000research.1-12.v1,/v1,explicit_version,1.0,,,
572276,crossref::10.12688/f1000research.1-2.v1,F1000Research,crossref,10.12688/f1000research.1-2.v1,https://doi.org/10.12688/f1000research.1-2.v1,http://f1000research.com/articles/1-2/v1,Considerations for clinical read alignment and...,"Oliver, Gavin R",2012.0,2013-05-09,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-2/v1,10.12688/f1000research.1-2.v1,/v1,explicit_version,1.0,,,
572363,crossref::10.12688/f1000research.1-23.v1,F1000Research,crossref,10.12688/f1000research.1-23.v1,https://doi.org/10.12688/f1000research.1-23.v1,http://f1000research.com/articles/1-23/v1,Female circumcision: Limiting the harm,"Kandil, Mohamed",2012.0,2013-05-09,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-23/v1,10.12688/f1000research.1-23.v1,/v1,explicit_version,1.0,,,
572496,crossref::10.12688/f1000research.1-36.v1,F1000Research,crossref,10.12688/f1000research.1-36.v1,https://doi.org/10.12688/f1000research.1-36.v1,http://f1000research.com/articles/1-36/v1,Termination of mid-trimester pregnancies: miso...,"Shabana, Ayman; Salah, Hesham; Kandil, Mohamed...",2012.0,2013-05-09,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-36/v1,10.12688/f1000research.1-36.v1,/v1,explicit_version,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
922077,openalex::W975457731,HAL,openalex,,,https://hal.inrae.fr/hal-02774168,Ivorian and Malaysian cocoa supply : a compara...,Françoise Jarrige,1993.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://hal.inrae.fr/hal-02774168,<na>,,,,,,
3148996,openalex::W985669677,RePEc: Research Papers in Economics,openalex,,,https://foresight.forecasters.org/shop/,Nate SilverÕs The Signal and the Noise: Why So...,David Orrell,2013.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://foresight.forecasters.org/shop/,<na>,,,,,,
2840713,openalex::W988701864,RePEc: Research Papers in Economics,openalex,,,https://www.econstor.eu/bitstream/10419/105793...,Mehr Vertrauen in Marktprozesse. Jahresgutacht...,Sachverständigenrat zur Begutachtung der Gesam...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://www.econstor.eu/bitstream/10419/105793...,<na>,,,,,,
3149018,openalex::W991373209,RePEc: Research Papers in Economics,openalex,,,https://foresight.forecasters.org/shop/,The Bet between Paul Ehrlich and Julian Simon ...,Ira Sohn,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://foresight.forecasters.org/shop/,<na>,,,,,,


In [136]:
dupes_landing_page_url['server_name'].value_counts()

server_name
HAL                                                                     3602
RePEc: Research Papers in Economics                                     1589
Qeios                                                                    191
ResearchHub                                                              171
Munich Personal RePEc Archive                                            169
AgEcon Search                                                            122
PsyArXiv                                                                 119
Earth and Space Science Open Archive                                     116
ScienceOpen Preprints                                                    111
TechRxiv                                                                  65
F1000Research                                                             62
arXiv                                                                     46
DSpace@MIT                                                      

In [137]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='eLife']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi


In [138]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='HAL'].sort_values(by='landing_page_url', ascending=False)


Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
1898157,openalex::W4391556995,HAL,openalex,,,https://uphf.hal.science/hal-04427814,Recherche &amp; Conception Centrées sur l’Huma...,Bako Rajaonah,2024.0,2024-02-06T00:00:00,,,,,,,,,,,,parent,https://uphf.hal.science/hal-04427814,<na>,,,,,,
1899437,openalex::W4402466855,HAL,openalex,,,https://uphf.hal.science/hal-04427814,Recherche et Conception Centrées sur l’Humain ...,Bako Rajaonah,2024.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://uphf.hal.science/hal-04427814,<na>,,,,,,
1898156,openalex::W4391556994,HAL,openalex,,,https://uphf.hal.science/hal-04427807,Human-Centred Research &amp; Design for Inclus...,Bako Rajaonah,2024.0,2024-02-06T00:00:00,,,,,,,,,,,,parent,https://uphf.hal.science/hal-04427807,<na>,,,,,,
1899435,openalex::W4402466852,HAL,openalex,,,https://uphf.hal.science/hal-04427807,Human-Centred Research and Design for Inclusiv...,Bako Rajaonah,2024.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://uphf.hal.science/hal-04427807,<na>,,,,,,
1497927,openalex::W4300758384,HAL,openalex,,,https://uphf.hal.science/hal-03402279,L’aide à la décision comme cadre de gouvernanc...,Igor Crévits; Laurence Bonnafous; Saïd Hanafi,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://uphf.hal.science/hal-03402279,<na>,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612448,openalex::W2267354779,HAL,openalex,,,http://www.utc.fr/,HNLS : une approche constructiviste de connais...,Nasreddine Bouhaï; Fabien Morvan,2004.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.utc.fr/,<na>,,,,,,
1559497,openalex::W2766068608,HAL,openalex,,,http://hal.univ-reunion.fr/hal-01620054,A multi-physics optimization problem in natura...,Delphine Ramalingom; Pierre-Henri Cocquet; Rez...,2017.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hal.univ-reunion.fr/hal-01620054,<na>,,,,,,
1724449,openalex::W4394975358,HAL,openalex,,,http://hal.univ-reunion.fr/hal-01620054,A multi-objective optimization problem in natu...,Delphine Ramalingom; Pierre-Henri Cocquet; Rez...,2018.0,2024-04-21T00:00:00,,,,,,,,,,,,parent,http://hal.univ-reunion.fr/hal-01620054,<na>,,,,,,
830106,openalex::W4298862399,HAL,openalex,,,http://atief.org,Un site web pour l'enseignement interdisciplin...,Sandrine Charles; Michel Ney; Dominique Mouchi...,2003.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://atief.org,<na>,,,,,,


In [139]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='RePEc: Research Papers in Economics'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2678434,openalex::W5759233,RePEc: Research Papers in Economics,openalex,,,https://www.rimisp.org/wp-content/files_mf/137...,Caracterización de los actores de Chiloé Central,Eduardo Ramı́rez; Félix Modrego; Julie Claire ...,2009.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://www.rimisp.org/wp-content/files_mf/137...,<na>,,,,,,
2765475,openalex::W2396705024,RePEc: Research Papers in Economics,openalex,,,https://www.rimisp.org/wp-content/files_mf/137...,Caracterización de los actores de Chiloé Central,C Revaz; Aude Favier du Noyer,2009.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,https://www.rimisp.org/wp-content/files_mf/137...,<na>,,,,,,
2766088,openalex::W25120768,RePEc: Research Papers in Economics,openalex,,,https://www.rba.gov.au/publications/confs/2006...,Wrap-up Discussion,Gary Burtless; James K. Glassman; Adair Turner,2006.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,https://www.rba.gov.au/publications/confs/2006...,<na>,,,,,,
3150826,openalex::W1440470170,RePEc: Research Papers in Economics,openalex,,,https://www.rba.gov.au/publications/confs/2006...,Overexpression of MAGE-D4 in colorectal cancer...,Qingmei Zhang; Shu-Jia He; Ning Shen; Bin Luo;...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://www.rba.gov.au/publications/confs/2006...,<na>,,,,,,
2668133,openalex::W2418493038,RePEc: Research Papers in Economics,openalex,,,https://www.nber.org/chapters/c9003.pdf,"Introduction to ""Concentrated Corporate Owners...",Felix Sahm,2000.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,https://www.nber.org/chapters/c9003.pdf,<na>,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195781,openalex::W96404234,RePEc: Research Papers in Economics,openalex,,,http://asers.eu/journals/jemt/jemt-issues.html,DEMAND OF REGIONAL TOURISTS VISITING LAO PEOPL...,Sakkarin Nonthapot; Thanet Wattanakul,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://asers.eu/journals/jemt/jemt-issues.html,<na>,,,,,,
3152394,openalex::W1533821926,RePEc: Research Papers in Economics,openalex,,,http://asers.eu/journals/jemt/jemt-issues.html,"NATURE BASED TOURISM, SEASONAL VARIATION AND I...",Utpal Kumar De; Amrita Devi,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://asers.eu/journals/jemt/jemt-issues.html,<na>,,,,,,
3169286,openalex::W1538117148,RePEc: Research Papers in Economics,openalex,,,http://asers.eu/journals/jemt/jemt-issues.html,SEEKING SUSTAINABILITY IN AN AGE OF COMPLEXITY...,Laura Ungureanu,2010.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://asers.eu/journals/jemt/jemt-issues.html,<na>,,,,,,
3157650,openalex::W113954667,RePEc: Research Papers in Economics,openalex,,,http://asers.eu/journals/jasf/jasf-issues.html,EFFICIENCY ANALYSIS OF TURKISH BANKING SYSTEM,Ayşe Altıok Yilmaz,2013.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://asers.eu/journals/jasf/jasf-issues.html,<na>,,,,,,


In [140]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='arXiv'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
6438978,datacite::10.48550/arxiv.1911.02734,arXiv,datacite,10.48550/arxiv.1911.02734,https://doi.org/10.48550/arxiv.1911.02734,https://arxiv.org/abs/1911.02734,Dipolar condensed atomic mixtures and miscibil...,"Tomio, Lauro; Kumar, Ramavarmaraja Kishor; Gam...",2019.0,2022-02-27,[],2,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1911.02734,10.48550/arxiv.1911.02734,,,,,,
6064798,datacite::10.48550/arxiv.1911.01396,arXiv,datacite,10.48550/arxiv.1911.01396,https://doi.org/10.48550/arxiv.1911.01396,https://arxiv.org/abs/1911.01396,D meson sensitivity to a system size scan at LHC,"Katz, Roland; Noronha-Hostler, Jacquelyn; Prad...",2019.0,2022-02-26,"[{""relatedIdentifier"": ""10.1007/978-3-030-5344...",1,,10.1007/978-3-030-53448-6_12,,,True,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1911.01396,10.48550/arxiv.1911.01396,,,,,,
6473812,datacite::10.48550/arxiv.1905.11483,arXiv,datacite,10.48550/arxiv.1905.11483,https://doi.org/10.48550/arxiv.1905.11483,https://arxiv.org/abs/1905.11483,Scaling properties of firearm homicides in Bra...,"Deppman, Airton",2019.0,2022-02-28,[],1,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1905.11483,10.48550/arxiv.1905.11483,,,,,,
6472098,datacite::10.48550/arxiv.1903.00148,arXiv,datacite,10.48550/arxiv.1903.00148,https://doi.org/10.48550/arxiv.1903.00148,https://arxiv.org/abs/1903.00148,Comment on Daya Bay's definition and use of De...,"Parke, Stephen J.; Funchal, Renata Zukanovich",2019.0,2022-02-28,[],1,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1903.00148,10.48550/arxiv.1903.00148,,,,,,
6496334,datacite::10.48550/arxiv.1901.02401,arXiv,datacite,10.48550/arxiv.1901.02401,https://doi.org/10.48550/arxiv.1901.02401,https://arxiv.org/abs/1901.02401,The Buzzard Flock: Dark Energy Survey Syntheti...,"DeRose, Joseph; Wechsler, Risa H.; Becker, Mat...",2019.0,2022-03-01,[],1,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1901.02401,10.48550/arxiv.1901.02401,,,,,,
6505988,datacite::10.48550/arxiv.1812.08127,arXiv,datacite,10.48550/arxiv.1812.08127,https://doi.org/10.48550/arxiv.1812.08127,https://arxiv.org/abs/1812.08127,Nuclear Dependence of Transverse Single-Spin A...,"Pate, Stephen",2018.0,2022-03-01,[],1,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1812.08127,10.48550/arxiv.1812.08127,,,,,,
6505699,datacite::10.48550/arxiv.1812.07964,arXiv,datacite,10.48550/arxiv.1812.07964,https://doi.org/10.48550/arxiv.1812.07964,https://arxiv.org/abs/1812.07964,Searches for Higgs bosons with dark matter at ...,"Gallinaro, Michele",2018.0,2022-03-01,[],1,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1812.07964,10.48550/arxiv.1812.07964,,,,,,
6502106,datacite::10.48550/arxiv.1812.05819,arXiv,datacite,10.48550/arxiv.1812.05819,https://doi.org/10.48550/arxiv.1812.05819,https://arxiv.org/abs/1812.05819,Top quark properties,"Van Mulders, Petra",2018.0,2022-03-01,[],1,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1812.05819,10.48550/arxiv.1812.05819,,,,,,
6509970,datacite::10.48550/arxiv.1811.10215,arXiv,datacite,10.48550/arxiv.1811.10215,https://doi.org/10.48550/arxiv.1811.10215,https://arxiv.org/abs/1811.10215,Higgs boson measurements at the LHC,"Unal, Guillaume",2018.0,2022-03-01,[],1,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1811.10215,10.48550/arxiv.1811.10215,,,,,,
6503045,datacite::10.48550/arxiv.1810.05321,arXiv,datacite,10.48550/arxiv.1810.05321,https://doi.org/10.48550/arxiv.1810.05321,https://arxiv.org/abs/1810.05321,PHENIX results on collectivity in small systems,"Morrow, Sylvia",2018.0,2022-03-01,[],1,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1810.05321,10.48550/arxiv.1810.05321,,,,,,


In [141]:
df_mirror[df_mirror['landing_page_url']=='https://arxiv.org/abs/1703.02360']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
6614991,datacite::10.48550/arxiv.1703.02360,arXiv,datacite,10.48550/arxiv.1703.02360,https://doi.org/10.48550/arxiv.1703.02360,https://arxiv.org/abs/1703.02360,Single-top quark cross-section measurements in...,"Hirschbuehl, Dominic",2017.0,2022-03-04,[],1.0,,,,,False,,,,"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",parent,https://arxiv.org/abs/1703.02360,10.48550/arxiv.1703.02360,,,,,,
2050450,openalex::W4300531380,Munich Personal RePEc Archive,openalex,,,https://arxiv.org/abs/1703.02360,Single-top quark cross-section measurements in...,D. Hirschbuehl,2017.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://arxiv.org/abs/1703.02360,<na>,,,,,,


In [142]:
df_mirror[df_mirror['landing_page_url']=='https://www.nber.org/chapters/c9003.pdf']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2664750,openalex::W13745406,RePEc: Research Papers in Economics,openalex,,,https://www.nber.org/chapters/c9003.pdf,"Introduction to ""Concentrated Corporate Owners...",Randall Mørck,2000.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://www.nber.org/chapters/c9003.pdf,<na>,,,,,,
2668133,openalex::W2418493038,RePEc: Research Papers in Economics,openalex,,,https://www.nber.org/chapters/c9003.pdf,"Introduction to ""Concentrated Corporate Owners...",Felix Sahm,2000.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,https://www.nber.org/chapters/c9003.pdf,<na>,,,,,,


In [143]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Qeios'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2571958,crossref::10.32388/045113,Qeios,crossref,10.32388/045113,https://doi.org/10.32388/045113,https://www.qeios.com/withdrawn-work,WITHDRAWN,"Doe, Jane",2020.0,2019-09-06,,,,,,,false,,,,,parent,https://www.qeios.com/withdrawn-work,10.32388/045113,,,,,,
2572398,crossref::10.32388/fopnva,Qeios,crossref,10.32388/fopnva,https://doi.org/10.32388/fopnva,https://www.qeios.com/withdrawn-work,WITHDRAWN,"Doe, Jane",2020.0,2022-07-07,,,,,,,false,,,,,parent,https://www.qeios.com/withdrawn-work,10.32388/fopnva,,,,,,
2572387,crossref::10.32388/d8ebqn.3,Qeios,crossref,10.32388/d8ebqn.3,https://doi.org/10.32388/d8ebqn.3,https://www.qeios.com/withdrawn-work,WITHDRAWN,"Doe, Jane",2020.0,2021-04-29,,,,,,,false,,,,,parent,https://www.qeios.com/withdrawn-work,10.32388/d8ebqn.3,,,,,,
2572459,crossref::10.32388/d8ebqn.4,Qeios,crossref,10.32388/d8ebqn.4,https://doi.org/10.32388/d8ebqn.4,https://www.qeios.com/withdrawn-work,WITHDRAWN,"Doe, Jane",2020.0,2021-05-01,,,,,,,false,,,,,parent,https://www.qeios.com/withdrawn-work,10.32388/d8ebqn.4,,,,,,
2567916,crossref::10.32388/dtdoz7,Qeios,crossref,10.32388/dtdoz7,https://doi.org/10.32388/dtdoz7,https://www.qeios.com/withdrawn-work,WITHDRAWN,"Doe, Jane",2020.0,2022-10-06,,,,,,,false,,,,,parent,https://www.qeios.com/withdrawn-work,10.32388/dtdoz7,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2618389,crossref::10.32388/subst.test.1,Qeios,crossref,10.32388/subst.test.1,https://doi.org/10.32388/subst.test.1,https://www.qeios.com/tmp-test-subst-1,The History of Chocolate: From Ancient Beginni...,"Marinello, Gabriele",2025.0,2025-08-05,,,,,,,false,,,,,parent,https://www.qeios.com/tmp-test-subst-1,10.32388/subst.test.1,,,,,,
2618460,crossref::10.32388/subst.lyons.modern_cinema_s...,Qeios,crossref,10.32388/subst.lyons.modern_cinema_snails.2025,https://doi.org/10.32388/subst.lyons.modern_ci...,https://qeios.com/read/substack-lyons-snails-i...,Snails in Modern Cinema: From Speed Dreams to ...,"Lyons, Mansel",2025.0,2025-08-13,,,,,,,false,,,,,parent,https://qeios.com/read/substack-lyons-snails-i...,10.32388/subst.lyons.modern_cinema_snails.2025,,,,,,
2618575,crossref::10.32388/subst.lyons.modern_cinema.2025,Qeios,crossref,10.32388/subst.lyons.modern_cinema.2025,https://doi.org/10.32388/subst.lyons.modern_ci...,https://qeios.com/read/substack-lyons-snails-i...,Snails in Modern Cinema: From Speed Dreams to ...,"Lyons, Mansel",2025.0,2025-08-29,,,,,,,false,,,,,parent,https://qeios.com/read/substack-lyons-snails-i...,10.32388/subst.lyons.modern_cinema.2025,,,,,,
2618459,crossref::10.32388/subst.lyons.knights_vs_snai...,Qeios,crossref,10.32388/subst.lyons.knights_vs_snails.2025,https://doi.org/10.32388/subst.lyons.knights_v...,https://qeios.com/read/substack-lyons-knights-...,"Knights vs. Snails: A History of a Tiny, Tenac...","Lyons, Mansel",2025.0,2025-08-13,,,,,,,false,,,,,parent,https://qeios.com/read/substack-lyons-knights-...,10.32388/subst.lyons.knights_vs_snails.2025,,,,,,


In [144]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='ResearchHub'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
3503777,crossref::10.55277/researchhub.8i2kymwh,ResearchHub,crossref,10.55277/researchhub.8i2kymwh,https://doi.org/10.55277/researchhub.8i2kymwh,https://www.staging.researchhub.com/post/321/r...,02.26 Preregistration template,"Tytarenko, Mykola",2025.0,2025-02-26,,,,,,,false,,,,,parent,https://www.staging.researchhub.com/post/321/r...,10.55277/researchhub.8i2kymwh,,,,,,
3503779,crossref::10.55277/researchhub.7ewvh0rf,ResearchHub,crossref,10.55277/researchhub.7ewvh0rf,https://doi.org/10.55277/researchhub.7ewvh0rf,https://www.staging.researchhub.com/post/321/r...,02.26 Preregistration template v 3,"Tytarenko, Mykola",2025.0,2025-02-26,,,,,,,false,,,,,parent,https://www.staging.researchhub.com/post/321/r...,10.55277/researchhub.7ewvh0rf,,,,,,
3503778,crossref::10.55277/researchhub.2ihlexrg,ResearchHub,crossref,10.55277/researchhub.2ihlexrg,https://doi.org/10.55277/researchhub.2ihlexrg,https://www.staging.researchhub.com/post/321/r...,02.26 Preregistration template,"Tytarenko, Mykola",2025.0,2025-02-26,,,,,,,false,,,,,parent,https://www.staging.researchhub.com/post/321/r...,10.55277/researchhub.2ihlexrg,,,,,,
3503776,crossref::10.55277/researchhub.lrtyw6ah,ResearchHub,crossref,10.55277/researchhub.lrtyw6ah,https://doi.org/10.55277/researchhub.lrtyw6ah,https://www.staging.researchhub.com/post/321/r...,Research Article Title 2,"Tytarenko, Mykola",2025.0,2025-02-26,,,,,,,false,,,,,parent,https://www.staging.researchhub.com/post/321/r...,10.55277/researchhub.lrtyw6ah,,,,,,
3503976,crossref::10.55277/rhj.8ksztm1x,ResearchHub,crossref,10.55277/rhj.8ksztm1x,https://doi.org/10.55277/rhj.8ksztm1x,https://www.staging.researchhub.com/paper/3236...,In the test journal,"K, Taki",2025.0,2025-05-29,,,,,,,false,,,,,parent,https://www.staging.researchhub.com/paper/3236...,10.55277/rhj.8ksztm1x,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3503708,crossref::10.55277/researchhub.s485ohw7.1,ResearchHub,crossref,10.55277/researchhub.s485ohw7.1,https://doi.org/10.55277/researchhub.s485ohw7.1,https://www.researchhub.com/paper/8437957/fini...,Finite Density Black Holes in a Quantum Gravit...,"Brown, Jesse Daniel; Smith, McCade",2025.0,2025-01-09,,,,,,,false,,,,,parent,https://www.researchhub.com/paper/8437957/fini...,10.55277/researchhub.s485ohw7.1,,,,,,
3503644,crossref::10.55277/researchhub.71mugn7l,ResearchHub,crossref,10.55277/researchhub.71mugn7l,https://doi.org/10.55277/researchhub.71mugn7l,https://www.researchhub.com/paper/8420046/alte...,Alternate EoS for Finite Density Black Holes i...,"Brown, Jesse Daniel; Smith, McCade",2024.0,2024-11-21,,,,,,,false,,,,,parent,https://www.researchhub.com/paper/8420046/alte...,10.55277/researchhub.71mugn7l,,,,,,
3503703,crossref::10.55277/researchhub.71mugn7l.1,ResearchHub,crossref,10.55277/researchhub.71mugn7l.1,https://doi.org/10.55277/researchhub.71mugn7l.1,https://www.researchhub.com/paper/8420046/alte...,Alternate EoS for Finite Density Black Holes i...,"Brown, Jesse Daniel; Smith, McCade",2025.0,2025-01-09,,,,,,,false,,,,,parent,https://www.researchhub.com/paper/8420046/alte...,10.55277/researchhub.71mugn7l.1,,,,,,
3503710,crossref::10.55277/researchhub.hmhp8upd.1,ResearchHub,crossref,10.55277/researchhub.hmhp8upd.1,https://doi.org/10.55277/researchhub.hmhp8upd.1,https://www.researchhub.com/paper/8420045/fini...,Finite Density Black Holes in a Quantum Gravit...,"Smith, McCade",2025.0,2025-01-09,,,,,,,false,,,,,parent,https://www.researchhub.com/paper/8420045/fini...,10.55277/researchhub.hmhp8upd.1,,,,,,


In [145]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Munich Personal RePEc Archive'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2028824,openalex::W2912141608,Munich Personal RePEc Archive,openalex,10.1111/twec.2014.37.issue-12,https://doi.org/10.1111/twec.2014.37.issue-12,https://doi.org/10.1111/twec.2014.37.issue-12,The paradox of “preferences”: regional trade a...,Sébastien Miroudot; Ben Shepherd,2012.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.1111/twec.2014.37.issue-12,10.1111/twec.2014.37.issue-12,,,,,,
2044972,openalex::W2981721423,Munich Personal RePEc Archive,openalex,10.1111/twec.2014.37.issue-12,https://doi.org/10.1111/twec.2014.37.issue-12,https://doi.org/10.1111/twec.2014.37.issue-12,United Arab Emirates FDI Outlook,Wasseem Mina,2013.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.1111/twec.2014.37.issue-12,10.1111/twec.2014.37.issue-12,,,,,,
2045632,openalex::W3020944286,Munich Personal RePEc Archive,openalex,10.1111/ecca.2015.82.issue-328,https://doi.org/10.1111/ecca.2015.82.issue-328,https://doi.org/10.1111/ecca.2015.82.issue-328,Does cigarette smoking affect body weight? cau...,Luca Pieroni; Luca Salmasi,2012.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.1111/ecca.2015.82.issue-328,10.1111/ecca.2015.82.issue-328,,,,,,
2016916,openalex::W2767144409,Munich Personal RePEc Archive,openalex,10.1111/(issn)1542-4774/issues,https://doi.org/10.1111/(issn)1542-4774/issues,https://doi.org/10.1111/(issn)1542-4774/issues,Inattentive consumers and product quality,Mark Armstrong; Yongmin Chen,2009.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.1111/(issn)1542-4774/issues,10.1111/(issn)1542-4774/issues,,,,,,
2050299,openalex::W4288027439,Munich Personal RePEc Archive,openalex,,,https://arxiv.org/abs/1911.02734,Dipolar condensed atomic mixtures and miscibil...,Lauro Tomio; R. Kishor Kumar; A. Gammal,2019.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://arxiv.org/abs/1911.02734,<na>,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2034783,openalex::W82458165,Munich Personal RePEc Archive,openalex,,,http://www.theses.fr/2010INPT0096/document,"Gestion autonomique de performance, d'énergie ...",Rémi Sharrock,2010.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.theses.fr/2010inpt0096/document,<na>,,,,,,
2089086,openalex::W4395677280,Munich Personal RePEc Archive,openalex,,,http://www.theses.fr/2010INPT0029/document,Architectures innovantes de systèmes de comman...,Manel Sghairi Haouati,2010.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://www.theses.fr/2010inpt0029/document,<na>,,,,,,
2062907,openalex::W4392339766,Munich Personal RePEc Archive,openalex,,,http://www.theses.fr/2010INPT0029/document,Innovative Architectures of Flight Control Sys...,Manel Sghairi Haouati,2010.0,2024-03-05T00:00:00,,,,,,,,,,,,parent,http://www.theses.fr/2010inpt0029/document,<na>,,,,,,
2038452,openalex::W20541881,Munich Personal RePEc Archive,openalex,,,http://www.theses.fr/2010CLF22073/document,"Synthèse d'aminocyclitols, inhibiteurs potenti...",Flora Camps Bres,2010.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://www.theses.fr/2010clf22073/document,<na>,,,,,,


In [146]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='AgEcon Search'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
57448,datacite::10.22004/ag.econ.98631,AgEcon Search,datacite,10.22004/ag.econ.98631,https://doi.org/10.22004/ag.econ.98631,https://ageconsearch.umn.edu/record/98631,What do Haitians need after the earthquake?,"Andre, Rock; Lusk, Jayson L.",2011.0,2019-08-24,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/98631,10.22004/ag.econ.98631,,,,,,
56302,datacite::10.22004/ag.econ.95334,AgEcon Search,datacite,10.22004/ag.econ.95334,https://doi.org/10.22004/ag.econ.95334,https://ageconsearch.umn.edu/record/95334,Análisis de la ampliación de los recursos loca...,"Marin-Sanchez, Maria Del Mar",2010.0,2019-08-24,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/95334,10.22004/ag.econ.95334,,,,,,
56299,datacite::10.22004/ag.econ.95331,AgEcon Search,datacite,10.22004/ag.econ.95331,https://doi.org/10.22004/ag.econ.95331,https://ageconsearch.umn.edu/record/95331,LAS VIAS PECUARIAS Y LA PLANIFICACION TERRITORIAL,"Guaita Pradas, Inmaculada; Barrachina Martinez...",2010.0,2019-08-24,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/95331,10.22004/ag.econ.95331,,,,,,
56236,datacite::10.22004/ag.econ.95214,AgEcon Search,datacite,10.22004/ag.econ.95214,https://doi.org/10.22004/ag.econ.95214,https://ageconsearch.umn.edu/record/95214,Los cambios en la división internacional del t...,"Pensado Leglise, Mario Del Roble",2010.0,2019-08-24,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/95214,10.22004/ag.econ.95214,,,,,,
17846,datacite::10.22004/ag.econ.6543,AgEcon Search,datacite,10.22004/ag.econ.6543,https://doi.org/10.22004/ag.econ.6543,https://ageconsearch.umn.edu/record/6543,School District and Municipal Reorganization: ...,"Scorsone, Eric",2007.0,2019-08-23,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/6543,10.22004/ag.econ.6543,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58451,datacite::10.22004/ag.econ.103153,AgEcon Search,datacite,10.22004/ag.econ.103153,https://doi.org/10.22004/ag.econ.103153,https://ageconsearch.umn.edu/record/103153,"PRODUÇÃO DE FLORES EM UMUARAMA, NOROESTE DO PA...","Morita, Daniela Alves Dos Santos; Dias-Arieira...",2008.0,2019-08-24,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/103153,10.22004/ag.econ.103153,,,,,,
58441,datacite::10.22004/ag.econ.103117,AgEcon Search,datacite,10.22004/ag.econ.103117,https://doi.org/10.22004/ag.econ.103117,https://ageconsearch.umn.edu/record/103117,A Trajetória das Pesquisas com Práticas Agríco...,"Borges Filho, Epaminondas Luiz",2008.0,2019-08-24,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/103117,10.22004/ag.econ.103117,,,,,,
58405,datacite::10.22004/ag.econ.102769,AgEcon Search,datacite,10.22004/ag.econ.102769,https://doi.org/10.22004/ag.econ.102769,https://ageconsearch.umn.edu/record/102769,COOPERATIVAS COMO FORMA DE ORGANIZAÇÃO DA AGRI...,"Rosa, Nadir Paula Da; Lago, Adriano; Santos Ju...",2008.0,2019-08-24,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/102769,10.22004/ag.econ.102769,,,,,,
58393,datacite::10.22004/ag.econ.102727,AgEcon Search,datacite,10.22004/ag.econ.102727,https://doi.org/10.22004/ag.econ.102727,https://ageconsearch.umn.edu/record/102727,The Value of EU Agricultural Landscape,"Pavel, Ciaian; Gomez Y Paloma, Sergio",2011.0,2019-08-24,[],,,,,,false,,,,"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",parent,https://ageconsearch.umn.edu/record/102727,10.22004/ag.econ.102727,,,,,,


In [147]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='PsyArXiv'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2398812,crossref::10.31234/osf.io/zq3fa_v1,PsyArXiv,crossref,10.31234/osf.io/zq3fa_v1,https://doi.org/10.31234/osf.io/zq3fa_v1,https://osf.io/zq3fa_v1,Raising the Bar: Improving Methodological Rigo...,"Pennington, Charlotte Rebecca; Jones, Andrew; ...",2021.0,2025-05-26,,,,,,,false,,,,,parent,https://osf.io/zq3fa_v1,10.31234/osf.io/zq3fa_v1,_v1,explicit_version,1.0,,,
2425581,crossref::10.31234/osf.io/zq3fa,PsyArXiv,crossref,10.31234/osf.io/zq3fa,https://doi.org/10.31234/osf.io/zq3fa,https://osf.io/zq3fa_v1,Raising the Bar: Improving Methodological Rigo...,"Pennington, Charlotte Rebecca; Jones, Andrew; ...",2021.0,2021-06-29,,,,,,,false,,,,,parent,https://osf.io/zq3fa_v1,10.31234/osf.io/zq3fa,_v1,explicit_version,1.0,,,
2399047,crossref::10.31234/osf.io/zphx9_v1,PsyArXiv,crossref,10.31234/osf.io/zphx9_v1,https://doi.org/10.31234/osf.io/zphx9_v1,https://osf.io/zphx9_v1,The adventure of running experiments with teen...,"Alfonso, Antonio; Branas-Garza, Pablo; Jorrat,...",2022.0,2025-06-26,,,,,,,false,,,,,parent,https://osf.io/zphx9_v1,10.31234/osf.io/zphx9_v1,_v1,explicit_version,1.0,,,
2434853,crossref::10.31234/osf.io/zphx9,PsyArXiv,crossref,10.31234/osf.io/zphx9,https://doi.org/10.31234/osf.io/zphx9,https://osf.io/zphx9_v1,The adventure of running experiments with teen...,"Alfonso, Antonio; Branas-Garza, Pablo; Jorrat,...",2022.0,2022-11-14,,,,,,,false,,,,,parent,https://osf.io/zphx9_v1,10.31234/osf.io/zphx9,_v1,explicit_version,1.0,,,
2398868,crossref::10.31234/osf.io/y39xu_v1,PsyArXiv,crossref,10.31234/osf.io/y39xu_v1,https://doi.org/10.31234/osf.io/y39xu_v1,https://osf.io/y39xu_v1,The Effects of Patients’ Expectations on Surge...,"Laferton, Johannes Andreas Christoph; Oeltjen,...",2020.0,2025-06-04,,,,,,,false,,,,,parent,https://osf.io/y39xu_v1,10.31234/osf.io/y39xu_v1,_v1,explicit_version,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399054,crossref::10.31234/osf.io/37a9q_v1,PsyArXiv,crossref,10.31234/osf.io/37a9q_v1,https://doi.org/10.31234/osf.io/37a9q_v1,https://osf.io/37a9q_v1,Access to meaning from visual input: Object an...,"Gregorova, Klara; Turini Volonghi, Jacopo; Gag...",2021.0,2025-06-26,,,,,,,false,,,,,parent,https://osf.io/37a9q_v1,10.31234/osf.io/37a9q_v1,_v1,explicit_version,1.0,,,
2437167,crossref::10.31234/osf.io/2cvk7,PsyArXiv,crossref,10.31234/osf.io/2cvk7,https://doi.org/10.31234/osf.io/2cvk7,https://osf.io/2cvk7_v1,No Evidence that Working Memory Modulates the ...,"Meyers, Elke; Alves, Maryna; Teugels, Anouk; T...",2023.0,2023-01-27,,,,,,,false,,,,,parent,https://osf.io/2cvk7_v1,10.31234/osf.io/2cvk7,_v1,explicit_version,1.0,,,
2399033,crossref::10.31234/osf.io/2cvk7_v1,PsyArXiv,crossref,10.31234/osf.io/2cvk7_v1,https://doi.org/10.31234/osf.io/2cvk7_v1,https://osf.io/2cvk7_v1,No Evidence that Working Memory Modulates the ...,"Meyers, Elke; alves, maryna; Teugels, Anouk; T...",2023.0,2025-06-24,,,,,,,false,,,,,parent,https://osf.io/2cvk7_v1,10.31234/osf.io/2cvk7_v1,_v1,explicit_version,1.0,,,
2398974,crossref::10.31234/osf.io/25qsu_v1,PsyArXiv,crossref,10.31234/osf.io/25qsu_v1,https://doi.org/10.31234/osf.io/25qsu_v1,https://osf.io/25qsu_v1,What Ergodicity Means for You,"Hunter, Michael D.; Fisher, Zachary; Geier, Ch...",2023.0,2025-06-16,,,,,,,false,,,,,parent,https://osf.io/25qsu_v1,10.31234/osf.io/25qsu_v1,_v1,explicit_version,1.0,,,


In [148]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Earth and Space Science Open Archive'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
447059,crossref::10.22541/essoar.171909800.07100465/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.171909800.07100465/v1,https://doi.org/10.22541/essoar.171909800.0710...,https://essopenarchive.org/users/795535/articl...,Abrupt Changes in the Timing and Magnitude of ...,"Kelly, Stephen John; Popova, Ekaterina E.; Yoo...",2024.0,2024-06-22,,,,,,,false,,,,,parent,https://essopenarchive.org/users/795535/articl...,10.22541/essoar.171909800.07100465/v1,,,,/v1,explicit_version,1.0
447028,crossref::10.22541/essoar.171865213.30368497/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.171865213.30368497/v1,https://doi.org/10.22541/essoar.171865213.3036...,https://essopenarchive.org/users/795535/articl...,Abrupt Changes in the Timing and Magnitude of ...,"Kelly, Stephen John; Popova, Ekaterina E.; Yoo...",2024.0,2024-06-17,,,,,,,false,,,,,parent,https://essopenarchive.org/users/795535/articl...,10.22541/essoar.171865213.30368497/v1,,,,/v1,explicit_version,1.0
447065,crossref::10.22541/essoar.171926236.62536302/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.171926236.62536302/v1,https://doi.org/10.22541/essoar.171926236.6253...,https://essopenarchive.org/users/795502/articl...,Reducing Flood Insurance Costs by Employing Ge...,"Cui, Shibo; Wang, Jiaqing; Zhao, Jianshi",2024.0,2024-06-24,,,,,,,false,,,,,parent,https://essopenarchive.org/users/795502/articl...,10.22541/essoar.171926236.62536302/v1,,,,/v1,explicit_version,1.0
447045,crossref::10.22541/essoar.171867280.07560342/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.171867280.07560342/v1,https://doi.org/10.22541/essoar.171867280.0756...,https://essopenarchive.org/users/795502/articl...,Reducing Flood Insurance Costs by Employing Ge...,"Cui, Shibo; Wang, Jiaqing; Zhao, Jianshi",2024.0,2024-06-18,,,,,,,false,,,,,parent,https://essopenarchive.org/users/795502/articl...,10.22541/essoar.171867280.07560342/v1,,,,/v1,explicit_version,1.0
447066,crossref::10.22541/essoar.171926256.62956465/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.171926256.62956465/v1,https://doi.org/10.22541/essoar.171926256.6295...,https://essopenarchive.org/users/795498/articl...,Assessment of Forced Evaporation as a Solution...,"Den, Walter; Walla, Travis",2024.0,2024-06-24,,,,,,,false,,,,,parent,https://essopenarchive.org/users/795498/articl...,10.22541/essoar.171926256.62956465/v1,,,,/v1,explicit_version,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444907,crossref::10.22541/essoar.169711700.09344058/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.169711700.09344058/v1,https://doi.org/10.22541/essoar.169711700.0934...,https://essopenarchive.org/users/530200/articl...,The 2022 Starlink geomagnetic storms: global t...,"Billett, Daniel D; Sartipzadeh, Kian; Ivarsen,...",2023.0,2023-10-12,,,,,,,false,,,,,parent,https://essopenarchive.org/users/530200/articl...,10.22541/essoar.169711700.09344058/v1,,,,/v1,explicit_version,1.0
445007,crossref::10.22541/essoar.169755311.15838743/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.169755311.15838743/v1,https://doi.org/10.22541/essoar.169755311.1583...,https://essopenarchive.org/users/529502/articl...,The influence of climate feedbacks on regional...,"Bonan, David; Feldl, Nicole; Siler, Nicholas; ...",2023.0,2023-10-17,,,,,,,false,,,,,parent,https://essopenarchive.org/users/529502/articl...,10.22541/essoar.169755311.15838743/v1,,,,/v1,explicit_version,1.0
444918,crossref::10.22541/essoar.169711716.62687942/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.169711716.62687942/v1,https://doi.org/10.22541/essoar.169711716.6268...,https://essopenarchive.org/users/529502/articl...,The influence of climate feedbacks on regional...,"Bonan, David; Feldl, Nicole; Siler, Nicholas; ...",2023.0,2023-10-12,,,,,,,false,,,,,parent,https://essopenarchive.org/users/529502/articl...,10.22541/essoar.169711716.62687942/v1,,,,/v1,explicit_version,1.0
447070,crossref::10.22541/essoar.171926407.71074494/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.171926407.71074494/v1,https://doi.org/10.22541/essoar.171926407.7107...,https://essopenarchive.org/users/527213/articl...,GOLD observations of the thermospheric respons...,"Evans, Joseph Scott; Correira, John; Lumpe, Je...",2024.0,2024-06-24,,,,,,,false,,,,,parent,https://essopenarchive.org/users/527213/articl...,10.22541/essoar.171926407.71074494/v1,,,,/v1,explicit_version,1.0


In [149]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='ScienceOpen Preprints'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5639665,crossref::10.14293/s2199-1006.1.sor-med.cljwhk...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-med.cljwhk7.v1,https://doi.org/10.14293/s2199-1006.1.sor-med....,https://www.scienceopen.com/collection/tmresearch,Traditional Medicine Research,"Tóth-Czifra, Erzsébet",2017.0,2017-09-05,,,,,,,false,,,,,parent,https://www.scienceopen.com/collection/tmresearch,10.14293/s2199-1006.1.sor-med.cljwhk7.v1,,,,.v1,explicit_version,1.0
5639684,crossref::10.14293/s2199-1006.1.sor-med.clhzql...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-med.clhzql4.v1,https://doi.org/10.14293/s2199-1006.1.sor-med....,https://www.scienceopen.com/collection/tmresearch,Traditional Medicine Research,"Tóth-Czifra, Erzsébet",2017.0,2017-09-05,,,,,,,false,,,,,parent,https://www.scienceopen.com/collection/tmresearch,10.14293/s2199-1006.1.sor-med.clhzql4.v1,,,,.v1,explicit_version,1.0
5639343,crossref::10.14293/s2199-1006.1.sor-life.clcgo...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-life.clcgoo7.v1,https://doi.org/10.14293/s2199-1006.1.sor-life...,https://www.scienceopen.com/collection/ticksan...,Ticks and tick-borne pathogens,"Estrada Peña, Agustín",2017.0,2017-04-25,,,,,,,false,,,,,parent,https://www.scienceopen.com/collection/ticksan...,10.14293/s2199-1006.1.sor-life.clcgoo7.v1,,,,.v1,explicit_version,1.0
5639342,crossref::10.14293/s2199-1006.1.sor-life.clefm...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-life.clefm4a.v1,https://doi.org/10.14293/s2199-1006.1.sor-life...,https://www.scienceopen.com/collection/ticksan...,Ticks and tick-borne pathogens,"Estrada Peña, Agustín",2017.0,2017-04-24,,,,,,,false,,,,,parent,https://www.scienceopen.com/collection/ticksan...,10.14293/s2199-1006.1.sor-life.clefm4a.v1,,,,.v1,explicit_version,1.0
5639676,crossref::10.14293/s2199-1006.1.sor-lit.clwycr...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-lit.clwycrh.v1,https://doi.org/10.14293/s2199-1006.1.sor-lit....,https://www.scienceopen.com/collection/sylvia-...,The Journal of the Sylvia Townsend Warner Society,"Admin, ScienceOpen",2017.0,2017-09-11,,,,,,,false,,,,,parent,https://www.scienceopen.com/collection/sylvia-...,10.14293/s2199-1006.1.sor-lit.clwycrh.v1,,,,.v1,explicit_version,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5639827,crossref::10.14293/s2199-1006.1.sor-law.clcgnl...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-law.clcgnlz.v1,https://doi.org/10.14293/s2199-1006.1.sor-law....,https://www.scienceopen.com/collection/CarlGro...,Carl Grossmann Books,"Admin, ScienceOpen",2019.0,2019-10-17,,,,,,,false,,,,,parent,https://www.scienceopen.com/collection/carlgro...,10.14293/s2199-1006.1.sor-law.clcgnlz.v1,,,,.v1,explicit_version,1.0
5639823,crossref::10.14293/s2199-1006.1.sor-socsci.clg...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-socsci.clgkrdl.v1,https://doi.org/10.14293/s2199-1006.1.sor-socs...,https://www.scienceopen.com/collection/AKJourn...,Psychology by AKJournals,"Admin, ScienceOpen",2019.0,2019-10-09,,,,,,,false,,,,,parent,https://www.scienceopen.com/collection/akjourn...,10.14293/s2199-1006.1.sor-socsci.clgkrdl.v1,,,,.v1,explicit_version,1.0
5639821,crossref::10.14293/s2199-1006.1.sor-socsci.clf...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-socsci.clfttbi.v1,https://doi.org/10.14293/s2199-1006.1.sor-socs...,https://www.scienceopen.com/collection/AKJourn...,Psychology by AKJournals,"Admin, ScienceOpen",2019.0,2019-10-09,,,,,,,false,,,,,parent,https://www.scienceopen.com/collection/akjourn...,10.14293/s2199-1006.1.sor-socsci.clfttbi.v1,,,,.v1,explicit_version,1.0
5642552,crossref::10.14293/s2199-1006.1.sor-hist.cl4ob...,ScienceOpen Preprints,crossref,10.14293/s2199-1006.1.sor-hist.cl4obbv.v1,https://doi.org/10.14293/s2199-1006.1.sor-hist...,https://scienceopen.com/collection/e5717391-c6...,Leuven University Press Books,"Admin, ScienceOpen",2023.0,2023-10-23,,,,,,,false,,,,,parent,https://scienceopen.com/collection/e5717391-c6...,10.14293/s2199-1006.1.sor-hist.cl4obbv.v1,,,,.v1,explicit_version,1.0


In [150]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='TechRxiv'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5718712,crossref::10.36227/techrxiv.174613156.61394999/v1,TechRxiv,crossref,10.36227/techrxiv.174613156.61394999/v1,https://doi.org/10.36227/techrxiv.174613156.61...,https://www.techrxiv.org/users/918312/articles...,Spatio-Temporal Gaze Analysis in VR: Comparing...,"Sun, Xiaoxiao; Shi, Xinran; Koorathota, Sharat...",2025.0,2025-05-01,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/918312/articles...,10.36227/techrxiv.174613156.61394999/v1,,,,/v1,explicit_version,1.0
5718638,crossref::10.36227/techrxiv.174594289.98518288/v1,TechRxiv,crossref,10.36227/techrxiv.174594289.98518288/v1,https://doi.org/10.36227/techrxiv.174594289.98...,https://www.techrxiv.org/users/918312/articles...,Spatio-Temporal Gaze Analysis in VR: Comparing...,"Sun, Xiaoxiao; Shi, Xinran; Koorathota, Sharat...",2025.0,2025-04-29,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/918312/articles...,10.36227/techrxiv.174594289.98518288/v1,,,,/v1,explicit_version,1.0
5718635,crossref::10.36227/techrxiv.174593999.98386877/v1,TechRxiv,crossref,10.36227/techrxiv.174593999.98386877/v1,https://doi.org/10.36227/techrxiv.174593999.98...,https://www.techrxiv.org/users/917216/articles...,Free-wheeling offline and online identificatio...,"Pfeifer, Bernd; Hackl, Christoph M.",2025.0,2025-04-29,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/917216/articles...,10.36227/techrxiv.174593999.98386877/v1,,,,/v1,explicit_version,1.0
5718710,crossref::10.36227/techrxiv.174613113.39438286/v1,TechRxiv,crossref,10.36227/techrxiv.174613113.39438286/v1,https://doi.org/10.36227/techrxiv.174613113.39...,https://www.techrxiv.org/users/917216/articles...,Free-wheeling offline and online identificatio...,"Pfeifer, Bernd; Hackl, Christoph M.",2025.0,2025-05-01,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/917216/articles...,10.36227/techrxiv.174613113.39438286/v1,,,,/v1,explicit_version,1.0
5718709,crossref::10.36227/techrxiv.174613072.24623819/v1,TechRxiv,crossref,10.36227/techrxiv.174613072.24623819/v1,https://doi.org/10.36227/techrxiv.174613072.24...,https://www.techrxiv.org/users/916644/articles...,Artificial Intelligence Applied to Risk Manage...,"Uehara, Marcelo Sousa",2025.0,2025-05-01,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/916644/articles...,10.36227/techrxiv.174613072.24623819/v1,,,,/v1,explicit_version,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5714424,crossref::10.36227/techrxiv.171925125.59769255/v1,TechRxiv,crossref,10.36227/techrxiv.171925125.59769255/v1,https://doi.org/10.36227/techrxiv.171925125.59...,https://www.techrxiv.org/users/681556/articles...,Electrodynamics for Non-Relativistic Point Cha...,"Kühn, Steffen",2024.0,2024-07-03,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/681556/articles...,10.36227/techrxiv.171925125.59769255/v1,,,,/v1,explicit_version,1.0
5714354,crossref::10.36227/techrxiv.171863936.68554917/v1,TechRxiv,crossref,10.36227/techrxiv.171863936.68554917/v1,https://doi.org/10.36227/techrxiv.171863936.68...,https://www.techrxiv.org/users/681556/articles...,Electrodynamics for Non-Relativistic Point Cha...,"Kühn, Steffen",2024.0,2024-06-17,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/681556/articles...,10.36227/techrxiv.171863936.68554917/v1,,,,/v1,explicit_version,1.0
5714355,crossref::10.36227/techrxiv.171863948.81280772/v1,TechRxiv,crossref,10.36227/techrxiv.171863948.81280772/v1,https://doi.org/10.36227/techrxiv.171863948.81...,https://www.techrxiv.org/users/623064/articles...,A Study of Instagram Dependency on Indian Yout...,"Dhiman, Dr. Bharat",2024.0,2024-06-17,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/623064/articles...,10.36227/techrxiv.171863948.81280772/v1,,,,/v1,explicit_version,1.0
5714425,crossref::10.36227/techrxiv.171925132.27898180/v1,TechRxiv,crossref,10.36227/techrxiv.171925132.27898180/v1,https://doi.org/10.36227/techrxiv.171925132.27...,https://www.techrxiv.org/users/623064/articles...,A Study of Instagram Dependency on Indian Yout...,"Dhiman, Dr. Bharat",2024.0,2024-07-03,,,,,,,false,,,,,parent,https://www.techrxiv.org/users/623064/articles...,10.36227/techrxiv.171925132.27898180/v1,,,,/v1,explicit_version,1.0


In [151]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='F1000Research'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
572326,crossref::10.12688/f1000research.3979.1,F1000Research,crossref,10.12688/f1000research.3979.1,https://doi.org/10.12688/f1000research.3979.1,http://f1000research.com/articles/3-94/v1,Data publication consensus and controversies,"Kratz, John; Strasser, Carly",2014.0,2014-06-17,,,,,,,false,,,,,parent,http://f1000research.com/articles/3-94/v1,10.12688/f1000research.3979.1,/v1,explicit_version,1.0,,,
572328,crossref::10.12688/f1000research.4264,F1000Research,crossref,10.12688/f1000research.4264,https://doi.org/10.12688/f1000research.4264,http://f1000research.com/articles/3-94/v1,Data publication consensus and controversies,"Kratz, John; Strasser, Carly",2014.0,2014-05-08,,,,,,,false,,,,,parent,http://f1000research.com/articles/3-94/v1,10.12688/f1000research.4264,/v1,explicit_version,1.0,,,
572462,crossref::10.12688/f1000research.4162,F1000Research,crossref,10.12688/f1000research.4162,https://doi.org/10.12688/f1000research.4162,http://f1000research.com/articles/3-91/v1,Ten things to get right for marine conservatio...,"Weeks, Rebecca; Pressey, Robert L.; Wilson, Jo...",2014.0,2014-05-12,,,,,,,false,,,,,parent,http://f1000research.com/articles/3-91/v1,10.12688/f1000research.4162,/v1,explicit_version,1.0,,,
572461,crossref::10.12688/f1000research.3886.1,F1000Research,crossref,10.12688/f1000research.3886.1,https://doi.org/10.12688/f1000research.3886.1,http://f1000research.com/articles/3-91/v1,Ten things to get right for marine conservatio...,"Weeks, Rebecca; Pressey, Robert L.; Wilson, Jo...",2014.0,2014-06-17,,,,,,,false,,,,,parent,http://f1000research.com/articles/3-91/v1,10.12688/f1000research.3886.1,/v1,explicit_version,1.0,,,
572231,crossref::10.12688/f1000research.4019,F1000Research,crossref,10.12688/f1000research.4019,https://doi.org/10.12688/f1000research.4019,http://f1000research.com/articles/3-83/v1,Recommendations to enable drug development for...,"Sames, Lori; Moore, Allison; Arnold, Renee; Ek...",2014.0,2014-04-03,,,,,,,false,,,,,parent,http://f1000research.com/articles/3-83/v1,10.12688/f1000research.4019,/v1,explicit_version,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572276,crossref::10.12688/f1000research.1-2.v1,F1000Research,crossref,10.12688/f1000research.1-2.v1,https://doi.org/10.12688/f1000research.1-2.v1,http://f1000research.com/articles/1-2/v1,Considerations for clinical read alignment and...,"Oliver, Gavin R",2012.0,2013-05-09,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-2/v1,10.12688/f1000research.1-2.v1,/v1,explicit_version,1.0,,,
572286,crossref::10.3410/f1000research.1-12.v1,F1000Research,crossref,10.3410/f1000research.1-12.v1,https://doi.org/10.3410/f1000research.1-12.v1,http://f1000research.com/articles/1-12/v1,Diversion at the ER: How Plasmodium falciparum...,"Römisch, Karin",2012.0,2015-06-24,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-12/v1,10.3410/f1000research.1-12.v1,/v1,explicit_version,1.0,,,
572275,crossref::10.12688/f1000research.1-12.v1,F1000Research,crossref,10.12688/f1000research.1-12.v1,https://doi.org/10.12688/f1000research.1-12.v1,http://f1000research.com/articles/1-12/v1,Diversion at the ER: How Plasmodium falciparum...,"Römisch, Karin",2012.0,2013-05-09,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-12/v1,10.12688/f1000research.1-12.v1,/v1,explicit_version,1.0,,,
572314,crossref::10.3410/f1000research.1-10.v1,F1000Research,crossref,10.3410/f1000research.1-10.v1,https://doi.org/10.3410/f1000research.1-10.v1,http://f1000research.com/articles/1-10/v1,Murine Tim-1 is excluded from the immunologica...,"Lin, Jean; Chen, Leo; Kane, Lawrence P",2012.0,2015-06-24,,,,,,,false,,,,,parent,http://f1000research.com/articles/1-10/v1,10.3410/f1000research.1-10.v1,/v1,explicit_version,1.0,,,


In [152]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='DSpace@MIT'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
335214,openalex::W4297798537,DSpace@MIT,openalex,,,https://arxiv.org/abs/1802.04480,RoboChain: A Secure Data-Sharing Framework for...,Eduardo Castelló Ferrer; Ognjen Rudovic; Thoma...,2018.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://arxiv.org/abs/1802.04480,<na>,,,,,,
335088,openalex::W3102404986,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/77925,Coherency Strain and the Kinetics of Phase Sep...,Daniel A. Cogswell; Martin Z. Bazant,2013.0,2020-11-23T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/77925,<na>,,,,,,
332101,openalex::W1499052255,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/77925,Coherency Strain and the Kinetics of Phase Sep...,Daniel A. Cogswell; Martin Z. Bazant,2012.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/77925,<na>,,,,,,
334098,openalex::W3024898576,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/60550,Spin and Valence States of Iron in Mg0.8Fe0.2S...,Brent Grocholski; Seung‐Bo Shim; Jie Zhao; W. ...,2009.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/60550,<na>,,,,,,
336939,openalex::W1586102975,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/60550,Spin and valence states of iron in (Mg[subscri...,Brent Grocholski; Sang-Heon Dan Shim; W. Sturh...,2009.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/60550,<na>,,,,,,
336006,openalex::W2905992706,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/110743,Kinetic isotope effects of 12CH3D + OH and 13...,L. M. T. Joelsson; Johan A. Schmidt; Elna J. K...,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/110743,<na>,,,,,,
340005,openalex::W2737630491,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/110743,Kinetic isotope effects of [superscript 12]CH[...,L. M. T. Joelsson; Johan A. Schmidt; Elna J. K...,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/110743,<na>,,,,,,
339951,openalex::W2734455347,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/110608,The rotation-vibration structure of the SO[sub...,Jun Jiang; George Barratt Park; Robert W. Field,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/110608,<na>,,,,,,
343384,openalex::W3215857213,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/110608,The rotation-vibration structure of the SO2 C ...,Jun Jiang; George Park; Robert W. Field,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/110608,<na>,,,,,,
340031,openalex::W2739208627,DSpace@MIT,openalex,,,http://hdl.handle.net/1721.1/110227,Observation of B[subscript c][superscript +]→J...,R. Aaij; B. Adeva; M. Adinolfi; Z. Ajaltouni; ...,2017.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/1721.1/110227,<na>,,,,,,


In [153]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Authorea Inc.'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
266149,crossref::10.22541/au.166013641.15972664/v1,Authorea Inc.,crossref,10.22541/au.166013641.15972664/v1,https://doi.org/10.22541/au.166013641.15972664/v1,https://www.techrxiv.org/doi/full/10.36227/tec...,"Human-Centered Artificial Intelligence, a review","Domfeh, Emmanuel Adjei; Weyori, Benjamin; APPI...",2022.0,2022-08-10,,,,,,,False,,,,,parent,https://www.techrxiv.org/doi/full/10.36227/tec...,10.22541/au.166013641.15972664/v1,.v1,explicit_version,1.0,,,
263047,crossref::10.22541/au.174111107.77253989/v1,Authorea Inc.,crossref,10.22541/au.174111107.77253989/v1,https://doi.org/10.22541/au.174111107.77253989/v1,https://www.authorea.com/users/898379/articles...,Inhibition of CaN/FoxO1/FABP4 pathway prevents...,"Zhu, Beibei; Luo, Shuangxue; Su, Hang; Zhang, ...",2025.0,2025-03-04,,,,,,,False,,,,,parent,https://www.authorea.com/users/898379/articles...,10.22541/au.174111107.77253989/v1,,,,/v1,explicit_version,1.0
263038,crossref::10.22541/au.174110059.99710888/v1,Authorea Inc.,crossref,10.22541/au.174110059.99710888/v1,https://doi.org/10.22541/au.174110059.99710888/v1,https://www.authorea.com/users/898379/articles...,Inhibition of CaN/FoxO1/FABP4 pathway prevents...,"Zhu, Beibei; Luo, Shuangxue; Su, Hang; Zhang, ...",2025.0,2025-03-04,,,,,,,False,,,,,parent,https://www.authorea.com/users/898379/articles...,10.22541/au.174110059.99710888/v1,,,,/v1,explicit_version,1.0
243970,crossref::10.22541/au.169754906.69651455/v1,Authorea Inc.,crossref,10.22541/au.169754906.69651455/v1,https://doi.org/10.22541/au.169754906.69651455/v1,https://www.authorea.com/users/672717/articles...,"Quality of randomised controlled trials, syste...","Jiang, Wilson; Wang, Bill; Sperandei, Sandro; ...",2023.0,2023-10-17,,,,,,,False,,,,,parent,https://www.authorea.com/users/672717/articles...,10.22541/au.169754906.69651455/v1,,,,/v1,explicit_version,1.0
243763,crossref::10.22541/au.169710870.00119117/v1,Authorea Inc.,crossref,10.22541/au.169710870.00119117/v1,https://doi.org/10.22541/au.169710870.00119117/v1,https://www.authorea.com/users/672717/articles...,"Quality of randomised controlled trials, syste...","Jiang, Wilson; Wang, Bill; Sperandei, Sandro; ...",2023.0,2023-10-12,,,,,,,False,,,,,parent,https://www.authorea.com/users/672717/articles...,10.22541/au.169710870.00119117/v1,,,,/v1,explicit_version,1.0
243767,crossref::10.22541/au.169710892.20614312/v1,Authorea Inc.,crossref,10.22541/au.169710892.20614312/v1,https://doi.org/10.22541/au.169710892.20614312/v1,https://www.authorea.com/users/672139/articles...,Solus: An end-to-end AI software developer,"Blumenfeld, Adam",2023.0,2023-10-12,,,,,,,False,,,,,parent,https://www.authorea.com/users/672139/articles...,10.22541/au.169710892.20614312/v1,,,,/v1,explicit_version,1.0
243981,crossref::10.22541/au.169754941.18437909/v1,Authorea Inc.,crossref,10.22541/au.169754941.18437909/v1,https://doi.org/10.22541/au.169754941.18437909/v1,https://www.authorea.com/users/672139/articles...,Solus: An end-to-end AI software developer,"Blumenfeld, Adam",2023.0,2023-10-17,,,,,,,False,,,,,parent,https://www.authorea.com/users/672139/articles...,10.22541/au.169754941.18437909/v1,,,,/v1,explicit_version,1.0
242953,crossref::10.22541/au.169516707.74001061/v1,Authorea Inc.,crossref,10.22541/au.169516707.74001061/v1,https://doi.org/10.22541/au.169516707.74001061/v1,https://www.authorea.com/users/666294/articles...,The Effect Of Complete Decongestıve Therapy On...,"Kavak, Songül Keskin; Kavak, Engin Eren",2023.0,2023-09-19,,,,,,,False,,,,,parent,https://www.authorea.com/users/666294/articles...,10.22541/au.169516707.74001061/v1,,,,/v1,explicit_version,1.0
242935,crossref::10.22541/au.169513747.79521287/v1,Authorea Inc.,crossref,10.22541/au.169513747.79521287/v1,https://doi.org/10.22541/au.169513747.79521287/v1,https://www.authorea.com/users/666294/articles...,The Effect Of Complete Decongestıve Therapy On...,"Kavak, Songül Keskin; Kavak, Engin Eren",2023.0,2023-09-19,,,,,,,False,,,,,parent,https://www.authorea.com/users/666294/articles...,10.22541/au.169513747.79521287/v1,,,,/v1,explicit_version,1.0
242952,crossref::10.22541/au.169516707.71218203/v1,Authorea Inc.,crossref,10.22541/au.169516707.71218203/v1,https://doi.org/10.22541/au.169516707.71218203/v1,https://www.authorea.com/users/666262/articles...,Regulation of Non-Canonical Proteins Encoded b...,"PERIASAMY, PARTHIBAN; Joseph, Craig; Campos, A...",2023.0,2023-09-19,,,,,,,False,,,,,parent,https://www.authorea.com/users/666262/articles...,10.22541/au.169516707.71218203/v1,,,,/v1,explicit_version,1.0


In [154]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Digital Access to Scholarship at Harvard (DASH) (Harvard University)'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
349916,openalex::W7112397115,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,What GN owes OMD,"Nagy, Gregory",2018.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
351402,openalex::W7113012407,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,How are the epic verses of the Hesiodic Suitor...,"Nagy, Gregory",2021.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
350917,openalex::W7112816552,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,"What on earth did Helen ever see in Ajax, her ...","Nagy, Gregory",2021.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
350846,openalex::W7112788517,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,A question of “reception”: how could Homer eve...,"Nagy, Gregory",2021.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
351422,openalex::W7113021182,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,Death of an Amazon,"Nagy, Gregory",2020.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
350563,openalex::W7112670930,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,Can Sappho be freed from receivership? Part Two,"Nagy, Gregory",2021.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
350530,openalex::W7112658728,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,Mages and Ionians,"Nagy, Gregory",2017.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
350078,openalex::W7112462899,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,"Sappho’s Aphrodite, the goddess Chryse, and a ...","Nagy, Gregory",2021.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
349980,openalex::W7112425060,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,On Visualizing Heavenly Origins for Particular...,"Nagy, Gregory",2021.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,
349954,openalex::W7112414740,Digital Access to Scholarship at Harvard (DASH...,openalex,,,http://nrs.harvard.edu/urn-3:hul.eresource:Cla...,A bathtub in Pylos,"Nagy, Gregory",2017.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://nrs.harvard.edu/urn-3:hul.eresource:cla...,<na>,,,,,,


In [155]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='EarthArXiv'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
428821,crossref::10.31223/x5kt7m,EarthArXiv,crossref,10.31223/x5kt7m,https://doi.org/10.31223/x5kt7m,https://eartharxiv.org/repository/view/6898/,Can spinodal decomposition occur during decomp...,"Nishiwaki, Mizuki",2025.0,2025-08-13,,,,,,,False,,,,,parent,https://eartharxiv.org/repository/view/6898/,10.31223/x5kt7m,,,,,,
429050,crossref::10.31223/x58h6b,EarthArXiv,crossref,10.31223/x58h6b,https://doi.org/10.31223/x58h6b,https://eartharxiv.org/repository/view/6898/,Can spinodal decomposition occur during decomp...,"Nishiwaki, Mizuki",2025.0,2024-03-28,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1016/j.epsl.2025.119655,,,True,,,,,parent,https://eartharxiv.org/repository/view/6898/,10.31223/x58h6b,,,,,,
427419,crossref::10.31223/x5fd5g,EarthArXiv,crossref,10.31223/x5fd5g,https://doi.org/10.31223/x5fd5g,https://eartharxiv.org/repository/view/6821/,Moving graphs: Predicting barchan dune migrati...,"Beelen, Daan",2024.0,2024-03-07,,,,,,,False,,,,,parent,https://eartharxiv.org/repository/view/6821/,10.31223/x5fd5g,,,,,,
427420,crossref::10.31223/x52t1p,EarthArXiv,crossref,10.31223/x52t1p,https://doi.org/10.31223/x52t1p,https://eartharxiv.org/repository/view/6821/,Moving graphs: Predicting barchan dune migrati...,"Beelen, Daan",2024.0,2023-10-17,,,,,,,False,,,,,parent,https://eartharxiv.org/repository/view/6821/,10.31223/x52t1p,,,,,,
426685,crossref::10.31223/x54s90,EarthArXiv,crossref,10.31223/x54s90,https://doi.org/10.31223/x54s90,https://eartharxiv.org/repository/view/5038/,Statistical precursor signals for Dansgaard-Oe...,"Mitsui, Takahito; Boers, Niklas",2023.0,2023-02-14,,,,,,,False,,,,,parent,https://eartharxiv.org/repository/view/5038/,10.31223/x54s90,,,,,,
426406,crossref::10.31223/x58m1g,EarthArXiv,crossref,10.31223/x58m1g,https://doi.org/10.31223/x58m1g,https://eartharxiv.org/repository/view/5038/,Predictability of abrupt northern-hemisphere c...,"Mitsui, Takahito; Boers, Niklas",2023.0,2023-02-14,,,,,,,False,,,,,parent,https://eartharxiv.org/repository/view/5038/,10.31223/x58m1g,,,,,,
425234,crossref::10.31223/x59922,EarthArXiv,crossref,10.31223/x59922,https://doi.org/10.31223/x59922,http://eartharxiv.org/repository/view/2855/,The Influence of Grain Shape and Size on the R...,"Payton, Ryan; Chiarella, Domenico; Kingdon, An...",2021.0,2021-11-12,,,,,,,False,,,,,parent,http://eartharxiv.org/repository/view/2855/,10.31223/x59922,,,,,,
425241,crossref::10.31223/x5gs6z,EarthArXiv,crossref,10.31223/x5gs6z,https://doi.org/10.31223/x5gs6z,http://eartharxiv.org/repository/view/2855/,The Influence of Grain Shape and Size on the R...,"Payton, Ryan; Chiarella, Domenico; Kingdon, An...",2021.0,2021-11-18,,,,,,,False,,,,,parent,http://eartharxiv.org/repository/view/2855/,10.31223/x5gs6z,,,,,,
425697,crossref::10.31223/x5fk9q,EarthArXiv,crossref,10.31223/x5fk9q,https://doi.org/10.31223/x5fk9q,http://eartharxiv.org/repository/view/2730/,Evaluating the Evolution of ECMWF Precipitatio...,"Ghajarnia, Navid; Akbari, Mahdi; Saemian, Peym...",2022.0,2022-03-16,,,,,,,False,,,,,parent,http://eartharxiv.org/repository/view/2730/,10.31223/x5fk9q,,,,,,
425698,crossref::10.31223/x59w6f,EarthArXiv,crossref,10.31223/x59w6f,https://doi.org/10.31223/x59w6f,http://eartharxiv.org/repository/view/2730/,Evaluating the Evolution of ECMWF Precipitatio...,"Ghajarnia, Navid; Akbari, Mahdi; Saemian, Peym...",2022.0,2022-03-16,,,,,,,False,,,,,parent,http://eartharxiv.org/repository/view/2730/,10.31223/x59w6f,,,,,,


In [156]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='IACR Cryptology ePrint Archive'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
1953806,openalex::W3207917365,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2016/161.pdf,Revisiting Structure Graphs: Applications to C...,Ashwin Jha; Mridul Nandi,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2016/161.pdf,<na>,,,,,,
1946913,openalex::W2952233364,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2016/161.pdf,Revisiting Structure Graph and Its Application...,Ashwin Jha; Mridul Nandi,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2016/161.pdf,<na>,,,,,,
1949116,openalex::W3217563045,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2016/008.pdf,cMix: Mixing with Minimal Real-Time Asymmetric...,David Chaum; Debajyoti Das; Farid Javani; Anik...,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2016/008.pdf,<na>,,,,,,
1943071,openalex::W2398691649,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2016/008.pdf,cMix: Anonymization byHigh-Performance Scalabl...,David Chaum; Farid Javani; Aniket Kate; Anna K...,2016.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2016/008.pdf,<na>,,,,,,
1949071,openalex::W3207746821,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2015/971.pdf,Attacks on the Search-RLWE problem with small ...,Hao Chen; Kristin Lauter; Katherine E. Stange,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2015/971.pdf,<na>,,,,,,
1942923,openalex::W2403700287,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2015/971.pdf,Attacks on Search RLWE.,Hao Chen; Kristin Lauter; Katherine E. Stange,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2015/971.pdf,<na>,,,,,,
1946601,openalex::W2791898560,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2015/942.pdf,Secrecy and independence for election schemes.,Ben Smyth,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2015/942.pdf,<na>,,,,,,
1945645,openalex::W3210407932,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2015/942.pdf,"Ballot secrecy: Security definition, sufficien...",Ben Smyth,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2015/942.pdf,<na>,,,,,,
1953716,openalex::W3204639541,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2015/806.pdf,Fault Space Transformation: A Generic Approach...,Sikhar Patranabis; Abhishek Chakraborty; Debde...,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2015/806.pdf,<na>,,,,,,
1943516,openalex::W2402265787,IACR Cryptology ePrint Archive,openalex,,,https://eprint.iacr.org/2015/806.pdf,Using State Space Encoding To Counter Biased F...,Sikhar Patranabis; Abhishek Chakraborty; Debde...,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eprint.iacr.org/2015/806.pdf,<na>,,,,,,


In [157]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Organic Eprints'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2246907,openalex::W102989963,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,PLANT BASED-DIVERSITY PRACTICES IN CONVENTIONA...,Marion Casagrande; Lionel Alletto; Christophe ...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2245698,openalex::W70345073,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,"Soil arthropod diversity in organic, integrate...",Vasileios Gkisakis; Dimitrios Kollaros; Paolo ...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2257146,openalex::W92931842,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Management of biomass resources within the cro...,Tora Råberg; Georg Carlsson; Erik Steen Jensen,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2244074,openalex::W91005601,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Need for phosphorus input in Austrian organic ...,Jürgen K. Friedel; Martina Kasper; Harald Schm...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2257092,openalex::W90975811,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,SILAGE IN DIETS FOR ORGANIC SOWS IN GESTATION,P. Bikker; G.P. Binnendijk; H.M. Vermeer; C.M....,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2257068,openalex::W89980577,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,"KNOWING, CHARACTERIZING AND ASSESSING SYSTEMS ...",L. Fontaine; Laetitia Fourrié; Josette Garnier...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2257049,openalex::W89340078,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,An agronomic approach to yield comparisons bet...,Daniel Neuhoff,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2244981,openalex::W82371600,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,Preliminary Results Of The Global Comparative ...,Hervé Bouagnimbeck; Roberto Ugás; Jannet Villa...,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2244017,openalex::W81918905,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,INFLUENCE OF NUTRIENT SOURCES AND INCLUSION OF...,Dinesh Kumar,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2256831,openalex::W81372898,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,LIVESTOCK IN ORGANIC FARMING – HOW IMPORTANT I...,Franz Schulz; Christopher Brock; Günter Leithold,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,


In [158]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Open Science Framework'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2156227,crossref::10.31219/osf.io/wcks6_v1,Open Science Framework,crossref,10.31219/osf.io/wcks6_v1,https://doi.org/10.31219/osf.io/wcks6_v1,https://osf.io/wcks6_v1,Value-based decision-making in regular alcohol...,"Copeland, Amber; Stafford, Tom; Field, Matt",2023.0,2025-05-26,,,,,,,False,,,,,parent,https://osf.io/wcks6_v1,10.31219/osf.io/wcks6_v1,_v1,explicit_version,1.0,,,
2227441,crossref::10.31219/osf.io/wcks6,Open Science Framework,crossref,10.31219/osf.io/wcks6,https://doi.org/10.31219/osf.io/wcks6,https://osf.io/wcks6_v1,Value-based decision-making in regular alcohol...,"Copeland, Amber; Stafford, Tom; Field, Matt",2023.0,2023-08-24,,,,,,,False,,,,,parent,https://osf.io/wcks6_v1,10.31219/osf.io/wcks6,_v1,explicit_version,1.0,,,
2123947,crossref::10.31219/osf.io/w8kue,Open Science Framework,crossref,10.31219/osf.io/w8kue,https://doi.org/10.31219/osf.io/w8kue,https://osf.io/w8kue,Quality-of-life in dementia: Assessment in low...,"Chua, Kia-Chong; Böhnke, Jan R.; Prince, Marti...",2017.0,2018-07-02,,,,,,,False,,,,,parent,https://osf.io/w8kue,10.31219/osf.io/w8kue,,,,,,
2133659,crossref::10.31219/osf.io/w4mhc,Open Science Framework,crossref,10.31219/osf.io/w4mhc,https://doi.org/10.31219/osf.io/w4mhc,https://osf.io/w4mhc,Chinese Immersion Teachers in the U.S: Percept...,"Chen, Mengyao; Li, Jiahang; Gorke, Yongling Z",2022.0,2022-03-07,,,,,,,False,,,,,parent,https://osf.io/w4mhc,10.31219/osf.io/w4mhc,,,,,,
2157098,crossref::10.31219/osf.io/sdzfq_v1,Open Science Framework,crossref,10.31219/osf.io/sdzfq_v1,https://doi.org/10.31219/osf.io/sdzfq_v1,https://osf.io/sdzfq_v1,Exploring the Design Space of BioFabric Visual...,"Fuchs, Johannes; Dennig, Frederik L.; Heinle, ...",2024.0,2025-06-30,,,,,,,False,,,,,parent,https://osf.io/sdzfq_v1,10.31219/osf.io/sdzfq_v1,_v1,explicit_version,1.0,,,
2226663,crossref::10.31219/osf.io/sdzfq,Open Science Framework,crossref,10.31219/osf.io/sdzfq,https://doi.org/10.31219/osf.io/sdzfq,https://osf.io/sdzfq_v1,Exploring the Design Space of BioFabric Visual...,"Fuchs, Johannes; Dennig, Frederik L.; Heinle, ...",2024.0,2024-03-21,,,,,,,False,,,,,parent,https://osf.io/sdzfq_v1,10.31219/osf.io/sdzfq,_v1,explicit_version,1.0,,,
2145107,crossref::10.31219/osf.io/rfvy5,Open Science Framework,crossref,10.31219/osf.io/rfvy5,https://doi.org/10.31219/osf.io/rfvy5,https://osf.io/rfvy5,Assessing the Psychometric Properties and Vali...,"Haddox, Dawson; Mackin, Daniel; Griffin, Tess;...",2024.0,2024-11-04,,,,,,,False,,,,,parent,https://osf.io/rfvy5,10.31219/osf.io/rfvy5,,,,,,
2227472,crossref::10.31219/osf.io/p5gm4,Open Science Framework,crossref,10.31219/osf.io/p5gm4,https://doi.org/10.31219/osf.io/p5gm4,https://osf.io/p5gm4_v1,Young children's screen time during the first ...,"Bergmann, Christina; Dimitrova, Nevena; Alasla...",2021.0,2021-05-31,,,,,,,False,,,,,parent,https://osf.io/p5gm4_v1,10.31219/osf.io/p5gm4,_v1,explicit_version,1.0,,,
2156392,crossref::10.31219/osf.io/p5gm4_v1,Open Science Framework,crossref,10.31219/osf.io/p5gm4_v1,https://doi.org/10.31219/osf.io/p5gm4_v1,https://osf.io/p5gm4_v1,Young children's screen time during the first ...,"Bergmann, Christina; Dimitrova, Nevena; Alasla...",2021.0,2025-05-30,,,,,,,False,,,,,parent,https://osf.io/p5gm4_v1,10.31219/osf.io/p5gm4_v1,_v1,explicit_version,1.0,,,
2156403,crossref::10.31219/osf.io/mztwg_v1,Open Science Framework,crossref,10.31219/osf.io/mztwg_v1,https://doi.org/10.31219/osf.io/mztwg_v1,https://osf.io/mztwg_v1,Social comparison selection in individuals wit...,"Schlechter, Pascal; Meyer, Thomas; Morina, Nex...",2024.0,2025-06-01,,,,,,,False,,,,,parent,https://osf.io/mztwg_v1,10.31219/osf.io/mztwg_v1,_v1,explicit_version,1.0,,,


In [159]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='SciELO Preprints'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5637898,crossref::10.1590/scielopreprints9984,SciELO Preprints,crossref,10.1590/scielopreprints9984,https://doi.org/10.1590/scielopreprints9984,https://preprints.scielo.org/index.php/scielo/...,ESTUDANTES COTISTAS NO CURSO DE MEDICINA DA UF...,"Vilar Bonaldi, Eduardo; Viricimo, Luan",2024.0,2024-09-26,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/scielopreprints9984,,,,,,
5637921,crossref::10.1590/scielopreprints.9984,SciELO Preprints,crossref,10.1590/scielopreprints.9984,https://doi.org/10.1590/scielopreprints.9984,https://preprints.scielo.org/index.php/scielo/...,ESTUDANTES COTISTAS NO CURSO DE MEDICINA DA UF...,"Vilar Bonaldi, Eduardo; Viricimo, Luan",2024.0,2024-09-14,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/scielopreprints.9984,,,,,,
5635263,crossref::10.1590/scielopreprints.2690,SciELO Preprints,crossref,10.1590/scielopreprints.2690,https://doi.org/10.1590/scielopreprints.2690,https://preprints.scielo.org/index.php/scielo/...,Prácticas de evaluación en entornos virtuales ...,"Picón, Gerardo Armando; Rodríguez, Nimia; Oliv...",2021.0,2021-07-26,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/scielopreprints.2690,,,,,,
5634999,crossref::10.1590/scielopreprints.1690,SciELO Preprints,crossref,10.1590/scielopreprints.1690,https://doi.org/10.1590/scielopreprints.1690,https://preprints.scielo.org/index.php/scielo/...,Prácticas de evaluación en entornos virtuales ...,"Picón, Gerardo Armando; Rodríguez, Nimia; Oliv...",2021.0,2021-07-26,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/scielopreprints.1690,,,,,,
5638844,crossref::10.1590/2596-304x202527e20251386,SciELO Preprints,crossref,10.1590/2596-304x202527e20251386,https://doi.org/10.1590/2596-304x202527e20251386,https://preprints.scielo.org/index.php/scielo/...,Surrealism and architecture: the transatlantic...,"Naumann Machado, Nara Helena; Ponge, Robert",2025.0,2025-11-06,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/2596-304x202527e20251386,,,,,,
5638827,crossref::10.1590/scielopreprints.13933,SciELO Preprints,crossref,10.1590/scielopreprints.13933,https://doi.org/10.1590/scielopreprints.13933,https://preprints.scielo.org/index.php/scielo/...,Surrealism and architecture: the transatlantic...,"Naumann Machado, Nara Helena; Ponge, Robert",2025.0,2025-10-30,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/scielopreprints.13933,,,,,,
5638845,crossref::10.1590/2596-304x202527e20251278,SciELO Preprints,crossref,10.1590/2596-304x202527e20251278,https://doi.org/10.1590/2596-304x202527e20251278,https://preprints.scielo.org/index.php/scielo/...,"Orfeu Negro e Emicida AmarElo: canto, comunida...","Silva Menezes, Roniere",2025.0,2025-11-06,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/2596-304x202527e20251278,,,,,,
5638826,crossref::10.1590/scielopreprints.13919,SciELO Preprints,crossref,10.1590/scielopreprints.13919,https://doi.org/10.1590/scielopreprints.13919,https://preprints.scielo.org/index.php/scielo/...,"Orfeu Negro e Emicida AmarElo: canto, comunida...","Silva Menezes, Roniere",2025.0,2025-10-29,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/scielopreprints.13919,,,,,,
5638846,crossref::10.1590/2596-304x202527e20251120,SciELO Preprints,crossref,10.1590/2596-304x202527e20251120,https://doi.org/10.1590/2596-304x202527e20251120,https://preprints.scielo.org/index.php/scielo/...,Marginal e antropófago: a poética do corpo em ...,"Pivetta de Oliveira, Rejane",2025.0,2025-11-06,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/2596-304x202527e20251120,,,,,,
5638824,crossref::10.1590/scielopreprints.13907,SciELO Preprints,crossref,10.1590/scielopreprints.13907,https://doi.org/10.1590/scielopreprints.13907,https://preprints.scielo.org/index.php/scielo/...,Marginal e antropófago: a poética do corpo em ...,"Pivetta de Oliveira, Rejane",2025.0,2025-10-29,,,,,,,False,,,,,parent,https://preprints.scielo.org/index.php/scielo/...,10.1590/scielopreprints.13907,,,,,,


In [160]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='EcoEvoRxiv'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
455996,crossref::10.31219/osf.io/wu5vz,EcoEvoRxiv,crossref,10.31219/osf.io/wu5vz,https://doi.org/10.31219/osf.io/wu5vz,https://osf.io/wu5vz,Test,"Rosenblatt, Rebecca P.",2018.0,2018-11-09,,,,,,,False,,,,,parent,https://osf.io/wu5vz,10.31219/osf.io/wu5vz,,,,,,
456000,crossref::10.32942/osf.io/wu5vz,EcoEvoRxiv,crossref,10.32942/osf.io/wu5vz,https://doi.org/10.32942/osf.io/wu5vz,https://osf.io/wu5vz,,,2018.0,2018-11-09,,,,,,,False,,,,,parent,https://osf.io/wu5vz,10.32942/osf.io/wu5vz,,,,,,
455999,crossref::10.32942/osf.io/k85eq,EcoEvoRxiv,crossref,10.32942/osf.io/k85eq,https://doi.org/10.32942/osf.io/k85eq,https://osf.io/k85eq,,,2018.0,2018-11-09,,,,,,,False,,,,,parent,https://osf.io/k85eq,10.32942/osf.io/k85eq,,,,,,
455997,crossref::10.31219/osf.io/k85eq,EcoEvoRxiv,crossref,10.31219/osf.io/k85eq,https://doi.org/10.31219/osf.io/k85eq,https://osf.io/k85eq,testtest,"Rosenblatt, Rebecca P.",2018.0,2018-11-09,,,,,,,False,,,,,parent,https://osf.io/k85eq,10.31219/osf.io/k85eq,,,,,,
455998,crossref::10.31219/osf.io/gzunx,EcoEvoRxiv,crossref,10.31219/osf.io/gzunx,https://doi.org/10.31219/osf.io/gzunx,https://osf.io/gzunx,testtesttest,"Rosenblatt, Rebecca P.",2018.0,2018-11-09,,,,,,,False,,,,,parent,https://osf.io/gzunx,10.31219/osf.io/gzunx,,,,,,
456001,crossref::10.32942/osf.io/gzunx,EcoEvoRxiv,crossref,10.32942/osf.io/gzunx,https://doi.org/10.32942/osf.io/gzunx,https://osf.io/gzunx,,,2018.0,2018-11-09,,,,,,,False,,,,,parent,https://osf.io/gzunx,10.32942/osf.io/gzunx,,,,,,
454497,crossref::10.32942/x2qc8z,EcoEvoRxiv,crossref,10.32942/x2qc8z,https://doi.org/10.32942/x2qc8z,https://ecoevorxiv.org/repository/view/6403/,The trade-offs of honest and dishonest signals,"Zachar, István; Penn, Dustin",2023.0,2023-12-13,,,,,,,False,,,,,parent,https://ecoevorxiv.org/repository/view/6403/,10.32942/x2qc8z,,,,,,
454505,crossref::10.32942/x2pc91,EcoEvoRxiv,crossref,10.32942/x2pc91,https://doi.org/10.32942/x2pc91,https://ecoevorxiv.org/repository/view/6403/,The trade-offs of honest and dishonest signals,"Számadó, Szabolcs; Zachar, István; Penn, Dustin",2023.0,2023-12-14,,,,,,,False,,,,,parent,https://ecoevorxiv.org/repository/view/6403/,10.32942/x2pc91,,,,,,
454463,crossref::10.32942/x23s40,EcoEvoRxiv,crossref,10.32942/x23s40,https://doi.org/10.32942/x23s40,https://ecoevorxiv.org/repository/view/6292/,Amazonian soundscapes: unravelling the secrets...,"Do Nascimento, Leandro; Pérez-Granados, Cristi...",2023.0,2023-11-28,,,,,,,False,,,,,parent,https://ecoevorxiv.org/repository/view/6292/,10.32942/x23s40,,,,,,
454456,crossref::10.32942/x2102p,EcoEvoRxiv,crossref,10.32942/x2102p,https://doi.org/10.32942/x2102p,https://ecoevorxiv.org/repository/view/6292/,Amazonian soundscapes: unravelling the secrets...,"Do Nascimento, Leandro; Pérez-Granados, Cristi...",2023.0,2023-11-22,,,,,,,False,,,,,parent,https://ecoevorxiv.org/repository/view/6292/,10.32942/x2102p,,,,,,


In [161]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='UCL Open Environment'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5729973,crossref::10.14324/111.444/ucloe.000068,UCL Open Environment,crossref,10.14324/111.444/ucloe.000068,https://doi.org/10.14324/111.444/ucloe.000068,https://journals.uclpress.co.uk/ucloe/article/...,Use of evidence and expertise in UK climate go...,"Willis, Rebecca",2024.0,2024-02-08,,,,,,,False,,,,,parent,https://journals.uclpress.co.uk/ucloe/article/...,10.14324/111.444/ucloe.000068,,,,,,
5729972,crossref::10.14324/ucloe.1982,UCL Open Environment,crossref,10.14324/ucloe.1982,https://doi.org/10.14324/ucloe.1982,https://journals.uclpress.co.uk/ucloe/article/...,Use of evidence and expertise in UK climate go...,"Willis, Rebecca",2024.0,2024-01-30,,,,,,,False,,,,,parent,https://journals.uclpress.co.uk/ucloe/article/...,10.14324/ucloe.1982,,,,,,
5730003,crossref::10.5334/ai.0311,UCL Open Environment,crossref,10.5334/ai.0311,https://doi.org/10.5334/ai.0311,https://journals.uclpress.co.uk/ai/article/id/...,"Hittites and ""barbarians"" in the Late Bronze A...","Matthews, Roger",2012.0,2024-02-27,,,,,,,False,,,,,parent,https://journals.uclpress.co.uk/ai/article/id/...,10.5334/ai.0311,,,,,,
5727433,crossref::10.5334/ai.3011,UCL Open Environment,crossref,10.5334/ai.3011,https://doi.org/10.5334/ai.3011,https://journals.uclpress.co.uk/ai/article/id/...,"Hittites and ""barbarians"" in the Late Bronze A...","Matthews, Roger",1999.0,2013-05-03,,,,,,,False,,,,,parent,https://journals.uclpress.co.uk/ai/article/id/...,10.5334/ai.3011,,,,,,


In [162]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Humanities Commons CORE'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
1925010,datacite::10.17613/bah4-vw03,Humanities Commons CORE,datacite,10.17613/bah4-vw03,https://doi.org/10.17613/bah4-vw03,https://hcommons.org/deposits/removed/,,"N/A, N/A",2026.0,2024-05-03,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed/,10.17613/bah4-vw03,,,,,,
1918000,datacite::10.17613/hs98-7t33,Humanities Commons CORE,datacite,10.17613/hs98-7t33,https://doi.org/10.17613/hs98-7t33,https://hcommons.org/deposits/removed/,,,2020.0,2021-11-30,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed/,10.17613/hs98-7t33,,,,,,
1933841,datacite::10.17613/sbns-7n85,Humanities Commons CORE,datacite,10.17613/sbns-7n85,https://doi.org/10.17613/sbns-7n85,https://hcommons.org/deposits/removed/,,"N/A, N/A",2024.0,2024-07-20,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed/,10.17613/sbns-7n85,,,,,,
1918005,datacite::10.17613/w01d-y281,Humanities Commons CORE,datacite,10.17613/w01d-y281,https://doi.org/10.17613/w01d-y281,https://hcommons.org/deposits/removed/,,,2021.0,2021-12-01,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed/,10.17613/w01d-y281,,,,,,
1918008,datacite::10.17613/x5jz-ym79,Humanities Commons CORE,datacite,10.17613/x5jz-ym79,https://doi.org/10.17613/x5jz-ym79,https://hcommons.org/deposits/removed/,,,2022.0,2021-12-02,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed/,10.17613/x5jz-ym79,,,,,,
1915118,datacite::10.17613/0smm-ms52,Humanities Commons CORE,datacite,10.17613/0smm-ms52,https://doi.org/10.17613/0smm-ms52,https://hcommons.org/deposits/removed,,"N/A, N/A",2019.0,2019-09-15,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed,10.17613/0smm-ms52,,,,,,
1913978,datacite::10.17613/m67w67553,Humanities Commons CORE,datacite,10.17613/m67w67553,https://doi.org/10.17613/m67w67553,https://hcommons.org/deposits/removed,,"N/A, N/A",2017.0,2018-10-14,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed,10.17613/m67w67553,,,,,,
1914556,datacite::10.17613/mxjy-1d38,Humanities Commons CORE,datacite,10.17613/mxjy-1d38,https://doi.org/10.17613/mxjy-1d38,https://hcommons.org/deposits/removed,,"N/A, N/A",2015.0,2019-04-12,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed,10.17613/mxjy-1d38,,,,,,
1915211,datacite::10.17613/pyew-7h37,Humanities Commons CORE,datacite,10.17613/pyew-7h37,https://doi.org/10.17613/pyew-7h37,https://hcommons.org/deposits/removed,,"N/A, N/A",2019.0,2019-10-19,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed,10.17613/pyew-7h37,,,,,,
1924561,datacite::10.17613/zb50-ft02,Humanities Commons CORE,datacite,10.17613/zb50-ft02,https://doi.org/10.17613/zb50-ft02,https://hcommons.org/deposits/removed,,"N/A, N/A",2016.0,2024-03-11,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""msu.core"", ""type"":...",parent,https://hcommons.org/deposits/removed,10.17613/zb50-ft02,,,,,,


In [163]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='SocArXiv'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5645994,crossref::10.31235/osf.io/w4mhc,SocArXiv,crossref,10.31235/osf.io/w4mhc,https://doi.org/10.31235/osf.io/w4mhc,https://osf.io/w4mhc,Chinese Immersion Teachers in the U.S: Percept...,"Chen, Mengyao; Li, Jiahang; Gorke, Yongling Z",2022.0,2022-03-14,,,,,,,False,,,,,parent,https://osf.io/w4mhc,10.31235/osf.io/w4mhc,,,,,,
5664641,crossref::10.31235/osf.io/8zejr,SocArXiv,crossref,10.31235/osf.io/8zejr,https://doi.org/10.31235/osf.io/8zejr,https://osf.io/8zejr_v1,The Financial Geography of Sustainability Data...,"Dimmelmeier, Andreas",2023.0,2023-12-16,,,,,,,False,,,,,parent,https://osf.io/8zejr_v1,10.31235/osf.io/8zejr,_v1,explicit_version,1.0,,,
5649227,crossref::10.31235/osf.io/8zejr_v1,SocArXiv,crossref,10.31235/osf.io/8zejr_v1,https://doi.org/10.31235/osf.io/8zejr_v1,https://osf.io/8zejr_v1,The Financial Geography of Sustainability Data...,"Dimmelmeier, Andreas",2023.0,2025-06-12,,,,,,,False,,,,,parent,https://osf.io/8zejr_v1,10.31235/osf.io/8zejr_v1,_v1,explicit_version,1.0,,,
5658710,crossref::10.31235/osf.io/8hvm6,SocArXiv,crossref,10.31235/osf.io/8hvm6,https://doi.org/10.31235/osf.io/8hvm6,https://osf.io/8hvm6_v1,Mahatma Gandhi and Basic Education,"Jena, Pravat Kumar",2021.0,2021-01-07,,,,,,,False,,,,,parent,https://osf.io/8hvm6_v1,10.31235/osf.io/8hvm6,_v1,explicit_version,1.0,,,
5649266,crossref::10.31235/osf.io/8hvm6_v1,SocArXiv,crossref,10.31235/osf.io/8hvm6_v1,https://doi.org/10.31235/osf.io/8hvm6_v1,https://osf.io/8hvm6_v1,Mahatma Gandhi and Basic Education,"Jena, Pravat Kumar",2021.0,2025-06-17,,,,,,,False,,,,,parent,https://osf.io/8hvm6_v1,10.31235/osf.io/8hvm6_v1,_v1,explicit_version,1.0,,,
5660039,crossref::10.31235/osf.io/4xpza,SocArXiv,crossref,10.31235/osf.io/4xpza,https://doi.org/10.31235/osf.io/4xpza,https://osf.io/4xpza_v1,Cognitive biases in strategic decision-making,"Midtgård, Kenneth; Selart, Marcus",2024.0,2024-03-02,,,,,,,False,,,,,parent,https://osf.io/4xpza_v1,10.31235/osf.io/4xpza,_v1,explicit_version,1.0,,,
5649248,crossref::10.31235/osf.io/4xpza_v1,SocArXiv,crossref,10.31235/osf.io/4xpza_v1,https://doi.org/10.31235/osf.io/4xpza_v1,https://osf.io/4xpza_v1,Cognitive biases in strategic decision-making,"Midtgård, Kenneth; Selart, Marcus",2024.0,2025-06-13,,,,,,,False,,,,,parent,https://osf.io/4xpza_v1,10.31235/osf.io/4xpza_v1,_v1,explicit_version,1.0,,,
5664484,crossref::10.31235/osf.io/3bzex,SocArXiv,crossref,10.31235/osf.io/3bzex,https://doi.org/10.31235/osf.io/3bzex,https://osf.io/3bzex_v1,Dark Money and Politician Learning,"Schnakenberg, Keith; Turner, Ian R",2023.0,2023-01-07,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1086/734548,,,True,,,,,parent,https://osf.io/3bzex_v1,10.31235/osf.io/3bzex,_v1,explicit_version,1.0,,,
5649343,crossref::10.31235/osf.io/3bzex_v1,SocArXiv,crossref,10.31235/osf.io/3bzex_v1,https://doi.org/10.31235/osf.io/3bzex_v1,https://osf.io/3bzex_v1,Dark Money and Politician Learning,"Schnakenberg, Keith; Turner, Ian R",2023.0,2025-06-24,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1086/734548,,,True,,,,,parent,https://osf.io/3bzex_v1,10.31235/osf.io/3bzex_v1,_v1,explicit_version,1.0,,,


In [164]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='ART-Dok'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5612,datacite::10.11588/artdok.00002463,ART-Dok,datacite,10.11588/artdok.00002463,https://doi.org/10.11588/artdok.00002463,https://www.ub.uni-heidelberg.de/helios/digi/r...,"Sprachtäter, Ausschließensmechanismus, Reine\r...","Riahi, Azam; Zafani Rad, Nika",2014.0,2017-02-15,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""gesis.ubhd"", ""type...",parent,https://www.ub.uni-heidelberg.de/helios/digi/r...,10.11588/artdok.00002463,,,,,,
5613,datacite::10.11588/artdok.00002464,ART-Dok,datacite,10.11588/artdok.00002464,https://doi.org/10.11588/artdok.00002464,https://www.ub.uni-heidelberg.de/helios/digi/r...,Die nicht zum Ausdruck gekommene Moderne und e...,"Riahi, Azam",2014.0,2017-02-15,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""gesis.ubhd"", ""type...",parent,https://www.ub.uni-heidelberg.de/helios/digi/r...,10.11588/artdok.00002464,,,,,,
10433,datacite::10.11588/artdok.00008068,ART-Dok,datacite,10.11588/artdok.00008068,https://doi.org/10.11588/artdok.00008068,https://archiv.ub.uni-heidelberg.de/artdok/id/...,Malarstwo witrażowe,"Labuda, Adam S. [Hrsg.]; Secomska, Krystyna [H...",2023.0,2022-12-07,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""gesis.ubhd"", ""type...",parent,https://archiv.ub.uni-heidelberg.de/artdok/id/...,10.11588/artdok.00008068,,,,,,
10521,datacite::10.11588/artdok.00008143,ART-Dok,datacite,10.11588/artdok.00008143,https://doi.org/10.11588/artdok.00008143,https://archiv.ub.uni-heidelberg.de/artdok/id/...,Malarstwo witrażowe,"Labuda, Adam S. [Hrsg.]; Secomska, Krystyna [H...",2023.0,2023-02-08,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""gesis.ubhd"", ""type...",parent,https://archiv.ub.uni-heidelberg.de/artdok/id/...,10.11588/artdok.00008143,,,,,,
5910,datacite::10.11588/artdok.00002777,ART-Dok,datacite,10.11588/artdok.00002777,https://doi.org/10.11588/artdok.00002777,https://archiv.ub.uni-heidelberg.de/artdok/id/...,Visuelle Topoi um 1600. Annibale Carracci zwis...,"Dickhut, Wolfgang [Hrsg.]; Manns, Stefan [Hrsg...",2022.0,2017-02-15,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""gesis.ubhd"", ""type...",parent,https://archiv.ub.uni-heidelberg.de/artdok/id/...,10.11588/artdok.00002777,,,,,,
10130,datacite::10.11588/artdok.00007786,ART-Dok,datacite,10.11588/artdok.00007786,https://doi.org/10.11588/artdok.00007786,https://archiv.ub.uni-heidelberg.de/artdok/id/...,Visuelle Topoi um 1600. Annibale Carracci zwis...,"Dickhut, Wolfgang [Hrsg.]; Manns, Stefan [Hrsg...",2022.0,2022-04-25,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""gesis.ubhd"", ""type...",parent,https://archiv.ub.uni-heidelberg.de/artdok/id/...,10.11588/artdok.00007786,,,,,,
8987,datacite::10.11588/artdok.00006572,ART-Dok,datacite,10.11588/artdok.00006572,https://doi.org/10.11588/artdok.00006572,http://archiv.ub.uni-heidelberg.de/artdok/id/e...,Una versione sconosciuta della tela di Giovann...,"Kienlechner, Susanne",2019.0,2019-09-17,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""gesis.ubhd"", ""type...",parent,http://archiv.ub.uni-heidelberg.de/artdok/id/e...,10.11588/artdok.00006572,,,,,,
9039,datacite::10.11588/artdok.00006627,ART-Dok,datacite,10.11588/artdok.00006627,https://doi.org/10.11588/artdok.00006627,http://archiv.ub.uni-heidelberg.de/artdok/id/e...,Una versione sconosciuta della tela di Giovann...,"Kienlechner, Susanne",2019.0,2019-10-18,[],,,,,,False,,,,"{""client"": {""data"": {""id"": ""gesis.ubhd"", ""type...",parent,http://archiv.ub.uni-heidelberg.de/artdok/id/e...,10.11588/artdok.00006627,,,,,,


In [165]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='EconStor Preprints'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
505270,openalex::W2241195487,EconStor Preprints,openalex,,,https://econpapers.repec.org/RePEc:zbw:espost:...,Electric Vehicles in Imperfect Electricity Mar...,Wolf-Peter Schill,2011.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://econpapers.repec.org/repec:zbw:espost:...,<na>,,,,,,
470304,openalex::W2888303629,EconStor Preprints,openalex,,,https://econpapers.repec.org/RePEc:zbw:espost:...,Electric Vehicles in Imperfect Electricity Mar...,Wolf-Peter Schill,2011.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://econpapers.repec.org/repec:zbw:espost:...,<na>,,,,,,
518111,openalex::W3049474313,EconStor Preprints,openalex,,,http://hdl.handle.net/10419/218808,Industrie dämpft die konjunkturelle Erholung,Schmidt Torsten; György Barabás; Boris Blagov;...,2019.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/218808,<na>,,,,,,
526939,openalex::W7112520020,EconStor Preprints,openalex,,,http://hdl.handle.net/10419/218808,Industrie dämpft die konjunkturelle Erholung,"Schmidt Torsten; Barabás, György; Blagov, Bori...",2019.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/218808,<na>,,,,,,
513062,openalex::W2756173876,EconStor Preprints,openalex,,,http://hdl.handle.net/10419/172735,"Neue Ordnung, neues Glück? Ordnungs- und fiska...",Markus Breuer; Luca Rebeggiani,2017.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/172735,<na>,,,,,,
513356,openalex::W2774257793,EconStor Preprints,openalex,,,http://hdl.handle.net/10419/172735,"Neue Ordnung, neues Glück?New Rules, New Luck?...",Luca Rebeggiani; Markus Breuer,2017.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/172735,<na>,,,,,,
469269,openalex::W2265687887,EconStor Preprints,openalex,,,http://hdl.handle.net/10419/124252,Technological Progress and Economic Geography_...,Jacques Thisse; Takatoshi Tabuchi; Xiwei Zhu,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/124252,<na>,,,,,,
526731,openalex::W7112003676,EconStor Preprints,openalex,,,http://hdl.handle.net/10419/124252,Technological Progress and Economic Geography,Thisse Jacques; Tabuchi Takatoshi; Zhu Xiwei,2014.0,2025-12-10T00:00:00,,,,,,,,,,,,parent,http://hdl.handle.net/10419/124252,<na>,,,,,,


In [166]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='viXra'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9461323,openalex::W2419579416,viXra,openalex,,,https://vixra.org/pdf/1405.0153v1.pdf,Pregnancy hormones in cardiovascular disease.,Denise Hilfiker‐Kleiner,2015.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://vixra.org/pdf/1405.0153v1.pdf,<na>,,,,,,
9471642,openalex::W26151073,viXra,openalex,,,https://vixra.org/pdf/1405.0153v1.pdf,Structure of Chromatic Polynomials on Quasi - ...,R.V.N. SrinivasaRao; J. VenkateswaraRao; T. Na...,2014.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,https://vixra.org/pdf/1405.0153v1.pdf,<na>,,,,,,
9461459,openalex::W18990831,viXra,openalex,,,https://vixra.org/pdf/1405.0117v1.pdf,Optimization of Green Sand Casting Process Par...,Sanjay S. Jamkar; M. J. Deshmukh; N.A. Vidhate,2014.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,https://vixra.org/pdf/1405.0117v1.pdf,<na>,,,,,,
9480335,openalex::W2994514029,viXra,openalex,,,https://vixra.org/pdf/1405.0117v1.pdf,Optimization of Green Sand Casting Process Par...,John Casillas,2014.0,2019-12-13T00:00:00,,,,,,,,,,,,parent,https://vixra.org/pdf/1405.0117v1.pdf,<na>,,,,,,
9461995,openalex::W2412976284,viXra,openalex,,,http://article.aascit.org/file/pdf/9090755.pdf,Ether-medium and a new constant on photons rad...,Edward F. Donnelly; T N Chase,2014.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,http://article.aascit.org/file/pdf/9090755.pdf,<na>,,,,,,
9485177,openalex::W4706860,viXra,openalex,,,http://article.aascit.org/file/pdf/9090755.pdf,Ether-medium and a new constant on photons rad...,Jian Ding; HU Xiuqin,2014.0,2016-06-24T00:00:00,,,,,,,,,,,,parent,http://article.aascit.org/file/pdf/9090755.pdf,<na>,,,,,,


In [167]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='PeerJ Preprints'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2259283,crossref::10.7287/peerj.preprints.3516,PeerJ Preprints,crossref,10.7287/peerj.preprints.3516,https://doi.org/10.7287/peerj.preprints.3516,https://peerj.com/preprints/3516,Factors affecting silk production in Japanese ...,"Zuko, Yeti; Maeda, Kazuo",2018.0,2018-01-09,,,,,,,False,,,,,parent,https://peerj.com/preprints/3516,10.7287/peerj.preprints.3516,,,,,,
2259284,crossref::10.7287/peerj.preprints.3516v1,PeerJ Preprints,crossref,10.7287/peerj.preprints.3516v1,https://doi.org/10.7287/peerj.preprints.3516v1,https://peerj.com/preprints/3516,Factors affecting silk production in Japanese ...,"Zuko, Yeti; Maeda, Kazuo",2018.0,2018-01-09,,,,,,,False,,,,,parent,https://peerj.com/preprints/3516,10.7287/peerj.preprints.3516v1,,,,,,
2259289,crossref::10.7287/peerj.preprints.26897,PeerJ Preprints,crossref,10.7287/peerj.preprints.26897,https://doi.org/10.7287/peerj.preprints.26897,https://peerj.com/preprints/26897,The effect of intestinal <i>Bifidobacterium</i...,"Zuko, Yeti; Maeda, Kazuo",2018.0,2018-04-26,,,,,,,False,,,,,parent,https://peerj.com/preprints/26897,10.7287/peerj.preprints.26897,,,,,,
2259288,crossref::10.7287/peerj.preprints.26897v1,PeerJ Preprints,crossref,10.7287/peerj.preprints.26897v1,https://doi.org/10.7287/peerj.preprints.26897v1,https://peerj.com/preprints/26897,The effect of intestinal <i>Bifidobacterium</i...,"Zuko, Yeti; Maeda, Kazuo",2018.0,2018-04-26,,,,,,,False,,,,,parent,https://peerj.com/preprints/26897,10.7287/peerj.preprints.26897v1,,,,,,
2259286,crossref::10.7287/peerj.preprints.26769,PeerJ Preprints,crossref,10.7287/peerj.preprints.26769,https://doi.org/10.7287/peerj.preprints.26769,https://peerj.com/preprints/26769,The effect of cyclical stimulation on the prim...,"Zuko, Yeti; Maeda, Kazuo",2018.0,2018-03-25,,,,,,,False,,,,,parent,https://peerj.com/preprints/26769,10.7287/peerj.preprints.26769,,,,,,
2259285,crossref::10.7287/peerj.preprints.26769v1,PeerJ Preprints,crossref,10.7287/peerj.preprints.26769v1,https://doi.org/10.7287/peerj.preprints.26769v1,https://peerj.com/preprints/26769,The effect of cyclical stimulation on the prim...,"Zuko, Yeti; Maeda, Kazuo",2018.0,2018-03-25,,,,,,,False,,,,,parent,https://peerj.com/preprints/26769,10.7287/peerj.preprints.26769v1,,,,,,


In [168]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='MetaArXiv'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2014772,crossref::10.31222/osf.io/uxf39,MetaArXiv,crossref,10.31222/osf.io/uxf39,https://doi.org/10.31222/osf.io/uxf39,https://osf.io/uxf39_v1,Do Pre-Registration and Pre-analysis Plans Red...,"Brodeur, Abel; Cook, Nikolai; Hartley, Jonatha...",2022.0,2022-08-11,,,,,,,False,,,,,parent,https://osf.io/uxf39_v1,10.31222/osf.io/uxf39,_v1,explicit_version,1.0,,,
2014395,crossref::10.31222/osf.io/uxf39_v1,MetaArXiv,crossref,10.31222/osf.io/uxf39_v1,https://doi.org/10.31222/osf.io/uxf39_v1,https://osf.io/uxf39_v1,Do Pre-Registration and Pre-analysis Plans Red...,"Brodeur, Abel; Cook, Nikolai; Hartley, Jonatha...",2022.0,2025-06-23,,,,,,,False,,,,,parent,https://osf.io/uxf39_v1,10.31222/osf.io/uxf39_v1,_v1,explicit_version,1.0,,,
2014770,crossref::10.31222/osf.io/a9vhr,MetaArXiv,crossref,10.31222/osf.io/a9vhr,https://doi.org/10.31222/osf.io/a9vhr,https://osf.io/a9vhr_v1,We Need to Talk about Mechanical Turk: What 22...,"Brodeur, Abel; Cook, Nikolai; Heyes, Anthony",2022.0,2022-08-11,,,,,,,False,,,,,parent,https://osf.io/a9vhr_v1,10.31222/osf.io/a9vhr,_v1,explicit_version,1.0,,,
2014396,crossref::10.31222/osf.io/a9vhr_v1,MetaArXiv,crossref,10.31222/osf.io/a9vhr_v1,https://doi.org/10.31222/osf.io/a9vhr_v1,https://osf.io/a9vhr_v1,We Need to Talk about Mechanical Turk: What 22...,"Brodeur, Abel; Cook, Nikolai; Heyes, Anthony",2022.0,2025-06-23,,,,,,,False,,,,,parent,https://osf.io/a9vhr_v1,10.31222/osf.io/a9vhr_v1,_v1,explicit_version,1.0,,,
2014776,crossref::10.31222/osf.io/8ya3m,MetaArXiv,crossref,10.31222/osf.io/8ya3m,https://doi.org/10.31222/osf.io/8ya3m,https://osf.io/8ya3m_v1,The influence of journal submission guidelines...,"Giofrè, David; Boedker, Ingrid; Cumming, Geoff...",2022.0,2022-03-07,,,,,,,False,,,,,parent,https://osf.io/8ya3m_v1,10.31222/osf.io/8ya3m,_v1,explicit_version,1.0,,,
2014401,crossref::10.31222/osf.io/8ya3m_v1,MetaArXiv,crossref,10.31222/osf.io/8ya3m_v1,https://doi.org/10.31222/osf.io/8ya3m_v1,https://osf.io/8ya3m_v1,The influence of journal submission guidelines...,"Giofrè, David; Boedker, Ingrid; Cumming, Geoff...",2022.0,2025-06-25,,,,,,,False,,,,,parent,https://osf.io/8ya3m_v1,10.31222/osf.io/8ya3m_v1,_v1,explicit_version,1.0,,,


In [169]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='WikiJournal of Humanities'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5736146,crossref::10.15347/wjh,WikiJournal of Humanities,crossref,10.15347/wjh,https://doi.org/10.15347/wjh,https://en.wikiversity.org/wiki/WikiJournal_of...,WikiJournal of Humanities,,,2017-10-25,,,,,,,False,,,,,parent,https://en.wikiversity.org/wiki/wikijournal_of...,10.15347/wjh,,,,,,
5736152,crossref::10.15347/wjh/2023.001,WikiJournal of Humanities,crossref,10.15347/wjh/2023.001,https://doi.org/10.15347/wjh/2023.001,https://en.wikiversity.org/wiki/WikiJournal_of...,"Loveday, 1458",,2023.0,2023-06-19,,,,,,,False,,,,,parent,https://en.wikiversity.org/wiki/wikijournal_of...,10.15347/wjh/2023.001,,,,,,
5736145,crossref::10.15347/wjh/2019.004,WikiJournal of Humanities,crossref,10.15347/wjh/2019.004,https://doi.org/10.15347/wjh/2019.004,https://en.wikiversity.org/wiki/WikiJournal_of...,Hilda Rix Nicholas,"Holland, Hannah",2019.0,2019-12-04,,,,,,,False,,,,,parent,https://en.wikiversity.org/wiki/wikijournal_of...,10.15347/wjh/2019.004,,,,,,
5736143,crossref::10.15347/wjx/2019.004,WikiJournal of Humanities,crossref,10.15347/wjx/2019.004,https://doi.org/10.15347/wjx/2019.004,https://en.wikiversity.org/wiki/WikiJournal_of...,Hilda Rix Nicholas,"Holland, Hannah",2019.0,2019-12-04,,,,,,,False,,,,,parent,https://en.wikiversity.org/wiki/wikijournal_of...,10.15347/wjx/2019.004,,,,,,


In [170]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='WikiJournal of Medicine'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5736159,crossref::10.15347/wjm/2019.009,WikiJournal of Medicine,crossref,10.15347/wjm/2019.009,https://doi.org/10.15347/wjm/2019.009,https://en.wikiversity.org/wiki/WikiJournal_of...,Epidemiology of the Hepatitis D virus,"University of Torino, Italy; Rizzetto, Mario",2020.0,2020-03-29,,,,,,,False,,,,,parent,https://en.wikiversity.org/wiki/wikijournal_of...,10.15347/wjm/2019.009,,,,,,
5736172,crossref::10.15347/wjm/2020.001.2,WikiJournal of Medicine,crossref,10.15347/wjm/2020.001.2,https://doi.org/10.15347/wjm/2020.001.2,https://en.wikiversity.org/wiki/WikiJournal_of...,Epidemiology of the Hepatitis D virus,"Rizzetto, Mario",2020.0,2020-04-06,,,,,,,False,,,,,parent,https://en.wikiversity.org/wiki/wikijournal_of...,10.15347/wjm/2020.001.2,,,,,,
5736210,crossref::10.15347/wjm,WikiJournal of Medicine,crossref,10.15347/wjm,https://doi.org/10.15347/wjm,https://en.wikiversity.org/wiki/WikiJournal_Pr...,WikiJournal of Medicine,,,2024-05-01,,,,,,,False,,,,,parent,https://en.wikiversity.org/wiki/wikijournal_pr...,10.15347/wjm,,,,,,
5736211,crossref::10.15347/wjm/2024.001,WikiJournal of Medicine,crossref,10.15347/wjm/2024.001,https://doi.org/10.15347/wjm/2024.001,https://en.wikiversity.org/wiki/WikiJournal_Pr...,Body image disturbance in eating disorders,"Artoni, Paolo",2024.0,2024-05-01,,,,,,,False,,,,,parent,https://en.wikiversity.org/wiki/wikijournal_pr...,10.15347/wjm/2024.001,,,,,,


In [171]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Bepress Legal Repository'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
274423,openalex::W1562651827,Bepress Legal Repository,openalex,10.1017/cbo9780511921476,https://doi.org/10.1017/cbo9780511921476,https://chicagounbound.uchicago.edu/cgi/viewco...,Building global democracy?: civil society and ...,Jan Aart Scholte,2011.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://chicagounbound.uchicago.edu/cgi/viewco...,10.1017/cbo9780511921476,,,,,,
274426,openalex::W1672512695,Bepress Legal Repository,openalex,,,https://chicagounbound.uchicago.edu/cgi/viewco...,Building Global Democracy?: List of abbreviations,Jan Aart Scholte,2011.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://chicagounbound.uchicago.edu/cgi/viewco...,<na>,,,,,,
274427,openalex::W784155097,Bepress Legal Repository,openalex,,,https://chicagounbound.uchicago.edu/cgi/viewco...,Building Global Democracy?: Editor's acknowled...,Jan Aart Scholte,2011.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://chicagounbound.uchicago.edu/cgi/viewco...,<na>,,,,,,
274428,openalex::W79364033,Bepress Legal Repository,openalex,,,https://chicagounbound.uchicago.edu/cgi/viewco...,Building Global Democracy?: List of contributors,Jan Aart Scholte,2011.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://chicagounbound.uchicago.edu/cgi/viewco...,<na>,,,,,,


In [172]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Social Science Open Access Repository'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
5689205,openalex::W2948778104,Social Science Open Access Repository,openalex,10.5167/uzh-160317,https://doi.org/10.5167/uzh-160317,https://www.ssoar.info/ssoar/handle/document/6...,"Kommentar zu Meiser, T. et al. (2018). Positio...",Natalie Nagowski; Peter Kirsch; Andrea Kübler;...,2018.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://www.ssoar.info/ssoar/handle/document/6...,10.5167/uzh-160317,,,,,,
5691773,openalex::W3198948865,Social Science Open Access Repository,openalex,,,https://www.ssoar.info/ssoar/handle/document/6...,Fachgruppe Gesundheitspsychologie: Methoden si...,Petra Warschburger; Gudrun Sproesser; Daniela ...,2018.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://www.ssoar.info/ssoar/handle/document/6...,<na>,,,,,,
5669183,openalex::W2748208804,Social Science Open Access Repository,openalex,,,https://www.ssoar.info/ssoar/handle/document/6...,Deutscher Alterssurvey (DEAS): Kurzbeschreibun...,Heribert Engstler; Nicole Hameister,2019.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://www.ssoar.info/ssoar/handle/document/6...,<na>,,,,,,
5691957,openalex::W3208177612,Social Science Open Access Repository,openalex,,,https://www.ssoar.info/ssoar/handle/document/6...,Deutscher Alterssurvey (DEAS): Kurzbeschreibun...,Heribert Engstler; Nicole Hameister,2021.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://www.ssoar.info/ssoar/handle/document/6...,<na>,,,,,,


In [173]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Advance'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
16252,crossref::10.31124/advance.171863606.69452326/v1,Advance,crossref,10.31124/advance.171863606.69452326/v1,https://doi.org/10.31124/advance.171863606.694...,https://advance.sagepub.com/users/719316/artic...,CLUSTER DEVELOPMENT ECONOMIC POLICY,"Lomsadze, Tinatin",2024.0,2024-06-17,,,,,,,False,,,,,parent,https://advance.sagepub.com/users/719316/artic...,10.31124/advance.171863606.69452326/v1,,,,/v1,explicit_version,1.0
16256,crossref::10.31124/advance.171897777.74136883/v1,Advance,crossref,10.31124/advance.171897777.74136883/v1,https://doi.org/10.31124/advance.171897777.741...,https://advance.sagepub.com/users/719316/artic...,CLUSTER DEVELOPMENT ECONOMIC POLICY,"Lomsadze, Tinatin",2024.0,2024-07-03,,,,,,,False,,,,,parent,https://advance.sagepub.com/users/719316/artic...,10.31124/advance.171897777.74136883/v1,,,,/v1,explicit_version,1.0


In [174]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='Preprints.org'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2271995,crossref::10.20944/preprints201904.0246.v1,Preprints.org,crossref,10.20944/preprints201904.0246.v1,https://doi.org/10.20944/preprints201904.0246.v1,http://www.preprints.org/manuscript/201904.024...,Mathematical Models for Possible Roles of Oxyt...,"Gottlieb, Mark",2019.0,2019-04-22,,,,,,,False,,,,,parent,http://www.preprints.org/manuscript/201904.024...,10.20944/preprints201904.0246.v1,/v1,explicit_version,1.0,,,
2272001,crossref::10.20944/preprints201904.0246.v2,Preprints.org,crossref,10.20944/preprints201904.0246.v2,https://doi.org/10.20944/preprints201904.0246.v2,http://www.preprints.org/manuscript/201904.024...,Mathematical Models for Possible Roles of Oxyt...,"Gottlieb, Mark",2019.0,2019-04-24,,,,,,,False,,,,,parent,http://www.preprints.org/manuscript/201904.024...,10.20944/preprints201904.0246.v2,/v1,explicit_version,1.0,,,


In [175]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='JMIR Preprints'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
1972015,crossref::10.2196/iproc.8586,JMIR Preprints,crossref,10.2196/iproc.8586,https://doi.org/10.2196/iproc.8586,http://preprints.jmir.org/preprint/8989,Barriers and facilitators to patient portal im...,"Kooij, Laura; Groen, Wim G; van Harten, Wim H",2017.0,2017-09-25,,,,,,,False,,,,,parent,http://preprints.jmir.org/preprint/8989,10.2196/iproc.8586,,,,,,
1972952,crossref::10.2196/preprints.8989,JMIR Preprints,crossref,10.2196/preprints.8989,https://doi.org/10.2196/preprints.8989,http://preprints.jmir.org/preprint/8989,Barriers and Facilitators Affecting Patient Po...,"Kooij, Laura; Groen, Wim G; van Harten, Wim H",2017.0,2017-09-26,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.2196/jmir.8989,,,True,,,,,parent,http://preprints.jmir.org/preprint/8989,10.2196/preprints.8989,,,,,,


In [176]:
dupes_landing_page_url[dupes_landing_page_url['server_name']=='National Bureau of Economic Research'].sort_values(by='landing_page_url', ascending=False)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2116112,openalex::W2567960855,National Bureau of Economic Research,openalex,,,https://eric.ed.gov/?id=ED524978,"Creating ""No Excuses"" (Traditional) Public Sch...",Roland G. Fryer,2011.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eric.ed.gov/?id=ed524978,<na>,,,,,,
2116118,openalex::W2913912915,National Bureau of Economic Research,openalex,,,https://eric.ed.gov/?id=ED524978,Injecting Successful Charter School Strategies...,Roland G. Fryer,2011.0,2025-10-10T00:00:00,,,,,,,,,,,,parent,https://eric.ed.gov/?id=ed524978,<na>,,,,,,


#### function

In [177]:
import pandas as pd
import re

# -----------------------------
# 0) Define primary servers
# -----------------------------
# PRIMARY_SERVERS = {
#     "arXiv",
#     "bioRxiv",
#     "medRxiv",
#     "SSRN",
#     "TechRxiv",
#     "Wellcome Open Research",
#     "AgriRxiv",
#     "EarthArXiv",
#     "Law Archive",
#     "SocArXiv",
#     "Thesis Commons",
#     "Research Square",
#     "Open Research Europe",
#     "Oroboros Instruments",
#     "ResearchGate",
#     "AgEcon Search",
#     "Zenodo",
#     "Open Science Framework",
#     "Humanities Commons CORE",
#     "CERN document server",
#     "eLife",
# }

PRIMARY_SERVERS = set(primary_servers) 
# -----------------------------
# 1) Normalize landing_page_url
# -----------------------------
url_norm = (
    df_mirror["landing_page_url"]
    .astype('string')
    .str.strip()
    .str.lower()
    .replace({"": pd.NA, "none": pd.NA, "nan": pd.NA, "null": pd.NA})
    # .replace({"": pd.NA, "none": pd.NA})
    # remove query strings and fragments
    .str.replace(r"[?#].*$", "", regex=True)
    # remove trailing slash
    .str.rstrip("/")
)


# -----------------------------
# 2) Flags
# -----------------------------
is_primary = df_mirror["server_name"].isin(PRIMARY_SERVERS)

# duplicated URL groups
is_dup = url_norm.notna() & url_norm.duplicated(keep=False)

# -----------------------------
# 3) Build URL → primary server mapping (FAST)
#    Only once, only for duplicated URLs
# -----------------------------
primary_by_url = (
    df_mirror.loc[is_primary & is_dup, ["server_name"]]
    .assign(url=url_norm[is_primary & is_dup])
    .dropna(subset=["url"])
    .groupby("url")["server_name"]
    .first()   # deterministic + fast
)

# -----------------------------
# 4) Label mirrors
#    - non-primary
#    - duplicated URL
#    - duplicated with at least one primary
# -----------------------------
mask_mirror = (
    is_dup
    & ~is_primary
    & url_norm.isin(primary_by_url.index)
)

df_mirror.loc[mask_mirror, "records_hierarchy"] = (
    "mirror (" + url_norm[mask_mirror].map(primary_by_url) + ")"
)

# -----------------------------
# 5) Sanity checks
# -----------------------------
print("Mirror rows set:", mask_mirror.sum())
print(df_mirror["records_hierarchy"].value_counts(dropna=False))


Mirror rows set: 2905
records_hierarchy
parent                              7950093
review                               175443
part_of                              149454
version                              105977
publish_version                        9351
mirror (arXiv)                         8286
mirror (AgEcon Search)                 6702
child                                  2028
mirror (ResearchGate)                  1633
correction                              355
comment                                 325
mirror (Zenodo)                         297
mirror (SSRN)                            36
mirror (Open Science Framework)          31
mirror (bioRxiv)                         29
mirror (Humanities Commons CORE)         24
others                                   12
parent_duplicate                          3
mirror (eLife)                            3
mirror (CERN document server)             2
mirror (Research Square)                  2
mirror (AgriRxiv)                   

# Resolve duplicates across OSF + OSF communities using the OSF id (osf.io/<id>)

In [178]:
## Quick proof (diagnostic)

In [179]:
# import pandas as pd
# import re

# def extract_osf_id(x):
#     if pd.isna(x):
#         return pd.NA
#     m = re.search(r"osf\.io/([a-z0-9]+)", str(x).lower())
#     return m.group(1) if m else pd.NA

# tmp = df_mirror.copy()

# tmp["osf_id_from_doi"] = tmp["doi"].map(extract_osf_id) if "doi" in tmp.columns else pd.NA
# tmp["osf_id_from_lp"]  = tmp["landing_page_url"].map(extract_osf_id) if "landing_page_url" in tmp.columns else pd.NA

# tmp["osf_id"] = tmp["osf_id_from_doi"].fillna(tmp["osf_id_from_lp"])

# print("Rows with OSF id:", tmp["osf_id"].notna().sum())
# print("Duplicated OSF id rows:", tmp["osf_id"].notna().sum() - tmp["osf_id"].dropna().nunique())
# print(tmp.loc[tmp["osf_id"].notna(), "server_name"].value_counts().head(20))


In [180]:
## function: resolve OSF duplicates by OSF ID (works across different DOI prefixes)

In [181]:
# import pandas as pd
# import numpy as np
# import re

# def resolve_osf_duplicates_fast(
#     df: pd.DataFrame,
#     preferred_parent_servers: set,
#     prefer_backend: str = "crossref",
#     choose_parent: str = "oldest",           # "oldest" or "most_recent"
#     date_col: str = "date_created",          # if missing -> record_id fallback
#     overwrite_mode: str = "parent_only",     # "any" | "parent_only" | "unlabeled_only"
#     # columns
#     server_col: str = "server_name",
#     backend_col: str = "backend",
#     record_id_col: str = "record_id",
#     doi_col: str = "doi",
#     landing_col: str = "landing_page_url",
#     hierarchy_col: str = "records_hierarchy",
#     parent_id_col: str = "parent_record_id",
#     # perf knobs
#     coarse_filter: str = "osf.io/",          # cheap contains() filter before regex
#     min_group_size: int = 2
# ) -> pd.DataFrame:
#     """
#     Fast OSF duplicate resolver for huge frames (millions of rows).

#     Key idea:
#       - Avoid regex over the full dataframe.
#       - First, cheaply filter rows that likely contain 'osf.io/' in doi or landing_page_url.
#       - Only then extract OSF id and group to resolve duplicates.
#       - Finally, write results back only for touched rows.

#     Parent selection priority:
#       1) preferred community servers first (SocArXiv, PsyArXiv, etc.)
#       2) prefer backend == prefer_backend (crossref)
#       3) date (oldest/most_recent)
#       4) record_id numeric key (tie-break)
#     """

#     out = df.copy()

#     # Ensure output cols exist
#     if hierarchy_col not in out.columns:
#         out[hierarchy_col] = pd.NA
#     if parent_id_col not in out.columns:
#         out[parent_id_col] = pd.NA

#     # Eligibility mask (run only where you allow overwriting)
#     h = out[hierarchy_col]
#     if overwrite_mode == "any":
#         eligible = pd.Series(True, index=out.index)
#     elif overwrite_mode == "parent_only":
#         eligible = h.astype(str).str.strip().str.lower().eq("parent")
#     elif overwrite_mode == "unlabeled_only":
#         eligible = h.isna()
#     else:
#         raise ValueError("overwrite_mode must be: any | parent_only | unlabeled_only")

#     # ---- 1) Coarse filter: only rows likely to be OSF-related (very fast)
#     # Use fillna("") so .str.contains doesn't create object issues
#     m_osf = pd.Series(False, index=out.index)

#     if doi_col in out.columns:
#         m_osf |= out[doi_col].fillna("").astype(str).str.contains(coarse_filter, case=False, regex=False)
#     if landing_col in out.columns:
#         m_osf |= out[landing_col].fillna("").astype(str).str.contains(coarse_filter, case=False, regex=False)

#     m = eligible & m_osf
#     if not m.any():
#         return out

#     # Work on subset only
#     sub = out.loc[m, [server_col, backend_col, record_id_col]].copy()

#     # ---- 2) Extract OSF id from subset (regex only on ~235k rows, not 8M)
#     pat = re.compile(r"osf\.io/([a-z0-9]+)", re.I)

#     def extract_osf_id_series(s: pd.Series) -> pd.Series:
#         return s.fillna("").astype(str).str.lower().str.extract(pat, expand=False)

#     osf_id = pd.Series(pd.NA, index=sub.index, dtype="object")
#     if doi_col in out.columns:
#         osf_id = extract_osf_id_series(out.loc[m, doi_col])
#     if landing_col in out.columns:
#         osf_id = osf_id.fillna(extract_osf_id_series(out.loc[m, landing_col]))

#     # drop rows with no extracted id (coarse filter can include a few false positives)
#     sub["_osf_id"] = osf_id
#     sub = sub[sub["_osf_id"].notna()].copy()
#     if sub.empty:
#         return out

#     # ---- 3) Prepare sort keys on subset
#     sub["_is_pref_server"] = sub[server_col].isin(preferred_parent_servers)

#     # backend preference (vectorized)
#     sub["_is_pref_backend"] = (
#         sub[backend_col].fillna("").astype(str).str.lower().eq(str(prefer_backend).lower())
#     )

#     # date key
#     if date_col in out.columns:
#         sub["_dt"] = pd.to_datetime(out.loc[sub.index, date_col], errors="coerce")
#     else:
#         sub["_dt"] = pd.NaT

#     # record_id numeric key (extract digits)
#     rid_digits = sub[record_id_col].astype(str).str.extract(r"(\d+)")[0]
#     sub["_rid_key"] = pd.to_numeric(rid_digits, errors="coerce")

#     if choose_parent not in {"oldest", "most_recent"}:
#         raise ValueError("choose_parent must be: oldest | most_recent")
#     date_asc = (choose_parent == "oldest")

#     # ---- 4) Only group IDs that actually have duplicates (saves time)
#     # value_counts on 235k rows is cheap
#     dup_ids = sub["_osf_id"].value_counts()
#     dup_ids = dup_ids[dup_ids >= min_group_size].index
#     sub = sub[sub["_osf_id"].isin(dup_ids)].copy()
#     if sub.empty:
#         return out

#     # ---- 5) Resolve per OSF id
#     # Sorting once, then picking first per group is faster than looping all groups with Python
#     sub_sorted = sub.sort_values(
#         by=["_osf_id", "_is_pref_server", "_is_pref_backend", "_dt", "_rid_key"],
#         ascending=[True, False, False, date_asc, True],
#         na_position="last"
#     )

#     # Parent idx per osf_id = first row after sorting
#     parent_idx_by_id = sub_sorted.groupby("_osf_id", sort=False).head(1)
#     parent_map_rid = parent_idx_by_id.set_index("_osf_id")[record_id_col]
#     parent_map_srv = parent_idx_by_id.set_index("_osf_id")[server_col]

#     # For all rows in sub_sorted, map parent rid & parent server
#     sub_sorted["_parent_rid"] = sub_sorted["_osf_id"].map(parent_map_rid)
#     sub_sorted["_parent_srv"] = sub_sorted["_osf_id"].map(parent_map_srv)

#     # Identify which rows are parent vs child
#     is_parent_row = sub_sorted[record_id_col].eq(sub_sorted["_parent_rid"])

#     # Write back to OUT
#     parent_rows = sub_sorted.index[is_parent_row]
#     child_rows  = sub_sorted.index[~is_parent_row]

#     out.loc[parent_rows, hierarchy_col] = "parent"
#     out.loc[parent_rows, parent_id_col] = pd.NA

#     out.loc[child_rows, hierarchy_col] = "mirror (" + sub_sorted.loc[child_rows, "_parent_srv"].astype(str) + ")"
#     out.loc[child_rows, parent_id_col] = sub_sorted.loc[child_rows, "_parent_rid"].values

#     return out


In [182]:
# PREFERRED_OSF_PARENT = {
#     "SocArXiv","Law Archive","PsyArXiv","EdArXiv","EarthArXiv","Thesis Commons",
#     "LIS Scholarship Archive","SportRxiv","INA-Rxiv","Arabixiv","engrXiv","MetaArXiv",
#     "MindRxiv","MarXiv","AgriRxiv","NutriXiv","ECSarXiv","FocUS Archive","Frenxiv",
#     "EcoEvoRxiv","IndiaRxiv","PaleorXiv","AfricArXiv","BioHackrXiv","MediArXiv"
# }

# df_mirror = resolve_osf_duplicates_fast(
#     df=df_mirror,
#     preferred_parent_servers=PREFERRED_OSF_PARENT,
#     prefer_backend="crossref",
#     choose_parent="oldest",
#     date_col="date_created",
#     overwrite_mode="parent_only",
# )

# print(df_mirror["records_hierarchy"].value_counts(dropna=False).head(30))


# save

In [183]:
records_hierarchy_df = df_mirror[['record_id','server_name','records_hierarchy']]
records_hierarchy_df

Unnamed: 0,record_id,server_name,records_hierarchy
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,parent
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,parent
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,parent
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,parent
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,parent
...,...,...,...
2806404,openalex::W999921877,RePEc: Research Papers in Economics,parent
1016201,openalex::W999947037,HAL,parent
2713111,openalex::W999974616,RePEc: Research Papers in Economics,parent
1015101,openalex::W999989114,HAL,parent


In [184]:
records_hierarchy_df.to_csv("outputs/records_hierarchy_df.csv", index=False)
records_hierarchy_df.to_pickle("outputs/records_hierarchy_df.pkl")

In [185]:
ccc

NameError: name 'ccc' is not defined

In [None]:
some paper have same title and author infos, but different may have different doi or landing page url in the same servers. 
    we need to clean title first or decide that the title have to be equal at 90% per example to overcome some little typo difference
    could it be possibe to write a code that will get one version of rows, 
        the version choose will be label parent and others childs (duplicate),
        we need also to work server by server and a way for each server to decide if we get the most recent or the old on as parent. 



how to have a function, that, we can add as input, server, or list of server, the columns one or multiples columns we need to use to find duplicates, wich records id to choose, the most recent or old one by sorting

# VeriXiv and Gate

In [186]:
data[data['server_name']=='VeriXiv']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
5731134,crossref::10.12688/verixiv.1003.1,VeriXiv,crossref,10.12688/verixiv.1003.1,https://doi.org/10.12688/verixiv.1003.1,https://verixiv.org/articles/2-72/v1,"Estimation of Seasonal, Additive and Non-addit...","Akech, Violet; Bayo, Stanley; Bengtsson, Theré...",2025.0,2025-04-23,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,10.1007/s10681-025-03598-1,,,true,,,,
5730998,crossref::10.12688/verixiv.1007.1,VeriXiv,crossref,10.12688/verixiv.1007.1,https://doi.org/10.12688/verixiv.1007.1,https://verixiv.org/articles/2-106/v1,Human-Centered Design in Public Health: A Case...,"Olarewaju, Olajumoke; Poddar, Manvi; Stephens,...",2025.0,2025-05-27,,,,,,,false,,,,
5730985,crossref::10.12688/verixiv.1008.1,VeriXiv,crossref,10.12688/verixiv.1008.1,https://doi.org/10.12688/verixiv.1008.1,https://verixiv.org/articles/2-86/v1,What it takes to sustain: A realist evaluation...,"Elaraby, Sarah; Yahner, Melanie; Krishna Sarke...",2025.0,2025-05-06,,,,,,,false,,,,
5730991,crossref::10.12688/verixiv.1022.1,VeriXiv,crossref,10.12688/verixiv.1022.1,https://doi.org/10.12688/verixiv.1022.1,https://verixiv.org/articles/2-97/v1,Exploring the role of small and medium food en...,"Fitawek, Wegayehu; Karuaihe, Selma Tuemumunu; ...",2025.0,2025-05-20,,,,,,,false,,,,
5731326,crossref::10.12688/verixiv.1025.1,VeriXiv,crossref,10.12688/verixiv.1025.1,https://doi.org/10.12688/verixiv.1025.1,https://verixiv.org/articles/2-189/v1,Small and medium-sized type of food business: ...,"Owusu-Sekyere, Enoch; Karuaihe, Selma Tuemumun...",2025.0,2025-07-21,,,,,,,false,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5730990,crossref::10.12688/verixiv.982.1,VeriXiv,crossref,10.12688/verixiv.982.1,https://doi.org/10.12688/verixiv.982.1,https://verixiv.org/articles/2-96/v1,Comparative technical and operational assessme...,"Zhu, Changcheng; Roa, Nerie; Neathery, Estelle...",2025.0,2025-05-20,,,,,,,false,,,,
5730986,crossref::10.12688/verixiv.986.1,VeriXiv,crossref,10.12688/verixiv.986.1,https://doi.org/10.12688/verixiv.986.1,https://verixiv.org/articles/2-92/v1,Evaluation of private sector engagement for TB...,"Shukla, Anupam; Borhade, Priyanka; Dholakia, Y...",2025.0,2025-05-12,,,,,,,false,,,,
5731118,crossref::10.12688/verixiv.989.1,VeriXiv,crossref,10.12688/verixiv.989.1,https://doi.org/10.12688/verixiv.989.1,https://verixiv.org/articles/2-99/v1,traveltime: an R package to calculate travel t...,"Ryan, Gerard E.; Tierney, Nicholas; Golding, N...",2025.0,2025-05-21,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,10.21956/verixiv.1007.r1274;10.21956/verixiv.1...,false,,,,
5731107,crossref::10.12688/verixiv.994.1,VeriXiv,crossref,10.12688/verixiv.994.1,https://doi.org/10.12688/verixiv.994.1,https://verixiv.org/articles/2-136/v1,"Mass Balance, Pharmacokinetics, Metabolism and...","Gillon, Jean-Yves; Simon, François; Sidhu, Sha...",2025.0,2025-08-03,,,,,,,false,,,,


In [187]:
gate_data = data[data['server_name']=='Gates Open Research']
gate_data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
590205,crossref::10.12688/gatesopenres,Gates Open Research,crossref,10.12688/gatesopenres,https://doi.org/10.12688/gatesopenres,http://www.gatesopenresearch.org,Gates Open Research,,,2017-11-06,,,,,,,false,,,,
589471,crossref::10.12688/gatesopenres.12749.1,Gates Open Research,crossref,10.12688/gatesopenres.12749.1,https://doi.org/10.12688/gatesopenres.12749.1,https://gatesopenresearch.org/articles/1-7/v1,A superhydrophobic cone to facilitate the xeno...,"Cook, Darren A.N.; Pilotte, Nils; Minetti, Cor...",2017.0,2017-11-06,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,10.21956/gatesopenres.13809.r26053;10.21956/ga...,false,,,,
589358,crossref::10.12688/gatesopenres.12749.2,Gates Open Research,crossref,10.12688/gatesopenres.12749.2,https://doi.org/10.12688/gatesopenres.12749.2,https://gatesopenresearch.org/articles/1-7/v2,A superhydrophobic cone to facilitate the xeno...,"Cook, Darren A.N.; Pilotte, Nils; Minetti, Cor...",2018.0,2018-04-27,,New version,,,,,false,,,"[{""DOI"": ""10.12688/gatesopenres.12749.1"", ""lab...",
589685,crossref::10.12688/gatesopenres.12750.1,Gates Open Research,crossref,10.12688/gatesopenres.12750.1,https://doi.org/10.12688/gatesopenres.12750.1,https://gatesopenresearch.org/articles/1-9/v1,Using biomarkers to predict TB treatment durat...,"Chen, Ray Y.; Via, Laura E.; Dodd, Lori E.; Wa...",2017.0,2017-11-06,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,10.21956/gatesopenres.13810.r26114;10.21956/ga...,false,,,,
589470,crossref::10.12688/gatesopenres.12751.1,Gates Open Research,crossref,10.12688/gatesopenres.12751.1,https://doi.org/10.12688/gatesopenres.12751.1,https://gatesopenresearch.org/articles/1-8/v1,Point-of-contact Interactive Record Linkage (P...,"Rentsch, Christopher T.; Kabudula, Chodziwadzi...",2017.0,2017-11-06,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,10.21956/gatesopenres.13811.r26151;10.21956/ga...,false,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590198,crossref::10.12688/gatesopenres.16368.1,Gates Open Research,crossref,10.12688/gatesopenres.16368.1,https://doi.org/10.12688/gatesopenres.16368.1,https://gatesopenresearch.org/articles/9-92/v1,Addressing Research Readiness Challenges in Af...,"Amboka, Patrick; Kariuki, H; Orobaton, Nosa; N...",2025.0,2025-11-21,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,,10.12688/verixiv.558.3,,false,,,,
590200,crossref::10.12688/gatesopenres.16369.1,Gates Open Research,crossref,10.12688/gatesopenres.16369.1,https://doi.org/10.12688/gatesopenres.16369.1,https://gatesopenresearch.org/articles/9-91/v1,Evaluation of an ultra-portable pocket-sized d...,"Fuentes, Segundo; Ogero, Kwame; Perez, Ana; Kr...",2025.0,2025-11-21,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,,10.12688/verixiv.1161.1,,false,,,,
590193,crossref::10.12688/gatesopenres.16370.1,Gates Open Research,crossref,10.12688/gatesopenres.16370.1,https://doi.org/10.12688/gatesopenres.16370.1,https://gatesopenresearch.org/articles/9-101/v1,CYP2D6 Pharmacogenetics in Nigerian Sickle Cel...,"Adeagbo, Babatunde; Olarewaju, Olusola; Orherh...",2025.0,2025-11-10,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,,10.12688/verixiv.1323.2,,false,,,,
590197,crossref::10.12688/gatesopenres.16371.1,Gates Open Research,crossref,10.12688/gatesopenres.16371.1,https://doi.org/10.12688/gatesopenres.16371.1,https://gatesopenresearch.org/articles/9-100/v1,Potential for Family Planning Program Sustaina...,"Hardee, Karen; Rosenberg, Rebecca; Zosa-Ferani...",2025.0,2025-11-21,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,,10.12688/verixiv.888.2,,false,,,,


In [188]:
pattern = "10.12688/verixiv.244"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
5730930,crossref::10.12688/verixiv.244.1,VeriXiv,crossref,10.12688/verixiv.244.1,https://doi.org/10.12688/verixiv.244.1,https://verixiv.org/articles/1-17/v1,Discoverability of African Journals by Google ...,"Amboka, Patrick; Kirimi Sindi, Julius; Wamukoy...",2024.0,2024-11-01,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,10.21956/verixiv.248.r101,False,,,,
5731336,crossref::10.12688/verixiv.244.2,VeriXiv,crossref,10.12688/verixiv.244.2,https://doi.org/10.12688/verixiv.244.2,https://verixiv.org/articles/1-17/v2,Discoverability of African Journals by Google ...,"Amboka, Patrick; Kirimi Sindi, Julius; Wamukoy...",2025.0,2025-08-05,"{""has-version"": [{""asserted-by"": ""subject"", ""i...",,10.12688/verixiv.244.1,,,,False,,,,
5731262,crossref::10.12688/verixiv.244.3,VeriXiv,crossref,10.12688/verixiv.244.3,https://doi.org/10.12688/verixiv.244.3,https://verixiv.org/articles/1-17/v3,Discoverability of African Journals by Google ...,"Amboka, Patrick; Kirimi Sindi, Julius; Wamukoy...",2025.0,2025-11-04,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,10.12688/verixiv.244.2;10.12688/verixiv.244.1,,,10.21956/verixiv.1894.r3555;10.21956/verixiv.1...,False,,,,
5731377,crossref::10.12688/verixiv.2443.1,VeriXiv,crossref,10.12688/verixiv.2443.1,https://doi.org/10.12688/verixiv.2443.1,https://verixiv.org/articles/2-412/v1,DHIS2 Integrated App usability and Challenges ...,"Mesfin, Araya; Getachew, Redet; Biruk, Ephrem;...",2025.0,2025-12-09,,,,,,,False,,,,


In [189]:
pattern = "10.12688/gatesopenres.16372"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,title,authors_flat,publication_year,date_created,relations_json,version_label,is_version_of,is_preprint_of,has_preprint,has_review,has_published_version,published_version_ids_json,version_of_ids_json,update_to_json,raw_relationships_json
590196,crossref::10.12688/gatesopenres.16372.1,Gates Open Research,crossref,10.12688/gatesopenres.16372.1,https://doi.org/10.12688/gatesopenres.16372.1,https://gatesopenresearch.org/articles/9-103/v1,Discoverability of African Journals by Google ...,"Amboka, Patrick; Kirimi Sindi, Julius; Wamukoy...",2025.0,2025-11-21,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,,,10.12688/verixiv.244.3,,False,,,,


In [None]:
result['relations_json'][590196]

In [None]:
pattern = "10.12688/verixiv.244.3"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

In [None]:
gate_data

In [None]:
pattern = "10.12688/verixiv."

mask = gate_data['relations_json'].str.contains(pattern, regex=False, na=False)
result = gate_data[mask]
result

In [None]:
result.shape

In [None]:
pattern = "has-preprint"

mask = gate_data['relations_json'].str.contains(pattern, regex=False, na=False)
result = gate_data[mask]
print(result.shape)
result

In [None]:
gate_data2025 = gate_data[gate_data['publication_year'] == '2025.0']
print(gate_data2025.shape)
gate_data2025

In [None]:
gate_data2025first = gate_data2025[gate_data2025['is_version_of']=='']
print(gate_data2025first.shape)
gate_data2025first

In [None]:
pattern = "has-preprint"

mask = ~gate_data2025['relations_json'].str.contains(pattern, regex=False, na=False)
result = gate_data2025[mask]
print(result.shape)
result

In [None]:
pattern = "10.12688/gatesopenres.15431.1"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
print(result.shape)
result

## Keldysh Institute Preprints

In [None]:
data[data['server_name']=='Keldysh Institute Preprints']