In [1]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


# A) Counts

## import data

In [2]:
import duckdb
from pathlib import Path

con = duckdb.connect()

# Low-memory settings
con.execute("PRAGMA threads=1;")
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")
con.execute("PRAGMA memory_limit='2GB';")           # try 1GB if still unstable
con.execute("PRAGMA temp_directory='data/tmp_duckdb';")

# 2) Build paths robustly from the notebook folder
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "by_server"

# IMPORTANT: your files are hive-partitioned like:
all_backends = (BASE / "*" / "*.parquet").as_posix()

con.execute(f"""
CREATE OR REPLACE VIEW all_backends AS
SELECT * FROM read_parquet('{all_backends}', hive_partitioning=true, union_by_name=true);
""")

# A unified "all_rows" view
con.execute("""
CREATE OR REPLACE VIEW all_rows AS
SELECT * FROM all_backends
""")

print(con.execute("SHOW TABLES").fetchall())


[('all_backends',), ('all_rows',)]


In [3]:
con.execute("""
SELECT backend, COUNT(*) AS total, COUNT(record_id) AS with_record_id
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
""").df()


Unnamed: 0,backend,total,with_record_id
0,crossref,3612833,3612833
1,datacite,3523368,3523368
2,openalex,2350692,2350692


## total count

In [4]:
con.execute("""
SELECT COUNT(DISTINCT record_id) AS total_dedup_records
FROM (
    SELECT record_id FROM all_backends
)
""").df()


Unnamed: 0,total_dedup_records
0,8410094


## count per backend

In [5]:
con.execute("""
SELECT
    backend,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
ORDER BY dedup_records DESC
""").df()


Unnamed: 0,backend,dedup_records
0,datacite,3520391
1,crossref,3187171
2,openalex,1702532


## count per server

In [6]:
con.execute("""
SELECT
    backend,
    server_name,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, server_name, record_id FROM all_backends
)
GROUP BY backend, server_name
ORDER BY dedup_records DESC
""").df()


Unnamed: 0,backend,server_name,dedup_records
0,datacite,arXiv,2920797
1,crossref,SSRN,1258958
2,openalex,HAL,1056424
3,crossref,Research Square,450818
4,openalex,RePEc: Research Papers in Economics,389398
...,...,...,...
108,crossref,Prepublicaciones OpenCiencia,8
109,crossref,Therapoid,7
110,crossref,NewAddictionsX,7
111,openalex,Bepress Legal Repository,6


In [7]:
server_name_df = con.execute("""
SELECT server_name, COUNT(DISTINCT record_id) AS n
FROM all_backends
GROUP BY 1
ORDER BY n DESC
LIMIT 200;
""").df()


In [8]:
server_name_df.head(60)

Unnamed: 0,server_name,n
0,arXiv,2920797
1,SSRN,1258958
2,HAL,1056424
3,Research Square,450818
4,RePEc: Research Papers in Economics,389398
5,bioRxiv,306948
6,eLife,247558
7,AgEcon Search,188173
8,ResearchGate,181231
9,Qeios,172316


In [9]:
server_name_df.tail(52)

Unnamed: 0,server_name,n
60,CERN document server,973
61,ARPHA Preprints,890
62,MetaArXiv,880
63,SportRxiv,878
64,Gates Open Research,863
65,AgriRxiv,818
66,Beilstein Archives,697
67,Covid-19 Preprints,647
68,EasyChair preprint,620
69,MarXiv,508


# B) Explorations

In [10]:
con.execute(f"""
CREATE OR REPLACE VIEW server_thin AS
SELECT
  CAST(record_id AS VARCHAR)           AS record_id,
  CAST(server_name AS VARCHAR)         AS server_name,
  CAST(backend AS VARCHAR)             AS backend,

  CAST(doi AS VARCHAR)                 AS doi,
  CAST(doi_url AS VARCHAR)             AS doi_url,
  CAST(landing_page_url AS VARCHAR)    AS landing_page_url,

  CAST(version_label AS VARCHAR)       AS version_label,

  -- Relationships (keep these for true version links)
  CAST(relations_json AS VARCHAR)       AS relations_json,
  CAST(raw_relationships_json AS VARCHAR)       AS raw_relationships_json,
  CAST(is_version_of AS VARCHAR)       AS is_version_of,      -- keep as text; we’ll interpret later
  CAST(version_of_ids_json AS VARCHAR) AS version_of_ids_json,
  CAST(is_preprint_of AS VARCHAR)      AS is_preprint_of,
  CAST(published_version_ids_json AS VARCHAR) AS published_version_ids_json,

  -- Dates (helpful for temporal patterns)
  CAST(date_posted AS VARCHAR)         AS date_posted,
  CAST(date_published AS VARCHAR)      AS date_published,
  CAST(date_published_online AS VARCHAR)      AS date_published_online,
  CAST(date_issued AS VARCHAR)         AS date_issued,
  CAST(date_deposited AS VARCHAR)      AS date_deposited,
  CAST(date_indexed AS VARCHAR)        AS date_indexed,
  CAST(date_created AS VARCHAR)        AS date_created,
  CAST(date_registered AS VARCHAR)     AS date_registered,
  CAST(date_updated AS VARCHAR)        AS date_updated,
  CAST(publication_year AS VARCHAR)    AS publication_year
FROM all_backends
""")

con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


Unnamed: 0,n
0,9486893


## Global Exploration

### Duplicates

In [11]:
data = con.execute("SELECT * FROM server_thin").df()
# data.drop_duplicates(subset=['record_id'], keep='first', inplace=False)

data = data.drop_duplicates()
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-05-03,2020-05-03,,2020-05-03,2020-09-15,2025-05-14,2020-09-15,,,2020.0
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-04-25,2020-04-25,,2020-04-25,2020-09-15,2025-05-14,2020-09-15,,,2020.0
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-04-16,2020-04-16,,2020-04-16,2020-09-15,2025-05-14,2020-09-15,,,2020.0
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-04-15,2020-04-15,,2020-04-15,2020-09-17,2022-12-13,2020-09-17,,,2020.0
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-04-15,2020-04-15,,2020-04-15,2020-09-17,2024-08-11,2020-09-17,,,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,,,,,,,,,2014-09-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2014.0
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,,,,,,,,,2011-12-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2011.0
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,,,,,,,,,2014-06-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2014.0
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,,,,,,,,,2013-06-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2013.0


In [12]:
dupes = data[data.duplicated(subset=['record_id'], keep=False)]
dupes

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
15570,crossref::10.31124/advance.24454624.v1,Advance,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,,,,,,,,2024-02-16,2024-02-16,,2024-02-16,2024-02-22,2024-02-23,2024-02-05,,,2024.0
16016,crossref::10.31124/advance.170921771.12975902/v1,Advance,crossref,10.31124/advance.170921771.12975902/v1,https://doi.org/10.31124/advance.170921771.129...,https://advance.sagepub.com/users/748375/artic...,,,,,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2024-03-01,2024-02-29,,,2024.0
232722,crossref::10.22541/essoar.167016857.73309909.1,Authorea Inc.,crossref,10.22541/essoar.167016857.73309909.1,https://doi.org/10.22541/essoar.167016857.7330...,https://www.authorea.com/users/525275/articles...,,,,,,,,2022-12-04,2022-12-04,,2022-12-04,2022-12-04,2025-06-03,2022-12-04,,,2022.0
232752,crossref::10.22541/essoar.167024940.09387576/v1,Authorea Inc.,crossref,10.22541/essoar.167024940.09387576/v1,https://doi.org/10.22541/essoar.167024940.0938...,https://www.authorea.com/users/525275/articles...,,,,,,,,2022-12-05,2022-12-05,,2022-12-05,2022-12-05,2025-06-03,2022-12-05,,,2022.0
232822,crossref::10.22541/essoar.167045090.02735580/v1,Authorea Inc.,crossref,10.22541/essoar.167045090.02735580/v1,https://doi.org/10.22541/essoar.167045090.0273...,https://www.authorea.com/users/563627/articles...,,,,,,,,2022-12-07,2022-12-07,,2022-12-07,2022-12-07,2025-06-03,2022-12-07,,,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446146,crossref::10.22541/essoar.170365313.34695479/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170365313.34695479/v1,https://doi.org/10.22541/essoar.170365313.3469...,https://essopenarchive.org/users/531775/articl...,,,,,,,,2023-12-27,2023-12-27,,2023-12-27,2024-02-29,2025-06-03,2023-12-27,,,2023.0
446148,crossref::10.22541/essoar.170680200.06135599/v2,Earth and Space Science Open Archive,crossref,10.22541/essoar.170680200.06135599/v2,https://doi.org/10.22541/essoar.170680200.0613...,https://essopenarchive.org/users/672150/articl...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.22541/essoar.170680200.06135599/v1,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2025-06-03,2024-02-29,,,2024.0
446149,crossref::10.22541/essoar.170923255.57545328/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,,,,,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2025-06-03,2024-02-29,,,2024.0
446341,crossref::10.22541/essoar.170612447.73077522/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170612447.73077522/v1,https://doi.org/10.22541/essoar.170612447.7307...,https://essopenarchive.org/doi/full/10.22541/e...,,,,,,,,2024-01-24,2024-01-24,,2024-01-24,2024-03-22,2025-11-23,2024-01-24,,,2024.0


In [13]:
dupes['server_name'].value_counts()

server_name
Authorea Inc.                           3432
Earth and Space Science Open Archive    3430
Advance                                    2
Name: count, dtype: int64

In [14]:
data[data['record_id']=='crossref::10.31124/advance.24454624.v1']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
15570,crossref::10.31124/advance.24454624.v1,Advance,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,,,,,,,,2024-02-16,2024-02-16,,2024-02-16,2024-02-22,2024-02-23,2024-02-05,,,2024.0
273723,crossref::10.31124/advance.24454624.v1,Authorea Inc.,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,,,,,,,,2024-02-16,2024-02-16,,2024-02-16,2024-02-22,2024-02-23,2024-02-05,,,2024.0


In [15]:
data[data['record_id']=='crossref::10.22541/essoar.170923255.57545328/v1']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
250777,crossref::10.22541/essoar.170923255.57545328/v1,Authorea Inc.,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,,,,,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2025-06-03,2024-02-29,,,2024.0
446149,crossref::10.22541/essoar.170923255.57545328/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,,,,,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2025-06-03,2024-02-29,,,2024.0


#### Resolution

In [16]:
import pandas as pd

# ----------------------------------
# 1) Define server priority
#    Lower value = higher priority (kept first)
# ----------------------------------
server_priority = {
    "Earth and Space Science Open Archive": 1,
    "Advance": 2,
    "Authorea Inc.": 3
}

# Work on a copy to avoid side effects
data = data.copy()

# ----------------------------------
# 2) Add priority column
#    Unknown servers get lowest priority
# ----------------------------------
data['server_priority'] = (
    data['server_name']
    .map(server_priority)
    .fillna(99)
    .astype(int)
)

# ----------------------------------
# 3) Deduplicate STRICTLY on record_id
#    - Sort so preferred server comes first
#    - Keep only the best row per record_id
# ----------------------------------
data_clean = (
    data
    .sort_values(by=['record_id', 'server_priority'])
    .drop_duplicates(subset=['record_id'], keep='first')
    .drop(columns=['server_priority'])
)

# ----------------------------------
# 4) (Optional but recommended) Inspect removals
# ----------------------------------
removed = data.loc[~data.index.isin(data_clean.index)]

print("Removed rows by server_name:")
print(removed['server_name'].value_counts(dropna=False))

print("\nRows before:", len(data))
print("Rows after :", len(data_clean))
print("Rows removed:", len(removed))

# ----------------------------------
# data_clean is the final deduplicated dataframe
# ----------------------------------


Removed rows by server_name:
server_name
Authorea Inc.    3432
Name: count, dtype: int64

Rows before: 8413526
Rows after : 8410094
Rows removed: 3432


In [17]:
data_clean.shape

(8410094, 23)

### Columns

In [18]:
data_clean.count()

record_id                     8410094
server_name                   8410094
backend                       8410094
doi                           6784859
doi_url                       6784859
landing_page_url              8330648
version_label                 2974496
relations_json                4253292
raw_relationships_json        3520391
is_version_of                 6707562
version_of_ids_json                 0
is_preprint_of                6707562
published_version_ids_json          0
date_posted                   1947932
date_published                4743663
date_published_online          288705
date_issued                   3041131
date_deposited                3187171
date_indexed                  3187171
date_created                  8410094
date_registered               3520391
date_updated                  5222915
publication_year              8264048
dtype: int64

### Summary

In [19]:
import pandas as pd
import numpy as np

data = data_clean.copy()

# -------------------------------------------------
# Configuration
# -------------------------------------------------
GROUP_COL = 'server_name'

MISSINGNESS_COLUMNS = [
    'doi',
    'landing_page_url'
]

COMPLETENESS_COLUMNS = [
    'version_label',
    'relations_json',
    'is_version_of',
    'is_preprint_of',
    'raw_relationships_json'
]

# -------------------------------------------------
# Helper functions
# -------------------------------------------------
def is_missing(series):
    """Strict missingness (NaN / None)"""
    return series.isna()

def is_incomplete(series):
    """
    Missing OR empty content:
    - NaN
    - empty string
    - empty list
    - empty dict
    """
    return (
        series.isna()
        | (series.astype(str).str.strip() == '')
        | (series.astype(str).isin(['[]', '{}']))
    )

# -------------------------------------------------
# Base counts per server
# -------------------------------------------------
base = (
    data
    .groupby(GROUP_COL, dropna=False)
    .size()
    .rename('total_records')
    .to_frame()
)

# -------------------------------------------------
# Missingness metrics
# -------------------------------------------------
for col in MISSINGNESS_COLUMNS:
    missing_count = (
        data.loc[is_missing(data[col])]
        .groupby(GROUP_COL, dropna=False)
        .size()
        .rename(f'{col}_missing_count')
    )

    base = base.join(missing_count, how='left').fillna(0)
    base[f'{col}_missing_count'] = base[f'{col}_missing_count'].astype(int)
    base[f'{col}_missing_percent'] = (
        base[f'{col}_missing_count'] / base['total_records'] * 100
    ).round(2)

# -------------------------------------------------
# Completeness metrics
# -------------------------------------------------
for col in COMPLETENESS_COLUMNS:
    incomplete_count = (
        data.loc[is_incomplete(data[col])]
        .groupby(GROUP_COL, dropna=False)
        .size()
        .rename(f'{col}_incomplete_count')
    )

    base = base.join(incomplete_count, how='left').fillna(0)
    base[f'{col}_incomplete_count'] = base[f'{col}_incomplete_count'].astype(int)
    base[f'{col}_incomplete_percent'] = (
        base[f'{col}_incomplete_count'] / base['total_records'] * 100
    ).round(2)

# -------------------------------------------------
# Final table
# -------------------------------------------------
summary = base.sort_values('total_records', ascending=False)

# summary


In [20]:
summary.head(60)

Unnamed: 0_level_0,total_records,doi_missing_count,doi_missing_percent,landing_page_url_missing_count,landing_page_url_missing_percent,version_label_incomplete_count,version_label_incomplete_percent,relations_json_incomplete_count,relations_json_incomplete_percent,is_version_of_incomplete_count,is_version_of_incomplete_percent,is_preprint_of_incomplete_count,is_preprint_of_incomplete_percent,raw_relationships_json_incomplete_count,raw_relationships_json_incomplete_percent
server_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
arXiv,2920797,0,0.0,0,0.0,0,0.0,1638047,56.08,2920797,100.0,1638164,56.09,0,0.0
SSRN,1258958,0,0.0,0,0.0,1258938,100.0,1258703,99.98,1258958,100.0,1258703,99.98,1258958,100.0
HAL,1056424,1024809,97.01,103,0.01,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0
Research Square,450818,0,0.0,0,0.0,450813,100.0,283964,62.99,435544,96.61,295396,65.52,450818,100.0
RePEc: Research Papers in Economics,389398,364045,93.49,39749,10.21,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0
bioRxiv,306948,0,0.0,0,0.0,306937,100.0,196496,64.02,306947,100.0,196497,64.02,306948,100.0
eLife,247558,0,0.0,0,0.0,247192,99.85,151705,61.28,233206,94.2,247537,99.99,247558,100.0
AgEcon Search,188173,0,0.0,0,0.0,188173,100.0,188173,100.0,188173,100.0,188173,100.0,0,0.0
ResearchGate,181231,0,0.0,0,0.0,174949,96.53,174741,96.42,174741,96.42,181231,100.0,0,0.0
Qeios,172316,0,0.0,0,0.0,172316,100.0,111907,64.94,167745,97.35,172316,100.0,172316,100.0


In [21]:
summary.tail(52)

Unnamed: 0_level_0,total_records,doi_missing_count,doi_missing_percent,landing_page_url_missing_count,landing_page_url_missing_percent,version_label_incomplete_count,version_label_incomplete_percent,relations_json_incomplete_count,relations_json_incomplete_percent,is_version_of_incomplete_count,is_version_of_incomplete_percent,is_preprint_of_incomplete_count,is_preprint_of_incomplete_percent,raw_relationships_json_incomplete_count,raw_relationships_json_incomplete_percent
server_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
CERN document server,973,0,0.0,0,0.0,933,95.89,46,4.73,973,100.0,443,45.53,0,0.0
ARPHA Preprints,890,0,0.0,0,0.0,890,100.0,578,64.94,890,100.0,578,64.94,890,100.0
MetaArXiv,880,0,0.0,0,0.0,880,100.0,681,77.39,813,92.39,742,84.32,880,100.0
SportRxiv,878,0,0.0,0,0.0,878,100.0,794,90.43,877,99.89,795,90.55,878,100.0
Gates Open Research,863,0,0.0,0,0.0,539,62.46,83,9.62,799,92.58,863,100.0,863,100.0
AgriRxiv,818,0,0.0,0,0.0,818,100.0,805,98.41,818,100.0,805,98.41,818,100.0
Beilstein Archives,697,0,0.0,0,0.0,697,100.0,236,33.86,697,100.0,236,33.86,697,100.0
Covid-19 Preprints,647,0,0.0,0,0.0,647,100.0,647,100.0,647,100.0,647,100.0,647,100.0
EasyChair preprint,620,0,0.0,0,0.0,620,100.0,620,100.0,620,100.0,620,100.0,620,100.0
MarXiv,508,0,0.0,0,0.0,508,100.0,249,49.02,506,99.61,251,49.41,508,100.0


In [22]:
# data_clean

## Manage hierarchy

In [23]:
import pandas as pd
import numpy as np
import re

# ============================================================
# 1) Read the Google Sheet (rules tab) as CSV
#    - Works for public / shared-to-anyone sheets
# ============================================================

SHEET_ID = "10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4"
GID = "1230415212"  # the gid you shared for the 'rules' tab

rules_csv_url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv&gid={GID}"
rules = pd.read_csv(rules_csv_url)

# Normalize column names (strip spaces/newlines)
rules.columns = (
    rules.columns.astype(str)
    .str.replace(r"\s+", " ", regex=True)  # collapse whitespace/newlines
    .str.strip()
)

# ============================================================
# 2) Extract servers where "versionning - doi" == "unique"
# ============================================================

# These are the column names seen in your screenshot:
SERVER_COL = "Field_server_name"
VERS_COL = "versionning - doi"  # after normalization it should match like this

# Safety: show close matches if something is off
if SERVER_COL not in rules.columns or VERS_COL not in rules.columns:
    print("Columns available:", rules.columns.tolist())
    raise KeyError(f"Expected columns not found. Need: {SERVER_COL!r} and {VERS_COL!r}")

# Build "unique versioning" server list
unique_servers = (
    rules.loc[
        rules[VERS_COL].astype(str).str.strip().str.lower().eq("unique"),
        SERVER_COL
    ]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
    .tolist()
)

print(f"Unique-versioning servers found: {len(unique_servers)}")
print(unique_servers[:30])  # preview

# ============================================================
# 3) Apply mapping to your dataframe df -> records_hierarchy = "parent"
# ============================================================

df = data_clean.copy()

# Normalize server_name in df for matching
df["server_name_norm"] = df["server_name"].astype(str).str.strip()

# Normalize unique list too (strip)
unique_servers_norm = [s.strip() for s in unique_servers]

# Create / overwrite records_hierarchy
# Default: keep existing if present; else "other"
# df["records_hierarchy"] = df["records_hierarchy"] if "records_hierarchy" in df.columns else "other"

mask_unique = df["server_name_norm"].isin(unique_servers_norm)
df.loc[mask_unique, "records_hierarchy"] = "parent"

# Optional: drop helper col
df.drop(columns=["server_name_norm"], inplace=True)

# Quick check
print(df["records_hierarchy"].value_counts(dropna=False).head(10))
print(df.loc[df["server_name"].isin(unique_servers_norm), ["server_name","records_hierarchy"]].drop_duplicates().head(20))


Unique-versioning servers found: 60
['AgEcon Search', 'AIJR Preprints', 'ARPHA Preprints', 'ART-Dok', 'arXiv', 'Bepress Legal Repository', 'bioRxiv', 'CERN document server', 'CogPrints', 'Covid-19 Preprints', 'CrimRxiv', 'CrossAsia-Repository', 'Digital Access to Scholarship at Harvard (DASH) (Harvard University)', 'DSpace@MIT', 'E-LIS Repository', 'EasyChair preprint', 'EcoEvoRxiv', 'EconStor Preprints', 'Electron Colloquium Comput Complex', 'ELPUB (Universitat Wuppertal)', 'EmeRI', 'EnerarXiv', 'HAL', 'HANS Publication PrePrints', 'Humanities Commons CORE', 'IACR Cryptology ePrint Archive', 'IndiaRxiv', 'JMIR Preprints', 'Keldysh Institute Preprints', 'LatArXiv']
records_hierarchy
parent    7205232
NaN       1204862
Name: count, dtype: int64
                         server_name records_hierarchy
2119783            Nature Precedings            parent
9461322                   searchRxiv            parent
5729276         UCL Open Environment            parent
8967487                   

In [24]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
Research Square                         450818
eLife                                   247558
Preprints.org                           115815
Authorea Inc.                            62018
EGUsphere                                60396
PsyArXiv                                 56866
ChemRxiv                                 46475
TechRxiv                                 29418
Earth and Space Science Open Archive     22932
SocArXiv                                 21541
INA-Rxiv                                 17837
F1000Research                            16873
EarthArXiv                                6537
PeerJ Preprints                           6446
ScienceOpen Preprints                     5087
engrXiv                                   4929
Wellcome Open Research                    4727
Advance                                   4401
Thesis Commons                            3959
Cambridge Open Engage                     3090
EdArXiv                                   2547
O

In [25]:
df = df.copy()

# Work only on remaining (not already forced to parent)
mask_remain = df["records_hierarchy"] != "parent"

# Normalize version_label once
vl = df["version_label"].astype(str).str.strip().str.lower()

# CHILD rules
child_mask = (
    mask_remain &
    vl.isin(["new version", 'New version; Retraction'])
)
df.loc[child_mask, "records_hierarchy"] = "child"

# correction rules
correction_mask = (
    mask_remain &
    vl.isin(["correction"])
)
df.loc[correction_mask, "records_hierarchy"] = "correction"

print(df["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
parent        7205232
NaN           1196060
child            8448
correction        354
Name: count, dtype: int64


In [26]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
Research Square                         450818
eLife                                   247204
Preprints.org                           115815
Authorea Inc.                            62018
EGUsphere                                60396
PsyArXiv                                 56866
ChemRxiv                                 46475
TechRxiv                                 29418
Earth and Space Science Open Archive     22932
SocArXiv                                 21541
INA-Rxiv                                 17837
F1000Research                            11226
EarthArXiv                                6537
PeerJ Preprints                           6446
ScienceOpen Preprints                     5087
engrXiv                                   4929
Advance                                   4401
Thesis Commons                            3959
Wellcome Open Research                    3428
Cambridge Open Engage                     3090
EdArXiv                                   2547
L

In [27]:
df = df.copy()

# Ensure column exists
# if "records_hierarchy" not in df.columns:
#     df["records_hierarchy"] = "other"

# Normalize relations_json once
relations_norm = df["relations_json"].astype(str).str.lower()

# Only touch rows not already parent/child
mask_remain = df["records_hierarchy"].isin(["other", None, np.nan])

# Review signals
review_mask = (
    mask_remain &
    (
        # relations_norm.str.contains("has-review", na=False) |
        relations_norm.str.contains("is-review-of", na=False) 
    )
)

df.loc[review_mask, "records_hierarchy"] = "review"
print(df["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
parent        7205232
NaN           1076468
review         119592
child            8448
correction        354
Name: count, dtype: int64


In [28]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
Research Square                         450818
eLife                                   173835
Preprints.org                           115815
Authorea Inc.                            62018
PsyArXiv                                 56866
ChemRxiv                                 46475
TechRxiv                                 29418
Earth and Space Science Open Archive     22932
SocArXiv                                 21541
INA-Rxiv                                 17837
EGUsphere                                14963
F1000Research                            11226
EarthArXiv                                6537
PeerJ Preprints                           6446
engrXiv                                   4929
Advance                                   4401
ScienceOpen Preprints                     4297
Thesis Commons                            3959
Wellcome Open Research                    3428
Cambridge Open Engage                     3090
EdArXiv                                   2547
L

In [29]:
import re
import numpy as np
import pandas as pd

df = df.copy()

# ------------------------------------------------------------
# 0) Ensure target column exists
# ------------------------------------------------------------

# normalize strings once (safe)
df["landing_norm"] = df.get("landing_page_url", "").astype(str).str.lower()
df["doi_norm"] = df.get("doi", "").astype(str).str.lower()

# ------------------------------------------------------------
# 1) Your regex (good: avoids /v284p and .v50i7)
# ------------------------------------------------------------
VERSION_RX = re.compile(
    r'(?P<token>('
    r'/v\d+(?![a-z0-9])|'       # /v1 but not /v284p
    r'\.v\d+(?![a-z0-9])|'      # .v1 but not .v50i7
    r'_v\d+(?![a-z0-9])|'       # _v1
    r'-v\d+(?![a-z0-9])|'       # -v2 but not -v8018x
    r'-rc\d+(?![a-z0-9])|'      # -rc1
    r'-cc\d+(?![a-z0-9])|'      # -cc1
    r'-supplement|'             # -supplement
    r'\.sa\d+(?![a-z0-9])'      # .sa10
    r'))',
    re.IGNORECASE
)

# helpers: classify token family + get numeric v
VNUM_RX = re.compile(r'(?:^|[._/\-])v(\d+)$', re.IGNORECASE)  # matches v1 at end of token
def extract_token(text: str):
    if not isinstance(text, str) or text.strip() == "" or text.lower() == "nan":
        return None
    m = VERSION_RX.search(text)
    return m.group("token") if m else None

def token_kind(token: str):
    if not isinstance(token, str):
        return None
    t = token.lower()
    if t.startswith(("/v", ".v", "_v", "-v")):
        return "explicit_version"
    if t.startswith("-rc"):
        return "rc"
    if t.startswith("-cc"):
        return "cc"
    if t == "-supplement":
        return "supplement"
    if t.startswith(".sa"):
        return "supplementary_asset"
    return "other"

def token_vnum(token: str):
    if not isinstance(token, str):
        return np.nan
    t = token.lower()
    # extract trailing v number for explicit version tokens only
    m = re.search(r'v(\d+)$', t)
    return float(m.group(1)) if m else np.nan

# ------------------------------------------------------------
# 2) Apply extraction: landing_page_url first, then doi
# ------------------------------------------------------------
remain = df["records_hierarchy"].fillna("other").eq("other")

# --- landing page tokens ---
df.loc[remain, "version_token_lp"] = df.loc[remain, "landing_norm"].map(extract_token)
df.loc[remain, "token_kind_lp"] = df.loc[remain, "version_token_lp"].map(token_kind)
df.loc[remain, "vnum_lp"] = df.loc[remain, "version_token_lp"].map(token_vnum)

# Label rules (landing_page_url)
# - Only use explicit vN tokens for parent/child
# - parent if v0 or v1, child if v>=2
lp_parent = remain & (df["token_kind_lp"] == "explicit_version") & (df["vnum_lp"].isin([0.0, 1.0]))
lp_child  = remain & (df["token_kind_lp"] == "explicit_version") & (df["vnum_lp"] >= 2)

df.loc[lp_parent, "records_hierarchy"] = "parent"
df.loc[lp_child,  "records_hierarchy"] = "child"

# Everything else from landing tokens (rc/cc/supplement/.sa) keep as other
# If you prefer to label them separately, do it here:
df.loc[remain & df["token_kind_lp"].isin(["rc","cc"]), "records_hierarchy"] = "comment"
df.loc[remain & df["token_kind_lp"].isin(["supplement","supplementary_asset"]), "records_hierarchy"] = "part_of"

# --- doi tokens (only for still-unlabeled) ---
remain2 = df["records_hierarchy"].fillna("other").eq("other")

df.loc[remain2, "version_token_doi"] = df.loc[remain2, "doi_norm"].map(extract_token)
df.loc[remain2, "token_kind_doi"] = df.loc[remain2, "version_token_doi"].map(token_kind)
df.loc[remain2, "vnum_doi"] = df.loc[remain2, "version_token_doi"].map(token_vnum)

doi_parent = remain2 & (df["token_kind_doi"] == "explicit_version") & (df["vnum_doi"].isin([0.0, 1.0]))
doi_child  = remain2 & (df["token_kind_doi"] == "explicit_version") & (df["vnum_doi"] >= 2)

df.loc[doi_parent, "records_hierarchy"] = "parent"
df.loc[doi_child,  "records_hierarchy"] = "child"

# ------------------------------------------------------------
# 3) Optional: quick diagnostics
# ------------------------------------------------------------
# How many got labeled via landing vs doi?
print(df["records_hierarchy"].value_counts(dropna=False))
# print(df.loc[df["records_hierarchy"].isin(["parent","child"]),
#              ["server_name","landing_page_url","doi","version_token_lp","version_token_doi","records_hierarchy"]].head(20))


records_hierarchy
parent        7922000
NaN            279301
review         119592
child           84259
part_of          4584
correction        354
comment             4
Name: count, dtype: int64


In [30]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
eLife                                   173835
ChemRxiv                                 27562
Earth and Space Science Open Archive     13220
EGUsphere                                10457
TechRxiv                                  8320
PsyArXiv                                  6632
PeerJ Preprints                           6446
EarthArXiv                                6425
Authorea Inc.                             6092
engrXiv                                   4925
SocArXiv                                  3072
INA-Rxiv                                  2830
Cambridge Open Engage                     2176
Advance                                   1717
Thesis Commons                            1507
APSA Preprints                            1176
AgriRxiv                                   787
EdArXiv                                    550
Law Archive                                287
Arabixiv                                   209
MetaArXiv                                  196
S

In [31]:
df = df.copy()

# Ensure column exists
# if "records_hierarchy" not in df.columns:
#     df["records_hierarchy"] = "other"

# Normalize relations_json once
relations_norm = df["landing_page_url"].astype(str).str.lower()

# Only touch rows not already parent/child
mask_remain = df["records_hierarchy"].isin(["other", None, np.nan])

# Review signals
review_mask = (
    mask_remain &
    (
        relations_norm.str.contains("#fig", na=False) |
        relations_norm.str.contains("#digest", na=False) |
        relations_norm.str.contains("#supp", na=False) |
        relations_norm.str.contains("#video", na=False) |
        relations_norm.str.contains("#media", na=False) |
        relations_norm.str.contains("#tbl", na=False) |
        relations_norm.str.contains("#table", na=False) |
        relations_norm.str.contains("#sd", na=False) |
        relations_norm.str.contains("#transrepform", na=False) |
        relations_norm.str.contains("/figures#", na=False) |
        relations_norm.str.contains("#box", na=False) |
        relations_norm.str.contains("#app", na=False) |
        relations_norm.str.contains("#resp", na=False) |
        relations_norm.str.contains("#a", na=False) |
        relations_norm.str.contains("#b", na=False) |
        relations_norm.str.contains("#c", na=False) |
        relations_norm.str.contains("#s", na=False) |
        relations_norm.str.contains("#atbl", na=False) |
        relations_norm.str.contains("#sa", na=False) |
        relations_norm.str.contains("#none", na=False) |
        relations_norm.str.contains("#desfig", na=False) |
        relations_norm.str.contains("#keyresource", na=False) |
        relations_norm.str.contains("#abstract", na=False) 
    )
)
df.loc[review_mask, "records_hierarchy"] = "part_of"
#
# Review signals
comment_mask = (
    mask_remain &
    (
        relations_norm.str.contains("#ac", na=False) |
        relations_norm.str.contains("#rc", na=False) |
        relations_norm.str.contains("#cc", na=False) |
        relations_norm.str.contains("#ec", na=False) |
        relations_norm.str.contains("/peer-reviews", na=False) |
        relations_norm.str.contains("#decision-letter", na=False) |
        relations_norm.str.contains("#cec", na=False) 
    )
)

df.loc[comment_mask, "records_hierarchy"] = "comment"


print(df["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
parent        7922000
part_of        148204
NaN            135383
review         119592
child           84259
correction        354
comment           302
Name: count, dtype: int64


In [32]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
eLife                                   30165
ChemRxiv                                27562
Earth and Space Science Open Archive    13220
EGUsphere                               10209
TechRxiv                                 8320
PsyArXiv                                 6632
PeerJ Preprints                          6446
EarthArXiv                               6425
Authorea Inc.                            6092
engrXiv                                  4925
SocArXiv                                 3072
INA-Rxiv                                 2830
Cambridge Open Engage                    2176
Advance                                  1717
Thesis Commons                           1507
APSA Preprints                           1176
AgriRxiv                                  787
EdArXiv                                   550
Law Archive                               287
Arabixiv                                  209
MetaArXiv                                 196
ScienceOpen Preprints 

In [33]:
pattern = "10.7554/elife.99997"

mask = df['doi'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9330914,crossref::10.7554/elife.99997,eLife,crossref,10.7554/elife.99997,https://doi.org/10.7554/elife.99997,https://elifesciences.org/articles/99997,,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99997.1;10.7554/elife.99997.2,,,,,2025-01-21,2025-01-21,2025-01-21,2025-01-21,2026-01-01,2024-10-08,,,2025.0,,https://elifesciences.org/articles/99997,10.7554/elife.99997,,,,,,
9352764,crossref::10.7554/elife.99997.1,eLife,crossref,10.7554/elife.99997.1,https://doi.org/10.7554/elife.99997.1,https://elifesciences.org/reviewed-preprints/9...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.13.598625,,,,2024-10-08,2024-10-08,,2024-10-08,2025-05-22,2025-11-23,2024-10-08,,,2024.0,,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1,,,,,,
9353744,crossref::10.7554/elife.99997.1.sa0,eLife,crossref,10.7554/elife.99997.1.sa0,https://doi.org/10.7554/elife.99997.1.sa0,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2024-10-08,,2024-10-08,2025-05-22,2025-05-23,2024-10-08,,,2024.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa0,,,,,,
9353735,crossref::10.7554/elife.99997.1.sa1,eLife,crossref,10.7554/elife.99997.1.sa1,https://doi.org/10.7554/elife.99997.1.sa1,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2024-10-08,,2024-10-08,2025-05-22,2025-05-23,2024-10-08,,,2024.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa1,,,,,,
9353740,crossref::10.7554/elife.99997.1.sa2,eLife,crossref,10.7554/elife.99997.1.sa2,https://doi.org/10.7554/elife.99997.1.sa2,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2024-10-08,,2024-10-08,2025-05-22,2025-05-23,2024-10-08,,,2024.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa2,,,,,,
9353736,crossref::10.7554/elife.99997.1.sa3,eLife,crossref,10.7554/elife.99997.1.sa3,https://doi.org/10.7554/elife.99997.1.sa3,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2024-10-08,,2024-10-08,2025-05-22,2025-05-23,2024-10-08,,,2024.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa3,,,,,,
9356050,crossref::10.7554/elife.99997.2,eLife,crossref,10.7554/elife.99997.2,https://doi.org/10.7554/elife.99997.2,https://elifesciences.org/reviewed-preprints/9...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.13.598625;10.7554/elife.99997.1,,,,2025-01-06,2025-01-06,,2025-01-06,2025-05-23,2025-11-23,2025-01-06,,,2025.0,,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.2,,,,,,
9357562,crossref::10.7554/elife.99997.2.sa0,eLife,crossref,10.7554/elife.99997.2.sa0,https://doi.org/10.7554/elife.99997.2.sa0,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-01-06,,2025-01-06,2025-05-23,2025-05-24,2025-01-06,,,2025.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.2.sa0,,,,,,
9357558,crossref::10.7554/elife.99997.2.sa1,eLife,crossref,10.7554/elife.99997.2.sa1,https://doi.org/10.7554/elife.99997.2.sa1,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-01-06,,2025-01-06,2025-05-23,2025-05-24,2025-01-06,,,2025.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.2.sa1,,,,,,
9357554,crossref::10.7554/elife.99997.2.sa2,eLife,crossref,10.7554/elife.99997.2.sa2,https://doi.org/10.7554/elife.99997.2.sa2,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-01-06,,2025-01-06,2025-05-23,2025-05-24,2025-01-06,,,2025.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.2.sa2,,,,,,


In [34]:
import re
import numpy as np
import pandas as pd

df = df.copy()

# Ensure column exists
# if "records_hierarchy" not in df.columns:
#     df["records_hierarchy"] = np.nan

# Work ONLY on remaining rows
remaining = df["records_hierarchy"].isna()

is_elife = df["server_name"].astype(str).str.lower().eq("elife")
doi_l = df["doi"].astype(str).str.lower()

# ---------- Regex patterns ----------

# Base parent: 10.7554/elife.12523
RX_PARENT_BASE = re.compile(    r"^10\.7554/elife\.\d+$",    re.IGNORECASE)

# Explicit parent v1 (NO padding)
RX_PARENT_V1 = re.compile(    r"^10\.7554/elife\.\d+\.1$",    re.IGNORECASE)

# Any dotted numeric suffix (captures padding too)
RX_ANY_SUFFIX = re.compile(r"^10\.7554/elife\.\d+\.(\d+)$", re.IGNORECASE)

# ---------- Apply rules ----------

# Parent: base DOI
mask_parent_base = remaining & is_elife & doi_l.str.match(RX_PARENT_BASE, na=False)
df.loc[mask_parent_base, "records_hierarchy"] = "parent"

# Parent: explicit ".1" ONLY
mask_parent_v1 = remaining & is_elife & doi_l.str.match(RX_PARENT_V1, na=False)
df.loc[mask_parent_v1, "records_hierarchy"] = "child"

# Child: any numeric suffix EXCEPT exact ".1"
suffix = doi_l.str.extract(RX_ANY_SUFFIX)[0]

mask_child = (
    remaining
    & is_elife
    & suffix.notna()
    & (suffix != "1")   # excludes .1 but keeps .001, .002, .2, etc.
)

df.loc[mask_child, "records_hierarchy"] = "child"


In [35]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
ChemRxiv                                27562
Earth and Space Science Open Archive    13220
EGUsphere                               10209
TechRxiv                                 8320
PsyArXiv                                 6632
PeerJ Preprints                          6446
EarthArXiv                               6425
Authorea Inc.                            6092
engrXiv                                  4925
SocArXiv                                 3072
INA-Rxiv                                 2830
Cambridge Open Engage                    2176
Advance                                  1717
Thesis Commons                           1507
APSA Preprints                           1176
AgriRxiv                                  787
EdArXiv                                   550
Law Archive                               287
Arabixiv                                  209
MetaArXiv                                 196
ScienceOpen Preprints                     151
LIS Scholarship Archiv

In [36]:
df_remain[df_remain['server_name']=='eLife']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9130897,crossref::10.7554/elife,eLife,crossref,10.7554/elife,https://doi.org/10.7554/elife,https://elifesciences.org/,,,,,,,,,,,,2017-07-25,2025-12-30,2017-07-25,,,,,https://elifesciences.org/,10.7554/elife,,,,,,


In [37]:
pattern = "10.7554/elife.99997"

mask = df['doi'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9330914,crossref::10.7554/elife.99997,eLife,crossref,10.7554/elife.99997,https://doi.org/10.7554/elife.99997,https://elifesciences.org/articles/99997,,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99997.1;10.7554/elife.99997.2,,,,,2025-01-21,2025-01-21,2025-01-21,2025-01-21,2026-01-01,2024-10-08,,,2025.0,parent,https://elifesciences.org/articles/99997,10.7554/elife.99997,,,,,,
9352764,crossref::10.7554/elife.99997.1,eLife,crossref,10.7554/elife.99997.1,https://doi.org/10.7554/elife.99997.1,https://elifesciences.org/reviewed-preprints/9...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.13.598625,,,,2024-10-08,2024-10-08,,2024-10-08,2025-05-22,2025-11-23,2024-10-08,,,2024.0,child,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1,,,,,,
9353744,crossref::10.7554/elife.99997.1.sa0,eLife,crossref,10.7554/elife.99997.1.sa0,https://doi.org/10.7554/elife.99997.1.sa0,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2024-10-08,,2024-10-08,2025-05-22,2025-05-23,2024-10-08,,,2024.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa0,,,,,,
9353735,crossref::10.7554/elife.99997.1.sa1,eLife,crossref,10.7554/elife.99997.1.sa1,https://doi.org/10.7554/elife.99997.1.sa1,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2024-10-08,,2024-10-08,2025-05-22,2025-05-23,2024-10-08,,,2024.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa1,,,,,,
9353740,crossref::10.7554/elife.99997.1.sa2,eLife,crossref,10.7554/elife.99997.1.sa2,https://doi.org/10.7554/elife.99997.1.sa2,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2024-10-08,,2024-10-08,2025-05-22,2025-05-23,2024-10-08,,,2024.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa2,,,,,,
9353736,crossref::10.7554/elife.99997.1.sa3,eLife,crossref,10.7554/elife.99997.1.sa3,https://doi.org/10.7554/elife.99997.1.sa3,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2024-10-08,,2024-10-08,2025-05-22,2025-05-23,2024-10-08,,,2024.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.1.sa3,,,,,,
9356050,crossref::10.7554/elife.99997.2,eLife,crossref,10.7554/elife.99997.2,https://doi.org/10.7554/elife.99997.2,https://elifesciences.org/reviewed-preprints/9...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.1101/2024.06.13.598625;10.7554/elife.99997.1,,,,2025-01-06,2025-01-06,,2025-01-06,2025-05-23,2025-11-23,2025-01-06,,,2025.0,child,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.2,,,,,,
9357562,crossref::10.7554/elife.99997.2.sa0,eLife,crossref,10.7554/elife.99997.2.sa0,https://doi.org/10.7554/elife.99997.2.sa0,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-01-06,,2025-01-06,2025-05-23,2025-05-24,2025-01-06,,,2025.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.2.sa0,,,,,,
9357558,crossref::10.7554/elife.99997.2.sa1,eLife,crossref,10.7554/elife.99997.2.sa1,https://doi.org/10.7554/elife.99997.2.sa1,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-01-06,,2025-01-06,2025-05-23,2025-05-24,2025-01-06,,,2025.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.2.sa1,,,,,,
9357554,crossref::10.7554/elife.99997.2.sa2,eLife,crossref,10.7554/elife.99997.2.sa2,https://doi.org/10.7554/elife.99997.2.sa2,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-01-06,,2025-01-06,2025-05-23,2025-05-24,2025-01-06,,,2025.0,review,https://elifesciences.org/reviewed-preprints/9...,10.7554/elife.99997.2.sa2,,,,,,


In [38]:
df_remain[df_remain['server_name']=='ChemRxiv']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
286137,crossref::10.26434/chemrxiv-2021-00kkd,ChemRxiv,crossref,10.26434/chemrxiv-2021-00kkd,https://doi.org/10.26434/chemrxiv-2021-00kkd,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2021-12-23,2021-12-23,,2021-12-23,2021-12-23,2025-11-23,2021-12-23,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-00kkd,,,,,,
312423,crossref::10.26434/chemrxiv-2021-00rj4,ChemRxiv,crossref,10.26434/chemrxiv-2021-00rj4,https://doi.org/10.26434/chemrxiv-2021-00rj4,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1149/1945-7111/ac3ff4,,2021-10-05,2021-10-05,,2021-10-05,2025-04-16,2025-04-16,2022-01-25,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-00rj4,,,,,,
312367,crossref::10.26434/chemrxiv-2021-012c7,ChemRxiv,crossref,10.26434/chemrxiv-2021-012c7,https://doi.org/10.26434/chemrxiv-2021-012c7,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1016/j.msec.2022.112697,,2021-10-29,2021-10-29,,2021-10-29,2025-04-16,2025-04-16,2022-01-25,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-012c7,,,,,,
286254,crossref::10.26434/chemrxiv-2021-013gn,ChemRxiv,crossref,10.26434/chemrxiv-2021-013gn,https://doi.org/10.26434/chemrxiv-2021-013gn,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2021-013gn-v2,,,,2021-07-15,2021-07-15,,2021-07-15,2022-01-25,2025-11-23,2022-01-25,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-013gn,,,,,,
286132,crossref::10.26434/chemrxiv-2021-01dfq,ChemRxiv,crossref,10.26434/chemrxiv-2021-01dfq,https://doi.org/10.26434/chemrxiv-2021-01dfq,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2021-12-23,2021-12-23,,2021-12-23,2021-12-23,2025-11-23,2021-12-23,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-01dfq,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285987,crossref::10.26434/chemrxiv.5917351,ChemRxiv,crossref,10.26434/chemrxiv.5917351,https://doi.org/10.26434/chemrxiv.5917351,https://chemrxiv.org/articles/CO2_Activation_o...,,,,,,,,2018-01-01,2018-01-01,,2018-01-01,2021-11-17,2022-04-01,2018-02-23,,,2018.0,,https://chemrxiv.org/articles/co2_activation_o...,10.26434/chemrxiv.5917351,,,,,,
287405,crossref::10.26434/chemrxiv.6483989,ChemRxiv,crossref,10.26434/chemrxiv.6483989,https://doi.org/10.26434/chemrxiv.6483989,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2018-06-12,2018-06-12,,2018-06-12,2022-02-15,2025-05-14,2018-06-12,,,2018.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.6483989,,,,,,
287403,crossref::10.26434/chemrxiv.8021102,ChemRxiv,crossref,10.26434/chemrxiv.8021102,https://doi.org/10.26434/chemrxiv.8021102,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2019-04-23,2019-04-23,,2019-04-23,2022-02-15,2025-05-14,2019-04-23,,,2019.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.8021102,,,,,,
287404,crossref::10.26434/chemrxiv.8378342,ChemRxiv,crossref,10.26434/chemrxiv.8378342,https://doi.org/10.26434/chemrxiv.8378342,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2019-07-01,2019-07-01,,2019-07-01,2022-02-15,2025-05-14,2019-07-01,,,2019.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.8378342,,,,,,


In [39]:
pattern = "v"

mask = df_remain[df_remain['server_name']=='ChemRxiv']['doi'].str.contains(pattern, regex=False, na=False)
result = df_remain[df_remain['server_name']=='ChemRxiv'][mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
286137,crossref::10.26434/chemrxiv-2021-00kkd,ChemRxiv,crossref,10.26434/chemrxiv-2021-00kkd,https://doi.org/10.26434/chemrxiv-2021-00kkd,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2021-12-23,2021-12-23,,2021-12-23,2021-12-23,2025-11-23,2021-12-23,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-00kkd,,,,,,
312423,crossref::10.26434/chemrxiv-2021-00rj4,ChemRxiv,crossref,10.26434/chemrxiv-2021-00rj4,https://doi.org/10.26434/chemrxiv-2021-00rj4,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1149/1945-7111/ac3ff4,,2021-10-05,2021-10-05,,2021-10-05,2025-04-16,2025-04-16,2022-01-25,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-00rj4,,,,,,
312367,crossref::10.26434/chemrxiv-2021-012c7,ChemRxiv,crossref,10.26434/chemrxiv-2021-012c7,https://doi.org/10.26434/chemrxiv-2021-012c7,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1016/j.msec.2022.112697,,2021-10-29,2021-10-29,,2021-10-29,2025-04-16,2025-04-16,2022-01-25,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-012c7,,,,,,
286254,crossref::10.26434/chemrxiv-2021-013gn,ChemRxiv,crossref,10.26434/chemrxiv-2021-013gn,https://doi.org/10.26434/chemrxiv-2021-013gn,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2021-013gn-v2,,,,2021-07-15,2021-07-15,,2021-07-15,2022-01-25,2025-11-23,2022-01-25,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-013gn,,,,,,
286132,crossref::10.26434/chemrxiv-2021-01dfq,ChemRxiv,crossref,10.26434/chemrxiv-2021-01dfq,https://doi.org/10.26434/chemrxiv-2021-01dfq,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2021-12-23,2021-12-23,,2021-12-23,2021-12-23,2025-11-23,2021-12-23,,,2021.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv-2021-01dfq,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285987,crossref::10.26434/chemrxiv.5917351,ChemRxiv,crossref,10.26434/chemrxiv.5917351,https://doi.org/10.26434/chemrxiv.5917351,https://chemrxiv.org/articles/CO2_Activation_o...,,,,,,,,2018-01-01,2018-01-01,,2018-01-01,2021-11-17,2022-04-01,2018-02-23,,,2018.0,,https://chemrxiv.org/articles/co2_activation_o...,10.26434/chemrxiv.5917351,,,,,,
287405,crossref::10.26434/chemrxiv.6483989,ChemRxiv,crossref,10.26434/chemrxiv.6483989,https://doi.org/10.26434/chemrxiv.6483989,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2018-06-12,2018-06-12,,2018-06-12,2022-02-15,2025-05-14,2018-06-12,,,2018.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.6483989,,,,,,
287403,crossref::10.26434/chemrxiv.8021102,ChemRxiv,crossref,10.26434/chemrxiv.8021102,https://doi.org/10.26434/chemrxiv.8021102,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2019-04-23,2019-04-23,,2019-04-23,2022-02-15,2025-05-14,2019-04-23,,,2019.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.8021102,,,,,,
287404,crossref::10.26434/chemrxiv.8378342,ChemRxiv,crossref,10.26434/chemrxiv.8378342,https://doi.org/10.26434/chemrxiv.8378342,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2019-07-01,2019-07-01,,2019-07-01,2022-02-15,2025-05-14,2019-07-01,,,2019.0,,https://chemrxiv.org/engage/chemrxiv/article-d...,10.26434/chemrxiv.8378342,,,,,,


In [40]:
df = df.copy()

# Work only on remaining (not already forced to parent)
mask_remain = df["records_hierarchy"] != "parent"

# Normalize version_label once
server_name = df["server_name"].astype(str).str.strip()#.str.lower()

# CHILD rules
child_mask = (
    mask_remain &
    server_name.isin(["ChemRxiv"])
)
df.loc[child_mask, "records_hierarchy"] = "parent"

# # correction rules
# correction_mask = (
#     mask_remain &
#     vl.isin(["correction"])
# )
# df.loc[correction_mask, "records_hierarchy"] = "correction"

print(df["records_hierarchy"].value_counts(dropna=False))

records_hierarchy
parent        7979191
part_of        148204
review         119592
child           84794
NaN             77657
correction        354
comment           302
Name: count, dtype: int64


In [41]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
Earth and Space Science Open Archive    13220
EGUsphere                               10209
TechRxiv                                 8320
PsyArXiv                                 6632
PeerJ Preprints                          6446
EarthArXiv                               6425
Authorea Inc.                            6092
engrXiv                                  4925
SocArXiv                                 3072
INA-Rxiv                                 2830
Cambridge Open Engage                    2176
Advance                                  1717
Thesis Commons                           1507
APSA Preprints                           1176
AgriRxiv                                  787
EdArXiv                                   550
Law Archive                               287
Arabixiv                                  209
MetaArXiv                                 196
ScienceOpen Preprints                     151
LIS Scholarship Archive                    85
PoolText              

In [42]:
import re
import numpy as np

# Ensure column exists
if 'records_hierarchy' not in df.columns:
    df['records_hierarchy'] = pd.NA

# Work only on remaining (unlabeled)
remain = df['records_hierarchy'].isna()

# ----------------------------
# 1) OSF-based servers: parent if DOI is exactly osf.io/<5chars>
#    Examples:
#      10.31234/osf.io/zypk9  -> parent
#      10.31234/osf.io/zypk9_v1  -> NOT parent by this rule
# ----------------------------
OSF_SERVERS = {
    'PsyArXiv',
    'Thesis Commons',
    'SocArXiv',
    'OSF Preprints',
    'Open Science Framework',
    'MindRxiv',
    'MetaArXiv',
    'SportRxiv',
    'LawArXiv',
    'EarthArXiv',
    'EngrXiv',
    'MarXiv',
    'INA-Rxiv',
    'AfricArXiv',
    'AgriXiv',
    'Arabixiv',
    # add/remove servers you want to include
}

# matches "...osf.io/ABCDE" at end of DOI string
OSF_PARENT_RX = re.compile(r'osf\.io/[a-z0-9]{5}$', re.IGNORECASE)

mask_osf_parent = (
    remain
    # & df['server_name'].isin(OSF_SERVERS)
    & df['doi'].astype(str).str.lower().str.contains('osf.io/', na=False)
    & df['doi'].astype(str).str.match(r'.*osf\.io/[a-z0-9]{5}$', na=False)
)

df.loc[mask_osf_parent, 'records_hierarchy'] = 'parent'


# ----------------------------
# 2) TechRxiv: parent if DOI is exactly "techrxiv.<digits>"
#    Examples:
#      10.36227/techrxiv.24750039 -> parent
#      10.36227/techrxiv.24750039.v1 -> NOT parent by this rule
# ----------------------------
TECHRXIV_PARENT_RX = re.compile(r'techrxiv\.\d+$', re.IGNORECASE)

mask_techrxiv_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'TechRxiv')
    & df['doi'].astype(str).str.match(r'.*techrxiv\.\d+$', na=False)
)

df.loc[mask_techrxiv_parent, 'records_hierarchy'] = 'parent'


# ----------------------------
# Optional: if you want to label obvious "child" for OSF when suffix exists
# (only do this if you are confident the suffix means versioning)
# Example: osf.io/xxxxx_v2 or osf.io/xxxxx-v2 or osf.io/xxxxx.v2
# ----------------------------
OSF_CHILD_RX = re.compile(r'osf\.io/[a-z0-9]{5}([._-]?v\d+)$', re.IGNORECASE)

mask_osf_child = (
    df['records_hierarchy'].isna()
    & df['server_name'].isin(OSF_SERVERS)
    & df['doi'].astype(str).str.match(r'.*osf\.io/[a-z0-9]{5}([._-]?v\d+)$', na=False)
)

df.loc[mask_osf_child, 'records_hierarchy'] = 'child'


# ----------------------------
# Optional: TechRxiv child rule if you see explicit versioning later
# Example: techrxiv.24750039.v2 or techrxiv.24750039-v2
# ----------------------------
mask_techrxiv_child = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'TechRxiv')
    & df['doi'].astype(str).str.match(r'.*techrxiv\.\d+([._-]?v\d+)$', na=False)
)

df.loc[mask_techrxiv_child, 'records_hierarchy'] = 'child'


# Quick check
print(df['records_hierarchy'].value_counts(dropna=False))


records_hierarchy
parent        8007687
part_of        148204
review         119592
child           84794
NaN             49161
correction        354
comment           302
Name: count, dtype: int64


In [43]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
Earth and Space Science Open Archive    13220
EGUsphere                               10209
PeerJ Preprints                          6446
Authorea Inc.                            6092
EarthArXiv                               4693
engrXiv                                  2733
Cambridge Open Engage                    2176
Advance                                  1717
APSA Preprints                           1176
AgriRxiv                                  380
ScienceOpen Preprints                     151
PoolText                                   79
Oroboros Instruments                       70
F1000Research                              14
Open Research Africa                        1
AMRC Open Research                          1
MNI Open Research                           1
Gates Open Research                         1
eLife                                       1
Name: count, dtype: int64

In [44]:
import re
import pandas as pd

# Ensure target column exists
# if 'records_hierarchy' not in df.columns:
#     df['records_hierarchy'] = pd.NA

remain = df['records_hierarchy'].isna()

doi_s = df['doi'].astype(str).str.strip().str.lower()

# -------------------------
# EarthArXiv: 10.15697/<token>
# Examples: 10.15697/fk20022, 10.15697/fk2v59g
# -------------------------
mask_eartharxiv_parent = (
    remain
    & (df['server_name'] == 'EarthArXiv')
    & doi_s.str.match(r'^10\.15697/[a-z0-9]+$', na=False)
)
df.loc[mask_eartharxiv_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# Authorea: 10.22541/au.<digits>.<digits>
# Examples: 10.22541/au.148928287.78038962
# -------------------------
mask_authorea_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'Authorea Inc.')
    & doi_s.str.match(r'^10\.22541/au\.\d+\.\d+$', na=False)
)
df.loc[mask_authorea_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# The Winnower: 10.15200/winn.<digits>.<digits>
# Examples: 10.15200/winn.143644.45920
# -------------------------
mask_winn_parent = (
    df['records_hierarchy'].isna()
    & doi_s.str.match(r'^10\.15200/winn\.\d+\.\d+$', na=False)
)
df.loc[mask_winn_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# engrXiv: 10.31224/<digits>
# Examples: 10.31224/2109
# -------------------------
mask_engrxiv_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'engrXiv')
    & doi_s.str.match(r'^10\.31224/\d+$', na=False)
)
df.loc[mask_engrxiv_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# Cambridge Open Engage: 10.33774/coe-<year or token>-<token>
# Examples: 10.33774/coe-2025-zz7tp, 10.33774/coe-2020-03prm, 10.33774/coe-xxxx-xxxxx
# -------------------------
mask_coe_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'Cambridge Open Engage')
    & doi_s.str.match(r'^10\.33774/coe-[a-z0-9]{4}-[a-z0-9]{4,}$', na=False)
)
df.loc[mask_coe_parent, 'records_hierarchy'] = 'parent'

# Optional quick check
print(df['records_hierarchy'].value_counts(dropna=False))


records_hierarchy
parent        8018690
part_of        148204
review         119592
child           84794
NaN             38158
correction        354
comment           302
Name: count, dtype: int64


In [45]:
import pandas as pd

# Ensure column exists
# if 'records_hierarchy' not in df.columns:
#     df['records_hierarchy'] = pd.NA

remain = df['records_hierarchy'].isna()
doi_s = df['doi'].astype(str).str.strip().str.lower()

# -------------------------
# Advance: 10.31124/advance.<digits>
# Examples: 10.31124/advance.10005662, 10.31124/advance.9978302
# -------------------------
mask_advance_parent = (
    remain
    & (df['server_name'] == 'Advance')
    & doi_s.str.match(r'^10\.31124/advance\.\d+$', na=False)
)
df.loc[mask_advance_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# APSA Preprints: 10.33774/apsa-<yyyy>-<token>
# Examples: 10.33774/apsa-2025-zqggn, 10.33774/apsa-2019-0b2dl
# -------------------------
mask_apsa_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'APSA Preprints')
    & doi_s.str.match(r'^10\.33774/apsa-\d{4}-[a-z0-9]+$', na=False)
)
df.loc[mask_apsa_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# AgriRxiv: 10.31220/agrirxiv.<yyyy>.<5digits>
# Examples: 10.31220/agrirxiv.2020.00001, 10.31220/agrirxiv.2025.00384
# -------------------------
mask_agrirxiv_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'AgriRxiv')
    & doi_s.str.match(r'^10\.31220/agrirxiv\.\d{4}\.\d{5}$', na=False)
)
df.loc[mask_agrirxiv_parent, 'records_hierarchy'] = 'parent'

# -------------------------
# EGUsphere (and EGUsphere-style journal DOIs): 10.5194/<slug>-<yyyy>-<number>
# Examples:
#   10.5194/egusphere-2022-1
#   10.5194/egusphere-2022-1000
#   10.5194/amt-2022-295
#   10.5194/hess-2024-3989
# -------------------------
mask_egusphere_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'EGUsphere')
    & doi_s.str.match(r'^10\.5194/[a-z0-9]+-\d{4}-\d+$', na=False)
)
df.loc[mask_egusphere_parent, 'records_hierarchy'] = 'parent'

# Optional quick check
print(df['records_hierarchy'].value_counts(dropna=False))

records_hierarchy
parent        8032172
part_of        148204
review         119592
child           84794
NaN             24676
correction        354
comment           302
Name: count, dtype: int64


In [46]:
import pandas as pd

# Ensure column exists
# if 'records_hierarchy' not in df.columns:
#     df['records_hierarchy'] = pd.NA

remain = df['records_hierarchy'].isna()
doi_s = df['doi'].astype(str).str.strip().str.lower()

# ------------------------------------------------------------
# Oroboros Instruments (examples)
# - 10.26124/bec.2024-0007
# - 10.26124/mitofit:ea19.mipschool.0005
# Parent if EXACT pattern with nothing after.
# ------------------------------------------------------------
mask_oroboros_parent = (
    remain
    & (df['server_name'] == 'Oroboros Instruments')
    & doi_s.str.match(
        r'^10\.26124/('
        r'bec\.\d{4}-\d{4}'                       # bec.2024-0007
        r'|mitofit:[a-z0-9]+\.[a-z0-9]+\.\d{4}'   # mitofit:ea19.mipschool.0005
        r')$',
        na=False
    )
)
df.loc[mask_oroboros_parent, 'records_hierarchy'] = 'parent'

# ------------------------------------------------------------
# Oroboros Instruments — Parent DOI patterns
# Examples (parents):
# - 10.26124/bec.2025-0005ar
# - 10.26124/bec.2025-0005it
# - 10.26124/bec:2024-0001
# - 10.26124/becprep.2025-0006.ed2
# - 10.26124/becprep.2025-0005
# - 10.26124/mitofit:190001
# - 10.26124/mitofit:2021-0005
# ------------------------------------------------------------
OROBOROS_PARENT_RX = r'^10\.26124/(' \
    r'bec[.:]\d{4}-\d{4}[a-z]{0,3}' \
    r'|' \
    r'becprep\.\d{4}-\d{4}(?:\.[a-z0-9]{1,6})?' \
    r'|' \
    r'mitofit:\d{6}' \
    r'|' \
    r'mitofit:\d{4}-\d{4}' \
    r')$'

mask_oroboros_parent = (
    remain
    & (df['server_name'] == 'Oroboros Instruments')
    & doi_s.str.match(OROBOROS_PARENT_RX, na=False)
)

df.loc[mask_oroboros_parent, 'records_hierarchy'] = 'parent'

# ------------------------------------------------------------
# PoolText (examples)
# - 10.31923/5547-4288-0095
# - 10.31923/pooltext-preprint-0067-3907-0053
# Parent if EXACT pattern with nothing after.
# ------------------------------------------------------------
mask_pooltext_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'PoolText')
    & doi_s.str.match(
        r'^10\.31923/('
        r'\d{4}-\d{4}-\d{4}'                                  # 5547-4288-0095
        r'|pooltext-preprint-\d{4}-\d{4}-\d{4}'               # pooltext-preprint-0067-3907-0053
        r')$',
        na=False
    )
)
df.loc[mask_pooltext_parent, 'records_hierarchy'] = 'parent'


# ------------------------------------------------------------
# ScienceOpen Preprints — Parent DOI patterns (STRICT / exact)
# Parent if EXACT pattern with nothing after.
# ------------------------------------------------------------

SCIENCEOPEN_PARENT_RX = (
    r'^10\.14293/('
    # 1) Old pattern like: s2199-1006.1.sor-.sdg.01
    r's2199-1006\.1\.sor-\.[a-z0-9]+\.\d{2}'
    r'|'
    # 2) New s2199-r2om patterns:
    #    - s2199-r2om-0001
    #    - s2199-r2om-abs-0003
    r's2199-r2om-(?:abs-)?\d{4}'
    r'|'
    # 3) New "rexpo" compact pattern:
    #    - s2199-rexpo22011v1
    r's2199-rexpo\d{5}v\d+'
    r'|'
    # 4) New ssp-am patterns:
    #    - s2199-ssp-am22-0001
    #    - s2199-ssp-am23-01001
    #    - s2199-ssp-am25-01015
    r's2199-ssp-am\d{2}-\d{4,5}'
    r'|'
    # 5) sblunisa patterns:
    #    - sblunisa.2023a024.mm (old)
    #    - sblunisa.2023a002.vnm (new)
    #    - sblunisa.2023a017.ojjt (new)
    r'sblunisa\.\d{4}a\d{3}\.[a-z0-9]{2,4}'
    r')$'
)

mask_scienceopen_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'ScienceOpen Preprints')
    & doi_s.str.match(SCIENCEOPEN_PARENT_RX, na=False)
)

df.loc[mask_scienceopen_parent, 'records_hierarchy'] = 'parent'


# ------------------------------------------------------------
# EarthArXiv
# Examples:
# - 10.31223/x50025  (5)
# - 10.31223/x5003j  (6)
# - 10.31223/x5zr0p  (6)
# Parent if EXACT pattern with nothing after.
# ------------------------------------------------------------
mask_eartharxiv_parent = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'EarthArXiv')
    & doi_s.str.match(r'^10\.31223/[a-z0-9]{5,6}$', na=False)   # 5–6 chars after slash
)
df.loc[mask_eartharxiv_parent, 'records_hierarchy'] = 'parent'


# Optional quick check
print(df['records_hierarchy'].value_counts(dropna=False))

records_hierarchy
parent        8037163
part_of        148204
review         119592
child           84794
NaN             19685
correction        354
comment           302
Name: count, dtype: int64


In [47]:
import re

# ------------------------------------------------------------
# PeerJ Preprints
# Parent:
#  - no trailing vN: 10.7287/peerj.preprints.1001
#  - trailing v1:    10.7287/peerj.preprints.1001v1
# Child:
#  - trailing v2+:   10.7287/peerj.preprints.1001v2, v3, ...
# ------------------------------------------------------------

doi_s = df['doi'].astype(str).str.strip().str.lower()

mask_peerj = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'PeerJ Preprints')
    & doi_s.str.startswith('10.7287/peerj.preprints.', na=False)
)

# Extract trailing version number if present (v1, v2, ...)
peerj_v_str = doi_s.where(mask_peerj).str.extract(r'v(?P<v>\d+)$')['v']

# Convert safely to numeric (NaN stays NaN)
peerj_v_num = pd.to_numeric(peerj_v_str, errors='coerce')

# Parent: no version suffix OR v1
mask_peerj_parent = mask_peerj & (peerj_v_num.isna() | (peerj_v_num == 1))
df.loc[mask_peerj_parent, 'records_hierarchy'] = 'parent'

# Child: v2+
mask_peerj_child = mask_peerj & (peerj_v_num >= 2)
df.loc[mask_peerj_child, 'records_hierarchy'] = 'child'

# Optional quick check
print(df['records_hierarchy'].value_counts(dropna=False))

records_hierarchy
parent        8042234
part_of        148204
review         119592
child           86169
NaN             13239
correction        354
comment           302
Name: count, dtype: int64


In [48]:
import pandas as pd

# -------------------------------------------------------------------
# Normalize DOI strings:
# - ensure everything is string
# - remove leading/trailing spaces
# - lowercase for consistent matching
# -------------------------------------------------------------------
doi_s = df['doi'].astype(str).str.strip().str.lower()

# -------------------------------------------------------------------
# Identify records to process:
# - records_hierarchy not yet assigned
# - server is Earth and Space Science Open Archive (ESSOAr)
# -------------------------------------------------------------------
mask_essoar = (
    df['records_hierarchy'].isna()
    & (df['server_name'] == 'Earth and Space Science Open Archive')
)

# -------------------------------------------------------------------
# Extract the final numeric version from the DOI
#
# Examples:
#   10.1002/essoar.10500074.1  → n = 1
#   10.1002/essoar.10500061.2  → n = 2
#   10.22541/essoar.xxx.xxx.3  → n = 3
#
# Regex explanation:
#   \.        → literal dot
#   (?P<n>)  → capture group named "n"
#   \d+      → one or more digits
#   $        → end of string (must be the last segment)
# -------------------------------------------------------------------
essoar_n_str = (
    doi_s
    .where(mask_essoar)                # only evaluate ESSOAr rows
    .str.extract(r'\.(?P<n>\d+)$')['n']  # extract trailing version number
)

# Convert extracted version to numeric:
# - invalid or missing values become NaN (safe for comparisons)
essoar_n = pd.to_numeric(essoar_n_str, errors='coerce')

# -------------------------------------------------------------------
# Label parent records:
# - ESSOAr records
# - version suffix == ".1"
# -------------------------------------------------------------------
mask_essoar_parent = mask_essoar & (essoar_n == 1)
df.loc[mask_essoar_parent, 'records_hierarchy'] = 'parent'

# -------------------------------------------------------------------
# Label child records:
# - ESSOAr records
# - version suffix >= ".2"
# -------------------------------------------------------------------
mask_essoar_child = mask_essoar & (essoar_n >= 2)
df.loc[mask_essoar_child, 'records_hierarchy'] = 'child'

# -------------------------------------------------------------------
# Optional sanity check:
# Show distribution of hierarchy labels
# -------------------------------------------------------------------
print(df['records_hierarchy'].value_counts(dropna=False))


records_hierarchy
parent        8053322
part_of        148204
review         119592
child           88301
correction        354
comment           302
NaN                19
Name: count, dtype: int64


In [49]:
df.loc[df['server_name'].eq('Earth and Space Science Open Archive'), 'records_hierarchy'].value_counts(dropna=False)

records_hierarchy
parent     20076
child       2850
part_of        6
Name: count, dtype: int64

In [50]:
df.loc[df['server_name'].eq('F1000Research'), 'records_hierarchy'].value_counts(dropna=False)


records_hierarchy
parent    11155
child      5704
NaN          14
Name: count, dtype: int64

In [51]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

server_name
F1000Research           14
Open Research Africa     1
AMRC Open Research       1
Gates Open Research      1
MNI Open Research        1
eLife                    1
Name: count, dtype: int64

In [52]:
df_remain[df_remain['server_name']=='F1000Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
587381,crossref::10.12688/f1000research,F1000Research,crossref,10.12688/f1000research,https://doi.org/10.12688/f1000research,http://www.f1000research.com,,,,,,,,,,,,2025-07-24,2025-12-23,2025-07-24,,,,,http://www.f1000research.com,10.12688/f1000research,,,,,,
588652,crossref::10.12688/f1000research.11198.1,F1000Research,crossref,10.12688/f1000research.11198.1,https://doi.org/10.12688/f1000research.11198.1,https://f1000research.com/articles/6-1014,,,,,,,,,2017-06-28,2025-01-01,2017-06-28,2025-11-28,2025-11-28,2025-11-28,,,2017.0,,https://f1000research.com/articles/6-1014,10.12688/f1000research.11198.1,,,,,,
580538,crossref::10.12688/f1000research.124059.1,F1000Research,crossref,10.12688/f1000research.124059.1,https://doi.org/10.12688/f1000research.124059.1,https://f1000research.com/articles/11-1230/,,,,,,,,,2022-10-28,2022-10-28,2022-10-28,2022-10-28,2025-12-05,2022-10-28,,,2022.0,,https://f1000research.com/articles/11-1230/,10.12688/f1000research.124059.1,,,,,,
588651,crossref::10.12688/f1000research.14416.1,F1000Research,crossref,10.12688/f1000research.14416.1,https://doi.org/10.12688/f1000research.14416.1,https://f1000research.com/articles/7-1362,,,,,,,,,2018-08-30,2025-01-01,2018-08-30,2025-11-28,2025-11-28,2025-11-28,,,2018.0,,https://f1000research.com/articles/7-1362,10.12688/f1000research.14416.1,,,,,,
588733,crossref::10.12688/f1000research.163729.1,F1000Research,crossref,10.12688/f1000research.163729.1,https://doi.org/10.12688/f1000research.163729.1,https://f1000research.com/articles/14-656,,,,,,,,,2025-01-01,2025-01-01,2025-01-01,2025-12-01,2025-12-01,2025-12-01,,,2025.0,,https://f1000research.com/articles/14-656,10.12688/f1000research.163729.1,,,,,,
588738,crossref::10.12688/f1000research.166247.1,F1000Research,crossref,10.12688/f1000research.166247.1,https://doi.org/10.12688/f1000research.166247.1,https://f1000research.com/articles/14-598,,,,,,,,,2025-01-01,,2025-01-01,2025-12-01,2025-12-01,2025-12-01,,,2025.0,,https://f1000research.com/articles/14-598,10.12688/f1000research.166247.1,,,,,,
588744,crossref::10.12688/f1000research.168206.1,F1000Research,crossref,10.12688/f1000research.168206.1,https://doi.org/10.12688/f1000research.168206.1,https://f1000research.com/articles/14-975,,,,,,,,,2025-01-01,,2025-01-01,2025-12-01,2025-12-01,2025-12-01,,,2025.0,,https://f1000research.com/articles/14-975,10.12688/f1000research.168206.1,,,,,,
572211,crossref::10.3410/10.3410/f1000devtestarticley,F1000Research,crossref,10.3410/10.3410/f1000devtestarticley,https://doi.org/10.3410/10.3410/f1000devtestar...,http://www.xy.net/article,,,,,,,,,2012-06-11,2012-06-11,2012-06-11,2012-06-11,2022-03-29,2012-06-11,,,2012.0,,http://www.xy.net/article,10.3410/10.3410/f1000devtestarticley,,,,,,
572499,crossref::10.3410/123.430,F1000Research,crossref,10.3410/123.430,https://doi.org/10.3410/123.430,http://www.someURl.com,,,,,,,,,2009-12-01,2009-12-01,2009-12-01,2015-06-26,2022-03-31,2012-09-13,,,2009.0,,http://www.someurl.com,10.3410/123.430,,,,,,
572210,crossref::10.3410/f1000devtestarticlez,F1000Research,crossref,10.3410/f1000devtestarticlez,https://doi.org/10.3410/f1000devtestarticlez,http://researchdev.f1000.com/articles/F1000Dev...,,,,,,,,,2012-01-01,2012-01-01,2012-01-01,2012-06-07,2022-03-29,2012-06-07,,,2012.0,,http://researchdev.f1000.com/articles/f1000dev...,10.3410/f1000devtestarticlez,,,,,,


In [53]:
df_remain[df_remain['server_name']=='Open Research Africa']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2121895,crossref::10.12688/aasopenres,Open Research Africa,crossref,10.12688/aasopenres,https://doi.org/10.12688/aasopenres,http://www.aasopenresearch.org,,,,,,,,,,,,2022-03-25,2025-09-19,2022-03-25,,,,,http://www.aasopenresearch.org,10.12688/aasopenres,,,,,,


In [54]:
df_remain[df_remain['server_name']=='AMRC Open Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
178,crossref::10.12688/amrcopenres,AMRC Open Research,crossref,10.12688/amrcopenres,https://doi.org/10.12688/amrcopenres,http://www.amrcopenresearch.org,,,,,,,,,,,,2022-11-18,2022-11-19,2019-02-19,,,,,http://www.amrcopenresearch.org,10.12688/amrcopenres,,,,,,


In [55]:
df_remain[df_remain['server_name']=='MNI Open Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2013305,crossref::10.12688/mniopenres,MNI Open Research,crossref,10.12688/mniopenres,https://doi.org/10.12688/mniopenres,http://www.mniopenresearch.org,,,,,,,,,,,,2021-06-21,2022-09-22,2021-06-21,,,,,http://www.mniopenresearch.org,10.12688/mniopenres,,,,,,


In [56]:
df_remain[df_remain['server_name']=='Gates Open Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
590205,crossref::10.12688/gatesopenres,Gates Open Research,crossref,10.12688/gatesopenres,https://doi.org/10.12688/gatesopenres,http://www.gatesopenresearch.org,,,,,,,,,,,,2025-12-17,2025-12-17,2017-11-06,,,,,http://www.gatesopenresearch.org,10.12688/gatesopenres,,,,,,


In [57]:
df_remain[df_remain['server_name']=='eLife']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9130897,crossref::10.7554/elife,eLife,crossref,10.7554/elife,https://doi.org/10.7554/elife,https://elifesciences.org/,,,,,,,,,,,,2017-07-25,2025-12-30,2017-07-25,,,,,https://elifesciences.org/,10.7554/elife,,,,,,


In [58]:
import pandas as pd

# Normalize DOI strings:
# - lower case
# - strip spaces
# - collapse accidental double slashes after the prefix (10.3410// -> 10.3410/)
doi_s = (
    df["doi"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"^10\.3410//", "10.3410/", regex=True)
)

# ------------------------------------------------------------
# F1000Research
# We support TWO DOI "families" that exist in your data:
#
# A) 10.12688/f1000research[.<id>.<N>]
#    - root: 10.12688/f1000research            -> parent
#    - versioned: 10.12688/f1000research.11198.1 -> parent
#                 10.12688/f1000research.11198.2 -> child
#
# B) 10.3410/f1000research.<something>.vN
#    - 10.3410/f1000research.1-29.v1 -> parent
#    - 10.3410/f1000research.1-29.v2 -> child
#    - 10.3410/f1000research.2-72.v1 -> parent
# ------------------------------------------------------------
mask_f1000 = (
    df["records_hierarchy"].isna()
    & (df["server_name"] == "F1000Research")
)

# -------------------------
# Case A: 10.12688 root DOI
# -------------------------
mask_f1000_root = mask_f1000 & doi_s.eq("10.12688/f1000research")
df.loc[mask_f1000_root, "records_hierarchy"] = "parent"

# -----------------------------------------------
# Case A2: 10.12688/f1000research.<digits>.<N>
# -----------------------------------------------
f1000_v12688_str = (
    doi_s.where(mask_f1000)
         .str.extract(r"^10\.12688/f1000research\.\d+\.(?P<v>\d+)$")["v"]
)
f1000_v12688 = pd.to_numeric(f1000_v12688_str, errors="coerce")

mask_f1000_12688_parent = mask_f1000 & (f1000_v12688 == 1)
df.loc[mask_f1000_12688_parent, "records_hierarchy"] = "parent"

mask_f1000_12688_child = mask_f1000 & (f1000_v12688 >= 2)
df.loc[mask_f1000_12688_child, "records_hierarchy"] = "child"

# -----------------------------------------
# Case B: 10.3410/f1000research.<...>.vN
# - accept things like: 1-29, 2-72, 1-50, etc.
# - also accept possible extra dots inside the middle part
# -----------------------------------------
f1000_v3410_str = (
    doi_s.where(mask_f1000)
         .str.extract(r"^10\.3410/f1000research\.[a-z0-9.\-]+\.v(?P<v>\d+)$")["v"]
)
f1000_v3410 = pd.to_numeric(f1000_v3410_str, errors="coerce")

mask_f1000_3410_parent = mask_f1000 & (f1000_v3410 == 1)
df.loc[mask_f1000_3410_parent, "records_hierarchy"] = "parent"

mask_f1000_3410_child = mask_f1000 & (f1000_v3410 >= 2)
df.loc[mask_f1000_3410_child, "records_hierarchy"] = "child"

# -------------------------
# Optional: remaining -> others
# -------------------------
mask_f1000_left = mask_f1000 & df["records_hierarchy"].isna()
df.loc[mask_f1000_left, "records_hierarchy"] = "others"

# Quick check
print(df.loc[df["server_name"] == "F1000Research", "records_hierarchy"].value_counts(dropna=False))


records_hierarchy
parent    11162
child      5704
others        7
Name: count, dtype: int64


In [59]:
# Normalize DOI (same style you already use)
doi_s = df["doi"].astype(str).str.strip().str.lower()

# Only touch rows not yet labeled
mask_unlabeled = df["records_hierarchy"].isna()

# Root DOIs you want to classify as "others"
ROOT_OTHERS_DOIS = {
    "10.7554/elife",
    "10.12688/gatesopenres",
    "10.12688/mniopenres",
    "10.12688/amrcopenres",
    "10.12688/aasopenres",
}

mask_root_others = mask_unlabeled & doi_s.isin(ROOT_OTHERS_DOIS)
df.loc[mask_root_others, "records_hierarchy"] = "others"


In [60]:

# -------------------------------------------------------------------
# Optional sanity check:
# Show distribution of hierarchy labels
# -------------------------------------------------------------------
print(df['records_hierarchy'].value_counts(dropna=False))

records_hierarchy
parent        8053329
part_of        148204
review         119592
child           88301
correction        354
comment           302
others             12
Name: count, dtype: int64


In [61]:
pattern = "others"


mask = df['records_hierarchy'].str.contains(pattern, regex=False, na=False)
result = df[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2121895,crossref::10.12688/aasopenres,Open Research Africa,crossref,10.12688/aasopenres,https://doi.org/10.12688/aasopenres,http://www.aasopenresearch.org,,,,,,,,,,,,2022-03-25,2025-09-19,2022-03-25,,,,others,http://www.aasopenresearch.org,10.12688/aasopenres,,,,,,
178,crossref::10.12688/amrcopenres,AMRC Open Research,crossref,10.12688/amrcopenres,https://doi.org/10.12688/amrcopenres,http://www.amrcopenresearch.org,,,,,,,,,,,,2022-11-18,2022-11-19,2019-02-19,,,,others,http://www.amrcopenresearch.org,10.12688/amrcopenres,,,,,,
590205,crossref::10.12688/gatesopenres,Gates Open Research,crossref,10.12688/gatesopenres,https://doi.org/10.12688/gatesopenres,http://www.gatesopenresearch.org,,,,,,,,,,,,2025-12-17,2025-12-17,2017-11-06,,,,others,http://www.gatesopenresearch.org,10.12688/gatesopenres,,,,,,
2013305,crossref::10.12688/mniopenres,MNI Open Research,crossref,10.12688/mniopenres,https://doi.org/10.12688/mniopenres,http://www.mniopenresearch.org,,,,,,,,,,,,2021-06-21,2022-09-22,2021-06-21,,,,others,http://www.mniopenresearch.org,10.12688/mniopenres,,,,,,
572211,crossref::10.3410/10.3410/f1000devtestarticley,F1000Research,crossref,10.3410/10.3410/f1000devtestarticley,https://doi.org/10.3410/10.3410/f1000devtestar...,http://www.xy.net/article,,,,,,,,,2012-06-11,2012-06-11,2012-06-11,2012-06-11,2022-03-29,2012-06-11,,,2012.0,others,http://www.xy.net/article,10.3410/10.3410/f1000devtestarticley,,,,,,
572499,crossref::10.3410/123.430,F1000Research,crossref,10.3410/123.430,https://doi.org/10.3410/123.430,http://www.someURl.com,,,,,,,,,2009-12-01,2009-12-01,2009-12-01,2015-06-26,2022-03-31,2012-09-13,,,2009.0,others,http://www.someurl.com,10.3410/123.430,,,,,,
572210,crossref::10.3410/f1000devtestarticlez,F1000Research,crossref,10.3410/f1000devtestarticlez,https://doi.org/10.3410/f1000devtestarticlez,http://researchdev.f1000.com/articles/F1000Dev...,,,,,,,,,2012-01-01,2012-01-01,2012-01-01,2012-06-07,2022-03-29,2012-06-07,,,2012.0,others,http://researchdev.f1000.com/articles/f1000dev...,10.3410/f1000devtestarticlez,,,,,,
572209,crossref::10.3410/f1000devtestcrossmarkpolicy,F1000Research,crossref,10.3410/f1000devtestcrossmarkpolicy,https://doi.org/10.3410/f1000devtestcrossmarkp...,resource,,,,,,,,,2012-01-01,2012-01-01,2012-01-01,2012-05-31,2022-04-03,2012-05-31,,,2012.0,others,resource,10.3410/f1000devtestcrossmarkpolicy,,,,,,
572212,crossref::10.3410/f1000res,F1000Research,crossref,10.3410/f1000res,https://doi.org/10.3410/f1000res,2046-1402,,,,,,,,,,,,2013-03-05,2022-03-31,2013-03-05,,,,others,2046-1402,10.3410/f1000res,,,,,,
572214,crossref::10.3410/f1000research,F1000Research,crossref,10.3410/f1000research,https://doi.org/10.3410/f1000research,http://www.f1000research.com/,,,,,,,,,,,,2013-05-09,2022-04-05,2013-05-09,,,,others,http://www.f1000research.com/,10.3410/f1000research,,,,,,


In [62]:
df_remain = df[df['records_hierarchy'].isna()]
df_remain['server_name'].value_counts()

Series([], Name: count, dtype: int64)

In [63]:
# df_remain[df_remain['server_name']=='AgriRxiv']['landing_page_url'][207416]

In [64]:
# pattern = "others"


# mask = df[df['server_name']=='F1000Research']['records_hierarchy'].str.contains(pattern, regex=False, na=False)
# result = df[df['server_name']=='F1000Research'][mask]
# result

In [65]:
# df_remain['landing_page_url'][286254]

In [66]:
# pattern = ".1"


# mask = df[df['server_name']=='eLife']['doi'].str.contains(pattern, regex=False, na=False)
# result = df[df['server_name']=='eLife'][mask]
# result

In [67]:
# result['landing_page_url'][9366338]

In [68]:
# result['landing_page_url'][9297794] 

In [69]:
# pattern = "10.7287/peerj.preprints.999"
# #video #media 

# mask = df['doi'].str.contains(pattern, regex=False, na=False)
# result = df[mask]
# result

In [70]:
# df_remain['relations_json'].value_counts()

In [71]:
# df_remain[df_remain['relations_json']=='{"is-preprint-of": [{"asserted-by": "subject", "id": "10.31237/osf.io/yr86k", "id-type": "doi"}]}']

## Check duplicates in secondary preprint repositories

In [72]:
df_mirror = df.copy()

In [73]:
df_touse = df_mirror[df_mirror['doi'].notna()].copy()
df_touse

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-26,2018-05-26,,2018-05-26,2022-12-05,2025-02-21,2019-11-13,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500000.1,,,,,,
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-31,2018-05-31,,2018-05-31,2022-12-05,2025-02-21,2019-11-13,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500002.1,,,,,,
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1029/2017sw001789;10.1029/2018sw001995,,2018-05-30,2018-05-30,,2018-05-30,2022-12-05,2025-02-21,2019-11-13,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500004.1,,,,,,
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-02-21,2019-11-13,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500007.1,,,,,,
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-09-29,2019-12-03,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500009.1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5667066,openalex::W999063011,Social Science Open Access Repository,openalex,10.15464/isi.42.2009.1-5,https://doi.org/10.15464/isi.42.2009.1-5,http://www.ssoar.info/ssoar/handle/document/21392,,,,,,,,,2009-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2009.0,parent,http://www.ssoar.info/ssoar/handle/document/21392,10.15464/isi.42.2009.1-5,,,,,,
2245001,openalex::W99945665,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
5667173,openalex::W999508398,Social Science Open Access Repository,openalex,10.15464/isi.38.2007.1-5,https://doi.org/10.15464/isi.38.2007.1-5,http://www.ssoar.info/ssoar/handle/document/21635,,,,,,,,,2007-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2007.0,parent,http://www.ssoar.info/ssoar/handle/document/21635,10.15464/isi.38.2007.1-5,,,,,,
5673122,openalex::W999509321,Social Science Open Access Repository,openalex,10.23668/psycharchives.13440,https://doi.org/10.23668/psycharchives.13440,http://www.ssoar.info/ssoar/handle/document/3368,,,,,,,,,1997-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,1997.0,parent,http://www.ssoar.info/ssoar/handle/document/3368,10.23668/psycharchives.13440,,,,,,


In [74]:
dupes = df_touse[df_touse.duplicated(subset=['doi'], keep=False)]
dupes

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
8976641,crossref::10.1101/103937,bioRxiv,crossref,10.1101/103937,https://doi.org/10.1101/103937,http://biorxiv.org/lookup/doi/10.1101/103937,,,,,,,,2017-01-28,2017-01-28,,2017-01-28,2024-08-03,2025-02-21,2017-01-29,,,2017.0,parent,http://biorxiv.org/lookup/doi/10.1101/103937,10.1101/103937,,,,,,
9073833,crossref::10.1101/2019.12.23.887166,bioRxiv,crossref,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,http://biorxiv.org/lookup/doi/10.1101/2019.12....,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.3390/metabo11110749,,2019-12-23,2019-12-23,,2019-12-23,2025-06-03,2025-06-03,2019-12-24,,,2019.0,parent,http://biorxiv.org/lookup/doi/10.1101/2019.12....,10.1101/2019.12.23.887166,,,,,,
8986959,crossref::10.1101/2020.01.22.915215,bioRxiv,crossref,10.1101/2020.01.22.915215,https://doi.org/10.1101/2020.01.22.915215,http://biorxiv.org/lookup/doi/10.1101/2020.01....,,,,,,,,2020-01-23,2020-01-23,,2020-01-23,2024-08-04,2025-02-21,2020-01-23,,,2020.0,parent,http://biorxiv.org/lookup/doi/10.1101/2020.01....,10.1101/2020.01.22.915215,,,,,,
8981770,crossref::10.1101/2020.02.03.919597,bioRxiv,crossref,10.1101/2020.02.03.919597,https://doi.org/10.1101/2020.02.03.919597,http://biorxiv.org/lookup/doi/10.1101/2020.02....,,,,,,,,2020-02-03,2020-02-03,,2020-02-03,2024-08-04,2025-02-21,2020-02-04,,,2020.0,parent,http://biorxiv.org/lookup/doi/10.1101/2020.02....,10.1101/2020.02.03.919597,,,,,,
8891397,crossref::10.1101/2020.06.08.118984,bioRxiv,crossref,10.1101/2020.06.08.118984,https://doi.org/10.1101/2020.06.08.118984,http://biorxiv.org/lookup/doi/10.1101/2020.06....,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1038/s41467-020-20239-4,,2020-06-09,2020-06-09,,2020-06-09,2022-10-27,2025-02-21,2020-06-09,,,2020.0,parent,http://biorxiv.org/lookup/doi/10.1101/2020.06....,10.1101/2020.06.08.118984,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486788,openalex::W983039221,viXra,openalex,10.13140/rg.2.2.29266.79041,https://doi.org/10.13140/rg.2.2.29266.79041,http://www.vixra.org/pdf/1503.0004v1.pdf,,,,,,,,,2020-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2020.0,parent,http://www.vixra.org/pdf/1503.0004v1.pdf,10.13140/rg.2.2.29266.79041,,,,,,
2806374,openalex::W985302891,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088/fil...,,,,,,,,,2006-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2006.0,parent,https://ageconsearch.umn.edu/record/133088/fil...,10.22004/ag.econ.133088,,,,,,
2817765,openalex::W986721525,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.196816,https://doi.org/10.22004/ag.econ.196816,https://ageconsearch.umn.edu/record/196816/fil...,,,,,,,,,2015-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2015.0,parent,https://ageconsearch.umn.edu/record/196816/fil...,10.22004/ag.econ.196816,,,,,,
9486875,openalex::W998010466,viXra,openalex,10.13140/rg.2.2.36331.69924,https://doi.org/10.13140/rg.2.2.36331.69924,https://vixra.org/pdf/1409.0185v1.pdf,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2014.0,parent,https://vixra.org/pdf/1409.0185v1.pdf,10.13140/rg.2.2.36331.69924,,,,,,


In [75]:
dupes[dupes['server_name']=='ResearchGate']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
3322080,datacite::10.13140/2.1.2328.2569,ResearchGate,datacite,10.13140/2.1.2328.2569,https://doi.org/10.13140/2.1.2328.2569,https://www.researchgate.net/doi/10.13140/2.1....,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2014-08-19,2014-08-19,2025-10-02,2014.0,parent,https://www.researchgate.net/doi/10.13140/2.1....,10.13140/2.1.2328.2569,,,,,,
3322083,datacite::10.13140/2.1.2910.4001,ResearchGate,datacite,10.13140/2.1.2910.4001,https://doi.org/10.13140/2.1.2910.4001,https://www.researchgate.net/doi/10.13140/2.1....,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2014-08-27,2014-08-27,2025-10-02,2014.0,parent,https://www.researchgate.net/doi/10.13140/2.1....,10.13140/2.1.2910.4001,,,,,,
3322151,datacite::10.13140/rg.2.1.1007.9848,ResearchGate,datacite,10.13140/rg.2.1.1007.9848,https://doi.org/10.13140/rg.2.1.1007.9848,https://www.researchgate.net/doi/10.13140/RG.2...,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2015-07-20,2015-07-20,2025-10-08,2015.0,parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.1.1007.9848,,,,,,
3322312,datacite::10.13140/rg.2.1.1103.1281,ResearchGate,datacite,10.13140/rg.2.1.1103.1281,https://doi.org/10.13140/rg.2.1.1103.1281,https://www.researchgate.net/doi/10.13140/RG.2...,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2016-08-09,2016-08-09,2025-10-09,2016.0,parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.1.1103.1281,,,,,,
3322200,datacite::10.13140/rg.2.1.1103.9445,ResearchGate,datacite,10.13140/rg.2.1.1103.9445,https://doi.org/10.13140/rg.2.1.1103.9445,https://www.researchgate.net/doi/10.13140/RG.2...,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2015-10-13,2015-10-13,2025-10-08,2015.0,parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.1.1103.9445,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3354289,datacite::10.13140/rg.2.2.36811.13607,ResearchGate,datacite,10.13140/rg.2.2.36811.13607,https://doi.org/10.13140/rg.2.2.36811.13607,https://www.researchgate.net/doi/10.13140/RG.2...,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2019-12-13,2019-12-13,2025-10-17,2019.0,parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36811.13607,,,,,,
3332704,datacite::10.13140/rg.2.2.36825.57442,ResearchGate,datacite,10.13140/rg.2.2.36825.57442,https://doi.org/10.13140/rg.2.2.36825.57442,https://www.researchgate.net/doi/10.13140/RG.2...,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2018-09-18,2018-09-18,2025-10-16,2018.0,parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36825.57442,,,,,,
3350078,datacite::10.13140/rg.2.2.36829.77284,ResearchGate,datacite,10.13140/rg.2.2.36829.77284,https://doi.org/10.13140/rg.2.2.36829.77284,https://www.researchgate.net/doi/10.13140/RG.2...,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2019-09-20,2019-09-20,2025-10-17,2019.0,parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36829.77284,,,,,,
3463359,datacite::10.13140/rg.2.2.36831.37285,ResearchGate,datacite,10.13140/rg.2.2.36831.37285,https://doi.org/10.13140/rg.2.2.36831.37285,https://www.researchgate.net/doi/10.13140/RG.2...,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2025-01-08,2025-01-08,2025-10-22,2018.0,parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36831.37285,,,,,,


In [76]:
dupes['server_name'].value_counts()

server_name
RePEc: Research Papers in Economics                                     10494
arXiv                                                                    8236
AgEcon Search                                                            6540
HAL                                                                      4041
ResearchGate                                                             1633
DSpace@MIT                                                               1082
viXra                                                                     826
EconStor Preprints                                                        559
Munich Personal RePEc Archive                                             443
Zenodo                                                                    297
PhilSci-Archive                                                           161
IACR Cryptology ePrint Archive                                             91
Digital Access to Scholarship at Harvard (DASH) (Har

In [77]:
df[df['doi']=='10.13140/rg.2.2.36331.69924']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
3349993,datacite::10.13140/rg.2.2.36331.69924,ResearchGate,datacite,10.13140/rg.2.2.36331.69924,https://doi.org/10.13140/rg.2.2.36331.69924,https://www.researchgate.net/doi/10.13140/RG.2...,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2019-09-19,2019-09-19,2025-10-17,2014.0,parent,https://www.researchgate.net/doi/10.13140/rg.2...,10.13140/rg.2.2.36331.69924,,,,,,
9486875,openalex::W998010466,viXra,openalex,10.13140/rg.2.2.36331.69924,https://doi.org/10.13140/rg.2.2.36331.69924,https://vixra.org/pdf/1409.0185v1.pdf,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2014.0,parent,https://vixra.org/pdf/1409.0185v1.pdf,10.13140/rg.2.2.36331.69924,,,,,,


In [78]:
df[df['doi']=='10.22004/ag.econ.133088']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
102633,datacite::10.22004/ag.econ.133088,AgEcon Search,datacite,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088,,[],"{""client"": {""data"": {""id"": ""tind.agecon"", ""typ...",,,,,,,,,,,2019-08-30,2019-08-30,2020-07-29,2006.0,parent,https://ageconsearch.umn.edu/record/133088,10.22004/ag.econ.133088,,,,,,
2806374,openalex::W985302891,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088/fil...,,,,,,,,,2006-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2006.0,parent,https://ageconsearch.umn.edu/record/133088/fil...,10.22004/ag.econ.133088,,,,,,


In [79]:
df[df['doi']=='10.1101/2019.12.23.887166']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9073833,crossref::10.1101/2019.12.23.887166,bioRxiv,crossref,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,http://biorxiv.org/lookup/doi/10.1101/2019.12....,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.3390/metabo11110749,,2019-12-23,2019-12-23,,2019-12-23,2025-06-03,2025-06-03,2019-12-24,,,2019.0,parent,http://biorxiv.org/lookup/doi/10.1101/2019.12....,10.1101/2019.12.23.887166,,,,,,
1555009,openalex::W2995005865,HAL,openalex,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,https://hal.science/hal-02437604,,,,,,,,,2019-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2019.0,parent,https://hal.science/hal-02437604,10.1101/2019.12.23.887166,,,,,,


In [80]:
df[df['doi']=='10.1101/2020.02.03.919597']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
8981770,crossref::10.1101/2020.02.03.919597,bioRxiv,crossref,10.1101/2020.02.03.919597,https://doi.org/10.1101/2020.02.03.919597,http://biorxiv.org/lookup/doi/10.1101/2020.02....,,,,,,,,2020-02-03,2020-02-03,,2020-02-03,2024-08-04,2025-02-21,2020-02-04,,,2020.0,parent,http://biorxiv.org/lookup/doi/10.1101/2020.02....,10.1101/2020.02.03.919597,,,,,,
1731345,openalex::W3004251285,HAL,openalex,10.1101/2020.02.03.919597,https://doi.org/10.1101/2020.02.03.919597,https://pasteur.hal.science/pasteur-02626442,,,,,,,,,2020-05-26,,,,,2020-02-07T00:00:00,,2025-11-15T23:13:30.683059,2020.0,parent,https://pasteur.hal.science/pasteur-02626442,10.1101/2020.02.03.919597,,,,,,


In [81]:
df[df['doi']=='10.3220/rep_20_1_2014']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
2246907,openalex::W102989963,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T03:46:38.306776,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2245787,openalex::W106941238,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2247124,openalex::W111413393,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2016-06-24T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2245890,openalex::W113337753,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T03:46:38.306776,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2251234,openalex::W25719276,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2016-06-24T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2255867,openalex::W46898626,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2255967,openalex::W49926529,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2244068,openalex::W50328423,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2256029,openalex::W53160016,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
2244962,openalex::W60459218,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,


In [82]:
df[df['doi']=='10.48550/arxiv.0704.0324']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
6936795,datacite::10.48550/arxiv.0704.0324,arXiv,datacite,10.48550/arxiv.0704.0324,https://doi.org/10.48550/arxiv.0704.0324,https://arxiv.org/abs/0704.0324,1.0,[],"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,,,,,,,,,2022-03-16,2022-03-16,2025-05-28,2007.0,parent,https://arxiv.org/abs/0704.0324,10.48550/arxiv.0704.0324,,,,,,
640498,openalex::W2951243300,HAL,openalex,10.48550/arxiv.0704.0324,https://doi.org/10.48550/arxiv.0704.0324,https://hal.science/hal-00139490,,,,,,,,,2007-04-03,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2007.0,parent,https://hal.science/hal-00139490,10.48550/arxiv.0704.0324,,,,,,


In [83]:
df[df['doi']=='10.13140/2.1.2910.4001']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
3322083,datacite::10.13140/2.1.2910.4001,ResearchGate,datacite,10.13140/2.1.2910.4001,https://doi.org/10.13140/2.1.2910.4001,https://www.researchgate.net/doi/10.13140/2.1....,,[],"{""client"": {""data"": {""id"": ""rg.rg"", ""type"": ""c...",,,,,,,,,,,2014-08-27,2014-08-27,2025-10-02,2014.0,parent,https://www.researchgate.net/doi/10.13140/2.1....,10.13140/2.1.2910.4001,,,,,,
9465428,openalex::W135550700,viXra,openalex,10.13140/2.1.2910.4001,https://doi.org/10.13140/2.1.2910.4001,https://vixra.org/pdf/1402.0082v1.pdf,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2014.0,parent,https://vixra.org/pdf/1402.0082v1.pdf,10.13140/2.1.2910.4001,,,,,,


In [84]:
# import pandas as pd

# # -----------------------------
# # 0) Define primary vs secondary
# # -----------------------------
# PRIMARY_SERVERS = {
#     "arXiv",
#     "bioRxiv",
#     "medRxiv",
#     "SSRN",
#     "TechRxiv",
#     "Wellcome Open Research",
#     "AgriRxiv",
#     "EarthArXiv",
#     "Law Archive",
#     "SocArXiv",
#     "Thesis Commons",
#     "Research Square",
#     "Open Research Europe",
#     "Oroboros Instruments",
#     "ResearchGate",
#     "AgEcon Search",
#     "Zenodo",
#     "Open Science Framework",
#     "Humanities Commons CORE",
#     "CERN document server",
#     "eLife",
#     # add/remove as needed
# }

# # Everything else is considered "non-primary" (i.e., eligible to become mirror)
# # If you prefer explicit list, replace mask_non_primary below with .isin(SECONDARY_SERVERS)

# # -----------------------------
# # 1) Normalize DOI + find duplicated DOIs
# # -----------------------------
# doi_s = df_mirror["doi"].astype(str).str.strip().str.lower()
# doi_norm = doi_s.mask(doi_s.eq("") | doi_s.eq("none"))

# # duplicated DOI flag (True for all rows where the DOI appears 2+ times)
# is_dup = doi_norm.notna() & doi_norm.duplicated(keep=False)

# # -----------------------------
# # 2) Determine which DOI groups contain a primary server record
# # -----------------------------
# is_primary = df_mirror["server_name"].isin(PRIMARY_SERVERS)

# # For each row: does its DOI group contain at least one primary record?
# has_primary_in_group = (
#     pd.Series(is_primary.values, index=df_mirror.index)
#       .groupby(doi_norm)
#       .transform("any")
# )

# # rows with NaN DOI should not be touched
# has_primary_in_group = has_primary_in_group.fillna(False)

# # -----------------------------
# # 3) Overwrite labels for non-primary rows to "mirror"
# #    ONLY when:
# #      - DOI duplicated
# #      - that DOI group has at least one primary record
# # -----------------------------
# mask_non_primary = ~is_primary

# mask_set_mirror = is_dup & has_primary_in_group & mask_non_primary

# # OVERWRITE parent/child/etc for these rows:
# df_mirror.loc[mask_set_mirror, "records_hierarchy"] = "mirror"

# # -----------------------------
# # 4) Optional: sanity checks
# # -----------------------------
# print("Mirror rows (set):", mask_set_mirror.sum())
# print(df_mirror["records_hierarchy"].value_counts(dropna=False).head(30))


In [86]:
import pandas as pd

# -----------------------------
# 0) Define primary servers
# -----------------------------
PRIMARY_SERVERS = {
    "arXiv",
    "bioRxiv",
    "medRxiv",
    "SSRN",
    "TechRxiv",
    "Wellcome Open Research",
    "AgriRxiv",
    "EarthArXiv",
    "Law Archive",
    "SocArXiv",
    "Thesis Commons",
    "Research Square",
    "Open Research Europe",
    "Oroboros Instruments",
    "ResearchGate",
    "AgEcon Search",
    "Zenodo",
    "Open Science Framework",
    "Humanities Commons CORE",
    "CERN document server",
    "eLife",
}

# -----------------------------
# 1) Normalize DOI
# -----------------------------
doi_norm = (
    df_mirror["doi"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({"": pd.NA, "none": pd.NA})
)

# -----------------------------
# 2) Flags
# -----------------------------
is_primary = df_mirror["server_name"].isin(PRIMARY_SERVERS)
is_dup = doi_norm.notna() & doi_norm.duplicated(keep=False)

# -----------------------------
# 3) Identify DOI → primary server mapping (ONLY ONCE)
# -----------------------------
primary_by_doi = (
    df_mirror.loc[is_primary & is_dup, ["server_name"]]
    .assign(doi=doi_norm[is_primary & is_dup])
    .dropna(subset=["doi"])
    .groupby("doi")["server_name"]
    .first()   # take first primary (fast & deterministic)
)

# -----------------------------
# 4) Mark mirror rows
# -----------------------------
mask_mirror = is_dup & ~is_primary & doi_norm.isin(primary_by_doi.index)

df_mirror.loc[mask_mirror, "records_hierarchy"] = (
    "mirror (" + doi_norm[mask_mirror].map(primary_by_doi) + ")"
)

# -----------------------------
# 5) Sanity check
# -----------------------------
print("Mirror rows:", mask_mirror.sum())
print(df_mirror["records_hierarchy"].value_counts(dropna=False).head(20))


Mirror rows: 16842
records_hierarchy
parent                              8036487
part_of                              148204
review                               119592
child                                 88301
mirror (arXiv)                         8240
mirror (AgEcon Search)                 6541
mirror (ResearchGate)                  1633
correction                              354
comment                                 302
mirror (Zenodo)                         297
mirror (Open Science Framework)          31
mirror (SSRN)                            30
mirror (bioRxiv)                         29
mirror (Humanities Commons CORE)         24
others                                   12
mirror (eLife)                            3
mirror (CERN document server)             2
mirror (Research Square)                  2
mirror (AgriRxiv)                         1
mirror (SocArXiv)                         1
Name: count, dtype: int64


In [87]:
dupes[dupes['server_name']=='EconStor Preprints']#.tail(60)

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
495549,openalex::W107243416,EconStor Preprints,openalex,10.1007/s10273-008-0850-2,https://doi.org/10.1007/s10273-008-0850-2,http://hdl.handle.net/10419/42949,,,,,,,,,2008-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2008.0,parent,http://hdl.handle.net/10419/42949,10.1007/s10273-008-0850-2,,,,,,
495567,openalex::W1122477516,EconStor Preprints,openalex,10.1007/s10273-012-1319-x,https://doi.org/10.1007/s10273-012-1319-x,http://hdl.handle.net/10419/68395,,,,,,,,,2012-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2012.0,parent,http://hdl.handle.net/10419/68395,10.1007/s10273-012-1319-x,,,,,,
495592,openalex::W1150235247,EconStor Preprints,openalex,10.1007/s10273-014-1758-7,https://doi.org/10.1007/s10273-014-1758-7,http://hdl.handle.net/10419/106724,,,,,,,,,2014-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2014.0,parent,http://hdl.handle.net/10419/106724,10.1007/s10273-014-1758-7,,,,,,
495606,openalex::W1166988194,EconStor Preprints,openalex,10.1007/s10272-006-0174-7,https://doi.org/10.1007/s10272-006-0174-7,http://hdl.handle.net/10419/41893,,,,,,,,,2006-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2006.0,parent,http://hdl.handle.net/10419/41893,10.1007/s10272-006-0174-7,,,,,,
488791,openalex::W1198321170,EconStor Preprints,openalex,10.1007/s10273-008-0758-x,https://doi.org/10.1007/s10273-008-0758-x,http://hdl.handle.net/10419/42977,,,,,,,,,2008-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2008.0,parent,http://hdl.handle.net/10419/42977,10.1007/s10273-008-0758-x,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527697,openalex::W822245089,EconStor Preprints,openalex,10.13140/rg.2.2.31319.42409,https://doi.org/10.13140/rg.2.2.31319.42409,http://hdl.handle.net/10419/95827,,,,,,,,,1996-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,1996.0,parent,http://hdl.handle.net/10419/95827,10.13140/rg.2.2.31319.42409,,,,,,
457459,openalex::W854544018,EconStor Preprints,openalex,10.1007/s10272-005-0163-2,https://doi.org/10.1007/s10272-005-0163-2,http://hdl.handle.net/10419/41835,,,,,,,,,2005-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2005.0,parent,http://hdl.handle.net/10419/41835,10.1007/s10272-005-0163-2,,,,,,
488736,openalex::W872556750,EconStor Preprints,openalex,10.1007/s10273-008-0758-x,https://doi.org/10.1007/s10273-008-0758-x,http://hdl.handle.net/10419/42941,,,,,,,,,2008-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2008.0,parent,http://hdl.handle.net/10419/42941,10.1007/s10273-008-0758-x,,,,,,
527752,openalex::W930520461,EconStor Preprints,openalex,10.1007/s10273-008-0872-9,https://doi.org/10.1007/s10273-008-0872-9,http://hdl.handle.net/10419/43019,,,,,,,,,2008-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2008.0,parent,http://hdl.handle.net/10419/43019,10.1007/s10273-008-0872-9,,,,,,


In [88]:
df_mirror[df_mirror['doi']=='10.1007/s10273-008-0850-2']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
495549,openalex::W107243416,EconStor Preprints,openalex,10.1007/s10273-008-0850-2,https://doi.org/10.1007/s10273-008-0850-2,http://hdl.handle.net/10419/42949,,,,,,,,,2008-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2008.0,parent,http://hdl.handle.net/10419/42949,10.1007/s10273-008-0850-2,,,,,,
466923,openalex::W218899851,EconStor Preprints,openalex,10.1007/s10273-008-0850-2,https://doi.org/10.1007/s10273-008-0850-2,http://hdl.handle.net/10419/42931,,,,,,,,,2008-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2008.0,parent,http://hdl.handle.net/10419/42931,10.1007/s10273-008-0850-2,,,,,,
504836,openalex::W2230538860,EconStor Preprints,openalex,10.1007/s10273-008-0850-2,https://doi.org/10.1007/s10273-008-0850-2,http://hdl.handle.net/10419/42921,,,,,,,,,2008-01-01,,,,,2016-06-24T00:00:00,,2025-11-06T03:46:38.306776,2008.0,parent,http://hdl.handle.net/10419/42921,10.1007/s10273-008-0850-2,,,,,,


In [89]:
# pattern = "osf.io"


# mask = ~dupes[dupes['server_name']=='Open Science Framework']['doi'].str.contains(pattern, regex=False, na=False)
# result = dupes[dupes['server_name']=='Open Science Framework'][mask]
# result

In [90]:
df_mirror[df_mirror['doi']=='10.1101/2019.12.23.887166']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9073833,crossref::10.1101/2019.12.23.887166,bioRxiv,crossref,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,http://biorxiv.org/lookup/doi/10.1101/2019.12....,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.3390/metabo11110749,,2019-12-23,2019-12-23,,2019-12-23,2025-06-03,2025-06-03,2019-12-24,,,2019.0,parent,http://biorxiv.org/lookup/doi/10.1101/2019.12....,10.1101/2019.12.23.887166,,,,,,
1555009,openalex::W2995005865,HAL,openalex,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,https://hal.science/hal-02437604,,,,,,,,,2019-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2019.0,mirror (bioRxiv),https://hal.science/hal-02437604,10.1101/2019.12.23.887166,,,,,,


In [91]:
pattern = "mirror"


mask = df_mirror[df_mirror['server_name']=='F1000Research']['records_hierarchy'].str.contains(pattern, regex=False, na=False)
result = df_mirror[df_mirror['server_name']=='F1000Research'][mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi


In [92]:
pattern = "mirror"


mask = df_mirror['records_hierarchy'].str.contains(pattern, regex=False, na=False)
result = df_mirror[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
9464662,openalex::W1003383974,viXra,openalex,10.13140/rg.2.2.14101.88805,https://doi.org/10.13140/rg.2.2.14101.88805,https://vixra.org/pdf/1501.0014v1.pdf,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2014.0,mirror (ResearchGate),https://vixra.org/pdf/1501.0014v1.pdf,10.13140/rg.2.2.14101.88805,,,,,,
9461657,openalex::W10037114,viXra,openalex,10.13140/rg.2.1.4248.2403,https://doi.org/10.13140/rg.2.1.4248.2403,https://vixra.org/pdf/1407.0122v1.pdf,,,,,,,,,2015-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2015.0,mirror (ResearchGate),https://vixra.org/pdf/1407.0122v1.pdf,10.13140/rg.2.1.4248.2403,,,,,,
2713165,openalex::W100680786,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.50567,https://doi.org/10.22004/ag.econ.50567,https://ageconsearch.umn.edu/record/50567,,,,,,,,,2009-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2009.0,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/50567,10.22004/ag.econ.50567,,,,,,
9464726,openalex::W101331951,viXra,openalex,10.13140/rg.2.2.17278.64326,https://doi.org/10.13140/rg.2.2.17278.64326,https://vixra.org/pdf/1306.0071v1.pdf,,,,,,,,,2013-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2013.0,mirror (ResearchGate),https://vixra.org/pdf/1306.0071v1.pdf,10.13140/rg.2.2.17278.64326,,,,,,
2713230,openalex::W101709879,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.131335,https://doi.org/10.22004/ag.econ.131335,https://ageconsearch.umn.edu/record/131335/fil...,,,,,,,,,1993-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,1993.0,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/131335/fil...,10.22004/ag.econ.131335,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2810559,openalex::W98271585,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.149739,https://doi.org/10.22004/ag.econ.149739,https://ageconsearch.umn.edu/record/149739/,,,,,,,,,2013-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2013.0,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/149739/,10.22004/ag.econ.149739,,,,,,
9486788,openalex::W983039221,viXra,openalex,10.13140/rg.2.2.29266.79041,https://doi.org/10.13140/rg.2.2.29266.79041,http://www.vixra.org/pdf/1503.0004v1.pdf,,,,,,,,,2020-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2020.0,mirror (ResearchGate),http://www.vixra.org/pdf/1503.0004v1.pdf,10.13140/rg.2.2.29266.79041,,,,,,
2806374,openalex::W985302891,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088/fil...,,,,,,,,,2006-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2006.0,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/133088/fil...,10.22004/ag.econ.133088,,,,,,
2817765,openalex::W986721525,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.196816,https://doi.org/10.22004/ag.econ.196816,https://ageconsearch.umn.edu/record/196816/fil...,,,,,,,,,2015-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2015.0,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/196816/fil...,10.22004/ag.econ.196816,,,,,,


In [93]:
result['server_name'].value_counts()

server_name
RePEc: Research Papers in Economics                                     9995
HAL                                                                     3924
DSpace@MIT                                                              1082
viXra                                                                    826
Munich Personal RePEc Archive                                            423
EconStor Preprints                                                       197
PhilSci-Archive                                                          161
IACR Cryptology ePrint Archive                                            91
Digital Access to Scholarship at Harvard (DASH) (Harvard University)      67
E-LIS Repository                                                          28
Social Science Open Access Repository                                     19
Organic Eprints                                                           14
CogPrints                                                       

In [None]:
df_mirror

In [107]:
dupes_mirror_df = df_mirror[df_mirror['doi'].notna()]
dupes_mirror_df

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-26,2018-05-26,,2018-05-26,2022-12-05,2025-02-21,2019-11-13,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500000.1,,,,,,
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-31,2018-05-31,,2018-05-31,2022-12-05,2025-02-21,2019-11-13,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500002.1,,,,,,
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1029/2017sw001789;10.1029/2018sw001995,,2018-05-30,2018-05-30,,2018-05-30,2022-12-05,2025-02-21,2019-11-13,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500004.1,,,,,,
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-02-21,2019-11-13,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500007.1,,,,,,
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-09-29,2019-12-03,,,2018.0,parent,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500009.1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5667066,openalex::W999063011,Social Science Open Access Repository,openalex,10.15464/isi.42.2009.1-5,https://doi.org/10.15464/isi.42.2009.1-5,http://www.ssoar.info/ssoar/handle/document/21392,,,,,,,,,2009-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2009.0,parent,http://www.ssoar.info/ssoar/handle/document/21392,10.15464/isi.42.2009.1-5,,,,,,
2245001,openalex::W99945665,Organic Eprints,openalex,10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,https://doi.org/10.3220/rep_20_1_2014,,,,,,,,,2014-10-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,parent,https://doi.org/10.3220/rep_20_1_2014,10.3220/rep_20_1_2014,,,,,,
5667173,openalex::W999508398,Social Science Open Access Repository,openalex,10.15464/isi.38.2007.1-5,https://doi.org/10.15464/isi.38.2007.1-5,http://www.ssoar.info/ssoar/handle/document/21635,,,,,,,,,2007-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2007.0,parent,http://www.ssoar.info/ssoar/handle/document/21635,10.15464/isi.38.2007.1-5,,,,,,
5673122,openalex::W999509321,Social Science Open Access Repository,openalex,10.23668/psycharchives.13440,https://doi.org/10.23668/psycharchives.13440,http://www.ssoar.info/ssoar/handle/document/3368,,,,,,,,,1997-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,1997.0,parent,http://www.ssoar.info/ssoar/handle/document/3368,10.23668/psycharchives.13440,,,,,,


In [108]:
dupes_mirror = dupes_mirror_df[dupes_mirror_df.duplicated(subset=['doi'], keep=False)]
dupes_mirror

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,records_hierarchy,landing_norm,doi_norm,version_token_lp,token_kind_lp,vnum_lp,version_token_doi,token_kind_doi,vnum_doi
8976641,crossref::10.1101/103937,bioRxiv,crossref,10.1101/103937,https://doi.org/10.1101/103937,http://biorxiv.org/lookup/doi/10.1101/103937,,,,,,,,2017-01-28,2017-01-28,,2017-01-28,2024-08-03,2025-02-21,2017-01-29,,,2017.0,parent,http://biorxiv.org/lookup/doi/10.1101/103937,10.1101/103937,,,,,,
9073833,crossref::10.1101/2019.12.23.887166,bioRxiv,crossref,10.1101/2019.12.23.887166,https://doi.org/10.1101/2019.12.23.887166,http://biorxiv.org/lookup/doi/10.1101/2019.12....,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.3390/metabo11110749,,2019-12-23,2019-12-23,,2019-12-23,2025-06-03,2025-06-03,2019-12-24,,,2019.0,parent,http://biorxiv.org/lookup/doi/10.1101/2019.12....,10.1101/2019.12.23.887166,,,,,,
8986959,crossref::10.1101/2020.01.22.915215,bioRxiv,crossref,10.1101/2020.01.22.915215,https://doi.org/10.1101/2020.01.22.915215,http://biorxiv.org/lookup/doi/10.1101/2020.01....,,,,,,,,2020-01-23,2020-01-23,,2020-01-23,2024-08-04,2025-02-21,2020-01-23,,,2020.0,parent,http://biorxiv.org/lookup/doi/10.1101/2020.01....,10.1101/2020.01.22.915215,,,,,,
8981770,crossref::10.1101/2020.02.03.919597,bioRxiv,crossref,10.1101/2020.02.03.919597,https://doi.org/10.1101/2020.02.03.919597,http://biorxiv.org/lookup/doi/10.1101/2020.02....,,,,,,,,2020-02-03,2020-02-03,,2020-02-03,2024-08-04,2025-02-21,2020-02-04,,,2020.0,parent,http://biorxiv.org/lookup/doi/10.1101/2020.02....,10.1101/2020.02.03.919597,,,,,,
8891397,crossref::10.1101/2020.06.08.118984,bioRxiv,crossref,10.1101/2020.06.08.118984,https://doi.org/10.1101/2020.06.08.118984,http://biorxiv.org/lookup/doi/10.1101/2020.06....,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1038/s41467-020-20239-4,,2020-06-09,2020-06-09,,2020-06-09,2022-10-27,2025-02-21,2020-06-09,,,2020.0,parent,http://biorxiv.org/lookup/doi/10.1101/2020.06....,10.1101/2020.06.08.118984,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486788,openalex::W983039221,viXra,openalex,10.13140/rg.2.2.29266.79041,https://doi.org/10.13140/rg.2.2.29266.79041,http://www.vixra.org/pdf/1503.0004v1.pdf,,,,,,,,,2020-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2020.0,mirror (ResearchGate),http://www.vixra.org/pdf/1503.0004v1.pdf,10.13140/rg.2.2.29266.79041,,,,,,
2806374,openalex::W985302891,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.133088,https://doi.org/10.22004/ag.econ.133088,https://ageconsearch.umn.edu/record/133088/fil...,,,,,,,,,2006-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2006.0,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/133088/fil...,10.22004/ag.econ.133088,,,,,,
2817765,openalex::W986721525,RePEc: Research Papers in Economics,openalex,10.22004/ag.econ.196816,https://doi.org/10.22004/ag.econ.196816,https://ageconsearch.umn.edu/record/196816/fil...,,,,,,,,,2015-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2015.0,mirror (AgEcon Search),https://ageconsearch.umn.edu/record/196816/fil...,10.22004/ag.econ.196816,,,,,,
9486875,openalex::W998010466,viXra,openalex,10.13140/rg.2.2.36331.69924,https://doi.org/10.13140/rg.2.2.36331.69924,https://vixra.org/pdf/1409.0185v1.pdf,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T06:51:31.235846,2014.0,mirror (ResearchGate),https://vixra.org/pdf/1409.0185v1.pdf,10.13140/rg.2.2.36331.69924,,,,,,


In [109]:
dupes_mirror['server_name'].value_counts()

server_name
RePEc: Research Papers in Economics                                     10494
arXiv                                                                    8236
AgEcon Search                                                            6540
HAL                                                                      4041
ResearchGate                                                             1633
DSpace@MIT                                                               1082
viXra                                                                     826
EconStor Preprints                                                        559
Munich Personal RePEc Archive                                             443
Zenodo                                                                    297
PhilSci-Archive                                                           161
IACR Cryptology ePrint Archive                                             91
Digital Access to Scholarship at Harvard (DASH) (Har

In [110]:
dupes_mirror['records_hierarchy'].value_counts()

records_hierarchy
parent                              17868
mirror (arXiv)                       8240
mirror (AgEcon Search)               6541
mirror (ResearchGate)                1633
mirror (Zenodo)                       297
mirror (Open Science Framework)        31
mirror (SSRN)                          30
mirror (bioRxiv)                       29
mirror (Humanities Commons CORE)       24
mirror (eLife)                          3
child                                   2
part_of                                 2
mirror (CERN document server)           2
mirror (Research Square)                2
mirror (AgriRxiv)                       1
mirror (SocArXiv)                       1
mirror (EarthArXiv)                     1
mirror (Open Research Europe)           1
mirror (Law Archive)                    1
mirror (Wellcome Open Research)         1
mirror (Thesis Commons)                 1
mirror (Oroboros Instruments)           1
mirror (TechRxiv)                       1
mirror (medRxiv)

# ccc