In [1]:
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)


# A) Counts

## import data

In [2]:
import duckdb
from pathlib import Path

con = duckdb.connect()

# Low-memory settings
con.execute("PRAGMA threads=1;")
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")
con.execute("PRAGMA memory_limit='2GB';")           # try 1GB if still unstable
con.execute("PRAGMA temp_directory='data/tmp_duckdb';")

# 2) Build paths robustly from the notebook folder
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

BASE = ROOT / "data" / "by_server"

# IMPORTANT: your files are hive-partitioned like:
all_backends = (BASE / "*" / "*.parquet").as_posix()

con.execute(f"""
CREATE OR REPLACE VIEW all_backends AS
SELECT * FROM read_parquet('{all_backends}', hive_partitioning=true, union_by_name=true);
""")

# A unified "all_rows" view
con.execute("""
CREATE OR REPLACE VIEW all_rows AS
SELECT * FROM all_backends
""")

print(con.execute("SHOW TABLES").fetchall())


[('all_backends',), ('all_rows',)]


In [3]:
con.execute("""
SELECT backend, COUNT(*) AS total, COUNT(record_id) AS with_record_id
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
""").df()


Unnamed: 0,backend,total,with_record_id
0,crossref,3612833,3612833
1,datacite,3523368,3523368
2,openalex,2350692,2350692


## total count

In [4]:
con.execute("""
SELECT COUNT(DISTINCT record_id) AS total_dedup_records
FROM (
    SELECT record_id FROM all_backends
)
""").df()


Unnamed: 0,total_dedup_records
0,8410094


## count per backend

In [5]:
con.execute("""
SELECT
    backend,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, record_id FROM all_backends
)
GROUP BY backend
ORDER BY dedup_records DESC
""").df()


Unnamed: 0,backend,dedup_records
0,datacite,3520391
1,crossref,3187171
2,openalex,1702532


## count per server

In [6]:
con.execute("""
SELECT
    backend,
    server_name,
    COUNT(DISTINCT record_id) AS dedup_records
FROM (
    SELECT backend, server_name, record_id FROM all_backends
)
GROUP BY backend, server_name
ORDER BY dedup_records DESC
""").df()


Unnamed: 0,backend,server_name,dedup_records
0,datacite,arXiv,2920797
1,crossref,SSRN,1258958
2,openalex,HAL,1056424
3,crossref,Research Square,450818
4,openalex,RePEc: Research Papers in Economics,389398
...,...,...,...
108,crossref,Prepublicaciones OpenCiencia,8
109,crossref,Therapoid,7
110,crossref,NewAddictionsX,7
111,openalex,Bepress Legal Repository,6


In [7]:
server_name_df = con.execute("""
SELECT server_name, COUNT(DISTINCT record_id) AS n
FROM all_backends
GROUP BY 1
ORDER BY n DESC
LIMIT 200;
""").df()


In [8]:
server_name_df.head(60)

Unnamed: 0,server_name,n
0,arXiv,2920797
1,SSRN,1258958
2,HAL,1056424
3,Research Square,450818
4,RePEc: Research Papers in Economics,389398
5,bioRxiv,306948
6,eLife,247558
7,AgEcon Search,188173
8,ResearchGate,181231
9,Qeios,172316


In [9]:
server_name_df.tail(52)

Unnamed: 0,server_name,n
60,CERN document server,973
61,ARPHA Preprints,890
62,MetaArXiv,880
63,SportRxiv,878
64,Gates Open Research,863
65,AgriRxiv,818
66,Beilstein Archives,697
67,Covid-19 Preprints,647
68,EasyChair preprint,620
69,MarXiv,508


# B) Explorations

In [10]:
con.execute(f"""
CREATE OR REPLACE VIEW server_thin AS
SELECT
  CAST(record_id AS VARCHAR)           AS record_id,
  CAST(server_name AS VARCHAR)         AS server_name,
  CAST(backend AS VARCHAR)             AS backend,

  CAST(doi AS VARCHAR)                 AS doi,
  CAST(doi_url AS VARCHAR)             AS doi_url,
  CAST(landing_page_url AS VARCHAR)    AS landing_page_url,

  CAST(version_label AS VARCHAR)       AS version_label,

  -- Relationships (keep these for true version links)
  CAST(relations_json AS VARCHAR)       AS relations_json,
  CAST(raw_relationships_json AS VARCHAR)       AS raw_relationships_json,
  CAST(is_version_of AS VARCHAR)       AS is_version_of,      -- keep as text; we’ll interpret later
  CAST(version_of_ids_json AS VARCHAR) AS version_of_ids_json,
  CAST(is_preprint_of AS VARCHAR)      AS is_preprint_of,
  CAST(published_version_ids_json AS VARCHAR) AS published_version_ids_json,

  -- Dates (helpful for temporal patterns)
  CAST(date_posted AS VARCHAR)         AS date_posted,
  CAST(date_published AS VARCHAR)      AS date_published,
  CAST(date_published_online AS VARCHAR)      AS date_published_online,
  CAST(date_issued AS VARCHAR)         AS date_issued,
  CAST(date_deposited AS VARCHAR)      AS date_deposited,
  CAST(date_indexed AS VARCHAR)        AS date_indexed,
  CAST(date_created AS VARCHAR)        AS date_created,
  CAST(date_registered AS VARCHAR)     AS date_registered,
  CAST(date_updated AS VARCHAR)        AS date_updated,
  CAST(publication_year AS VARCHAR)    AS publication_year
FROM all_backends
""")

con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


Unnamed: 0,n
0,9486893


## Global Exploration

### Duplicates

In [11]:
data = con.execute("SELECT * FROM server_thin").df()
# data.drop_duplicates(subset=['record_id'], keep='first', inplace=False)

data = data.drop_duplicates()
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
0,crossref::10.21467/preprints.48,AIJR Preprints,crossref,10.21467/preprints.48,https://doi.org/10.21467/preprints.48,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-05-03,2020-05-03,,2020-05-03,2020-09-15,2025-05-14,2020-09-15,,,2020.0
1,crossref::10.21467/preprints.43,AIJR Preprints,crossref,10.21467/preprints.43,https://doi.org/10.21467/preprints.43,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-04-25,2020-04-25,,2020-04-25,2020-09-15,2025-05-14,2020-09-15,,,2020.0
2,crossref::10.21467/preprints.39,AIJR Preprints,crossref,10.21467/preprints.39,https://doi.org/10.21467/preprints.39,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-04-16,2020-04-16,,2020-04-16,2020-09-15,2025-05-14,2020-09-15,,,2020.0
3,crossref::10.21467/preprints.38,AIJR Preprints,crossref,10.21467/preprints.38,https://doi.org/10.21467/preprints.38,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-04-15,2020-04-15,,2020-04-15,2020-09-17,2022-12-13,2020-09-17,,,2020.0
4,crossref::10.21467/preprints.36,AIJR Preprints,crossref,10.21467/preprints.36,https://doi.org/10.21467/preprints.36,https://preprints.aijr.org/index.php/ap/prepri...,,,,,,,,2020-04-15,2020-04-15,,2020-04-15,2020-09-17,2024-08-11,2020-09-17,,,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486888,openalex::W999325625,viXra,openalex,,,https://vixra.org/pdf/1409.0090v1.pdf,,,,,,,,,2014-09-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2014.0
9486889,openalex::W999460032,viXra,openalex,,,https://vixra.org/abs/1112.0094,,,,,,,,,2011-12-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2011.0
9486890,openalex::W99967155,viXra,openalex,,,https://vixra.org/pdf/1406.0019v1.pdf,,,,,,,,,2014-06-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2014.0
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,,,,,,,,,2013-06-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2013.0


In [12]:
dupes = data[data.duplicated(subset=['record_id'], keep=False)]
dupes

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
15570,crossref::10.31124/advance.24454624.v1,Advance,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,,,,,,,,2024-02-16,2024-02-16,,2024-02-16,2024-02-22,2024-02-23,2024-02-05,,,2024.0
16016,crossref::10.31124/advance.170921771.12975902/v1,Advance,crossref,10.31124/advance.170921771.12975902/v1,https://doi.org/10.31124/advance.170921771.129...,https://advance.sagepub.com/users/748375/artic...,,,,,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2024-03-01,2024-02-29,,,2024.0
232722,crossref::10.22541/essoar.167016857.73309909.1,Authorea Inc.,crossref,10.22541/essoar.167016857.73309909.1,https://doi.org/10.22541/essoar.167016857.7330...,https://www.authorea.com/users/525275/articles...,,,,,,,,2022-12-04,2022-12-04,,2022-12-04,2022-12-04,2025-06-03,2022-12-04,,,2022.0
232752,crossref::10.22541/essoar.167024940.09387576/v1,Authorea Inc.,crossref,10.22541/essoar.167024940.09387576/v1,https://doi.org/10.22541/essoar.167024940.0938...,https://www.authorea.com/users/525275/articles...,,,,,,,,2022-12-05,2022-12-05,,2022-12-05,2022-12-05,2025-06-03,2022-12-05,,,2022.0
232822,crossref::10.22541/essoar.167045090.02735580/v1,Authorea Inc.,crossref,10.22541/essoar.167045090.02735580/v1,https://doi.org/10.22541/essoar.167045090.0273...,https://www.authorea.com/users/563627/articles...,,,,,,,,2022-12-07,2022-12-07,,2022-12-07,2022-12-07,2025-06-03,2022-12-07,,,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446146,crossref::10.22541/essoar.170365313.34695479/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170365313.34695479/v1,https://doi.org/10.22541/essoar.170365313.3469...,https://essopenarchive.org/users/531775/articl...,,,,,,,,2023-12-27,2023-12-27,,2023-12-27,2024-02-29,2025-06-03,2023-12-27,,,2023.0
446148,crossref::10.22541/essoar.170680200.06135599/v2,Earth and Space Science Open Archive,crossref,10.22541/essoar.170680200.06135599/v2,https://doi.org/10.22541/essoar.170680200.0613...,https://essopenarchive.org/users/672150/articl...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.22541/essoar.170680200.06135599/v1,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2025-06-03,2024-02-29,,,2024.0
446149,crossref::10.22541/essoar.170923255.57545328/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,,,,,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2025-06-03,2024-02-29,,,2024.0
446341,crossref::10.22541/essoar.170612447.73077522/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170612447.73077522/v1,https://doi.org/10.22541/essoar.170612447.7307...,https://essopenarchive.org/doi/full/10.22541/e...,,,,,,,,2024-01-24,2024-01-24,,2024-01-24,2024-03-22,2025-11-23,2024-01-24,,,2024.0


In [13]:
dupes['server_name'].value_counts()

server_name
Authorea Inc.                           3432
Earth and Space Science Open Archive    3430
Advance                                    2
Name: count, dtype: int64

In [14]:
data[data['record_id']=='crossref::10.31124/advance.24454624.v1']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
15570,crossref::10.31124/advance.24454624.v1,Advance,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,,,,,,,,2024-02-16,2024-02-16,,2024-02-16,2024-02-22,2024-02-23,2024-02-05,,,2024.0
273723,crossref::10.31124/advance.24454624.v1,Authorea Inc.,crossref,10.31124/advance.24454624.v1,https://doi.org/10.31124/advance.24454624.v1,https://advance.sagepub.com/doi/full/10.31124/...,,,,,,,,2024-02-16,2024-02-16,,2024-02-16,2024-02-22,2024-02-23,2024-02-05,,,2024.0


In [15]:
data[data['record_id']=='crossref::10.22541/essoar.170923255.57545328/v1']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
250777,crossref::10.22541/essoar.170923255.57545328/v1,Authorea Inc.,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,,,,,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2025-06-03,2024-02-29,,,2024.0
446149,crossref::10.22541/essoar.170923255.57545328/v1,Earth and Space Science Open Archive,crossref,10.22541/essoar.170923255.57545328/v1,https://doi.org/10.22541/essoar.170923255.5754...,https://essopenarchive.org/users/528524/articl...,,,,,,,,2024-02-29,2024-02-29,,2024-02-29,2024-02-29,2025-06-03,2024-02-29,,,2024.0


#### Resolution

In [16]:
import pandas as pd

# ----------------------------------
# 1) Define server priority
#    Lower value = higher priority (kept first)
# ----------------------------------
server_priority = {
    "Earth and Space Science Open Archive": 1,
    "Advance": 2,
    "Authorea Inc.": 3
}

# Work on a copy to avoid side effects
data = data.copy()

# ----------------------------------
# 2) Add priority column
#    Unknown servers get lowest priority
# ----------------------------------
data['server_priority'] = (
    data['server_name']
    .map(server_priority)
    .fillna(99)
    .astype(int)
)

# ----------------------------------
# 3) Deduplicate STRICTLY on record_id
#    - Sort so preferred server comes first
#    - Keep only the best row per record_id
# ----------------------------------
data_clean = (
    data
    .sort_values(by=['record_id', 'server_priority'])
    .drop_duplicates(subset=['record_id'], keep='first')
    .drop(columns=['server_priority'])
)

# ----------------------------------
# 4) (Optional but recommended) Inspect removals
# ----------------------------------
removed = data.loc[~data.index.isin(data_clean.index)]

print("Removed rows by server_name:")
print(removed['server_name'].value_counts(dropna=False))

print("\nRows before:", len(data))
print("Rows after :", len(data_clean))
print("Rows removed:", len(removed))

# ----------------------------------
# data_clean is the final deduplicated dataframe
# ----------------------------------


Removed rows by server_name:
server_name
Authorea Inc.    3432
Name: count, dtype: int64

Rows before: 8413526
Rows after : 8410094
Rows removed: 3432


In [17]:
data_clean.shape

(8410094, 23)

### Columns

In [18]:
data_clean.count()

record_id                     8410094
server_name                   8410094
backend                       8410094
doi                           6784859
doi_url                       6784859
landing_page_url              8330648
version_label                 2974496
relations_json                4253292
raw_relationships_json        3520391
is_version_of                 6707562
version_of_ids_json                 0
is_preprint_of                6707562
published_version_ids_json          0
date_posted                   1947932
date_published                4743663
date_published_online          288705
date_issued                   3041131
date_deposited                3187171
date_indexed                  3187171
date_created                  8410094
date_registered               3520391
date_updated                  5222915
publication_year              8264048
dtype: int64

In [19]:
import pandas as pd
import numpy as np

data = data_clean.copy()

# -------------------------------------------------
# Configuration
# -------------------------------------------------
GROUP_COL = 'server_name'

MISSINGNESS_COLUMNS = [
    'doi',
    'landing_page_url'
]

COMPLETENESS_COLUMNS = [
    'version_label',
    'relations_json',
    'is_version_of',
    'is_preprint_of',
    'raw_relationships_json'
]

# -------------------------------------------------
# Helper functions
# -------------------------------------------------
def is_missing(series):
    """Strict missingness (NaN / None)"""
    return series.isna()

def is_incomplete(series):
    """
    Missing OR empty content:
    - NaN
    - empty string
    - empty list
    - empty dict
    """
    return (
        series.isna()
        | (series.astype(str).str.strip() == '')
        | (series.astype(str).isin(['[]', '{}']))
    )

# -------------------------------------------------
# Base counts per server
# -------------------------------------------------
base = (
    data
    .groupby(GROUP_COL, dropna=False)
    .size()
    .rename('total_records')
    .to_frame()
)

# -------------------------------------------------
# Missingness metrics
# -------------------------------------------------
for col in MISSINGNESS_COLUMNS:
    missing_count = (
        data.loc[is_missing(data[col])]
        .groupby(GROUP_COL, dropna=False)
        .size()
        .rename(f'{col}_missing_count')
    )

    base = base.join(missing_count, how='left').fillna(0)
    base[f'{col}_missing_count'] = base[f'{col}_missing_count'].astype(int)
    base[f'{col}_missing_percent'] = (
        base[f'{col}_missing_count'] / base['total_records'] * 100
    ).round(2)

# -------------------------------------------------
# Completeness metrics
# -------------------------------------------------
for col in COMPLETENESS_COLUMNS:
    incomplete_count = (
        data.loc[is_incomplete(data[col])]
        .groupby(GROUP_COL, dropna=False)
        .size()
        .rename(f'{col}_incomplete_count')
    )

    base = base.join(incomplete_count, how='left').fillna(0)
    base[f'{col}_incomplete_count'] = base[f'{col}_incomplete_count'].astype(int)
    base[f'{col}_incomplete_percent'] = (
        base[f'{col}_incomplete_count'] / base['total_records'] * 100
    ).round(2)

# -------------------------------------------------
# Final table
# -------------------------------------------------
summary = base.sort_values('total_records', ascending=False)

# summary


In [20]:
summary.head(60)

Unnamed: 0_level_0,total_records,doi_missing_count,doi_missing_percent,landing_page_url_missing_count,landing_page_url_missing_percent,version_label_incomplete_count,version_label_incomplete_percent,relations_json_incomplete_count,relations_json_incomplete_percent,is_version_of_incomplete_count,is_version_of_incomplete_percent,is_preprint_of_incomplete_count,is_preprint_of_incomplete_percent,raw_relationships_json_incomplete_count,raw_relationships_json_incomplete_percent
server_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
arXiv,2920797,0,0.0,0,0.0,0,0.0,1638047,56.08,2920797,100.0,1638164,56.09,0,0.0
SSRN,1258958,0,0.0,0,0.0,1258938,100.0,1258703,99.98,1258958,100.0,1258703,99.98,1258958,100.0
HAL,1056424,1024809,97.01,103,0.01,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0,1056424,100.0
Research Square,450818,0,0.0,0,0.0,450813,100.0,283964,62.99,435544,96.61,295396,65.52,450818,100.0
RePEc: Research Papers in Economics,389398,364045,93.49,39749,10.21,389398,100.0,389398,100.0,389398,100.0,389398,100.0,389398,100.0
bioRxiv,306948,0,0.0,0,0.0,306937,100.0,196496,64.02,306947,100.0,196497,64.02,306948,100.0
eLife,247558,0,0.0,0,0.0,247192,99.85,151705,61.28,233206,94.2,247537,99.99,247558,100.0
AgEcon Search,188173,0,0.0,0,0.0,188173,100.0,188173,100.0,188173,100.0,188173,100.0,0,0.0
ResearchGate,181231,0,0.0,0,0.0,174949,96.53,174741,96.42,174741,96.42,181231,100.0,0,0.0
Qeios,172316,0,0.0,0,0.0,172316,100.0,111907,64.94,167745,97.35,172316,100.0,172316,100.0


In [21]:
summary.tail(52)

Unnamed: 0_level_0,total_records,doi_missing_count,doi_missing_percent,landing_page_url_missing_count,landing_page_url_missing_percent,version_label_incomplete_count,version_label_incomplete_percent,relations_json_incomplete_count,relations_json_incomplete_percent,is_version_of_incomplete_count,is_version_of_incomplete_percent,is_preprint_of_incomplete_count,is_preprint_of_incomplete_percent,raw_relationships_json_incomplete_count,raw_relationships_json_incomplete_percent
server_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
CERN document server,973,0,0.0,0,0.0,933,95.89,46,4.73,973,100.0,443,45.53,0,0.0
ARPHA Preprints,890,0,0.0,0,0.0,890,100.0,578,64.94,890,100.0,578,64.94,890,100.0
MetaArXiv,880,0,0.0,0,0.0,880,100.0,681,77.39,813,92.39,742,84.32,880,100.0
SportRxiv,878,0,0.0,0,0.0,878,100.0,794,90.43,877,99.89,795,90.55,878,100.0
Gates Open Research,863,0,0.0,0,0.0,539,62.46,83,9.62,799,92.58,863,100.0,863,100.0
AgriRxiv,818,0,0.0,0,0.0,818,100.0,805,98.41,818,100.0,805,98.41,818,100.0
Beilstein Archives,697,0,0.0,0,0.0,697,100.0,236,33.86,697,100.0,236,33.86,697,100.0
Covid-19 Preprints,647,0,0.0,0,0.0,647,100.0,647,100.0,647,100.0,647,100.0,647,100.0
EasyChair preprint,620,0,0.0,0,0.0,620,100.0,620,100.0,620,100.0,620,100.0,620,100.0
MarXiv,508,0,0.0,0,0.0,508,100.0,249,49.02,506,99.61,251,49.41,508,100.0


### Systematic pattern extraction (exploration)

In [22]:
import re
import pandas as pd
import numpy as np

data = data.copy()

# ----------------------------
# Normalize fields
# ----------------------------
data['doi_norm'] = data['doi'].astype(str).str.lower()
data['url_norm'] = data['landing_page_url'].astype(str).str.lower()

# ----------------------------
# Threshold for version numbers
# ----------------------------
MAX_VERSION = 99

# ----------------------------
# Regex to extract version-like token
# Note: ".N" limited to .1–.99 to avoid .00000516, .8725, etc.
# ----------------------------
# VERSION_TOKEN_RX = re.compile(
#     r'(?P<token>('
#     r'/v\d+|'                 # /v1
#     r'\.v\d+|'                # .v1
#     r'_v\d+|'                 # _v1
#     r'-v\d+|'                 # -v2 (incl -v05)
#     r'\.(?:[1-9]?\d)$|'       # .1–.99 ONLY
#     r'-rc\d+|'                # -rc1
#     r'-cc\d+|'                # -cc1
#     r'-supplement|'           # -supplement
#     r'\.sa\d+'                # .sa0, .sa1, .sa10
#     r'))'
# )
# import re

VERSION_TOKEN_RX = re.compile(
    r'(?P<token>('
    r'/v\d+(?![a-z0-9])|'       # /v1 but not /v284p
    r'\.v\d+(?![a-z0-9])|'      # .v1 but not .v50i7
    r'_v\d+(?![a-z0-9])|'       # _v1
    r'-v\d+(?![a-z0-9])|'       # -v2 but not -v8018x (rare)
    r'\.(?:[1-9]?\d)$|'         # .1–.99 ONLY
    r'-rc\d+(?![a-z0-9])|'      # -rc1
    r'-cc\d+(?![a-z0-9])|'      # -cc1
    r'-supplement|'             # -supplement
    r'\.sa\d+(?![a-z0-9])'      # .sa10
    r'))'
)

DOTNUM_RX = re.compile(r'\.(?:[1-9]?\d)$')        # ".1" .. ".99"
NUM_RX = re.compile(r'(\d+)')

# Servers where ".N" must NOT be interpreted as a version signal
SERVERS_EXCLUDE_DOTNUM = {
    'arXiv', 'SSRN', 'Zenodo', 'HAL', 'bioRxiv', 'medRxiv',
    'Digital Access to Scholarship at Harvard (DASH) (Harvard University)',
    'E-LIS Repository', 'DSpace@MIT', 'AgEcon Search',
    'JMIR Preprints', 'RePEc: Research Papers in Economics',
    'AIJR Preprints', 'Nature Precedings', 'ResearchGate'
}

def extract_token(text, server):
    if not isinstance(text, str) or text.strip() == "":
        return None

    t = text.lower()
    m = VERSION_TOKEN_RX.search(t)
    if not m:
        return None

    token = m.group('token')

    # If token is ".N" but server is in exclude list, ignore it
    if server in SERVERS_EXCLUDE_DOTNUM and DOTNUM_RX.fullmatch(token):
        return None

    return token

def token_to_number(tok):
    """Extract numeric part of a token. Returns NaN if none."""
    if not isinstance(tok, str):
        return np.nan
    m = NUM_RX.search(tok)
    return float(m.group(1)) if m else np.nan

def is_valid_version_signal(tok, num):
    """
    Keep token only if it is a plausible version signal:
    - For v-patterns: require 1..MAX_VERSION
    - For ".N": already limited to .1..99; also require 1..MAX_VERSION
    - For rc/cc: treat as revision-like; require numeric 1..MAX_VERSION (optional but consistent)
    - For supplement/sa: keep as non-version auxiliary signals (you can change if desired)
    """
    if not isinstance(tok, str):
        return False

    # Always keep these as "auxiliary signals" (not versions)
    if tok == '-supplement' or tok.startswith('.sa'):
        return True

    # ".N" (already .1..99)
    if DOTNUM_RX.fullmatch(tok):
        return (not np.isnan(num)) and (0 <= num <= MAX_VERSION)

    # /vN, .vN, _vN, -vN
    if re.search(r'(^/v|\.v|_v|-v)', tok):
        return (not np.isnan(num)) and (0 <= num <= MAX_VERSION)

    # -rcN / -ccN (revision marker)
    if re.search(r'(rc|cc)\d+', tok):
        return (not np.isnan(num)) and (0 <= num <= MAX_VERSION)

    # If something else matched, keep it (rare)
    return True

# ----------------------------
# Extract DOI + URL tokens
# ----------------------------
data['doi_version_token'] = data.apply(lambda r: extract_token(r['doi_norm'], r['server_name']), axis=1)
data['url_version_token'] = data.apply(lambda r: extract_token(r['url_norm'], r['server_name']), axis=1)

# Prefer DOI token, fallback to URL token
data['version_token'] = data['doi_version_token'].combine_first(data['url_version_token'])

# Numeric version extracted from token when applicable
data['version_num'] = data['version_token'].apply(token_to_number)

# Validate version signals (filter out -v3840, -v8018, etc.)
data['is_valid_version_signal'] = data.apply(
    lambda r: is_valid_version_signal(r['version_token'], r['version_num']),
    axis=1
)

# Final "version signal" table (only rows with a valid signal)
version_signals = data.loc[
    data['version_token'].notna() & data['is_valid_version_signal'],
    ['server_name', 'backend', 'doi', 'landing_page_url', 'version_token', 'version_num']
]

version_signals


Unnamed: 0,server_name,backend,doi,landing_page_url,version_token,version_num
429739,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,1.0
429742,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,1.0
429740,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,1.0
429741,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,1.0
429746,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,1.0
...,...,...,...,...,...,...
527518,EconStor Preprints,openalex,,https://hdl.handle.net/10419/269584.2,.2,2.0
527567,EconStor Preprints,openalex,,https://hdl.handle.net/10419/307791.2,.2,2.0
9486019,viXra,openalex,10.6084/m9.figshare.3486131.v1,https://vixra.org/pdf/1404.0231v1.pdf,.v1,1.0
9486738,viXra,openalex,10.6084/m9.figshare.6199046.v1,https://vixra.org/pdf/1010.0060v1.pdf,.v1,1.0


In [23]:
(
    version_signals
    .groupby(['version_token'])
    .size()
    .sort_values()
    .tail(60)
)


version_token
.v14               51
-v7                54
.14                54
.sa5               54
.v13               57
.7                 60
_v7                64
.13                66
.v12               68
.v11               74
-v6                77
.12                78
.11                89
.6                 90
.10               101
.v10              108
/v7               108
_v6               115
.v9               122
-cc3              125
.v8               138
-v5               143
.v7               177
-rc4              230
.5                239
_v5               241
.v6               267
/v6               282
-cc2              335
-v4               361
.v5               368
_v4               600
.v4               790
/v5               895
.4               1095
-cc1             1286
-v3              1411
-rc3             1641
_v3              1821
.v3              2285
.sa4             2817
/v4              3221
-supplement      4473
.3               5568
_v2              6

In [24]:
invalid = data.loc[data['version_token'].notna() & ~data['is_valid_version_signal'],
                   ['server_name', 'version_token', 'doi', 'landing_page_url']]
invalid.groupby(['server_name', 'version_token']).size().sort_values(ascending=False).head(50)


server_name                          version_token
HAL                                  .v2018           4
ChemRxiv                             -v4824           4
HAL                                  .v2019           3
                                     .v2020           3
RePEc: Research Papers in Economics  _v2014           2
Humanities Commons CORE              -v772            2
HAL                                  -v502            2
Cambridge Open Engage                -v7545           2
ChemRxiv                             -v4758           2
                                     -v5951           2
Humanities Commons CORE              -v560            2
ChemRxiv                             -v9841           2
RePEc: Research Papers in Economics  _v2012           2
PsyArXiv                             /v5743           2
Humanities Commons CORE              -v845            1
                                     -v850            1
                                     -v8541          

In [25]:
(
    invalid
    .groupby(['version_token'])
    .size()
    .sort_values()
    .tail(60)
)


version_token
/v471         1
/v4859        1
/v4986        1
/v05030       1
/v110219      1
/v112         1
/v120160      1
/v08067       1
/v15081628    1
/v2738        1
/v26250       1
/v20502       1
/v351         1
/v360         1
/v3645        1
/v2743        1
/v3007        1
/v3264        1
/v34199       1
/v349         1
/v391         1
/v3926        1
/v4105        1
/v3895        1
/v4406        1
/v46576       1
/v4538        1
/v447         1
/v9683        1
/v81905       1
/v8255        1
/v8947        1
/v7895        1
/v693         1
/v7620        1
/v5326        1
/v5347        1
/v546         1
/v6297        1
/v642         1
/v7239        1
/v7346        1
-v203         2
-v502         2
-v9841        2
-v686         2
-v560         2
-v5951        2
-v4758        2
-v806         2
-v772         2
-v7545        2
_v2014        2
/v6849        2
_v2012        2
/v5743        2
.v2020        3
.v2019        3
-v4824        4
.v2018        4
dtype: int64

In [26]:
import pandas as pd
import numpy as np
import re

# ----------------------------
# Inputs
# ----------------------------
df = data.copy()  # or data_clean.copy() if that's your base
GROUP_COL = 'server_name'

# Keep only rows with a valid version signal
sig = df.loc[df['version_token'].notna() & df['is_valid_version_signal']].copy()

# ----------------------------
# Helper: classify token type
# ----------------------------
DOTNUM_RX = re.compile(r'\.(?:[1-9]?\d)$')   # ".1" .. ".99"

def token_type(tok: str) -> str:
    if not isinstance(tok, str) or tok == "":
        return "none"
    tok = tok.lower()
    if tok.startswith("/v"):
        return "/v"
    if tok.startswith("-v"):
        return "-v"
    if tok.startswith(".v"):
        return ".v"
    if tok.startswith("_v"):
        return "_v"
    if DOTNUM_RX.fullmatch(tok):
        return ".N"
    if "rc" in tok:
        return "rc"
    if "cc" in tok:
        return "cc"
    if tok.startswith(".sa"):
        return "sa"
    if tok == "-supplement":
        return "supplement"
    return "other"

sig['token_type'] = sig['version_token'].apply(token_type)

# ----------------------------
# 1) Base counts per server
# ----------------------------
total = df.groupby(GROUP_COL, dropna=False).size().rename('total_records')
with_signal = sig.groupby(GROUP_COL, dropna=False).size().rename('records_with_valid_signal')

profile = pd.concat([total, with_signal], axis=1).fillna(0)
profile['records_with_valid_signal'] = profile['records_with_valid_signal'].astype(int)

profile['signal_coverage_percent'] = (
    profile['records_with_valid_signal'] / profile['total_records'] * 100
).round(2)

# ----------------------------
# 2) Token-type distribution (counts + % within server among signals)
# ----------------------------
type_counts = (
    sig.groupby([GROUP_COL, 'token_type'], dropna=False)
    .size()
    .rename('count')
    .reset_index()
)

type_counts['percent_of_signals'] = (
    type_counts['count'] / type_counts.groupby(GROUP_COL)['count'].transform('sum') * 100
).round(2)

# Pivot to wide for counts
type_counts_wide = (
    type_counts.pivot(index=GROUP_COL, columns='token_type', values='count')
    .fillna(0)
    .astype(int)
)

# Optional: also pivot to wide for percents (rename columns with suffix)
type_perc_wide = (
    type_counts.pivot(index=GROUP_COL, columns='token_type', values='percent_of_signals')
    .fillna(0)
)
type_perc_wide.columns = [f"{c}_pct_of_signals" for c in type_perc_wide.columns]

profile = profile.join(type_counts_wide, how='left').join(type_perc_wide, how='left').fillna(0)

# ----------------------------
# 3) Version number stats (only where numeric exists)
#    (supplement/sa may not have a meaningful numeric "version")
# ----------------------------
sig_num = sig.loc[sig['version_num'].notna()].copy()

num_stats = (
    sig_num.groupby(GROUP_COL)['version_num']
    .agg(
        max_version='max',
        median_version='median',
        p90_version=lambda x: np.nanpercentile(x, 90),
        n_numeric_versions='count'
    )
)

# Round nice
num_stats = num_stats.round({'max_version': 0, 'median_version': 1, 'p90_version': 0})
num_stats['max_version'] = num_stats['max_version'].astype('Int64')
num_stats['p90_version'] = num_stats['p90_version'].astype('Int64')
num_stats['n_numeric_versions'] = num_stats['n_numeric_versions'].astype(int)

profile = profile.join(num_stats, how='left')

# ----------------------------
# 4) Consistency indicators
#    - dominant token type + share among signals
# ----------------------------
dominant = (
    type_counts.sort_values(['server_name', 'count'], ascending=[True, False])
    .groupby(GROUP_COL, as_index=False)
    .first()[[GROUP_COL, 'token_type', 'percent_of_signals']]
    .rename(columns={'token_type': 'dominant_token_type', 'percent_of_signals': 'dominant_type_share_pct'})
)

dominant = dominant.set_index(GROUP_COL)
profile = profile.join(dominant, how='left')

# Fill missing dominant fields for servers with no signals
profile['dominant_token_type'] = profile['dominant_token_type'].fillna('none')
profile['dominant_type_share_pct'] = profile['dominant_type_share_pct'].fillna(0).round(2)

# ----------------------------
# 5) Final sorting
#    - show biggest servers first, or worst coverage first
# ----------------------------
profile_sorted = profile.sort_values(['total_records', 'signal_coverage_percent'], ascending=[False, True])

# Display
profile_sorted


Unnamed: 0_level_0,total_records,records_with_valid_signal,signal_coverage_percent,-v,.N,.v,/v,_v,cc,rc,sa,supplement,-v_pct_of_signals,.N_pct_of_signals,.v_pct_of_signals,/v_pct_of_signals,_v_pct_of_signals,cc_pct_of_signals,rc_pct_of_signals,sa_pct_of_signals,supplement_pct_of_signals,max_version,median_version,p90_version,n_numeric_versions,dominant_token_type,dominant_type_share_pct
server_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
arXiv,2920797,0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,none,0.00
SSRN,1258958,0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,none,0.00
HAL,1056424,178,0.02,11.0,0.0,150.0,11.0,2.0,0.0,2.0,1.0,1.0,6.18,0.0,84.27,6.18,1.12,0.00,1.12,0.56,0.56,69,1.0,20,177.0,.v,84.27
Research Square,450818,450818,100.00,0.0,0.0,0.0,450818.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,100.00,0.00,0.00,0.00,0.00,0.00,27,1.0,1,450818.0,/v,100.00
RePEc: Research Papers in Economics,389398,556,0.14,34.0,0.0,299.0,153.0,67.0,1.0,1.0,0.0,1.0,6.12,0.0,53.78,27.52,12.05,0.18,0.18,0.00,0.18,82,1.0,40,555.0,.v,53.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Prepublicaciones OpenCiencia,8,8,100.00,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,100.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,12,5.5,11,8.0,.N,100.00
Therapoid,7,0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,none,0.00
NewAddictionsX,7,5,71.43,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,100.00,0.00,0.00,0.00,0.00,3,2.0,3,5.0,_v,100.00
Bepress Legal Repository,6,0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,none,0.00


In [27]:
profile_sorted.loc[profile_sorted['dominant_type_share_pct'] < 70,
                   ['total_records','signal_coverage_percent','dominant_token_type','dominant_type_share_pct']].head(50)


Unnamed: 0_level_0,total_records,signal_coverage_percent,dominant_token_type,dominant_type_share_pct
server_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
arXiv,2920797,0.0,none,0.0
SSRN,1258958,0.0,none,0.0
RePEc: Research Papers in Economics,389398,0.14,.v,53.78
bioRxiv,306948,0.0,none,0.0
AgEcon Search,188173,0.0,none,0.0
ResearchGate,181231,0.0,none,0.0
Zenodo,166786,0.0,none,0.0
medRxiv,75743,0.0,none,0.0
Munich Personal RePEc Archive,68692,0.04,.v,68.0
ChemRxiv,46475,40.66,.v,60.49


In [28]:
server = "Humanities Commons CORE"
(sig.loc[sig['server_name'].eq(server)]
   .groupby(['token_type','version_token'])
   .size()
   .sort_values(ascending=False)
   .head(50)
)


token_type  version_token
rc          -rc07            3
cc          -cc46            2
-v          -v011            1
            -v015            1
            -v037            1
            -v044            1
            -v046            1
            -v049            1
            -v051            1
            -v075            1
            -v019            1
            -v020            1
            -v091            1
            -v088            1
/v          /v039            1
-v          -v095            1
cc          -cc06            1
            -cc063           1
            -cc12            1
-v          -v096            1
cc          -cc15            1
            -cc26            1
            -cc30            1
            -cc44            1
            -cc48            1
            -cc51            1
            -cc55            1
            -cc02            1
            -cc59            1
            -cc67            1
            -cc72            1
            -

In [29]:
server = "Open Science Framework"
(sig.loc[sig['server_name'].eq(server)]
   .groupby(['token_type','version_token'])
   .size()
   .sort_values(ascending=False)
   # .head(50)
)


token_type  version_token
_v          _v1              78355
            _v2               1451
            _v3                467
            _v4                174
            _v5                 63
            _v6                 33
            _v8                 20
            _v7                 18
            _v9                 10
            _v10                 6
            _v11                 5
            _v16                 4
            _v13                 4
            _v19                 3
            _v14                 3
            _v18                 3
            _v17                 3
            _v12                 3
            _v21                 3
            _v15                 2
            _v23                 2
            _v22                 2
            _v25                 2
            _v24                 2
            _v20                 2
            _v26                 1
            _v27                 1
            _v28             

In [30]:
PREFER_URL_SERVERS = {'Open Science Framework'}

data['version_token'] = np.where(
    data['server_name'].isin(PREFER_URL_SERVERS),
    data['url_version_token'],
    data['doi_version_token'].combine_first(data['url_version_token'])
)


In [31]:
server = "Open Science Framework"
(sig.loc[sig['server_name'].eq(server)]
   .groupby(['token_type','version_token'])
   .size()
   .sort_values(ascending=False)
   # .head(50)
)


token_type  version_token
_v          _v1              78355
            _v2               1451
            _v3                467
            _v4                174
            _v5                 63
            _v6                 33
            _v8                 20
            _v7                 18
            _v9                 10
            _v10                 6
            _v11                 5
            _v16                 4
            _v13                 4
            _v19                 3
            _v14                 3
            _v18                 3
            _v17                 3
            _v12                 3
            _v21                 3
            _v15                 2
            _v23                 2
            _v22                 2
            _v25                 2
            _v24                 2
            _v20                 2
            _v26                 1
            _v27                 1
            _v28             

In [32]:
doi = "10.31219/osf.io/v284p"
url = "https://osf.io/v284p_v1"

print(VERSION_TOKEN_RX.search(doi.lower()))  # should be None
print(VERSION_TOKEN_RX.search(url.lower()).group('token'))  # should be _v1


None
_v1


In [33]:
data

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,doi_norm,url_norm,doi_version_token,url_version_token,version_token,version_num,is_valid_version_signal
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-26,2018-05-26,,2018-05-26,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-31,2018-05-31,,2018-05-31,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1029/2017sw001789;10.1029/2018sw001995,,2018-05-30,2018-05-30,,2018-05-30,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-09-29,2019-12-03,,,2018.0,10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806404,openalex::W999921877,RePEc: Research Papers in Economics,openalex,,,https://steconomice.uoradea.ro/anale/volume/20...,,,,,,,,,2008-01-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2008.0,none,https://steconomice.uoradea.ro/anale/volume/20...,,/v1,/v1,1.0,True
1016201,openalex::W999947037,HAL,openalex,,,https://hal.science/hal-01922484,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,none,https://hal.science/hal-01922484,,,,,False
2713111,openalex::W999974616,RePEc: Research Papers in Economics,openalex,,,https://EconPapers.repec.org/RePEc:cde:cdewps:19,,,,,,,,,1994-08-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,1994.0,none,https://econpapers.repec.org/repec:cde:cdewps:19,,,,,False
1015101,openalex::W999989114,HAL,openalex,,,https://hal.science/hal-01268467,,,,,,,,,2012-07-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2012.0,none,https://hal.science/hal-01268467,,,,,False


In [34]:
# pattern = "/v284"

# mask = data['landing_page_url'].str.contains(pattern, regex=False, na=False)
# result = data[mask]
# result

### Classify patterns (semantic meaning)

In [35]:
import re
import numpy as np
import pandas as pd

df = data.copy()   # full dataset, with record_id

DOTNUM_RX = re.compile(r'^\.(?:[1-9]?\d)$')          # .1..99
VN_RX     = re.compile(r'^(?:/v|_v|\.v|-v)(\d+)$')   # /v1, _v1, .v1, -v1
RC_RX     = re.compile(r'^-rc(\d+)$')
CC_RX     = re.compile(r'^-cc(\d+)$')
SA_RX     = re.compile(r'^\.(sa)(\d+)$')             # .sa0, .sa1...

def classify_token(tok: str):
    if not isinstance(tok, str) or tok == "":
        return ("none", np.nan)

    tok = tok.lower().strip()

    # Non-version components
    if tok == "-supplement":
        return ("non_version_component", np.nan)
    if SA_RX.match(tok):
        return ("non_version_component", np.nan)

    # Revision markers
    m = RC_RX.match(tok)
    if m:
        return ("revision_marker", float(m.group(1)))
    m = CC_RX.match(tok)
    if m:
        return ("revision_marker", float(m.group(1)))

    # Canonical version tokens
    m = VN_RX.match(tok)
    if m:
        return ("version", float(m.group(1)))

    # Dot numeric versions (server-dependent)
    if DOTNUM_RX.match(tok):
        return ("dot_numeric_version", float(tok[1:]))

    return ("other", np.nan)

# Ensure version_token exists (if not, use your earlier construction)
# If you already have version_token, keep it. Otherwise:
if 'version_token' not in df.columns:
    df['version_token'] = df.get('doi_version_token', pd.Series(index=df.index, dtype=object)).combine_first(
        df.get('url_version_token', pd.Series(index=df.index, dtype=object))
    )

tmp = df['version_token'].apply(classify_token)
df['signal_class'] = tmp.apply(lambda x: x[0])
df['signal_num']   = tmp.apply(lambda x: x[1])

# Conservative default: only canonical "version" tokens used for ordering
# (You can later add dot_numeric_version for specific servers)
# DOTNUM_ALLOWED_SERVERS = set()  # fill later if desired
# # DOTNUM_ALLOWED_SERVERS = {
# #     "Wellcome Open Research",
# #     "UCL Open Environment",
# #     # add others after inspection
# # }
# df['use_for_versioning'] = (
#     (df['signal_class'] == 'version') |
#     ((df['signal_class'] == 'dot_numeric_version') & df['server_name'].isin(DOTNUM_ALLOWED_SERVERS))
# )


In [36]:
data.shape

(8410094, 30)

In [37]:
df

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,doi_norm,url_norm,doi_version_token,url_version_token,version_token,version_num,is_valid_version_signal,signal_class,signal_num
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-26,2018-05-26,,2018-05-26,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-31,2018-05-31,,2018-05-31,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1029/2017sw001789;10.1029/2018sw001995,,2018-05-30,2018-05-30,,2018-05-30,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-09-29,2019-12-03,,,2018.0,10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806404,openalex::W999921877,RePEc: Research Papers in Economics,openalex,,,https://steconomice.uoradea.ro/anale/volume/20...,,,,,,,,,2008-01-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2008.0,none,https://steconomice.uoradea.ro/anale/volume/20...,,/v1,/v1,1.0,True,version,1.0
1016201,openalex::W999947037,HAL,openalex,,,https://hal.science/hal-01922484,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,none,https://hal.science/hal-01922484,,,,,False,none,
2713111,openalex::W999974616,RePEc: Research Papers in Economics,openalex,,,https://EconPapers.repec.org/RePEc:cde:cdewps:19,,,,,,,,,1994-08-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,1994.0,none,https://econpapers.repec.org/repec:cde:cdewps:19,,,,,False,none,
1015101,openalex::W999989114,HAL,openalex,,,https://hal.science/hal-01268467,,,,,,,,,2012-07-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2012.0,none,https://hal.science/hal-01268467,,,,,False,none,


In [38]:
out = (
    df.groupby(['server_name', 'signal_class'], dropna=False)
       .size()
       .rename('count')
       .reset_index()
)

out['pct'] = (
    out['count'] / out.groupby('server_name')['count'].transform('sum') * 100
).round(2)

out = out.sort_values(['count'], ascending=False)

out.head(60)


Unnamed: 0,server_name,signal_class,count,pct
191,arXiv,none,2920797,100.0
160,SSRN,none,1258958,100.0
79,HAL,none,1056212,99.98
154,Research Square,version,450818,100.0
151,RePEc: Research Papers in Economics,none,388823,99.85
192,bioRxiv,none,306948,100.0
11,AgEcon Search,none,188173,100.0
155,ResearchGate,none,181231,100.0
195,eLife,none,176806,71.42
148,Qeios,none,167713,97.33


### Parent vs child inference logic

In [39]:
import re
import numpy as np
import pandas as pd

df = df.copy()  # your working dataframe

# ------------------------------------------------------------
# 0) Preconditions:
# df should already have:
# - server_name
# - record_id
# - doi (may be missing)
# - landing_page_url (may be missing)
# - version_token, signal_class, signal_num, use_for_versioning
# If not, we can rebuild them, but assuming you already did.
# ------------------------------------------------------------

# ------------------------------------------------------------
# 1) Build "root" identifiers for grouping
#    - Remove version-like suffixes from DOI/URL to get a stable root.
#    - IMPORTANT: use your tightened patterns to avoid OSF/v50i7 false positives.
# ------------------------------------------------------------

# Remove common version suffixes (safe-ish)
DOI_ROOT_RX = re.compile(r'(/v\d+|\.v\d+|_v\d+|-v\d+|\.(?:[1-9]?\d))$', re.IGNORECASE)

def make_doi_root(doi):
    if not isinstance(doi, str) or doi.strip() == "" or doi.lower() == "nan":
        return None
    d = doi.strip().lower()
    # strip only ONE suffix at the end
    return DOI_ROOT_RX.sub('', d)

URL_ROOT_RX = re.compile(r'(/v\d+|\.v\d+|_v\d+|-v\d+|\.(?:[1-9]?\d))(?=($|[?#/]))', re.IGNORECASE)

def make_url_root(url):
    if not isinstance(url, str) or url.strip() == "" or url.lower() == "nan":
        return None
    u = url.strip().lower()
    # strip version-like chunk near the end (before query/hash)
    # then also remove query/hash for stability
    u = re.sub(r'[#?].*$', '', u)
    return URL_ROOT_RX.sub('', u)

df['doi_root_for_group'] = df['doi'].apply(make_doi_root)
df['url_root_for_group'] = df['landing_page_url'].apply(make_url_root)

# ------------------------------------------------------------
# 2) Define group_id (best available root)
# ------------------------------------------------------------
df['group_id'] = df['doi_root_for_group'].combine_first(df['url_root_for_group'])

# If still missing, fallback to record_id without backend prefix (optional)
# Example record_id: "crossref::10.31219/osf.io/v268w"
df['record_id_root'] = (
    df['record_id'].astype(str)
      .str.lower()
      .str.replace(r'^[a-z0-9_]+::', '', regex=True)  # drop "crossref::", "openalex::" etc
)

df['group_id'] = df['group_id'].fillna(df['record_id_root'])

# ------------------------------------------------------------
# 3) Decide which rows participate in version ordering
#    - use_for_versioning True means we trust signal_num to order versions
# ------------------------------------------------------------
# Ensure use_for_versioning exists; if not, create a conservative default:
# if 'use_for_versioning' not in df.columns:
df['use_for_versioning'] = df['signal_class'].isin([
    'version',
    'dot_numeric_version'
])

# Version number used for ordering (only when trusted)
df['version_order_num'] = np.where(
    df['use_for_versioning'],
    df['signal_num'],
    np.nan
)

# ------------------------------------------------------------
# 4) Rank within each (server_name, group_id)
#    - We include server_name because "roots" can collide across servers.
#    - Parent = smallest numeric version
# ------------------------------------------------------------
group_cols = ['server_name', 'group_id']

# Only rows with numeric version participate in ranking
has_num = df['version_order_num'].notna()

df['version_rank'] = np.nan
df.loc[has_num, 'version_rank'] = (
    df.loc[has_num]
      .groupby(group_cols)['version_order_num']
      .rank(method='dense', ascending=True)
)

# Parent/child flags (only meaningful when rank exists and group has >1 version)
df['group_size_with_num'] = (
    df.loc[has_num]
      .groupby(group_cols)['version_order_num']
      .transform('count')
)

df['is_parent_version'] = (df['version_rank'] == 1)
    # & (df['group_size_with_num'] > 1)
df['is_child_version']  = (df['version_rank'] > 1) & (df['group_size_with_num'] > 1)

# ------------------------------------------------------------
# 5) Optional: attach parent_record_id to each child
# ------------------------------------------------------------
parent_map = (
    df.loc[df['is_parent_version'], group_cols + ['record_id']]
      .drop_duplicates(subset=group_cols)
      .rename(columns={'record_id': 'parent_record_id'})
)

df = df.merge(parent_map, on=group_cols, how='left')

# parent_record_id is filled for children (and also for parent itself; you can blank it out if you want)
df.loc[df['is_parent_version'], 'parent_record_id'] = np.nan

# Done. Your updated dataframe:
data_with_parent_child = df


In [52]:
import pandas as pd
import numpy as np
import json

# ============================================================
# Parent/Child + children list preview (parents-only) + relatives counts
# Robust to re-running (drops old columns before merge to avoid _x/_y)
#
# REQUIRED columns in df:
#   - record_id
#   - server_name
#   - group_id
#   - version_rank  (1=parent, >1=child, NaN=not versioned)
#   - is_parent_version, is_child_version
# ============================================================

# ---------- choose the dataframe to work on ----------
df = data_with_parent_child.copy()   # <- use your output from the first stage

# ---------- Settings ----------
K_CHILD_PREVIEW = 20   # store only first K child ids per parent (set None to store all)
MAKE_JSON = True

group_cols = ['server_name', 'group_id']

# ---------- Safety: ensure required columns exist ----------
required = ['record_id', 'server_name', 'group_id', 'version_rank', 'is_parent_version', 'is_child_version']
missing = [c for c in required if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

# ---------- IMPORTANT: drop old merged columns to avoid suffixes (_x/_y) ----------
cols_to_reset = [
    'parent_record_id',
    'child_record_ids_for_parent',
    'child_record_ids_json',
    'n_versioned_records',
    'n_children',
    'n_children_for_parent',
    'n_relatives',
]
df = df.drop(columns=[c for c in cols_to_reset if c in df.columns], errors='ignore')

# ---------- Build group-level info (only among versioned rows) ----------
# Parent per group: version_rank == 1
parent_per_group = (
    df.loc[df['version_rank'] == 1, group_cols + ['record_id']]
      .drop_duplicates(subset=group_cols)
      .rename(columns={'record_id': 'parent_record_id'})
)

# Children per group: version_rank > 1
children_per_group = (
    df.loc[df['version_rank'] > 1, group_cols + ['record_id']]
      .groupby(group_cols, dropna=False)['record_id']
      .apply(list)
      .rename('child_record_ids_full')
      .reset_index()
)

# Count versioned records per group: version_rank notna
counts_per_group = (
    df.loc[df['version_rank'].notna(), group_cols + ['record_id']]
      .groupby(group_cols, dropna=False)['record_id']
      .size()
      .rename('n_versioned_records')
      .reset_index()
)

group_info = (
    parent_per_group
      .merge(children_per_group, on=group_cols, how='left')
      .merge(counts_per_group, on=group_cols, how='left')
)

# Normalize
group_info['child_record_ids_full'] = group_info['child_record_ids_full'].apply(lambda x: x if isinstance(x, list) else [])
group_info['n_versioned_records'] = group_info['n_versioned_records'].fillna(0).astype(int)
group_info['n_children'] = group_info['child_record_ids_full'].apply(len).astype(int)

def preview_list(lst, k):
    if not isinstance(lst, list):
        return []
    if k is None:
        return lst
    return lst[:k]

group_info['child_record_ids_preview'] = group_info['child_record_ids_full'].apply(lambda x: preview_list(x, K_CHILD_PREVIEW))

# Keep only needed columns for merge (lighter)
group_info = group_info[group_cols + ['parent_record_id', 'n_versioned_records', 'n_children', 'child_record_ids_preview']]

# ---------- Merge back ----------
df = df.merge(group_info, on=group_cols, how='left')

# ---------- Parents-only children lists ----------
parent_mask = df['is_parent_version'].fillna(False)

df['child_record_ids_for_parent'] = None
df.loc[parent_mask, 'child_record_ids_for_parent'] = df.loc[parent_mask, 'child_record_ids_preview']

df['n_children_for_parent'] = 0
df.loc[parent_mask, 'n_children_for_parent'] = df.loc[parent_mask, 'n_children'].fillna(0).astype(int)

# ---------- Relatives count for everyone ----------
df['n_versioned_records'] = df['n_versioned_records'].fillna(0).astype(int)
df['n_relatives'] = (df['n_versioned_records'] - 1).clip(lower=0)

# ---------- Optional JSON (parents only) ----------
if MAKE_JSON:
    df['child_record_ids_json'] = None
    df.loc[parent_mask, 'child_record_ids_json'] = df.loc[parent_mask, 'child_record_ids_for_parent'].map(json.dumps)

# ---------- Optional: drop preview helper if you don't want it ----------
# df = df.drop(columns=['child_record_ids_preview'], errors='ignore')

df


Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,doi_norm,url_norm,doi_version_token,url_version_token,version_token,version_num,is_valid_version_signal,signal_class,signal_num,doi_root_for_group,url_root_for_group,group_id,record_id_root,use_for_versioning,version_order_num,version_rank,group_size_with_num,is_parent_version,is_child_version,parent_record_id,n_versioned_records,n_children,child_record_ids_preview,child_record_ids_for_parent,n_children_for_parent,n_relatives,child_record_ids_json
0,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-26,2018-05-26,,2018-05-26,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500000,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500000,10.1002/essoar.10500000.1,True,1.0,1.0,1.0,True,False,crossref::10.1002/essoar.10500000.1,1,0.0,[],[],0,0,[]
1,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-31,2018-05-31,,2018-05-31,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500002,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500002,10.1002/essoar.10500002.1,True,1.0,1.0,1.0,True,False,crossref::10.1002/essoar.10500002.1,1,0.0,[],[],0,0,[]
2,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1029/2017sw001789;10.1029/2018sw001995,,2018-05-30,2018-05-30,,2018-05-30,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500004,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500004,10.1002/essoar.10500004.1,True,1.0,1.0,1.0,True,False,crossref::10.1002/essoar.10500004.1,1,0.0,[],[],0,0,[]
3,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500007,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500007,10.1002/essoar.10500007.1,True,1.0,1.0,1.0,True,False,crossref::10.1002/essoar.10500007.1,1,0.0,[],[],0,0,[]
4,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-09-29,2019-12-03,,,2018.0,10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500009,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500009,10.1002/essoar.10500009.1,True,1.0,1.0,1.0,True,False,crossref::10.1002/essoar.10500009.1,1,0.0,[],[],0,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8410089,openalex::W999921877,RePEc: Research Papers in Economics,openalex,,,https://steconomice.uoradea.ro/anale/volume/20...,,,,,,,,,2008-01-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2008.0,none,https://steconomice.uoradea.ro/anale/volume/20...,,/v1,/v1,1.0,True,version,1.0,,https://steconomice.uoradea.ro/anale/volume/20...,https://steconomice.uoradea.ro/anale/volume/20...,w999921877,True,1.0,1.0,1.0,True,False,openalex::W999921877,1,0.0,[],[],0,0,[]
8410090,openalex::W999947037,HAL,openalex,,,https://hal.science/hal-01922484,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,none,https://hal.science/hal-01922484,,,,,False,none,,,https://hal.science/hal-01922484,https://hal.science/hal-01922484,w999947037,False,,,,False,False,,0,,,,0,0,
8410091,openalex::W999974616,RePEc: Research Papers in Economics,openalex,,,https://EconPapers.repec.org/RePEc:cde:cdewps:19,,,,,,,,,1994-08-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,1994.0,none,https://econpapers.repec.org/repec:cde:cdewps:19,,,,,False,none,,,https://econpapers.repec.org/repec:cde:cdewps:19,https://econpapers.repec.org/repec:cde:cdewps:19,w999974616,False,,,,False,False,,0,,,,0,0,
8410092,openalex::W999989114,HAL,openalex,,,https://hal.science/hal-01268467,,,,,,,,,2012-07-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2012.0,none,https://hal.science/hal-01268467,,,,,False,none,,,https://hal.science/hal-01268467,https://hal.science/hal-01268467,w999989114,False,,,,False,False,,0,,,,0,0,


In [40]:
data_with_parent_child

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,doi_norm,url_norm,doi_version_token,url_version_token,version_token,version_num,is_valid_version_signal,signal_class,signal_num,doi_root_for_group,url_root_for_group,group_id,record_id_root,use_for_versioning,version_order_num,version_rank,group_size_with_num,is_parent_version,is_child_version,parent_record_id
0,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-26,2018-05-26,,2018-05-26,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500000,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500000,10.1002/essoar.10500000.1,True,1.0,1.0,1.0,True,False,
1,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-31,2018-05-31,,2018-05-31,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500002,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500002,10.1002/essoar.10500002.1,True,1.0,1.0,1.0,True,False,
2,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1029/2017sw001789;10.1029/2018sw001995,,2018-05-30,2018-05-30,,2018-05-30,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500004,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500004,10.1002/essoar.10500004.1,True,1.0,1.0,1.0,True,False,
3,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500007,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500007,10.1002/essoar.10500007.1,True,1.0,1.0,1.0,True,False,
4,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-09-29,2019-12-03,,,2018.0,10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500009,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500009,10.1002/essoar.10500009.1,True,1.0,1.0,1.0,True,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8410089,openalex::W999921877,RePEc: Research Papers in Economics,openalex,,,https://steconomice.uoradea.ro/anale/volume/20...,,,,,,,,,2008-01-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2008.0,none,https://steconomice.uoradea.ro/anale/volume/20...,,/v1,/v1,1.0,True,version,1.0,,https://steconomice.uoradea.ro/anale/volume/20...,https://steconomice.uoradea.ro/anale/volume/20...,w999921877,True,1.0,1.0,1.0,True,False,
8410090,openalex::W999947037,HAL,openalex,,,https://hal.science/hal-01922484,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,none,https://hal.science/hal-01922484,,,,,False,none,,,https://hal.science/hal-01922484,https://hal.science/hal-01922484,w999947037,False,,,,False,False,
8410091,openalex::W999974616,RePEc: Research Papers in Economics,openalex,,,https://EconPapers.repec.org/RePEc:cde:cdewps:19,,,,,,,,,1994-08-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,1994.0,none,https://econpapers.repec.org/repec:cde:cdewps:19,,,,,False,none,,,https://econpapers.repec.org/repec:cde:cdewps:19,https://econpapers.repec.org/repec:cde:cdewps:19,w999974616,False,,,,False,False,
8410092,openalex::W999989114,HAL,openalex,,,https://hal.science/hal-01268467,,,,,,,,,2012-07-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2012.0,none,https://hal.science/hal-01268467,,,,,False,none,,,https://hal.science/hal-01268467,https://hal.science/hal-01268467,w999989114,False,,,,False,False,


In [43]:
data_with_parent_child['group_size_with_num'].value_counts()

group_size_with_num
1.0     763237
2.0      84872
3.0      38946
4.0      15300
5.0       4950
6.0       1962
7.0       1092
8.0        560
9.0        450
11.0       330
10.0       310
12.0       252
17.0       204
14.0       182
21.0       168
20.0       140
15.0       135
13.0       130
26.0       104
99.0        99
98.0        98
16.0        96
23.0        92
18.0        90
32.0        64
64.0        64
19.0        57
55.0        55
27.0        54
52.0        52
25.0        50
22.0        44
42.0        42
38.0        38
37.0        37
33.0        33
24.0        24
Name: count, dtype: int64

In [54]:
df[df['group_size_with_num']==98]

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,doi_norm,url_norm,doi_version_token,url_version_token,version_token,version_num,is_valid_version_signal,signal_class,signal_num,doi_root_for_group,url_root_for_group,group_id,record_id_root,use_for_versioning,version_order_num,version_rank,group_size_with_num,is_parent_version,is_child_version,parent_record_id,n_versioned_records,n_children,child_record_ids_preview,child_record_ids_for_parent,n_children_for_parent,n_relatives,child_record_ids_json
475247,crossref::10.20944/preprints202302.0051.v1,Preprints.org,crossref,10.20944/preprints202302.0051.v1,https://doi.org/10.20944/preprints202302.0051.v1,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v2;10.20944/prep...,,,,2023-02-03,2023-02-03,,2023-02-03,2025-08-11,2025-11-23,2023-02-03,,,2023.0,10.20944/preprints202302.0051.v1,https://www.preprints.org/manuscript/202302.00...,.v1,/v1,.v1,1.0,True,version,1.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v1,True,1.0,1.0,98.0,True,False,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...","[crossref::10.20944/preprints202302.0051.v10, ...",97,97,"[""crossref::10.20944/preprints202302.0051.v10""..."
475248,crossref::10.20944/preprints202302.0051.v10,Preprints.org,crossref,10.20944/preprints202302.0051.v10,https://doi.org/10.20944/preprints202302.0051.v10,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v1;10.20944/prep...,,,,2023-04-10,2023-04-10,,2023-04-10,2025-08-11,2025-11-23,2023-04-10,,,2023.0,10.20944/preprints202302.0051.v10,https://www.preprints.org/manuscript/202302.00...,.v10,/v10,.v10,10.0,True,version,10.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v10,True,10.0,10.0,98.0,False,True,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...",,0,97,
475249,crossref::10.20944/preprints202302.0051.v11,Preprints.org,crossref,10.20944/preprints202302.0051.v11,https://doi.org/10.20944/preprints202302.0051.v11,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v1;10.20944/prep...,,,,2023-04-23,2023-04-23,,2023-04-23,2025-08-11,2025-11-23,2023-04-23,,,2023.0,10.20944/preprints202302.0051.v11,https://www.preprints.org/manuscript/202302.00...,.v11,/v11,.v11,11.0,True,version,11.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v11,True,11.0,11.0,98.0,False,True,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...",,0,97,
475250,crossref::10.20944/preprints202302.0051.v12,Preprints.org,crossref,10.20944/preprints202302.0051.v12,https://doi.org/10.20944/preprints202302.0051.v12,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v1;10.20944/prep...,,,,2023-04-29,2023-04-29,,2023-04-29,2025-08-11,2025-11-23,2023-05-04,,,2023.0,10.20944/preprints202302.0051.v12,https://www.preprints.org/manuscript/202302.00...,.v12,/v12,.v12,12.0,True,version,12.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v12,True,12.0,12.0,98.0,False,True,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...",,0,97,
475251,crossref::10.20944/preprints202302.0051.v13,Preprints.org,crossref,10.20944/preprints202302.0051.v13,https://doi.org/10.20944/preprints202302.0051.v13,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v1;10.20944/prep...,,,,2023-05-17,2023-05-17,,2023-05-17,2025-08-11,2025-11-23,2023-05-04,,,2023.0,10.20944/preprints202302.0051.v13,https://www.preprints.org/manuscript/202302.00...,.v13,/v13,.v13,13.0,True,version,13.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v13,True,13.0,13.0,98.0,False,True,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...",,0,97,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475340,crossref::10.20944/preprints202302.0051.v94,Preprints.org,crossref,10.20944/preprints202302.0051.v94,https://doi.org/10.20944/preprints202302.0051.v94,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v1;10.20944/prep...,,,,2024-08-23,2024-08-23,,2024-08-23,2025-08-11,2025-11-23,2024-08-25,,,2024.0,10.20944/preprints202302.0051.v94,https://www.preprints.org/manuscript/202302.00...,.v94,/v94,.v94,94.0,True,version,94.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v94,True,94.0,94.0,98.0,False,True,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...",,0,97,
475341,crossref::10.20944/preprints202302.0051.v95,Preprints.org,crossref,10.20944/preprints202302.0051.v95,https://doi.org/10.20944/preprints202302.0051.v95,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v1;10.20944/prep...,,,,2024-09-02,2024-09-02,,2024-09-02,2025-08-11,2025-11-23,2024-09-02,,,2024.0,10.20944/preprints202302.0051.v95,https://www.preprints.org/manuscript/202302.00...,.v95,/v95,.v95,95.0,True,version,95.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v95,True,95.0,95.0,98.0,False,True,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...",,0,97,
475342,crossref::10.20944/preprints202302.0051.v96,Preprints.org,crossref,10.20944/preprints202302.0051.v96,https://doi.org/10.20944/preprints202302.0051.v96,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v1;10.20944/prep...,,,,2024-12-09,2024-12-09,,2024-12-09,2025-08-11,2025-11-23,2024-12-09,,,2024.0,10.20944/preprints202302.0051.v96,https://www.preprints.org/manuscript/202302.00...,.v96,/v96,.v96,96.0,True,version,96.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v96,True,96.0,96.0,98.0,False,True,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...",,0,97,
475343,crossref::10.20944/preprints202302.0051.v97,Preprints.org,crossref,10.20944/preprints202302.0051.v97,https://doi.org/10.20944/preprints202302.0051.v97,https://www.preprints.org/manuscript/202302.00...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.20944/preprints202302.0051.v1;10.20944/prep...,,,,2025-07-22,2025-07-22,,2025-07-22,2025-08-11,2025-11-23,2025-07-24,,,2025.0,10.20944/preprints202302.0051.v97,https://www.preprints.org/manuscript/202302.00...,.v97,/v97,.v97,97.0,True,version,97.0,10.20944/preprints202302.0051,https://www.preprints.org/manuscript/202302.0051,10.20944/preprints202302.0051,10.20944/preprints202302.0051.v97,True,97.0,97.0,98.0,False,True,crossref::10.20944/preprints202302.0051.v1,98,97.0,"[crossref::10.20944/preprints202302.0051.v10, ...",,0,97,


In [68]:
data_with_parent_child[data_with_parent_child['relations_json'].notna()]

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,doi_norm,url_norm,doi_version_token,url_version_token,version_token,version_num,is_valid_version_signal,signal_class,signal_num,doi_root_for_group,url_root_for_group,group_id,record_id_root,use_for_versioning,version_order_num,version_rank,group_size_with_num,is_parent_version,is_child_version,parent_record_id
2,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1029/2017sw001789;10.1029/2018sw001995,,2018-05-30,2018-05-30,,2018-05-30,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500004,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500004,10.1002/essoar.10500004.1,True,1.0,1.0,1.0,True,False,
10,crossref::10.1002/essoar.10500015.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500015.1,https://doi.org/10.1002/essoar.10500015.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1016/j.gca.2019.03.006,,2018-07-16,2018-07-16,,2018-07-16,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500015.1,https://essopenarchive.org/doi/full/10.1002/es...,.1,.1,.1,1.0,True,dot_numeric_version,1.0,10.1002/essoar.10500015,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500015,10.1002/essoar.10500015.1,True,1.0,1.0,3.0,True,False,
11,crossref::10.1002/essoar.10500015.2,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500015.2,https://doi.org/10.1002/essoar.10500015.2,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1016/j.gca.2019.03.006,,2018-07-26,2018-07-26,,2018-07-26,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500015.2,https://essopenarchive.org/doi/full/10.1002/es...,.2,.2,.2,2.0,True,dot_numeric_version,2.0,10.1002/essoar.10500015,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500015,10.1002/essoar.10500015.2,True,2.0,2.0,3.0,False,True,crossref::10.1002/essoar.10500015.1
12,crossref::10.1002/essoar.10500015.3,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500015.3,https://doi.org/10.1002/essoar.10500015.3,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1016/j.gca.2019.03.006,,2019-02-26,2019-02-26,,2019-02-26,2022-12-05,2025-02-21,2019-02-26,,,2019.0,10.1002/essoar.10500015.3,https://essopenarchive.org/doi/full/10.1002/es...,.3,.3,.3,3.0,True,dot_numeric_version,3.0,10.1002/essoar.10500015,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500015,10.1002/essoar.10500015.3,True,3.0,3.0,3.0,False,True,crossref::10.1002/essoar.10500015.1
16,crossref::10.1002/essoar.10500022.2,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500022.2,https://doi.org/10.1002/essoar.10500022.2,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1080/02786826.2019.1623863,,2018-11-12,2018-11-12,,2018-11-12,2022-12-05,2025-02-21,2018-11-12,,,2018.0,10.1002/essoar.10500022.2,https://essopenarchive.org/doi/full/10.1002/es...,.2,.2,.2,2.0,True,dot_numeric_version,2.0,10.1002/essoar.10500022,https://essopenarchive.org/doi/full/10.1002/es...,10.1002/essoar.10500022,10.1002/essoar.10500022.2,True,2.0,2.0,5.0,False,True,crossref::10.1002/essoar.10500022.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6707557,datacite::10.5281/zenodo.903488,Zenodo,datacite,10.5281/zenodo.903488,https://doi.org/10.5281/zenodo.903488,https://zenodo.org/record/903488,,"[{""relatedIdentifier"": ""10.5281/zenodo.903487""...","{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",,,10.5281/zenodo.903487,,,,,,,,2017-09-21,2017-09-21,2020-09-19,2017.0,10.5281/zenodo.903488,https://zenodo.org/record/903488,,,,,False,none,,10.5281/zenodo.903488,https://zenodo.org/record/903488,10.5281/zenodo.903488,10.5281/zenodo.903488,False,,,,False,False,
6707558,datacite::10.5281/zenodo.949496,Zenodo,datacite,10.5281/zenodo.949496,https://doi.org/10.5281/zenodo.949496,https://zenodo.org/record/949496,,"[{""relatedIdentifier"": ""10.5281/zenodo.949497""...","{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",,,,,,,,,,,2017-09-23,2017-09-23,2020-09-19,2017.0,10.5281/zenodo.949496,https://zenodo.org/record/949496,,,,,False,none,,10.5281/zenodo.949496,https://zenodo.org/record/949496,10.5281/zenodo.949496,10.5281/zenodo.949496,False,,,,False,False,
6707559,datacite::10.5281/zenodo.949497,Zenodo,datacite,10.5281/zenodo.949497,https://doi.org/10.5281/zenodo.949497,https://zenodo.org/record/949497,,"[{""relatedIdentifier"": ""10.5281/zenodo.949496""...","{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",,,10.5281/zenodo.949496,,,,,,,,2017-09-23,2017-09-23,2020-09-19,2017.0,10.5281/zenodo.949497,https://zenodo.org/record/949497,,,,,False,none,,10.5281/zenodo.949497,https://zenodo.org/record/949497,10.5281/zenodo.949497,10.5281/zenodo.949497,False,,,,False,False,
6707560,datacite::10.5281/zenodo.996036,Zenodo,datacite,10.5281/zenodo.996036,https://doi.org/10.5281/zenodo.996036,https://zenodo.org/record/996036,,"[{""relatedIdentifier"": ""10.5281/zenodo.996037""...","{""client"": {""data"": {""id"": ""cern.zenodo"", ""typ...",,,,,,,,,,,2017-09-25,2017-09-25,2020-09-19,2017.0,10.5281/zenodo.996036,https://zenodo.org/record/996036,,,,,False,none,,10.5281/zenodo.996036,https://zenodo.org/record/996036,10.5281/zenodo.996036,10.5281/zenodo.996036,False,,,,False,False,


In [67]:
import pandas as pd
import numpy as np
import json
import re

def _as_text(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    return str(x)

def parse_relations(rel):
    """
    Works for two common shapes:
    - Crossref: dict like {"is-preprint-of":[{"id":"10.xxx"...}], "has-preprint":[...]}
    - DataCite: list of dicts [{"relatedIdentifier":"10.xxx","relationType":"IsPreprintOf", ...}, ...]
    Returns list of (rel_type, identifier)
    """
    if rel is None or (isinstance(rel, float) and np.isnan(rel)):
        return []

    # If already list/dict
    obj = rel
    if isinstance(rel, str):
        s = rel.strip()
        if s == "" or s.lower() in {"none", "nan", "null", "[]", "{}"}:
            return []
        try:
            obj = json.loads(s)
        except Exception:
            return []

    out = []

    # DataCite-style list
    if isinstance(obj, list):
        for it in obj:
            if not isinstance(it, dict):
                continue
            rel_type = it.get("relationType") or it.get("relation_type")
            ident = it.get("relatedIdentifier") or it.get("relatedIdentifierValue") or it.get("id")
            if rel_type and ident:
                out.append((str(rel_type), str(ident)))
        return out

    # Crossref-style dict
    if isinstance(obj, dict):
        for rel_type, items in obj.items():
            if isinstance(items, list):
                for it in items:
                    if isinstance(it, dict):
                        ident = it.get("id") or it.get("DOI") or it.get("doi")
                        if ident:
                            out.append((str(rel_type), str(ident)))
                    elif isinstance(it, str):
                        out.append((str(rel_type), it))
            elif isinstance(items, dict):
                ident = items.get("id") or items.get("DOI") or items.get("doi")
                if ident:
                    out.append((str(rel_type), str(ident)))
        return out

    return []

def explore_server(df, server, n=50):
    d = df.loc[df["server_name"] == server].copy()

    # mark non-empty relations_json
    rel_nonempty = d["relations_json"].apply(lambda x: len(parse_relations(x)) > 0)
    d1 = d.loc[rel_nonempty].head(n)

    rows = []
    for _, r in d1.iterrows():
        rels = parse_relations(r["relations_json"])
        # keep only first 5 for display
        rels_short = rels[:5]
        rows.append({
            "record_id": r["record_id"],
            "doi": r.get("doi"),
            "landing_page_url": r.get("landing_page_url"),
            "version_label": r.get("version_label"),
            "is_version_of": r.get("is_version_of"),
            "is_preprint_of": r.get("is_preprint_of"),
            "published_version_ids_json": r.get("published_version_ids_json"),
            "rels_short": rels_short
        })

    return pd.DataFrame(rows)

# Example usage:
# explore_server(df, "arXiv", n=30)
# explore_server(df, "Research Square", n=30)
# explore_server(df, "eLife", n=30)


In [69]:
explore_server(df, "arXiv", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,datacite::10.48550/arxiv.0704.0001,10.48550/arxiv.0704.0001,https://arxiv.org/abs/0704.0001,2,,10.1103/physrevd.76.013009,,"[(IsVersionOf, 10.1103/physrevd.76.013009)]"
1,datacite::10.48550/arxiv.0704.0006,10.48550/arxiv.0704.0006,https://arxiv.org/abs/0704.0006,1,,10.1103/physreva.75.043613,,"[(IsVersionOf, 10.1103/physreva.75.043613)]"
2,datacite::10.48550/arxiv.0704.0007,10.48550/arxiv.0704.0007,https://arxiv.org/abs/0704.0007,2,,10.1103/physrevd.76.044016,,"[(IsVersionOf, 10.1103/physrevd.76.044016)]"
3,datacite::10.48550/arxiv.0704.0008,10.48550/arxiv.0704.0008,https://arxiv.org/abs/0704.0008,3,,10.1063/1.2975338,,"[(IsVersionOf, 10.1063/1.2975338)]"
4,datacite::10.48550/arxiv.0704.0009,10.48550/arxiv.0704.0009,https://arxiv.org/abs/0704.0009,1,,10.1086/518646,,"[(IsVersionOf, 10.1086/518646)]"
5,datacite::10.48550/arxiv.0704.0015,10.48550/arxiv.0704.0015,https://arxiv.org/abs/0704.0015,2,,10.1088/1126-6708/2007/05/034,,"[(IsVersionOf, 10.1088/1126-6708/2007/05/034)]"
6,datacite::10.48550/arxiv.0704.0016,10.48550/arxiv.0704.0016,https://arxiv.org/abs/0704.0016,1,,10.1088/0253-6102/49/4/38,,"[(IsVersionOf, 10.1088/0253-6102/49/4/38)]"
7,datacite::10.48550/arxiv.0704.0017,10.48550/arxiv.0704.0017,https://arxiv.org/abs/0704.0017,1,,10.1111/j.1365-2966.2007.11762.x,,"[(IsVersionOf, 10.1111/j.1365-2966.2007.11762.x)]"
8,datacite::10.48550/arxiv.0704.0020,10.48550/arxiv.0704.0020,https://arxiv.org/abs/0704.0020,1,,10.1103/physrevd.76.052005,,"[(IsVersionOf, 10.1103/physrevd.76.052005)]"
9,datacite::10.48550/arxiv.0704.0021,10.48550/arxiv.0704.0021,https://arxiv.org/abs/0704.0021,2,,10.1103/physrevlett.99.048301,,"[(IsVersionOf, 10.1103/physrevlett.99.048301)]"


In [70]:
explore_server(df, "Research Square", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.21203/rs.1.1/v1,10.21203/rs.1.1/v1,https://www.researchsquare.com/article/rs-2/v1,,,10.1186/s13063-019-3980-5,,"[(is-preprint-of, 10.1186/s13063-019-3980-5)]"
1,crossref::10.21203/rs.1.1/v2,10.21203/rs.1.1/v2,https://www.researchsquare.com/article/rs-2/v2,,,10.1186/s13063-019-3980-5,,"[(is-preprint-of, 10.1186/s13063-019-3980-5)]"
2,crossref::10.21203/rs.1.10/v1,10.21203/rs.1.10/v1,https://www.researchsquare.com/article/rs-11/v1,,,10.1186/s12886-019-1078-1,,"[(is-preprint-of, 10.1186/s12886-019-1078-1)]"
3,crossref::10.21203/rs.1.10/v2,10.21203/rs.1.10/v2,https://www.researchsquare.com/article/rs-11/v2,,,10.1186/s12886-019-1078-1,,"[(is-preprint-of, 10.1186/s12886-019-1078-1)]"
4,crossref::10.21203/rs.1.10/v3,10.21203/rs.1.10/v3,https://www.researchsquare.com/article/rs-11/v3,,,10.1186/s12886-019-1078-1,,"[(is-preprint-of, 10.1186/s12886-019-1078-1)]"
5,crossref::10.21203/rs.1.10/v4,10.21203/rs.1.10/v4,https://www.researchsquare.com/article/rs-11/v4,,,10.1186/s12886-019-1078-1,,"[(is-preprint-of, 10.1186/s12886-019-1078-1)]"
6,crossref::10.21203/rs.1.11/v1,10.21203/rs.1.11/v1,https://www.researchsquare.com/article/rs-18/v1,,,10.1186/s13063-019-3374-8,,"[(is-preprint-of, 10.1186/s13063-019-3374-8)]"
7,crossref::10.21203/rs.1.12/v1,10.21203/rs.1.12/v1,https://www.researchsquare.com/article/rs-17/v1,,,10.1155/2019/6392751,,"[(is-preprint-of, 10.1155/2019/6392751)]"
8,crossref::10.21203/rs.1.13/v1,10.21203/rs.1.13/v1,https://www.researchsquare.com/article/rs-15/v1,,,10.1186/s12871-018-0671-9,,"[(is-preprint-of, https://bmcanesthesiol.biome..."
9,crossref::10.21203/rs.1.13/v2,10.21203/rs.1.13/v2,https://www.researchsquare.com/article/rs-15/v2,,,10.1186/s12871-018-0671-9,,"[(is-preprint-of, https://bmcanesthesiol.biome..."


In [71]:
explore_server(df, "eLife", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.7554/elife.00003.012,10.7554/elife.00003.012,https://elifesciences.org/articles/00003v1/pee...,,,,,"[(is-review-of, 10.7554/eLife.00003)]"
1,crossref::10.7554/elife.00003.013,10.7554/elife.00003.013,https://elifesciences.org/articles/00003v1/pee...,,,,,"[(is-review-of, 10.7554/eLife.00003)]"
2,crossref::10.7554/elife.00005.021,10.7554/elife.00005.021,https://elifesciences.org/articles/00005v1/pee...,,,,,"[(is-review-of, 10.7554/eLife.00005)]"
3,crossref::10.7554/elife.00005.022,10.7554/elife.00005.022,https://elifesciences.org/articles/00005v1/pee...,,,,,"[(is-review-of, 10.7554/eLife.00005)]"
4,crossref::10.7554/elife.00007,10.7554/elife.00007,https://elifesciences.org/articles/00007,,,,,"[(is-supplemented-by, 10.5061/dryad.gs45f)]"
5,crossref::10.7554/elife.00007.021,10.7554/elife.00007.021,https://elifesciences.org/articles/00007v1/pee...,,,,,"[(is-review-of, 10.7554/eLife.00007)]"
6,crossref::10.7554/elife.00007.022,10.7554/elife.00007.022,https://elifesciences.org/articles/00007v1/pee...,,,,,"[(is-review-of, 10.7554/eLife.00007)]"
7,crossref::10.7554/elife.00011,10.7554/elife.00011,https://elifesciences.org/articles/00011,,,,,"[(is-supplemented-by, GSE36916)]"
8,crossref::10.7554/elife.00011.025,10.7554/elife.00011.025,https://elifesciences.org/articles/00011v1/pee...,,,,,"[(is-review-of, 10.7554/eLife.00011)]"
9,crossref::10.7554/elife.00011.026,10.7554/elife.00011.026,https://elifesciences.org/articles/00011v1/pee...,,,,,"[(is-review-of, 10.7554/eLife.00011)]"


In [88]:
explore_server(df, "SSRN", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.2139/ssrn.4363867,10.2139/ssrn.4363867,https://www.ssrn.com/abstract=4363867,,,10.1016/j.ssci.2023.106265,,"[(is-preprint-of, 10.1016/j.ssci.2023.106265)]"
1,crossref::10.2139/ssrn.4435137,10.2139/ssrn.4435137,https://www.ssrn.com/abstract=4435137,,,10.1016/j.geosus.2024.10.003,,"[(is-preprint-of, 10.1016/j.geosus.2024.10.003)]"
2,crossref::10.2139/ssrn.4629125,10.2139/ssrn.4629125,https://www.ssrn.com/abstract=4629125,,,10.1007/s10494-024-00565-z,,"[(is-preprint-of, 10.1007/s10494-024-00565-z)]"
3,crossref::10.2139/ssrn.4725544,10.2139/ssrn.4725544,https://www.ssrn.com/abstract=4725544,,,10.1016/j.jmrt.2024.05.118,,"[(is-preprint-of, 10.1016/j.jmrt.2024.05.118)]"
4,crossref::10.2139/ssrn.4729368,10.2139/ssrn.4729368,https://www.ssrn.com/abstract=4729368,,,10.1002/csr.2935,,"[(is-preprint-of, 10.1002/csr.2935)]"
5,crossref::10.2139/ssrn.4744613,10.2139/ssrn.4744613,https://www.ssrn.com/abstract=4744613,,,10.1016/j.aime.2025.100164,,"[(is-preprint-of, 10.1016/j.aime.2025.100164)]"
6,crossref::10.2139/ssrn.4747148,10.2139/ssrn.4747148,https://www.ssrn.com/abstract=4747148,,,10.1016/j.physa.2025.130401,,"[(is-preprint-of, 10.1016/j.physa.2025.130401)]"
7,crossref::10.2139/ssrn.4766742,10.2139/ssrn.4766742,https://www.ssrn.com/abstract=4766742,,,10.1016/j.apenergy.2025.125710,,"[(is-preprint-of, 10.1016/j.apenergy.2025.1257..."
8,crossref::10.2139/ssrn.4777572,10.2139/ssrn.4777572,https://www.ssrn.com/abstract=4777572,,,10.1016/j.bar.2025.101643,,"[(is-preprint-of, 10.1016/j.bar.2025.101643)]"
9,crossref::10.2139/ssrn.4845117,10.2139/ssrn.4845117,https://www.ssrn.com/abstract=4845117,,,10.1002/jsf2.70012,,"[(is-preprint-of, 10.1002/jsf2.70012)]"


In [73]:
explore_server(df, "bioRxiv", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.1101/000042,10.1101/000042,http://biorxiv.org/lookup/doi/10.1101/000042,,,10.1038/nrg3723,,"[(is-preprint-of, 10.1038/nrg3723)]"
1,crossref::10.1101/000125,10.1101/000125,http://biorxiv.org/lookup/doi/10.1101/000125,,,10.1016/j.ejca.2014.08.019,,"[(is-preprint-of, 10.1016/j.ejca.2014.08.019)]"
2,crossref::10.1101/000265,10.1101/000265,http://biorxiv.org/lookup/doi/10.1101/000265,,,10.1093/nar/gkw1148,,"[(is-preprint-of, 10.1093/nar/gkw1148)]"
3,crossref::10.1101/001685,10.1101/001685,http://biorxiv.org/lookup/doi/10.1101/001685,,,10.1186/s13227-015-0018-2,,"[(is-preprint-of, 10.1186/s13227-015-0018-2)]"
4,crossref::10.1101/001750,10.1101/001750,http://biorxiv.org/lookup/doi/10.1101/001750,,,10.7717/peerj.632,,"[(is-preprint-of, 10.7717/peerj.632)]"
5,crossref::10.1101/001933,10.1101/001933,http://biorxiv.org/lookup/doi/10.1101/001933,,,10.1111/mec.12755,,"[(is-preprint-of, 10.1111/mec.12755)]"
6,crossref::10.1101/001941,10.1101/001941,http://biorxiv.org/lookup/doi/10.1101/001941,,,10.1111/2041-210x.12206,,"[(is-preprint-of, 10.1111/2041-210X.12206)]"
7,crossref::10.1101/002634,10.1101/002634,http://biorxiv.org/lookup/doi/10.1101/002634,,,10.1111/ecog.00976,,"[(is-preprint-of, 10.1111/ecog.00976)]"
8,crossref::10.1101/002709,10.1101/002709,http://biorxiv.org/lookup/doi/10.1101/002709,,,10.1186/1471-2164-15-1031,,"[(is-preprint-of, 10.1186/1471-2164-15-1031)]"
9,crossref::10.1101/002832,10.1101/002832,http://biorxiv.org/lookup/doi/10.1101/002832,,,10.1186/s13059-014-0550-8,,"[(is-preprint-of, 10.1186/s13059-014-0550-8)]"


In [78]:
explore_server(data, "viXra", n=30)

In [79]:
explore_server(df, "ResearchGate", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,datacite::10.13140/rg.2.1.1154.5209/1,10.13140/rg.2.1.1154.5209/1,https://www.researchgate.net/doi/10.13140/RG.2...,1.0,10.13140/rg.2.1.1154.5209,,,"[(IsNewVersionOf, 10.13140/rg.2.1.1154.5209)]"
1,datacite::10.13140/rg.2.1.1205.3367/2,10.13140/rg.2.1.1205.3367/2,https://www.researchgate.net/doi/10.13140/RG.2...,,10.13140/rg.2.1.1205.3367/1,,,"[(IsNewVersionOf, 10.13140/rg.2.1.1205.3367/1)]"
2,datacite::10.13140/rg.2.1.1467.5608/2,10.13140/rg.2.1.1467.5608/2,https://www.researchgate.net/doi/10.13140/RG.2...,,10.13140/rg.2.1.1467.5608/1,,,"[(IsNewVersionOf, 10.13140/rg.2.1.1467.5608/1)]"
3,datacite::10.13140/rg.2.1.2301.9363/1,10.13140/rg.2.1.2301.9363/1,https://www.researchgate.net/doi/10.13140/RG.2...,1.0,10.13140/rg.2.1.2301.9363,,,"[(IsNewVersionOf, 10.13140/rg.2.1.2301.9363)]"
4,datacite::10.13140/rg.2.1.2516.1363/3,10.13140/rg.2.1.2516.1363/3,https://www.researchgate.net/doi/10.13140/RG.2...,,10.13140/rg.2.1.2516.1363/2,,,"[(IsNewVersionOf, 10.13140/rg.2.1.2516.1363/2)]"
5,datacite::10.13140/rg.2.1.2746.0242/2,10.13140/rg.2.1.2746.0242/2,https://www.researchgate.net/doi/10.13140/RG.2...,,10.13140/rg.2.1.2746.0242/1,,,"[(IsNewVersionOf, 10.13140/rg.2.1.2746.0242/1)]"
6,datacite::10.13140/rg.2.1.2907.1369/1,10.13140/rg.2.1.2907.1369/1,https://www.researchgate.net/doi/10.13140/RG.2...,1.0,10.13140/2.1.2907.1369,,,"[(IsNewVersionOf, 10.13140/2.1.2907.1369)]"
7,datacite::10.13140/rg.2.1.3019.5928/1,10.13140/rg.2.1.3019.5928/1,https://www.researchgate.net/doi/10.13140/RG.2...,1.0,10.13140/rg.2.1.3019.5928,,,"[(IsNewVersionOf, 10.13140/rg.2.1.3019.5928)]"
8,datacite::10.13140/rg.2.1.3115.5606/1,10.13140/rg.2.1.3115.5606/1,https://www.researchgate.net/doi/10.13140/RG.2...,1.0,10.13140/rg.2.1.3115.5606,,,"[(IsNewVersionOf, 10.13140/rg.2.1.3115.5606)]"
9,datacite::10.13140/rg.2.1.3203.8245/1,10.13140/rg.2.1.3203.8245/1,https://www.researchgate.net/doi/10.13140/RG.2...,,10.13140/rg.2.1.3203.8245,,,"[(IsNewVersionOf, 10.13140/rg.2.1.3203.8245)]"


In [80]:
explore_server(df, "AMRC Open Research", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.12688/amrcopenres.12855.1,10.12688/amrcopenres.12855.1,https://amrcopenresearch.org/articles/1-2/v1,,,,,"[(has-review, 10.21956/amrcopenres.13921.r2630..."
1,crossref::10.12688/amrcopenres.12860.1,10.12688/amrcopenres.12860.1,https://amrcopenresearch.org/articles/1-4/v1,,,,,"[(has-review, 10.21956/amrcopenres.13926.r2642..."
2,crossref::10.12688/amrcopenres.12860.2,10.12688/amrcopenres.12860.2,https://amrcopenresearch.org/articles/1-4/v2,New version,,,,"[(has-review, 10.21956/amrcopenres.13960.r26480)]"
3,crossref::10.12688/amrcopenres.12861.1,10.12688/amrcopenres.12861.1,https://amrcopenresearch.org/articles/1-1/v1,,,,,"[(has-review, 10.21956/amrcopenres.13927.r2630..."
4,crossref::10.12688/amrcopenres.12861.2,10.12688/amrcopenres.12861.2,https://amrcopenresearch.org/articles/1-1/v2,New version,,,,"[(has-review, 10.21956/amrcopenres.13946.r26443)]"
5,crossref::10.12688/amrcopenres.12862.1,10.12688/amrcopenres.12862.1,https://amrcopenresearch.org/articles/1-3/v1,,,,,"[(has-review, 10.21956/amrcopenres.13928.r2631..."
6,crossref::10.12688/amrcopenres.12862.2,10.12688/amrcopenres.12862.2,https://amrcopenresearch.org/articles/1-3/v2,New version,,,,"[(has-review, 10.21956/amrcopenres.13951.r2646..."
7,crossref::10.12688/amrcopenres.12873.1,10.12688/amrcopenres.12873.1,https://amrcopenresearch.org/articles/1-11/v1,,,,,"[(has-review, 10.21956/amrcopenres.13939.r2641..."
8,crossref::10.12688/amrcopenres.12875.1,10.12688/amrcopenres.12875.1,https://amrcopenresearch.org/articles/1-10/v1,,,,,"[(has-review, 10.21956/amrcopenres.13941.r2639..."
9,crossref::10.12688/amrcopenres.12882.1,10.12688/amrcopenres.12882.1,https://amrcopenresearch.org/articles/1-12/v1,,,,,"[(has-review, 10.21956/amrcopenres.13950.r2647..."


In [81]:
explore_server(df, "RePEc: Research Papers in Economics", n=30)

In [82]:
explore_server(df, "AgriRxiv", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.31220/osf.io/2nf7j,10.31220/osf.io/2nf7j,https://osf.io/2nf7j,,,10.1177/0262728019894753,,"[(is-preprint-of, 10.1177/0262728019894753)]"
1,crossref::10.31220/osf.io/5ecwq,10.31220/osf.io/5ecwq,https://osf.io/5ecwq,,,10.5281/zenodo.2528683,,"[(is-preprint-of, 10.5281/zenodo.2528683)]"
2,crossref::10.31220/osf.io/aw6es,10.31220/osf.io/aw6es,https://osf.io/aw6es,,,10.25518/1780-4507.17219,,"[(is-preprint-of, 10.25518/1780-4507.17219)]"
3,crossref::10.31220/osf.io/f8pma,10.31220/osf.io/f8pma,https://osf.io/f8pma,,,10.5772/intechopen.90033,,"[(is-preprint-of, 10.5772/intechopen.90033)]"
4,crossref::10.31220/osf.io/fn8sy,10.31220/osf.io/fn8sy,https://osf.io/fn8sy,,,10.20431/2454-6224.0508001,,"[(is-preprint-of, 10.20431/2454-6224.0508001)]"
5,crossref::10.31220/osf.io/g6fcz,10.31220/osf.io/g6fcz,https://osf.io/g6fcz,,,10.1007/s11250-019-02191-8,,"[(is-preprint-of, 10.1007/s11250-019-02191-8)]"
6,crossref::10.31220/osf.io/jp2vx,10.31220/osf.io/jp2vx,https://osf.io/jp2vx,,,10.1016/j.fm.2018.10.020,,"[(is-preprint-of, 10.1016/j.fm.2018.10.020)]"
7,crossref::10.31220/osf.io/k3szu,10.31220/osf.io/k3szu,https://osf.io/k3szu,,,10.15406/apar.2014.01.00029,,"[(is-preprint-of, 10.15406/apar.2014.01.00029)]"
8,crossref::10.31220/osf.io/ns49k,10.31220/osf.io/ns49k,https://osf.io/ns49k,,,10.14202/vetworld.2015.898-901,,"[(is-preprint-of, 10.14202/vetworld.2015.898-9..."
9,crossref::10.31220/osf.io/pg3v9,10.31220/osf.io/pg3v9,https://osf.io/pg3v9,,,10.29255/aksara.v31i2.364.251-268,,"[(is-preprint-of, 10.29255/aksara.v31i2.364.25..."


In [83]:
explore_server(df, "Authorea Inc.", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.22541/21docs.171448358.88743249/v2,10.22541/21docs.171448358.88743249/v2,https://advance.sagepub.com/users/487877/artic...,,10.22541/21docs.171448358.88743249/v1,,,"[(is-version-of, 10.22541/21docs.171448358.887..."
1,crossref::10.22541/au.148762927.73541341/v2,10.22541/au.148762927.73541341/v2,https://www.authorea.com/users/2/articles/1397...,,10.22541/au.148762927.73541341,,,"[(is-version-of, 10.22541/au.148762927.73541341)]"
2,crossref::10.22541/au.148769949.92783646/v2,10.22541/au.148769949.92783646/v2,https://www.authorea.com/users/23/articles/876...,,10.22541/au.148769949.92783646,,,"[(is-version-of, 10.22541/au.148769949.92783646)]"
3,crossref::10.22541/au.148787463.30762904/v2,10.22541/au.148787463.30762904/v2,https://www.authorea.com/users/4510/articles/1...,,10.22541/au.148787463.30762904,,,"[(is-version-of, 10.22541/au.148787463.30762904)]"
4,crossref::10.22541/au.148787463.30762904/v3,10.22541/au.148787463.30762904/v3,https://www.authorea.com/users/4510/articles/1...,,10.22541/au.148787463.30762904,,,"[(is-version-of, 10.22541/au.148787463.30762904)]"
5,crossref::10.22541/au.148807336.66911346/v2,10.22541/au.148807336.66911346/v2,https://www.authorea.com/users/6000/articles/1...,,10.22541/au.148807336.66911346,,,"[(is-version-of, 10.22541/au.148807336.66911346)]"
6,crossref::10.22541/au.149308896.61629534/v2,10.22541/au.149308896.61629534/v2,https://www.authorea.com/users/4510/articles/1...,,10.22541/au.149308896.61629534,,,"[(is-version-of, 10.22541/au.149308896.61629534)]"
7,crossref::10.22541/au.149308896.61629534/v3,10.22541/au.149308896.61629534/v3,https://www.authorea.com/users/4510/articles/1...,,10.22541/au.149308896.61629534,,,"[(is-version-of, 10.22541/au.149308896.61629534)]"
8,crossref::10.22541/au.149308896.61629534/v4,10.22541/au.149308896.61629534/v4,https://www.authorea.com/users/4510/articles/1...,,10.22541/au.149308896.61629534,,,"[(is-version-of, 10.22541/au.149308896.61629534)]"
9,crossref::10.22541/au.150188653.35495142/v2,10.22541/au.150188653.35495142/v2,https://www.authorea.com/users/4510/articles/1...,,10.22541/au.150188653.35495142,,,"[(is-version-of, 10.22541/au.150188653.35495142)]"


In [84]:
explore_server(df, "Cambridge Open Engage", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.33774/coe-2020-0x2pp,10.33774/coe-2020-0x2pp,https://www.cambridge.org/engage/coe/article-d...,,10.33774/coe-2020-0x2pp-v2;10.33774/coe-2020-0...,,,"[(is-version-of, 10.33774/coe-2020-0x2pp-v2), ..."
1,crossref::10.33774/coe-2020-0x2pp-v2,10.33774/coe-2020-0x2pp-v2,https://www.cambridge.org/engage/coe/article-d...,,10.33774/coe-2020-0x2pp;10.33774/coe-2020-0x2p...,,,"[(is-version-of, 10.33774/coe-2020-0x2pp), (is..."
2,crossref::10.33774/coe-2020-0x2pp-v3,10.33774/coe-2020-0x2pp-v3,https://www.cambridge.org/engage/coe/article-d...,,10.33774/coe-2020-0x2pp;10.33774/coe-2020-0x2p...,,,"[(is-version-of, 10.33774/coe-2020-0x2pp), (is..."
3,crossref::10.33774/coe-2020-1ccff,10.33774/coe-2020-1ccff,https://www.cambridge.org/engage/coe/article-d...,,,10.18203/2394-6040.ijcmph20204365,,"[(is-preprint-of, 10.18203/2394-6040.ijcmph202..."
4,crossref::10.33774/coe-2020-1d2hz,10.33774/coe-2020-1d2hz,https://www.cambridge.org/engage/coe/article-d...,,,10.1080/15472450.2022.2051169,,"[(is-preprint-of, 10.1080/15472450.2022.2051169)]"
5,crossref::10.33774/coe-2020-27j3q,10.33774/coe-2020-27j3q,https://www.cambridge.org/engage/coe/article-d...,,,10.1080/00150517.2020.12427602,,"[(is-preprint-of, 10.1080/00150517.2020.124276..."
6,crossref::10.33774/coe-2020-290p3,10.33774/coe-2020-290p3,https://www.cambridge.org/engage/coe/article-d...,,10.33774/coe-2020-290p3-v2,,,"[(is-version-of, 10.33774/coe-2020-290p3-v2)]"
7,crossref::10.33774/coe-2020-290p3-v2,10.33774/coe-2020-290p3-v2,https://www.cambridge.org/engage/coe/article-d...,,10.33774/coe-2020-290p3,,,"[(is-version-of, 10.33774/coe-2020-290p3)]"
8,crossref::10.33774/coe-2020-3q700,10.33774/coe-2020-3q700,https://www.cambridge.org/engage/coe/article-d...,,10.33774/coe-2020-3q700-v2,,,"[(is-version-of, 10.33774/coe-2020-3q700-v2)]"
9,crossref::10.33774/coe-2020-3q700-v2,10.33774/coe-2020-3q700-v2,https://www.cambridge.org/engage/coe/article-d...,,10.33774/coe-2020-3q700,,,"[(is-version-of, 10.33774/coe-2020-3q700)]"


In [85]:
explore_server(df, "Open Research Africa", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.12688/aasopenres.12822.1,10.12688/aasopenres.12822.1,https://aasopenresearch.org/articles/1-24/v1,,,,,"[(has-review, 10.21956/aasopenres.13888.r26691..."
1,crossref::10.12688/aasopenres.12825.1,10.12688/aasopenres.12825.1,https://aasopenresearch.org/articles/1-1/v1,,,,,"[(has-review, 10.21956/aasopenres.13891.r26335..."
2,crossref::10.12688/aasopenres.12825.2,10.12688/aasopenres.12825.2,https://aasopenresearch.org/articles/1-1/v2,New version,,,,"[(has-review, 10.21956/aasopenres.13998.r26676)]"
3,crossref::10.12688/aasopenres.12826.1,10.12688/aasopenres.12826.1,https://aasopenresearch.org/articles/1-14/v1,,,,,"[(has-review, 10.21956/aasopenres.13892.r26493..."
4,crossref::10.12688/aasopenres.12829.1,10.12688/aasopenres.12829.1,https://aasopenresearch.org/articles/1-4/v1,,,,,"[(has-review, 10.21956/aasopenres.13895.r26478..."
5,crossref::10.12688/aasopenres.12832.1,10.12688/aasopenres.12832.1,https://aasopenresearch.org/articles/1-3/v1,,,,,"[(has-review, 10.21956/aasopenres.13898.r26342..."
6,crossref::10.12688/aasopenres.12832.2,10.12688/aasopenres.12832.2,https://aasopenresearch.org/articles/1-3/v2,New version,,,,"[(has-review, 10.21956/aasopenres.13951.r26487)]"
7,crossref::10.12688/aasopenres.12837.1,10.12688/aasopenres.12837.1,https://aasopenresearch.org/articles/1-12/v1,,,,,"[(has-review, 10.21956/aasopenres.13903.r26351..."
8,crossref::10.12688/aasopenres.12837.2,10.12688/aasopenres.12837.2,https://aasopenresearch.org/articles/1-12/v2,New version,,,,"[(has-review, 10.21956/aasopenres.14017.r26741..."
9,crossref::10.12688/aasopenres.12839.1,10.12688/aasopenres.12839.1,https://aasopenresearch.org/articles/1-7/v1,,,,,"[(has-review, 10.21956/aasopenres.13905.r26384)]"


In [86]:
explore_server(df, "Preprints.org", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.20944/preprints201607.0003.v1,10.20944/preprints201607.0003.v1,http://www.preprints.org/manuscript/201607.000...,,10.20944/preprints201607.0003.v2;10.20944/prep...,,,"[(is-version-of, 10.20944/preprints201607.0003..."
1,crossref::10.20944/preprints201607.0003.v2,10.20944/preprints201607.0003.v2,http://www.preprints.org/manuscript/201607.000...,,10.20944/preprints201607.0003.v1;10.20944/prep...,,,"[(is-version-of, 10.20944/preprints201607.0003..."
2,crossref::10.20944/preprints201607.0003.v3,10.20944/preprints201607.0003.v3,http://www.preprints.org/manuscript/201607.000...,,10.20944/preprints201607.0003.v1;10.20944/prep...,,,"[(is-version-of, 10.20944/preprints201607.0003..."
3,crossref::10.20944/preprints201607.0003.v4,10.20944/preprints201607.0003.v4,http://www.preprints.org/manuscript/201607.000...,,10.20944/preprints201607.0003.v1;10.20944/prep...,,,"[(is-version-of, 10.20944/preprints201607.0003..."
4,crossref::10.20944/preprints201607.0003.v5,10.20944/preprints201607.0003.v5,http://www.preprints.org/manuscript/201607.000...,,10.20944/preprints201607.0003.v1;10.20944/prep...,,,"[(is-version-of, 10.20944/preprints201607.0003..."
5,crossref::10.20944/preprints201607.0003.v6,10.20944/preprints201607.0003.v6,http://www.preprints.org/manuscript/201607.000...,,10.20944/preprints201607.0003.v1;10.20944/prep...,,,"[(is-version-of, 10.20944/preprints201607.0003..."
6,crossref::10.20944/preprints201607.0008.v1,10.20944/preprints201607.0008.v1,https://www.preprints.org/manuscript/201607.00...,,,10.3390/galaxies4030022,,"[(is-preprint-of, 10.3390/galaxies4030022)]"
7,crossref::10.20944/preprints201607.0010.v1,10.20944/preprints201607.0010.v1,https://www.preprints.org/manuscript/201607.00...,,,10.3390/ijfs4040025,,"[(is-preprint-of, 10.3390/ijfs4040025)]"
8,crossref::10.20944/preprints201607.0023.v1,10.20944/preprints201607.0023.v1,https://www.preprints.org/manuscript/201607.00...,,,10.3390/galaxies4040054,,"[(is-preprint-of, 10.3390/galaxies4040054)]"
9,crossref::10.20944/preprints201607.0025.v1,10.20944/preprints201607.0025.v1,https://www.preprints.org/manuscript/201607.00...,,,10.3390/galaxies4040031,,"[(is-preprint-of, 10.3390/galaxies4040031)]"


In [87]:
explore_server(df, "Qeios", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,is_version_of,is_preprint_of,published_version_ids_json,rels_short
0,crossref::10.32388/0032qb,10.32388/0032qb,https://www.qeios.com/read/0032QB,,,,,"[(is-review-of, 10.32388/FH02NZ.2)]"
1,crossref::10.32388/0032qv,10.32388/0032qv,https://www.qeios.com/read/0032QV,,,,,"[(is-review-of, 10.32388/TUZ29Y)]"
2,crossref::10.32388/003mus,10.32388/003mus,https://www.qeios.com/read/003MUS,,,,,"[(is-review-of, 10.32388/SWMC6H)]"
3,crossref::10.32388/0044gp,10.32388/0044gp,https://www.qeios.com/read/0044GP,,,,,"[(is-review-of, 10.32388/1R2IRX.2)]"
4,crossref::10.32388/004dw3,10.32388/004dw3,https://www.qeios.com/read/004DW3,,,,,"[(is-review-of, 10.32388/4W9RXU.2)]"
5,crossref::10.32388/004ro2,10.32388/004ro2,https://www.qeios.com/read/004RO2,,,,,"[(is-review-of, 10.32388/H40OZW)]"
6,crossref::10.32388/005cul,10.32388/005cul,https://www.qeios.com/read/005CUL,,,,,"[(is-review-of, 10.32388/9GH618)]"
7,crossref::10.32388/005ew8,10.32388/005ew8,https://www.qeios.com/read/005EW8,,,,,"[(is-review-of, 10.32388/Z08IKJ)]"
8,crossref::10.32388/005l77,10.32388/005l77,https://www.qeios.com/read/005L77,,,,,"[(is-review-of, 10.32388/1Q3T6O)]"
9,crossref::10.32388/005lcz,10.32388/005lcz,https://www.qeios.com/read/005LCZ,,,,,"[(is-review-of, 10.32388/BLVVSV)]"


## Preprint / version / review relations normalization utilities

In [20]:
# ============================================================
# Preprint / version / review relations normalization utilities
# (Copy-paste ready)
# ============================================================
from __future__ import annotations

import re
import json
from typing import Any, Dict, List, Optional, Tuple, Iterable

import pandas as pd


# -----------------------------
# Small helpers
# -----------------------------
_DOI_RX = re.compile(r"(10\.\d{4,9}/[^\s<>\"]+)", re.IGNORECASE)

def _safe_str(x: Any) -> str:
    return "" if x is None else str(x)

def normalize_doi(x: Any) -> Optional[str]:
    """
    Normalize a DOI-like string:
      - lowercases
      - strips whitespace
      - removes leading DOI resolver prefixes (https://doi.org/, doi:)
      - extracts first DOI if embedded in a URL or text
    """
    s = _safe_str(x).strip()
    if not s:
        return None

    s = s.strip()
    s = re.sub(r"^https?://(dx\.)?doi\.org/", "", s, flags=re.IGNORECASE)
    s = re.sub(r"^doi:\s*", "", s, flags=re.IGNORECASE)

    m = _DOI_RX.search(s)
    if m:
        s = m.group(1)

    s = s.strip().rstrip(").,;")
    s = s.lower()
    return s or None

def split_multi_ids(x: Any) -> List[str]:
    """
    Split strings like:
      "10.x/abc;10.y/def" or "10.x/abc, 10.y/def"
    into normalized DOIs/IDs. Keeps non-DOI IDs too (e.g., GSE36916) when present.
    """
    if x is None:
        return []
    if isinstance(x, list):
        out: List[str] = []
        for item in x:
            out.extend(split_multi_ids(item))
        return out

    s = _safe_str(x).strip()
    if not s:
        return []

    parts = re.split(r"[;,]\s*|\s+\|\s+", s)
    out: List[str] = []
    for p in parts:
        p = p.strip()
        if not p:
            continue
        doi = normalize_doi(p)
        out.append(doi if doi else p)
    # de-dupe while preserving order
    seen = set()
    uniq = []
    for v in out:
        if v not in seen:
            uniq.append(v)
            seen.add(v)
    return uniq

def _dedup_list_keep_order(xs: Iterable[str]) -> List[str]:
    seen = set()
    out = []
    for x in xs:
        if x and x not in seen:
            out.append(x)
            seen.add(x)
    return out


# -----------------------------
# Version label detection
# -----------------------------
def detect_version_label(doi: Any, landing_page_url: Any) -> Tuple[Optional[str], List[str]]:
    """
    Returns:
      version_label (e.g. "1", "2", "v3", "New version") and list of reasons
    Works for patterns:
      - DOI suffix .1 .2 ... (F1000-style)
      - DOI suffix /v1 /v2 ...
      - URL contains /v1 /v2 ...
    """
    reasons: List[str] = []
    d = _safe_str(doi).lower().strip()
    u = _safe_str(landing_page_url).lower().strip()

    # pattern: 10.x/yyy.1
    m = re.search(r"\.(\d+)$", d)
    if m:
        reasons.append("doi_suffix_dot_number")
        return m.group(1), reasons

    # pattern: /v1 or v1 at end of doi token
    m = re.search(r"(?:/|\.|-)v(\d+)$", d)
    if m:
        reasons.append("doi_suffix_v_number")
        return f"v{m.group(1)}", reasons

    # pattern in URL: /v1 /v2 etc
    m = re.search(r"/v(\d+)(?:/|$)", u)
    if m:
        reasons.append("url_contains_v_number")
        return f"v{m.group(1)}", reasons

    return None, reasons


# -----------------------------
# Crossref relation extraction
# -----------------------------
_CROSSREF_MAP = {
    "is-preprint-of": "is_preprint_of",
    "has-preprint": "has_preprint",
    "is-version-of": "is_version_of",
    "has-version": "has_version",
    "is-review-of": "is_review_of",
    "has-review": "has_review",
    "is-supplemented-by": "is_supplemented_by",
    "is-supplement-to": "is_supplement_to",
    "references": "references",
}

def extract_relations_crossref(relation_obj: Any) -> Dict[str, Any]:
    """
    Crossref 'relation' object is typically a dict like:
      { "is-preprint-of": [ {"id":"10.x/..", ...}, ...], "has-review":[...], ... }
    Returns a dict with normalized lists and a compact rels_short list.
    """
    out: Dict[str, Any] = {
        "is_preprint_of": [],
        "has_preprint": [],
        "is_version_of": [],
        "has_version": [],
        "is_review_of": [],
        "has_review": [],
        "is_supplemented_by": [],
        "is_supplement_to": [],
        "references": [],
        "rels_short": [],          # list[(rel, target)]
        "relation_signal_reasons": []
    }

    if not relation_obj or not isinstance(relation_obj, dict):
        return out

    for rel_key, items in relation_obj.items():
        rel_key_l = _safe_str(rel_key).strip().lower()
        col = _CROSSREF_MAP.get(rel_key_l)
        if not col:
            continue

        if not isinstance(items, list):
            items = [items]

        for it in items:
            if isinstance(it, dict):
                target = it.get("id") or it.get("identifier") or it.get("URL") or it.get("url")
            else:
                target = it

            # normalize DOI if possible, otherwise keep raw
            target_norm = normalize_doi(target) or _safe_str(target).strip()
            if target_norm:
                out[col].append(target_norm)
                out["rels_short"].append((rel_key_l, target_norm))
                out["relation_signal_reasons"].append(f"crossref:{rel_key_l}")

    # dedup lists
    for k in ["is_preprint_of","has_preprint","is_version_of","has_version",
              "is_review_of","has_review","is_supplemented_by","is_supplement_to","references"]:
        out[k] = _dedup_list_keep_order(out[k])
    out["relation_signal_reasons"] = _dedup_list_keep_order(out["relation_signal_reasons"])
    return out


# -----------------------------
# DataCite relation extraction
# -----------------------------
_DATACITE_PREPRINT_RELTYPES = {"isversionof"}  # arXiv typically uses IsVersionOf => published article
_DATACITE_VERSION_CHAIN_RELTYPES = {"isnewversionof", "ispreviousversionof", "ispartof", "haspart"}  # conservative
_DATACITE_REVIEW_RELTYPES = {"isreviewof", "hasreview"}
_DATACITE_OTHER_RELTYPES = {"issupplementt o", "issupplementto", "issupplementedby", "issupplemented-by"}

def extract_relations_datacite(related_identifiers: Any, *,
                               record_doi: Optional[str],
                               landing_page_url: Optional[str],
                               server_name: Optional[str]) -> Dict[str, Any]:
    """
    DataCite JSON typically has attributes.relatedIdentifiers:
      [
        {"relationType":"IsVersionOf", "relatedIdentifier":"10.x/...", "relatedIdentifierType":"DOI"},
        ...
      ]

    Heuristic:
      - If the record looks like arXiv (doi contains 10.48550/arxiv OR url has arxiv OR server_name == 'arXiv'),
        then relationType IsVersionOf => is_preprint_of (published version).
      - Otherwise, IsVersionOf / IsNewVersionOf are treated as version-chain (is_version_of).
    """
    out: Dict[str, Any] = {
        "is_preprint_of": [],
        "is_version_of": [],
        "has_review": [],
        "is_review_of": [],
        "rels_short": [],
        "relation_signal_reasons": [],
    }

    if not related_identifiers:
        return out

    if isinstance(related_identifiers, dict):
        # sometimes wrapped
        related_identifiers = related_identifiers.get("relatedIdentifiers") or related_identifiers.get("related_identifiers")

    if not isinstance(related_identifiers, list):
        return out

    doi_l = (record_doi or "").lower()
    url_l = (landing_page_url or "").lower()
    srv_l = (server_name or "").lower()

    looks_arxiv = ("10.48550/arxiv" in doi_l) or ("arxiv.org" in url_l) or (srv_l == "arxiv")

    for rel in related_identifiers:
        if not isinstance(rel, dict):
            continue

        rel_type = _safe_str(rel.get("relationType")).strip()
        rel_type_l = rel_type.lower()

        target_raw = rel.get("relatedIdentifier") or rel.get("relatedIdentifierValue") or rel.get("id") or rel.get("identifier")
        target = normalize_doi(target_raw) or _safe_str(target_raw).strip()
        if not target:
            continue

        # map
        if rel_type_l in _DATACITE_PREPRINT_RELTYPES and looks_arxiv:
            out["is_preprint_of"].append(target)
            out["rels_short"].append((rel_type, target))
            out["relation_signal_reasons"].append(f"datacite:{rel_type}")

        elif rel_type_l in (_DATACITE_PREPRINT_RELTYPES | _DATACITE_VERSION_CHAIN_RELTYPES):
            out["is_version_of"].append(target)
            out["rels_short"].append((rel_type, target))
            out["relation_signal_reasons"].append(f"datacite:{rel_type}")

        elif rel_type_l in _DATACITE_REVIEW_RELTYPES:
            if rel_type_l == "isreviewof":
                out["is_review_of"].append(target)
            else:
                out["has_review"].append(target)
            out["rels_short"].append((rel_type, target))
            out["relation_signal_reasons"].append(f"datacite:{rel_type}")

        else:
            # still keep in rels_short (useful for debugging)
            out["rels_short"].append((rel_type, target))

    # dedup
    for k in ["is_preprint_of","is_version_of","has_review","is_review_of"]:
        out[k] = _dedup_list_keep_order(out[k])
    out["relation_signal_reasons"] = _dedup_list_keep_order(out["relation_signal_reasons"])
    return out


# -----------------------------
# Unified row enricher
# -----------------------------
def enrich_row(row: pd.Series) -> Dict[str, Any]:
    """
    Expects (as available):
      backend, server_name, record_id, doi, landing_page_url
      - Crossref relation dict in: relation OR relations OR relation_obj
      - DataCite relatedIdentifiers in: relatedIdentifiers OR related_identifiers OR datacite_relatedIdentifiers
    Produces:
      version_label, is_version_of, is_preprint_of, published_version_ids_json, rels_short, version_signal_reasons
    """
    backend = _safe_str(row.get("backend")).strip().lower() or None
    server_name = _safe_str(row.get("server_name")).strip() or None

    doi = normalize_doi(row.get("doi"))
    landing_page_url = _safe_str(row.get("landing_page_url")).strip() or None

    # 1) version label (+ reasons)
    version_label, version_reasons = detect_version_label(doi, landing_page_url)

    # 2) relations
    rels_short: List[Tuple[str, str]] = []
    is_preprint_of: List[str] = []
    has_preprint: List[str] = []
    is_version_of: List[str] = []
    has_review: List[str] = []
    is_review_of: List[str] = []
    relation_reasons: List[str] = []

    if backend == "crossref":
        rel_obj = row.get("relation") if "relation" in row else None
        if rel_obj is None and "relations" in row:
            rel_obj = row.get("relations")
        if rel_obj is None and "relation_obj" in row:
            rel_obj = row.get("relation_obj")

        # if stored as JSON string, parse it
        if isinstance(rel_obj, str):
            try:
                rel_obj = json.loads(rel_obj)
            except Exception:
                rel_obj = None

        r = extract_relations_crossref(rel_obj)
        is_preprint_of = r["is_preprint_of"]
        has_preprint = r["has_preprint"]
        is_version_of = r["is_version_of"]
        has_review = r["has_review"]
        is_review_of = r["is_review_of"]
        rels_short = r["rels_short"]
        relation_reasons = r["relation_signal_reasons"]

    elif backend == "datacite":
        rel_obj = None
        for k in ["relatedIdentifiers", "related_identifiers", "datacite_relatedIdentifiers"]:
            if k in row and row.get(k) is not None:
                rel_obj = row.get(k)
                break

        if isinstance(rel_obj, str):
            try:
                rel_obj = json.loads(rel_obj)
            except Exception:
                rel_obj = None

        r = extract_relations_datacite(
            rel_obj,
            record_doi=doi,
            landing_page_url=landing_page_url,
            server_name=server_name,
        )
        is_preprint_of = r["is_preprint_of"]
        is_version_of = r["is_version_of"]
        has_review = r["has_review"]
        is_review_of = r["is_review_of"]
        rels_short = r["rels_short"]
        relation_reasons = r["relation_signal_reasons"]

    else:
        # unknown backend: do nothing but keep version label
        pass

    # 3) published_version_ids_json
    #    Rule:
    #      - If we have is_preprint_of -> that's the published version(s)
    #      - Else if (some ecosystems) only have is_version_of but it's clearly a preprint DOI pattern -> treat as published versions
    published_version_ids: List[str] = []

    if is_preprint_of:
        published_version_ids = list(is_preprint_of)
    else:
        # heuristic: if DOI looks like a preprint DOI and it points to something else via is_version_of
        doi_l = (doi or "").lower()
        url_l = (landing_page_url or "").lower()
        srv_l = (server_name or "").lower()

        looks_preprint = any([
            "arxiv" in doi_l or "arxiv.org" in url_l,
            "ssrn" in doi_l or "ssrn.com" in url_l,
            "researchsquare" in url_l,
            "biorxiv" in url_l or "medrxiv" in url_l,
            srv_l in {"arxiv", "ssrn", "research square", "biorxiv", "medrxiv"},
        ])
        if looks_preprint and is_version_of:
            published_version_ids = list(is_version_of)

    published_version_ids = _dedup_list_keep_order(published_version_ids)
    published_version_ids_json = json.dumps(published_version_ids, ensure_ascii=False) if published_version_ids else None

    # 4) rels_short pretty + full reasons
    #    You asked to "replace it with the full version_signal_reasons"
    version_signal_reasons = _dedup_list_keep_order(version_reasons + relation_reasons)
    version_signal_reasons_json = json.dumps(version_signal_reasons, ensure_ascii=False) if version_signal_reasons else "[]"

    # also keep rels_short as JSON string for easier storage
    rels_short_json = json.dumps(rels_short, ensure_ascii=False) if rels_short else "[]"

    # 5) pick single representative strings (optional)
    #    If you prefer semicolon-joined for display:
    is_preprint_of_s = ";".join(is_preprint_of) if is_preprint_of else None
    is_version_of_s = ";".join(is_version_of) if is_version_of else None

    return {
        "doi_norm": doi,
        "version_label": version_label,
        "is_preprint_of": is_preprint_of_s,
        "has_preprint": ";".join(has_preprint) if has_preprint else None,
        "is_version_of": is_version_of_s,
        "has_review": ";".join(has_review) if has_review else None,
        "is_review_of": ";".join(is_review_of) if is_review_of else None,
        "published_version_ids_json": published_version_ids_json,
        "rels_short": rels_short_json,
        "version_signal_reasons": version_signal_reasons_json,
    }


def enrich_versions_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds normalized columns to your existing dataframe.
    Safe to run on a huge DF, but if memory is tight, use explore_server_sql below.
    """
    add = df.apply(enrich_row, axis=1, result_type="expand")
    # keep your existing doi too; add normalized
    out = pd.concat([df, add], axis=1)

    # Prefer doi_norm if doi is missing/bad (optional)
    if "doi" in out.columns:
        out["doi"] = out["doi"].where(out["doi"].notna() & (out["doi"].astype(str).str.strip() != ""), out["doi_norm"])
    return out


# -----------------------------
# Exploration helpers (your ask: explore server-by-server)
# -----------------------------
def explore_server(df: pd.DataFrame, server: str, n: int = 30) -> pd.DataFrame:
    """
    Lightweight: filter an already-loaded pandas DF.
    """
    if "server_name" not in df.columns:
        raise ValueError("df must have a 'server_name' column")
    mask = df["server_name"].astype(str).str.lower().eq(str(server).lower())
    cols = [c for c in [
        "record_id","doi","landing_page_url","version_label",
        "is_version_of","is_preprint_of","published_version_ids_json","rels_short","version_signal_reasons"
    ] if c in df.columns]
    return df.loc[mask, cols].head(n)


def explore_server_sql(con, table: str, server: str, n: int = 30) -> pd.DataFrame:
    """
    Memory-friendly: query only one server from DuckDB (or any DB with .execute().fetchdf()).
    Assumes the table already contains the enriched columns.
    """
    q = f"""
    SELECT
      record_id,
      doi,
      landing_page_url,
      version_label,
      is_version_of,
      is_preprint_of,
      published_version_ids_json,
      rels_short,
      version_signal_reasons
    FROM {table}
    WHERE lower(CAST(server_name AS VARCHAR)) = lower(?)
    LIMIT {int(n)}
    """
    return con.execute(q, [server]).fetchdf()


# ============================================================
# OPTIONAL: DuckDB table creation with safe VARCHAR casting
# (this addresses your "convert all these columns to varchar before" issue)
# ============================================================
DUCKDB_CREATE_VERSION_DETECTION_SQL = r"""
-- Replace raw_table_name with your table/view name that has:
--   backend, server_name, record_id, doi, landing_page_url,
--   relation (Crossref) OR relatedIdentifiers (DataCite) as JSON text/struct
CREATE OR REPLACE TABLE version_detection AS
SELECT
  CAST(backend AS VARCHAR)            AS backend,
  CAST(server_name AS VARCHAR)        AS server_name,
  CAST(record_id AS VARCHAR)          AS record_id,
  CAST(doi AS VARCHAR)               AS doi,
  CAST(landing_page_url AS VARCHAR)  AS landing_page_url,

  -- store raw relation blobs as VARCHAR so pandas/json parsing is predictable
  CAST(relation AS VARCHAR)           AS relation,
  CAST(relatedIdentifiers AS VARCHAR) AS relatedIdentifiers

FROM raw_table_name;
"""


# ============================================================
# Example usage
# ============================================================
# 1) If you already have a pandas df:
# df = enrich_versions_df(df)

# 2) Explore per server (pandas):
# explore_server(df, "Research Square", n=30)

# 3) If you're using DuckDB and want to avoid loading everything:
# con.execute(DUCKDB_CREATE_VERSION_DETECTION_SQL.replace("raw_table_name", "my_raw_table"))
# tmp = con.execute("SELECT * FROM version_detection WHERE lower(server_name)=lower('Research Square') LIMIT 5000").fetchdf()
# tmp = enrich_versions_df(tmp)
# con.register("tmp_enriched", tmp)
# con.execute("CREATE OR REPLACE TABLE version_detection_enriched AS SELECT * FROM tmp_enriched")
# explore_server_sql(con, "version_detection_enriched", "Research Square", n=30)


In [21]:
enrich_versions_dfdf = enrich_versions_df(data_clean)
enrich_versions_dfdf

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year,doi_norm,version_label.1,is_preprint_of.1,has_preprint,is_version_of.1,has_review,is_review_of,published_version_ids_json.1,rels_short,version_signal_reasons
429739,crossref::10.1002/essoar.10500000.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500000.1,https://doi.org/10.1002/essoar.10500000.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-26,2018-05-26,,2018-05-26,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500000.1,1,,,,,,,[],"[""doi_suffix_dot_number""]"
429742,crossref::10.1002/essoar.10500002.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500002.1,https://doi.org/10.1002/essoar.10500002.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-05-31,2018-05-31,,2018-05-31,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500002.1,1,,,,,,,[],"[""doi_suffix_dot_number""]"
429740,crossref::10.1002/essoar.10500004.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500004.1,https://doi.org/10.1002/essoar.10500004.1,https://essopenarchive.org/doi/full/10.1002/es...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1029/2017sw001789;10.1029/2018sw001995,,2018-05-30,2018-05-30,,2018-05-30,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500004.1,1,,,,,,,[],"[""doi_suffix_dot_number""]"
429741,crossref::10.1002/essoar.10500007.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500007.1,https://doi.org/10.1002/essoar.10500007.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-02-21,2019-11-13,,,2018.0,10.1002/essoar.10500007.1,1,,,,,,,[],"[""doi_suffix_dot_number""]"
429746,crossref::10.1002/essoar.10500009.1,Earth and Space Science Open Archive,crossref,10.1002/essoar.10500009.1,https://doi.org/10.1002/essoar.10500009.1,https://essopenarchive.org/doi/full/10.1002/es...,,,,,,,,2018-08-15,2018-08-15,,2018-08-15,2022-12-05,2025-09-29,2019-12-03,,,2018.0,10.1002/essoar.10500009.1,1,,,,,,,[],"[""doi_suffix_dot_number""]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806404,openalex::W999921877,RePEc: Research Papers in Economics,openalex,,,https://steconomice.uoradea.ro/anale/volume/20...,,,,,,,,,2008-01-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2008.0,,,,,,,,,[],[]
1016201,openalex::W999947037,HAL,openalex,,,https://hal.science/hal-01922484,,,,,,,,,2014-01-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2014.0,,,,,,,,,[],[]
2713111,openalex::W999974616,RePEc: Research Papers in Economics,openalex,,,https://EconPapers.repec.org/RePEc:cde:cdewps:19,,,,,,,,,1994-08-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,1994.0,,,,,,,,,[],[]
1015101,openalex::W999989114,HAL,openalex,,,https://hal.science/hal-01268467,,,,,,,,,2012-07-01,,,,,2025-10-10T00:00:00,,2025-11-06T04:12:42.849631,2012.0,,,,,,,,,[],[]


In [22]:
explore_server(enrich_versions_dfdf, "Research Square", n=30)

Unnamed: 0,record_id,doi,landing_page_url,version_label,version_label.1,is_version_of,is_version_of.1,is_preprint_of,is_preprint_of.1,published_version_ids_json,published_version_ids_json.1,rels_short,version_signal_reasons
3512005,crossref::10.21203/rs-42600/v2,10.21203/rs-42600/v2,https://www.researchsquare.com/article/rs-4260...,,v2,,,,,,,[],"[""doi_suffix_v_number""]"
3621362,crossref::10.21203/rs.1.1/v1,10.21203/rs.1.1/v1,https://www.researchsquare.com/article/rs-2/v1,,v1,,,10.1186/s13063-019-3980-5,,,,[],"[""doi_suffix_v_number""]"
3621363,crossref::10.21203/rs.1.1/v2,10.21203/rs.1.1/v2,https://www.researchsquare.com/article/rs-2/v2,,v2,,,10.1186/s13063-019-3980-5,,,,[],"[""doi_suffix_v_number""]"
3621165,crossref::10.21203/rs.1.10/v1,10.21203/rs.1.10/v1,https://www.researchsquare.com/article/rs-11/v1,,v1,,,10.1186/s12886-019-1078-1,,,,[],"[""doi_suffix_v_number""]"
3621168,crossref::10.21203/rs.1.10/v2,10.21203/rs.1.10/v2,https://www.researchsquare.com/article/rs-11/v2,,v2,,,10.1186/s12886-019-1078-1,,,,[],"[""doi_suffix_v_number""]"
3621169,crossref::10.21203/rs.1.10/v3,10.21203/rs.1.10/v3,https://www.researchsquare.com/article/rs-11/v3,,v3,,,10.1186/s12886-019-1078-1,,,,[],"[""doi_suffix_v_number""]"
3621172,crossref::10.21203/rs.1.10/v4,10.21203/rs.1.10/v4,https://www.researchsquare.com/article/rs-11/v4,,v4,,,10.1186/s12886-019-1078-1,,,,[],"[""doi_suffix_v_number""]"
3621316,crossref::10.21203/rs.1.11/v1,10.21203/rs.1.11/v1,https://www.researchsquare.com/article/rs-18/v1,,v1,,,10.1186/s13063-019-3374-8,,,,[],"[""doi_suffix_v_number""]"
3621288,crossref::10.21203/rs.1.12/v1,10.21203/rs.1.12/v1,https://www.researchsquare.com/article/rs-17/v1,,v1,,,10.1155/2019/6392751,,,,[],"[""doi_suffix_v_number""]"
3621264,crossref::10.21203/rs.1.13/v1,10.21203/rs.1.13/v1,https://www.researchsquare.com/article/rs-15/v1,,v1,,,10.1186/s12871-018-0671-9,,,,[],"[""doi_suffix_v_number""]"


In [41]:
vvvv

NameError: name 'vvvv' is not defined

## Per server exploration

In [55]:
data = data_clean.copy()
data.shape

(8410094, 23)

### arXiv

In [56]:
arXiv_df = data[data['server_name']=='arXiv']
arXiv_df

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
7710375,datacite::10.48550/arxiv.0704.0001,arXiv,datacite,10.48550/arxiv.0704.0001,https://doi.org/10.48550/arxiv.0704.0001,https://arxiv.org/abs/0704.0001,2,"[{""relatedIdentifier"": ""10.1103/physrevd.76.01...","{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,10.1103/physrevd.76.013009,,,,,,,,2022-03-16,2022-03-16,2022-03-16,2007.0
6936497,datacite::10.48550/arxiv.0704.0002,arXiv,datacite,10.48550/arxiv.0704.0002,https://doi.org/10.48550/arxiv.0704.0002,https://arxiv.org/abs/0704.0002,2,[],"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,,,,,,,,,2022-03-16,2022-03-16,2022-03-16,2007.0
6936500,datacite::10.48550/arxiv.0704.0003,arXiv,datacite,10.48550/arxiv.0704.0003,https://doi.org/10.48550/arxiv.0704.0003,https://arxiv.org/abs/0704.0003,3,[],"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,,,,,,,,,2022-03-16,2022-03-16,2022-03-16,2007.0
6936501,datacite::10.48550/arxiv.0704.0004,arXiv,datacite,10.48550/arxiv.0704.0004,https://doi.org/10.48550/arxiv.0704.0004,https://arxiv.org/abs/0704.0004,1,[],"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,,,,,,,,,2022-03-16,2022-03-16,2022-03-16,2007.0
7710844,datacite::10.48550/arxiv.0704.0005,arXiv,datacite,10.48550/arxiv.0704.0005,https://doi.org/10.48550/arxiv.0704.0005,https://arxiv.org/abs/0704.0005,1,[],"{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,,,,,,,,,2022-03-16,2022-03-16,2022-03-16,2007.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7892185,datacite::10.48550/arxiv.supr-con/9608008,arXiv,datacite,10.48550/arxiv.supr-con/9608008,https://doi.org/10.48550/arxiv.supr-con/9608008,https://arxiv.org/abs/supr-con/9608008,1,"[{""relatedIdentifier"": ""10.1103/physrevb.54.15...","{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,10.1103/physrevb.54.15530,,,,,,,,2022-03-19,2022-03-19,2022-03-19,1996.0
7892195,datacite::10.48550/arxiv.supr-con/9609001,arXiv,datacite,10.48550/arxiv.supr-con/9609001,https://doi.org/10.48550/arxiv.supr-con/9609001,https://arxiv.org/abs/supr-con/9609001,1,"[{""relatedIdentifier"": ""10.1109/77.620744"", ""r...","{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,10.1109/77.620744,,,,,,,,2022-03-19,2022-03-19,2022-03-19,1996.0
7892200,datacite::10.48550/arxiv.supr-con/9609002,arXiv,datacite,10.48550/arxiv.supr-con/9609002,https://doi.org/10.48550/arxiv.supr-con/9609002,https://arxiv.org/abs/supr-con/9609002,1,"[{""relatedIdentifier"": ""10.1103/physrevb.56.11...","{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,10.1103/physrevb.56.11989,,,,,,,,2022-03-19,2022-03-19,2022-03-19,1996.0
7892203,datacite::10.48550/arxiv.supr-con/9609003,arXiv,datacite,10.48550/arxiv.supr-con/9609003,https://doi.org/10.48550/arxiv.supr-con/9609003,https://arxiv.org/abs/supr-con/9609003,1,"[{""relatedIdentifier"": ""10.1143/jpsj.65.3131"",...","{""client"": {""data"": {""id"": ""arxiv.content"", ""t...",,,10.1143/jpsj.65.3131,,,,,,,,2022-03-19,2022-03-19,2022-03-19,1996.0


In [None]:
arXiv_df.count()

In [None]:
arXiv_df['version_label'].value_counts()

### SSRN

In [57]:
ssrn_df = data[data['server_name']=='SSRN']
ssrn_df

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
4801551,crossref::10.2139/ssrn.0,SSRN,crossref,10.2139/ssrn.0,https://doi.org/10.2139/ssrn.0,https://www.ssrn.com/abstract=0,,,,,,,,,2021-01-01,,2021-01-01,2021-11-19,2022-04-04,2021-11-19,,,2021.0
4422416,crossref::10.2139/ssrn.100,SSRN,crossref,10.2139/ssrn.100,https://doi.org/10.2139/ssrn.100,http://www.ssrn.com/abstract=100,,,,,,,,,1997-01-01,,1997-01-01,2019-04-25,2025-10-05,2005-08-19,,,1997.0
4420573,crossref::10.2139/ssrn.1000,SSRN,crossref,10.2139/ssrn.1000,https://doi.org/10.2139/ssrn.1000,http://www.ssrn.com/abstract=1000,,,,,,,,,1997-01-01,,1997-01-01,2019-04-24,2023-09-08,2005-08-19,,,1997.0
4458608,crossref::10.2139/ssrn.1000038,SSRN,crossref,10.2139/ssrn.1000038,https://doi.org/10.2139/ssrn.1000038,http://www.ssrn.com/abstract=1000038,,,,,,,,,2007-01-01,,2007-01-01,2019-05-01,2022-04-03,2011-12-28,,,2007.0
4505212,crossref::10.2139/ssrn.1000075,SSRN,crossref,10.2139/ssrn.1000075,https://doi.org/10.2139/ssrn.1000075,http://www.ssrn.com/abstract=1000075,,,,,,,,,2007-01-01,,2007-01-01,2019-05-07,2022-04-01,2011-12-28,,,2007.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4568250,crossref::10.2139/ssrn.999982,SSRN,crossref,10.2139/ssrn.999982,https://doi.org/10.2139/ssrn.999982,http://www.ssrn.com/abstract=999982,,,,,,,,,2002-01-01,,2002-01-01,2019-05-14,2022-03-30,2011-12-28,,,2002.0
4566662,crossref::10.2139/ssrn.999985,SSRN,crossref,10.2139/ssrn.999985,https://doi.org/10.2139/ssrn.999985,http://www.ssrn.com/abstract=999985,,,,,,,,,2008-01-01,,2008-01-01,2019-05-13,2023-03-10,2011-12-28,,,2008.0
4486040,crossref::10.2139/ssrn.999986,SSRN,crossref,10.2139/ssrn.999986,https://doi.org/10.2139/ssrn.999986,http://www.ssrn.com/abstract=999986,,,,,,,,,2002-01-01,,2002-01-01,2019-05-06,2023-10-02,2011-12-28,,,2002.0
4485908,crossref::10.2139/ssrn.999988,SSRN,crossref,10.2139/ssrn.999988,https://doi.org/10.2139/ssrn.999988,http://www.ssrn.com/abstract=999988,,,,,,,,,2002-01-01,,2002-01-01,2019-05-06,2025-10-04,2011-12-28,,,2002.0


In [None]:
ssrn_df.count()

In [None]:
ssrn_df['version_label'].value_counts()

In [None]:
ssrn_df['is_version_of'].value_counts()

In [None]:
ssrn_df['is_preprint_of'].value_counts()

In [None]:
ssrn_df['relations_json'].value_counts()

In [None]:
ssrn_df[ssrn_df['relations_json'].notna()]

### HAL

In [None]:
HAL_df = data[data['server_name']=='HAL']
HAL_df

In [None]:
HAL_df.count()

In [None]:
HAL_df['version_label'].value_counts()

In [None]:
HAL_df['is_version_of'].value_counts()

In [None]:
HAL_df['is_preprint_of'].value_counts()

In [None]:
HAL_df[HAL_df['doi'].notna()]

### Research Square

In [58]:
Research_Square_df = data[data['server_name']=='Research Square']
Research_Square_df

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
3512005,crossref::10.21203/rs-42600/v2,Research Square,crossref,10.21203/rs-42600/v2,https://doi.org/10.21203/rs-42600/v2,https://www.researchsquare.com/article/rs-4260...,,,,,,,,2020-07-30,2020-07-30,,2020-07-30,2020-08-03,2024-03-03,2020-08-03,,,2020.0
3621362,crossref::10.21203/rs.1.1/v1,Research Square,crossref,10.21203/rs.1.1/v1,https://doi.org/10.21203/rs.1.1/v1,https://www.researchsquare.com/article/rs-2/v1,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1186/s13063-019-3980-5,,2018-10-18,2018-10-18,,2018-10-18,2022-07-28,2025-10-21,2018-10-22,,,2018.0
3621363,crossref::10.21203/rs.1.1/v2,Research Square,crossref,10.21203/rs.1.1/v2,https://doi.org/10.21203/rs.1.1/v2,https://www.researchsquare.com/article/rs-2/v2,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1186/s13063-019-3980-5,,2019-10-09,2019-10-09,,2019-10-09,2022-07-28,2025-05-14,2019-10-09,,,2019.0
3621165,crossref::10.21203/rs.1.10/v1,Research Square,crossref,10.21203/rs.1.10/v1,https://doi.org/10.21203/rs.1.10/v1,https://www.researchsquare.com/article/rs-11/v1,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1186/s12886-019-1078-1,,2018-11-07,2018-11-07,,2018-11-07,2022-07-28,2024-03-03,2018-11-07,,,2018.0
3621168,crossref::10.21203/rs.1.10/v2,Research Square,crossref,10.21203/rs.1.10/v2,https://doi.org/10.21203/rs.1.10/v2,https://www.researchsquare.com/article/rs-11/v2,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1186/s12886-019-1078-1,,2019-01-21,2019-01-21,,2019-01-21,2022-07-28,2024-03-03,2019-01-21,,,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3583109,crossref::10.21203/rs.3.rs-999957/v1,Research Square,crossref,10.21203/rs.3.rs-999957/v1,https://doi.org/10.21203/rs.3.rs-999957/v1,https://www.researchsquare.com/article/rs-9999...,,,,,,,,2021-11-30,2021-11-30,,2021-11-30,2021-12-03,2025-05-14,2021-11-30,,,2021.0
3681336,crossref::10.21203/rs.3.rs-999962/v1,Research Square,crossref,10.21203/rs.3.rs-999962/v1,https://doi.org/10.21203/rs.3.rs-999962/v1,https://www.researchsquare.com/article/rs-9999...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1186/s13578-022-00762-1,,2021-10-26,2021-10-26,,2021-10-26,2022-07-29,2025-05-14,2021-10-26,,,2021.0
3580631,crossref::10.21203/rs.3.rs-999982/v1,Research Square,crossref,10.21203/rs.3.rs-999982/v1,https://doi.org/10.21203/rs.3.rs-999982/v1,https://www.researchsquare.com/article/rs-9999...,,,,,,,,2021-11-15,2021-11-15,,2021-11-15,2021-11-15,2025-05-15,2021-11-15,,,2021.0
3579310,crossref::10.21203/rs.3.rs-999985/v1,Research Square,crossref,10.21203/rs.3.rs-999985/v1,https://doi.org/10.21203/rs.3.rs-999985/v1,https://www.researchsquare.com/article/rs-9999...,,,,,,,,2021-11-05,2021-11-05,,2021-11-05,2021-11-05,2022-03-29,2021-11-05,,,2021.0


In [None]:
Research_Square_df.count()

In [None]:
Research_Square_df['version_label'].value_counts()

In [None]:
Research_Square_df['is_version_of'].unique()

In [None]:
pattern = "v2"

mask = data[data['server_name']=='Research Square']['is_version_of'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='Research Square'][mask]
result

In [None]:
Research_Square_df['relations_json'].value_counts()

### RePEc: Research Papers in Economics

In [None]:
data[data['server_name']=='RePEc: Research Papers in Economics'] #['landing_page_url'][9146744]

### bioRxiv

In [59]:
data[data['server_name']=='bioRxiv'] #['landing_page_url'][9146744]

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
8967487,crossref::10.1101/000026,bioRxiv,crossref,10.1101/000026,https://doi.org/10.1101/000026,http://biorxiv.org/lookup/doi/10.1101/000026,,,,,,,,2013-11-07,2013-11-07,,2013-11-07,2024-08-03,2024-08-03,2013-11-30,,,2013.0
8843906,crossref::10.1101/000042,bioRxiv,crossref,10.1101/000042,https://doi.org/10.1101/000042,http://biorxiv.org/lookup/doi/10.1101/000042,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1038/nrg3723,,2013-11-07,2013-11-07,,2013-11-07,2020-04-20,2024-08-03,2013-11-30,,,2013.0
8967488,crossref::10.1101/000067,bioRxiv,crossref,10.1101/000067,https://doi.org/10.1101/000067,http://biorxiv.org/lookup/doi/10.1101/000067,,,,,,,,2013-11-07,2013-11-07,,2013-11-07,2024-08-03,2024-08-03,2013-11-30,,,2013.0
8967490,crossref::10.1101/000075,bioRxiv,crossref,10.1101/000075,https://doi.org/10.1101/000075,http://biorxiv.org/lookup/doi/10.1101/000075,,,,,,,,2013-11-07,2013-11-07,,2013-11-07,2024-08-03,2024-08-03,2013-11-30,,,2013.0
8967489,crossref::10.1101/000091,bioRxiv,crossref,10.1101/000091,https://doi.org/10.1101/000091,http://biorxiv.org/lookup/doi/10.1101/000091,,,,,,,,2013-11-07,2013-11-07,,2013-11-07,2024-08-03,2024-08-03,2013-11-30,,,2013.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8864960,crossref::10.1101/872853,bioRxiv,crossref,10.1101/872853,https://doi.org/10.1101/872853,http://biorxiv.org/lookup/doi/10.1101/872853,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1093/femsyr/foaa018,,2019-12-11,2019-12-11,,2019-12-11,2021-12-15,2025-02-21,2019-12-12,,,2019.0
8947267,crossref::10.1101/872879,bioRxiv,crossref,10.1101/872879,https://doi.org/10.1101/872879,http://biorxiv.org/lookup/doi/10.1101/872879,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1038/s41467-020-14898-6,,2019-12-11,2019-12-11,,2019-12-11,2024-04-29,2025-02-21,2019-12-11,,,2019.0
8938850,crossref::10.1101/872952,bioRxiv,crossref,10.1101/872952,https://doi.org/10.1101/872952,http://biorxiv.org/lookup/doi/10.1101/872952,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1083/jcb.202001160,,2019-12-11,2019-12-11,,2019-12-11,2024-04-23,2024-04-23,2019-12-12,,,2019.0
8890449,crossref::10.1101/873109,bioRxiv,crossref,10.1101/873109,https://doi.org/10.1101/873109,http://biorxiv.org/lookup/doi/10.1101/873109,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1126/scisignal.aay4353,,2019-12-11,2019-12-11,,2019-12-11,2022-10-08,2025-02-21,2019-12-12,,,2019.0


In [None]:
data[data['server_name']=='bioRxiv']['is_version_of'].value_counts()

In [None]:
data[data['server_name']=='bioRxiv']['relations_json'].value_counts()

In [None]:
pattern = "10.1101/022145"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

In [None]:
pattern = "v2"

mask = data[data['server_name']=='bioRxiv']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='bioRxiv'][mask]
result

In [None]:
pattern = "v2"

mask = data[data['server_name']=='bioRxiv']['doi'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='bioRxiv'][mask]
result

In [None]:
pattern = "v2"

mask = data[data['server_name']=='bioRxiv']['is_version_of'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='bioRxiv'][mask]
result

### eLife

In [60]:
data[data['server_name']=='eLife'] #['landing_page_url'][9146744]

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
9130897,crossref::10.7554/elife,eLife,crossref,10.7554/elife,https://doi.org/10.7554/elife,https://elifesciences.org/,,,,,,,,,,,,2017-07-25,2025-12-30,2017-07-25,,,
9274104,crossref::10.7554/elife.00003,eLife,crossref,10.7554/elife.00003,https://doi.org/10.7554/elife.00003,https://elifesciences.org/articles/00003,,,,,,,,,2012-11-13,2012-11-13,2012-11-13,2023-10-11,2025-12-23,2012-11-13,,,2012.0
9131097,crossref::10.7554/elife.00003.001,eLife,crossref,10.7554/elife.00003.001,https://doi.org/10.7554/elife.00003.001,https://elifesciences.org/articles/00003#abstract,,,,,,,,,,,,2018-08-23,2025-11-23,2012-11-13,,,
9131098,crossref::10.7554/elife.00003.002,eLife,crossref,10.7554/elife.00003.002,https://doi.org/10.7554/elife.00003.002,https://elifesciences.org/articles/00003#digest,,,,,,,,,,,,2018-08-23,2025-11-23,2012-11-13,,,
9131100,crossref::10.7554/elife.00003.003,eLife,crossref,10.7554/elife.00003.003,https://doi.org/10.7554/elife.00003.003,https://elifesciences.org/articles/00003#fig1,,,,,,,,,,,,2018-08-23,2025-11-23,2012-11-13,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9366367,crossref::10.7554/elife.99999.2.sa2,eLife,crossref,10.7554/elife.99999.2.sa2,https://doi.org/10.7554/elife.99999.2.sa2,https://elifesciences.org/reviewed-preprints/9...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-06-26,,2025-06-26,2025-06-26,2025-06-26,2025-06-26,,,2025.0
9370550,crossref::10.7554/elife.99999.3,eLife,crossref,10.7554/elife.99999.3,https://doi.org/10.7554/elife.99999.3,https://elifesciences.org/articles/99999,,"{""has-preprint"": [{""asserted-by"": ""subject"", ""...",,10.7554/elife.99999.1;10.7554/elife.99999.2,,,,,2025-09-09,2025-09-09,2025-09-09,2025-09-09,2025-12-24,2025-09-09,,,2025.0
9370553,crossref::10.7554/elife.99999.3.sa0,eLife,crossref,10.7554/elife.99999.3.sa0,https://doi.org/10.7554/elife.99999.3.sa0,https://elifesciences.org/articles/99999v1,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-09-09,,2025-09-09,2025-09-09,2025-09-11,2025-09-09,,,2025.0
9370552,crossref::10.7554/elife.99999.3.sa1,eLife,crossref,10.7554/elife.99999.3.sa1,https://doi.org/10.7554/elife.99999.3.sa1,https://elifesciences.org/articles/99999v1/pee...,,"{""is-review-of"": [{""asserted-by"": ""subject"", ""...",,,,,,,2025-09-09,,2025-09-09,2025-09-09,2025-09-11,2025-09-09,,,2025.0


In [None]:
data[data['doi']=='10.7554/elife.101496']

In [None]:
data[data['doi']=='10.7554/elife.101496.sa0']

In [None]:
data[data['doi']=='10.7554/elife.101496.sa1']

In [None]:
data[data['doi']=='10.7554/elife.101496.sa2']

### AgEcon Search

In [None]:
data[data['server_name']=='AgEcon Search'] #['landing_page_url'][9146744]

### ResearchGate

In [None]:
data[data['server_name']=='ResearchGate']

In [None]:
data[data['doi']=='10.13140/rg.2.2.34292.82565/16']

In [None]:
data[data['doi']=='10.13140/rg.2.2.34292.82565']

### Qeios

In [None]:
data[data['server_name']=='Qeios']

### Zenodo

In [None]:
data[data['server_name']=='Zenodo']

### Open Science Framework

In [None]:
data[data['server_name']=='Open Science Framework']

In [None]:
data[
    (data['server_name'].eq('Open Science Framework')) &
    (data['backend'].eq('openalex'))
]


In [None]:
data[
    (data['server_name'].eq('Open Science Framework')) &
    (data['backend'].eq('datacite'))
]


In [None]:
data[
    (data['server_name'].eq('Open Science Framework')) &
    (data['backend'].eq('crossref'))
]


### Preprint.org

In [None]:
data[data['server_name']=='Preprints.org']

### viXra

In [None]:
data[data['server_name']=='viXra']

In [77]:
data[data['landing_page_url']=='https://vixra.org/pdf/1306.0105v3.pdf']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
9486891,openalex::W999790414,viXra,openalex,,,https://vixra.org/pdf/1306.0105v3.pdf,,,,,,,,,2013-06-01,,,,,2025-10-10T00:00:00,,2025-10-10T17:16:08.811792,2013.0


In [None]:
data[data['landing_page_url']=='https://vixra.org/pdf/1306.0105v1.pdf']

### Advance

In [None]:
data[data['server_name']=='Advance']

In [None]:
pattern = "10.31124/advance.175630073.36585408"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

In [None]:
result['landing_page_url']

In [None]:
### AfricArXiv

In [None]:
data[data['server_name']=='AfricArXiv']

### AgriRxiv

In [None]:
data[data['server_name']=='AgriRxiv']

### AIJR Preprints

In [None]:
data[data['server_name']=='AIJR Preprints']

In [None]:
data[data['server_name']=='AIJR Preprints']['landing_page_url']

### AMRC Open Research

In [61]:
data[data['server_name']=='AMRC Open Research']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
178,crossref::10.12688/amrcopenres,AMRC Open Research,crossref,10.12688/amrcopenres,https://doi.org/10.12688/amrcopenres,http://www.amrcopenresearch.org,,,,,,,,,,,,2022-11-18,2022-11-19,2019-02-19,,,
145,crossref::10.12688/amrcopenres.12855.1,AMRC Open Research,crossref,10.12688/amrcopenres.12855.1,https://doi.org/10.12688/amrcopenres.12855.1,https://amrcopenresearch.org/articles/1-2/v1,,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,,,2019-02-19,2019-02-19,2019-02-19,2019-09-21,2025-12-22,2019-02-19,,,2019.0
143,crossref::10.12688/amrcopenres.12860.1,AMRC Open Research,crossref,10.12688/amrcopenres.12860.1,https://doi.org/10.12688/amrcopenres.12860.1,https://amrcopenresearch.org/articles/1-4/v1,,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,,,2019-02-19,2019-02-19,2019-02-19,2019-09-19,2025-02-21,2019-02-19,,,2019.0
151,crossref::10.12688/amrcopenres.12860.2,AMRC Open Research,crossref,10.12688/amrcopenres.12860.2,https://doi.org/10.12688/amrcopenres.12860.2,https://amrcopenresearch.org/articles/1-4/v2,New version,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,,,2020-01-16,2020-01-16,2020-01-16,2020-01-20,2025-10-27,2020-01-16,,,2020.0
147,crossref::10.12688/amrcopenres.12861.1,AMRC Open Research,crossref,10.12688/amrcopenres.12861.1,https://doi.org/10.12688/amrcopenres.12861.1,https://amrcopenresearch.org/articles/1-1/v1,,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,,,2019-02-19,2019-02-19,2019-02-19,2019-09-21,2025-02-21,2019-02-19,,,2019.0
144,crossref::10.12688/amrcopenres.12861.2,AMRC Open Research,crossref,10.12688/amrcopenres.12861.2,https://doi.org/10.12688/amrcopenres.12861.2,https://amrcopenresearch.org/articles/1-1/v2,New version,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,,,2019-06-26,2019-06-26,2019-06-26,2019-09-21,2025-02-21,2019-06-26,,,2019.0
180,crossref::10.12688/amrcopenres.12862.1,AMRC Open Research,crossref,10.12688/amrcopenres.12862.1,https://doi.org/10.12688/amrcopenres.12862.1,https://amrcopenresearch.org/articles/1-3/v1,,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,,,2019-02-19,2019-02-19,2019-02-19,2024-07-14,2025-02-21,2019-02-19,,,2019.0
181,crossref::10.12688/amrcopenres.12862.2,AMRC Open Research,crossref,10.12688/amrcopenres.12862.2,https://doi.org/10.12688/amrcopenres.12862.2,https://amrcopenresearch.org/articles/1-3/v2,New version,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,,,2019-08-07,2019-08-07,2019-08-07,2024-07-21,2025-02-21,2019-08-07,,,2019.0
146,crossref::10.12688/amrcopenres.12873.1,AMRC Open Research,crossref,10.12688/amrcopenres.12873.1,https://doi.org/10.12688/amrcopenres.12873.1,https://amrcopenresearch.org/articles/1-11/v1,,"{""has-review"": [{""asserted-by"": ""subject"", ""id...",,,,,,,2019-05-21,2019-05-21,2019-05-21,2019-09-21,2025-02-21,2019-05-21,,,2019.0
149,crossref::10.12688/amrcopenres.12873.2,AMRC Open Research,crossref,10.12688/amrcopenres.12873.2,https://doi.org/10.12688/amrcopenres.12873.2,https://amrcopenresearch.org/articles/1-11/v2,New version,,,,,,,,2019-07-15,2019-07-15,2019-07-15,2019-09-23,2025-10-18,2019-07-15,,,2019.0


In [None]:
pattern = "10.12688/amrcopenres.12907"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### APSA Preprints

In [None]:
data[data['server_name']=='APSA Preprints']

In [None]:
pattern = "10.33774/apsa-2023-m7x2m"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### Arabixiv

In [None]:
Arabixiv_df= data[data['server_name']=='Arabixiv']
Arabixiv_df

In [None]:
pattern = "_v2"

mask = Arabixiv_df['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = Arabixiv_df[mask]
result

In [None]:
pattern = "10.31221/osf.io/sy3mk"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### ARPHA Preprints

In [None]:
data[data['server_name']=='ARPHA Preprints']

### ART-Dok

In [None]:
data[data['server_name']=='ART-Dok']

In [None]:
data[data['server_name']=='ART-Dok']['version_label'].value_counts()

### Authorea Inc.

In [62]:
data[data['server_name']=='Authorea Inc.']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
273725,crossref::10.15200/winn.143644.45920,Authorea Inc.,crossref,10.15200/winn.143644.45920,https://doi.org/10.15200/winn.143644.45920,https://www.authorea.com/users/607662/articles...,,,,,,,,2023-04-17,2023-04-17,,2023-04-17,2023-09-05,2025-06-03,2015-07-09,,,2023.0
231842,crossref::10.22541/21docs.166724538.84041815/v1,Authorea Inc.,crossref,10.22541/21docs.166724538.84041815/v1,https://doi.org/10.22541/21docs.166724538.8404...,https://www.authorea.com/users/517807/articles...,,,,,,,,2022-10-31,2022-10-31,,2022-10-31,2022-10-31,2025-06-03,2022-10-31,,,2022.0
237085,crossref::10.22541/21docs.168138558.87244492/v1,Authorea Inc.,crossref,10.22541/21docs.168138558.87244492/v1,https://doi.org/10.22541/21docs.168138558.8724...,https://essopenarchive.org/users/461179/articl...,,,,,,,,2023-04-13,2023-04-13,,2023-04-13,2023-04-13,2025-06-03,2023-04-13,,,2023.0
237262,crossref::10.22541/21docs.168174839.96052014/v1,Authorea Inc.,crossref,10.22541/21docs.168174839.96052014/v1,https://doi.org/10.22541/21docs.168174839.9605...,https://www.authorea.com/users/487877/articles...,,,,,,,,2023-04-17,2023-04-17,,2023-04-17,2023-04-17,2025-06-03,2023-04-17,,,2023.0
237263,crossref::10.22541/21docs.168174877.72543727/v1,Authorea Inc.,crossref,10.22541/21docs.168174877.72543727/v1,https://doi.org/10.22541/21docs.168174877.7254...,https://www.authorea.com/users/487877/articles...,,,,,,,,2023-04-17,2023-04-17,,2023-04-17,2023-04-17,2025-06-03,2023-04-17,,,2023.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273607,crossref::10.22541/au.176720962.27813585/v1,Authorea Inc.,crossref,10.22541/au.176720962.27813585/v1,https://doi.org/10.22541/au.176720962.27813585/v1,https://www.authorea.com/users/1014866/article...,,,,,,,,2025-12-31,2025-12-31,,2025-12-31,2025-12-31,2025-12-31,2025-12-31,,,2025.0
273608,crossref::10.22541/au.176722209.98153476/v1,Authorea Inc.,crossref,10.22541/au.176722209.98153476/v1,https://doi.org/10.22541/au.176722209.98153476/v1,https://www.authorea.com/users/1014885/article...,,,,,,,,2025-12-31,2025-12-31,,2025-12-31,2025-12-31,2025-12-31,2025-12-31,,,2025.0
273609,crossref::10.22541/au.176722210.05496848/v1,Authorea Inc.,crossref,10.22541/au.176722210.05496848/v1,https://doi.org/10.22541/au.176722210.05496848/v1,https://www.authorea.com/users/1014886/article...,,,,,,,,2025-12-31,2025-12-31,,2025-12-31,2025-12-31,2025-12-31,2025-12-31,,,2025.0
241980,crossref::10.22541/techrxiv.169288366.62411171/v1,Authorea Inc.,crossref,10.22541/techrxiv.169288366.62411171/v1,https://doi.org/10.22541/techrxiv.169288366.62...,https://essopenarchive.org/users/487877/articl...,,,,,,,,2023-08-24,2023-08-24,,2023-08-24,2023-08-24,2025-06-03,2023-08-24,,,2023.0


In [65]:
pattern = "10.22541/au.158826951.14028294"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
208276,crossref::10.22541/au.158826951.14028294,Authorea Inc.,crossref,10.22541/au.158826951.14028294,https://doi.org/10.22541/au.158826951.14028294,https://www.authorea.com/users/3/articles/4300...,,,,,,,,2020-04-30,2020-04-30,,2020-04-30,2020-04-30,2025-11-23,2020-04-30,,,2020.0
208277,crossref::10.22541/au.158826951.14028294/v2,Authorea Inc.,crossref,10.22541/au.158826951.14028294/v2,https://doi.org/10.22541/au.158826951.14028294/v2,https://www.authorea.com/users/3/articles/4300...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.22541/au.158826951.14028294,,,,2020-04-30,2020-04-30,,2020-04-30,2020-04-30,2025-06-03,2020-04-30,,,2020.0
232602,crossref::10.22541/au.158826951.14028294/v3,Authorea Inc.,crossref,10.22541/au.158826951.14028294/v3,https://doi.org/10.22541/au.158826951.14028294/v3,https://www.authorea.com/users/3/articles/4300...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.22541/au.158826951.14028294,,,,2022-11-29,2022-11-29,,2022-11-29,2022-11-29,2025-06-03,2022-11-29,,,2022.0


### Beilstein Archives

In [None]:
data[data['server_name']=='Beilstein Archives']

In [None]:
pattern = "10.3762/bxiv.2019.102"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

In [None]:
pattern = ".v2"

mask = data[data['server_name']=='Beilstein Archives']['doi'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='Beilstein Archives'][mask]
result

### Bepress Legal Repository

In [None]:
data[data['server_name']=='Bepress Legal Repository']

### BioHackrXiv

In [None]:
data[data['server_name']=='BioHackrXiv']

In [None]:
pattern = "v2"

mask = data[data['server_name']=='BioHackrXiv']['doi'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='BioHackrXiv'][mask]
result

In [None]:
pattern = "10.37044/osf.io/5uhwz"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### Cambridge Open Engage

In [63]:
data[data['server_name']=='Cambridge Open Engage']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
275889,crossref::10.33774/coe-2020-03prm,Cambridge Open Engage,crossref,10.33774/coe-2020-03prm,https://doi.org/10.33774/coe-2020-03prm,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2020-07-16,2020-07-16,,2020-07-16,2021-10-20,2025-05-14,2020-07-16,,,2020.0
275905,crossref::10.33774/coe-2020-05l1b,Cambridge Open Engage,crossref,10.33774/coe-2020-05l1b,https://doi.org/10.33774/coe-2020-05l1b,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2020-09-01,2020-09-01,,2020-09-01,2021-10-20,2025-11-11,2020-09-01,,,2020.0
275919,crossref::10.33774/coe-2020-05p95,Cambridge Open Engage,crossref,10.33774/coe-2020-05p95,https://doi.org/10.33774/coe-2020-05p95,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2020-10-02,2020-10-02,,2020-10-02,2021-10-20,2025-05-14,2020-10-02,,,2020.0
275838,crossref::10.33774/coe-2020-08ng6,Cambridge Open Engage,crossref,10.33774/coe-2020-08ng6,https://doi.org/10.33774/coe-2020-08ng6,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2020-04-17,2020-04-17,,2020-04-17,2021-10-20,2022-04-02,2020-04-17,,,2020.0
275935,crossref::10.33774/coe-2020-08r2r,Cambridge Open Engage,crossref,10.33774/coe-2020-08r2r,https://doi.org/10.33774/coe-2020-08r2r,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2020-11-06,2020-11-06,,2020-11-06,2021-10-20,2025-05-14,2020-11-06,,,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278537,crossref::10.33774/coe-2025-zrw4b,Cambridge Open Engage,crossref,10.33774/coe-2025-zrw4b,https://doi.org/10.33774/coe-2025-zrw4b,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2025-10-17,2025-10-17,,2025-10-17,2025-10-17,2025-10-18,2025-10-17,,,2025.0
278616,crossref::10.33774/coe-2025-zsj9r,Cambridge Open Engage,crossref,10.33774/coe-2025-zsj9r,https://doi.org/10.33774/coe-2025-zsj9r,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2025-11-14,2025-11-14,,2025-11-14,2025-11-14,2025-11-14,2025-11-14,,,2025.0
277878,crossref::10.33774/coe-2025-zwj7p,Cambridge Open Engage,crossref,10.33774/coe-2025-zwj7p,https://doi.org/10.33774/coe-2025-zwj7p,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2025-01-31,2025-01-31,,2025-01-31,2025-01-31,2025-02-01,2025-01-31,,,2025.0
278729,crossref::10.33774/coe-2025-zwrxb,Cambridge Open Engage,crossref,10.33774/coe-2025-zwrxb,https://doi.org/10.33774/coe-2025-zwrxb,https://www.cambridge.org/engage/coe/article-d...,,,,,,,,2025-12-16,2025-12-16,,2025-12-16,2025-12-16,2025-12-16,2025-12-16,,,2025.0


In [None]:
pattern = "v4"

mask = data[data['server_name']=='Cambridge Open Engage']['doi'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='Cambridge Open Engage'][mask]
result

In [None]:
pattern = "0.33774/coe-2020-vntk7"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### CERN document server

In [None]:
data[data['server_name']=='CERN document server']

In [None]:
data[data['server_name']=='CERN document server']['version_label'].value_counts()

### ChemRxiv

In [64]:
data[data['server_name']=='ChemRxiv']

Unnamed: 0,record_id,server_name,backend,doi,doi_url,landing_page_url,version_label,relations_json,raw_relationships_json,is_version_of,version_of_ids_json,is_preprint_of,published_version_ids_json,date_posted,date_published,date_published_online,date_issued,date_deposited,date_indexed,date_created,date_registered,date_updated,publication_year
286137,crossref::10.26434/chemrxiv-2021-00kkd,ChemRxiv,crossref,10.26434/chemrxiv-2021-00kkd,https://doi.org/10.26434/chemrxiv-2021-00kkd,https://chemrxiv.org/engage/chemrxiv/article-d...,,,,,,,,2021-12-23,2021-12-23,,2021-12-23,2021-12-23,2025-11-23,2021-12-23,,,2021.0
312389,crossref::10.26434/chemrxiv-2021-00kkd-v2,ChemRxiv,crossref,10.26434/chemrxiv-2021-00kkd-v2,https://doi.org/10.26434/chemrxiv-2021-00kkd-v2,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,10.26434/chemrxiv-2021-00kkd,,10.1021/jacs.1c13508,,2022-03-07,2022-03-07,,2022-03-07,2025-04-16,2025-05-14,2022-03-07,,,2022.0
312423,crossref::10.26434/chemrxiv-2021-00rj4,ChemRxiv,crossref,10.26434/chemrxiv-2021-00rj4,https://doi.org/10.26434/chemrxiv-2021-00rj4,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1149/1945-7111/ac3ff4,,2021-10-05,2021-10-05,,2021-10-05,2025-04-16,2025-04-16,2022-01-25,,,2021.0
312367,crossref::10.26434/chemrxiv-2021-012c7,ChemRxiv,crossref,10.26434/chemrxiv-2021-012c7,https://doi.org/10.26434/chemrxiv-2021-012c7,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1016/j.msec.2022.112697,,2021-10-29,2021-10-29,,2021-10-29,2025-04-16,2025-04-16,2022-01-25,,,2021.0
286254,crossref::10.26434/chemrxiv-2021-013gn,ChemRxiv,crossref,10.26434/chemrxiv-2021-013gn,https://doi.org/10.26434/chemrxiv-2021-013gn,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-version-of"": [{""asserted-by"": ""subject"", ...",,10.26434/chemrxiv-2021-013gn-v2,,,,2021-07-15,2021-07-15,,2021-07-15,2022-01-25,2025-11-23,2022-01-25,,,2021.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281396,crossref::10.26434/chemrxiv.9999572.v1,ChemRxiv,crossref,10.26434/chemrxiv.9999572.v1,https://doi.org/10.26434/chemrxiv.9999572.v1,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1002/chem.201904798,,2019-10-21,2019-10-21,,2019-10-21,2021-10-21,2025-05-14,2019-10-21,,,2019.0
281397,crossref::10.26434/chemrxiv.9999656.v1,ChemRxiv,crossref,10.26434/chemrxiv.9999656.v1,https://doi.org/10.26434/chemrxiv.9999656.v1,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1002/jcc.26182,,2019-10-22,2019-10-22,,2019-10-22,2021-10-21,2025-05-14,2019-10-22,,,2019.0
288879,crossref::10.26434/chemrxiv.9999734.v1,ChemRxiv,crossref,10.26434/chemrxiv.9999734.v1,https://doi.org/10.26434/chemrxiv.9999734.v1,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1021/acs.jchemed.9b00975,,2019-10-22,2019-10-22,,2019-10-22,2022-08-18,2025-05-14,2019-10-22,,,2019.0
312387,crossref::10.26434/chemrxiv.9999848.v1,ChemRxiv,crossref,10.26434/chemrxiv.9999848.v1,https://doi.org/10.26434/chemrxiv.9999848.v1,https://chemrxiv.org/engage/chemrxiv/article-d...,,"{""is-preprint-of"": [{""asserted-by"": ""subject"",...",,,,10.1002/chem.202001365,,2019-10-22,2019-10-22,,2019-10-22,2025-04-16,2025-05-14,2019-10-22,,,2019.0


In [None]:
pattern = "v9"

mask = data[data['server_name']=='ChemRxiv']['doi'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='ChemRxiv'][mask]
result

In [None]:
pattern = "10.26434/chemrxiv-2025-202"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

In [None]:
pattern = "10.26434/chemrxiv-2025-xc7vd"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### CogPrints

In [None]:
data[data['server_name']=='CogPrints']

In [None]:
data[data['server_name']=='CogPrints'].count()

### CoP

In [None]:
data[data['server_name']=='CoP']

In [None]:
pattern = "10.31219/osf.io/bg3sn"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### Covid-19 Preprints

In [None]:
data[data['server_name']=='Covid-19 Preprints']

In [None]:
data[data['server_name']=='Covid-19 Preprints'].count()

### CrimRxiv

In [None]:
data[data['server_name']=='CrimRxiv'].tail(60)

### CrossAsia-Repository

In [None]:
data[data['server_name']=='CrossAsia-Repository']

### Digital Access to Scholarship at Harvard (DASH) (Harvard University)

In [None]:
data[data['server_name']=='Digital Access to Scholarship at Harvard (DASH) (Harvard University)']

### DSpace@MIT

In [None]:
data[data['server_name']=='DSpace@MIT']

### E-LIS Repository

In [None]:
data[data['server_name']=='E-LIS Repository']

In [None]:
### Earth and Space Science Open Archive

In [None]:
data[data['server_name']=='Earth and Space Science Open Archive']

In [None]:
pattern = "10.1002/essoar.10502516"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### EarthArXiv

In [None]:
data[data['server_name']=='EarthArXiv']

In [None]:
pattern = "v2"

mask = data[data['server_name']=='EarthArXiv']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='EarthArXiv'][mask]
result

### EasyChair preprint

In [None]:
data[data['server_name']=='EasyChair preprint']

### EcoEvoRxiv

In [None]:
data[data['server_name']=='EcoEvoRxiv']

In [None]:
pattern = "_v1"

mask = data[data['server_name']=='EcoEvoRxiv']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='EcoEvoRxiv'][mask]
result

### EconStor Preprints

In [None]:
data[data['server_name']=='EconStor Preprints']

### EGUsphere

In [None]:
data[data['server_name']=='EGUsphere']

In [None]:
pattern = "10.5194/egusphere-2022-187"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### Electron Colloquium Comput Complex

In [None]:
data[data['server_name']=='Electron Colloquium Comput Complex']

In [None]:
data[data['server_name']=='Electron Colloquium Comput Complex']['landing_page_url'][571606]

### ELPUB (Universitat Wuppertal)

In [None]:
data[data['server_name']=='ELPUB (Universitat Wuppertal)']

In [None]:
data[data['server_name']=='ELPUB (Universitat Wuppertal)']['landing_page_url'][423028]

### EmeRI

In [None]:
data[data['server_name']=='EmeRI']

### Encyclopedia

In [None]:
data[data['server_name']=='Encyclopedia']

### EnerarXiv

In [None]:
data[data['server_name']=='EnerarXiv']

### engrXiv

In [None]:
data[data['server_name']=='engrXiv']

### F1000Research

In [None]:
data[data['server_name']=='F1000Research']

### FocUS Archive

In [None]:
data[data['server_name']=='FocUS Archive']

### Frenxiv

In [None]:
data[data['server_name']=='Frenxiv']

### Gates Open Research

In [None]:
data[data['server_name']=='engrXiv']

### HANS Publication PrePrints

In [None]:
data[data['server_name']=='HANS Publication PrePrints']

In [None]:
data[data['server_name']=='HANS Publication PrePrints']['is_version_of'].unique()

### HRB Open Research

In [None]:
data[data['server_name']=='HRB Open Research']

### Humanities Commons CORE

In [None]:
data[data['server_name']=='Humanities Commons CORE']

In [None]:
data[data['server_name']=='Humanities Commons CORE']['landing_page_url']

In [None]:
data[data['server_name']=='Humanities Commons CORE']['landing_page_url'][1940981]

### IACR Cryptology ePrint Archive

In [None]:
data[data['server_name']=='IACR Cryptology ePrint Archive']

### INA-Rxiv

In [None]:
data[data['server_name']=='INA-Rxiv']

### IndiaRxiv

In [None]:
data[data['server_name']=='IndiaRxiv']

In [None]:
data[data['server_name']=='IndiaRxiv']['is_version_of'].unique()

In [None]:
pattern = "v1"

mask = data[data['server_name']=='IndiaRxiv']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='IndiaRxiv'][mask]
result

### JMIR Preprints

In [None]:
data[data['server_name']=='JMIR Preprints']

### Keldysh Institute Preprints

In [None]:
data[data['server_name']=='Keldysh Institute Preprints']

### LatArXiv

In [None]:
data[data['server_name']=='LatArXiv']

In [None]:
pattern = "v1"

mask = data[data['server_name']=='LatArXiv']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='LatArXiv'][mask]
result

### Law Archive

In [None]:
data[data['server_name']=='Law Archive']

In [None]:
pattern = "v1"

mask = data[data['server_name']=='Law Archive']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='Law Archive'][mask]
result

### LIS Scholarship Archive

In [None]:
data[data['server_name']=='LIS Scholarship Archive']

In [None]:
pattern = "v1"

mask = data[data['server_name']=='LIS Scholarship Archive']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='LIS Scholarship Archive'][mask]
result

### LSE Research Online Documents on Economics

In [None]:
data[data['server_name']=='LSE Research Online Documents on Economics']

In [None]:
pattern = "v1"

mask = data[data['server_name']=='LSE Research Online Documents on Economics']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='LSE Research Online Documents on Economics'][mask]
result

### MarXiv

In [None]:
data[data['server_name']=='MarXiv']

In [None]:
pattern = "v1"

mask = data[data['server_name']=='MarXiv']['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[data['server_name']=='MarXiv'][mask]
result

### MediArXiv

In [None]:
data[data['server_name']=='MediArXiv']

### medRxiv

In [None]:
data[data['server_name']=='medRxiv']

### MetaArXiv

In [None]:
data[data['server_name']=='MetaArXiv']

### MindRxiv

In [None]:
data[data['server_name']=='MindRxiv']

### MNI Open Research

In [None]:
data[data['server_name']=='MNI Open Research']

### Munich Personal RePEc Archive

In [None]:
data[data['server_name']=='Munich Personal RePEc Archive']

### National Bureau of Economic Research

In [None]:
data[data['server_name']=='National Bureau of Economic Research']

### Nature Precedings

In [None]:
data[data['server_name']=='Nature Precedings']

### NewAddictionsX

In [None]:
data[data['server_name']=='NewAddictionsX']

### NutriXiv

In [None]:
data[data['server_name']=='NutriXiv']

### Open Research Africa

In [None]:
data[data['server_name']=='Open Research Africa']

### Open Research Europe

In [None]:
data[data['server_name']=='Open Research Europe']

In [None]:
data[data['server_name']=='Open Research Europe']['landing_page_url'][2122034]

### Organic Eprints

In [None]:
data[data['server_name']=='Organic Eprints']

### Oroboros Instruments

In [None]:
data[data['server_name']=='Oroboros Instruments']

In [None]:
data[data['server_name']=='Oroboros Instruments']['landing_page_url'][2273436]

In [None]:
pattern = "10.26124/mitofit:190001"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### PaleorXiv

In [None]:
data[data['server_name']=='PaleorXiv']

### PeerJ Preprints

In [None]:
data[data['server_name']=='PeerJ Preprints']

In [None]:
pattern = "10.7287/peerj.preprints.28002"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

In [None]:
data[data['server_name']=='PeerJ Preprints']['relations_json'][2281677]

### PhilSci-Archive

In [None]:
data[data['server_name']=='PhilSci-Archive']

### PMSE Preprints

In [None]:
data[data['server_name']=='PMSE Preprints']

### PoolText

In [None]:
data[data['server_name']=='PoolText']

In [None]:
data[data['server_name']=='PoolText']['landing_page_url'][2284121]

### prepare@u

In [None]:
data[data['server_name']=='prepare@u']

In [None]:
data[data['server_name']=='prepare@u']['landing_page_url'][9477269]

### Preprints.org

In [None]:
data[data['server_name']=='Preprints.org']

In [None]:
pattern = "10.20944/preprints202510.0903"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### PREPRINTS.RU

In [None]:
data[data['server_name']=='PREPRINTS.RU']

### Prepublicaciones OpenCiencia

In [None]:
data[data['server_name']=='Prepublicaciones OpenCiencia']

### PropylaeumDok

In [None]:
data[data['server_name']=='PropylaeumDok']

### PsyArXiv

In [None]:
data[data['server_name']=='PsyArXiv']

### Qeios

In [None]:
data[data['server_name']=='Qeios']

### RePEc: Research Papers in Economics

In [None]:
data[data['server_name']=='RePEc: Research Papers in Economics']

### ResearchHub

In [None]:
data[data['server_name']=='ResearchHub']

In [None]:
data[data['server_name']=='ResearchHub']['landing_page_url'][3520891]

### SAE Mobilus®

In [None]:
data[data['server_name']=='SAE Mobilus®']

In [None]:
data[data['server_name']=='SAE Mobilus®']['landing_page_url'][4391971]

### SciELO Preprints

In [None]:
data[data['server_name']=='SciELO Preprints']

In [None]:
data[data['server_name']=='SciELO Preprints']['landing_page_url'][5655071]

### ScienceOpen Preprints

In [None]:
data[data['server_name']=='ScienceOpen Preprints']

In [None]:
data[data['server_name']=='ScienceOpen Preprints']['landing_page_url'][5660159]

In [None]:
pattern = "10.14293/s2199-1006.1.sor-compsci.abltb1"

mask = data['doi'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

In [None]:
data[data['server_name']=='ScienceOpen Preprints']['doi'][5660161]

In [None]:
data[data['server_name']=='ScienceOpen Preprints']['landing_page_url'][5660161]

### Sciencepaper Online

In [None]:
data[data['server_name']=='Sciencepaper Online']

In [None]:
data[data['server_name']=='Sciencepaper Online']['landing_page_url'][5660256]

### searchRxiv

In [None]:
data[data['server_name']=='searchRxiv']

In [None]:
data[data['server_name']=='searchRxiv']['landing_page_url'][9477274]

### SocArXiv

In [None]:
data[data['server_name']=='SocArXiv']

### Social Science Open Access Repository

In [None]:
data[data['server_name']=='Social Science Open Access Repository']

### SportRxiv

In [None]:
data[data['server_name']=='SportRxiv']

In [None]:
data[data['server_name']=='SportRxiv']['landing_page_url'][5709876]

### TechRxiv

In [None]:
data[data['server_name']=='TechRxiv']

In [None]:
data[data['server_name']=='TechRxiv']['landing_page_url'][5739296]

### Therapoid

In [None]:
data[data['server_name']=='Therapoid']

In [None]:
data[data['server_name']=='Therapoid']['landing_page_url'][5739303]

### Thesis Commons

In [None]:
data[data['server_name']=='Thesis Commons']

### UCL Open Environment

In [None]:
data[data['server_name']=='UCL Open Environment']

In [None]:
data[data['server_name']=='UCL Open Environment']['landing_page_url'][5746734]

### UnisaRxiv

In [None]:
data[data['server_name']=='UnisaRxiv']

In [None]:
data[data['server_name']=='UnisaRxiv']['landing_page_url'][5746860]

### VeriXiv

In [None]:
data[data['server_name']=='VeriXiv']

### viXra

In [None]:
data[data['server_name']=='viXra']

In [None]:
data[data['server_name']=='ResearchHub']['landing_page_url'][3520891]

In [None]:
pattern = "https://vixra.org/pdf/1412.0197"

mask = data['landing_page_url'].str.contains(pattern, regex=False, na=False)
result = data[mask]
result

### Wellcome Open Research

In [None]:
data[data['server_name']=='Wellcome Open Research']

In [None]:
data[data['server_name']=='Wellcome Open Research']['landing_page_url'][5752092]

### WikiJournal of Humanities

In [None]:
data[data['server_name']=='WikiJournal of Humanities']

In [None]:
data[data['server_name']=='WikiJournal of Humanities']['landing_page_url'][5752103]

### WikiJournal of Medicine

In [None]:
data[data['server_name']=='WikiJournal of Medicine']

In [None]:
data[data['server_name']=='WikiJournal of Medicine']['landing_page_url'][5752165]

### WikiJournal of Science

In [None]:
data[data['server_name']=='WikiJournal of Science']

In [None]:
data[data['server_name']=='WikiJournal of Science']['landing_page_url'][5752210]

### Zenodo

In [None]:
data[data['server_name']=='Zenodo']

In [None]:
cccc

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G = nx.Graph()

# Center
G.add_node("Informatik im Alltag", level=0)

# Main domains
domains = {
    "Theoretische Informatik": "orange",
    "Praktische Informatik": "green",
    "Technische Informatik": "red",
    "Angewandte Informatik": "blue",
    "Didaktik & Gesellschaft": "purple"
}

for d in domains:
    G.add_edge("Informatik im Alltag", d)

# Subtopics
G.add_edge("Theoretische Informatik", "Algorithmen")
G.add_edge("Theoretische Informatik", "Automatentheorie")
G.add_edge("Praktische Informatik", "Programmiersprachen")
G.add_edge("Praktische Informatik", "Softwaretechnik")

# Layout
pos = nx.nx_agraph.graphviz_layout(G, prog="twopi")

# Draw
nx.draw(
    G,
    pos,
    with_labels=True,
    node_size=2000,
    font_size=8
)

plt.show()


In [None]:
pip install pygraphviz

## 2) Choose the server you want to explore

In [None]:
SERVER = "arXiv"   # <-- change me (exact spelling must match server_name values)
BACKEND = None     # set to "crossref" or "datacite" or "openalex" if you want only one backend

## 3) Create a thin server view (casts to VARCHAR to avoid schema drift)

This avoids the COALESCE VARCHAR/BOOLEAN error and drops heavy JSON columns.

In [None]:
def make_server_view(server_name: str, backend: str | None = None):
    backend_filter = ""
    if backend:
        backend_filter = f"AND backend = '{backend}'"

    con.execute(f"""
    CREATE OR REPLACE VIEW server_thin AS
    SELECT
      CAST(record_id AS VARCHAR)           AS record_id,
      CAST(server_name AS VARCHAR)         AS server_name,
      CAST(backend AS VARCHAR)             AS backend,

      CAST(doi AS VARCHAR)                 AS doi,
      CAST(doi_url AS VARCHAR)             AS doi_url,
      CAST(landing_page_url AS VARCHAR)    AS landing_page_url,

      CAST(version_label AS VARCHAR)       AS version_label,

      -- Relationships (keep these for true version links)
      CAST(relations_json AS VARCHAR)       AS relations_json,
      CAST(raw_relationships_json AS VARCHAR)       AS raw_relationships_json,
      CAST(is_version_of AS VARCHAR)       AS is_version_of,      -- keep as text; we’ll interpret later
      CAST(version_of_ids_json AS VARCHAR) AS version_of_ids_json,
      CAST(is_preprint_of AS VARCHAR)      AS is_preprint_of,
      CAST(published_version_ids_json AS VARCHAR) AS published_version_ids_json,

      -- Dates (helpful for temporal patterns)
      CAST(date_posted AS VARCHAR)         AS date_posted,
      CAST(date_published AS VARCHAR)      AS date_published,
      CAST(date_published_online AS VARCHAR)      AS date_published_online,
      CAST(date_issued AS VARCHAR)         AS date_issued,
      CAST(date_deposited AS VARCHAR)      AS date_deposited,
      CAST(date_indexed AS VARCHAR)        AS date_indexed,
      CAST(date_created AS VARCHAR)        AS date_created,
      CAST(date_registered AS VARCHAR)     AS date_registered,
      CAST(date_updated AS VARCHAR)        AS date_updated,
      CAST(publication_year AS VARCHAR)    AS publication_year
    FROM all_rows
    WHERE server_name = '{server_name}'
    {backend_filter};
    """)

make_server_view(SERVER, BACKEND)
con.execute("SELECT COUNT(*) AS n FROM server_thin").df()


In [None]:
data = con.execute("SELECT * FROM server_thin").df()
data.head()

In [None]:
data.head(60)

In [None]:
data.count()

In [None]:
data[['doi','relations_json']]

In [None]:
data['relations_json'][1044404]

#### 

In [None]:
vvv

# B) Explore versioning signals for ONLY that server

## 1) Build a clean version detection table for that server

This outputs: record_id -> version_signal_reasons (pipe-separated reasons).

In [None]:
con.execute(r"""
CREATE OR REPLACE TABLE server_version_signals AS
WITH base AS (
  SELECT
    record_id,
    server_name,
    backend,

    lower(coalesce(doi, '')) AS doi_l,
    lower(coalesce(landing_page_url, '')) AS landing_l,
    lower(coalesce(version_label, '')) AS vlabel_l,

    -- interpret is_version_of safely from text
    CASE
      WHEN lower(coalesce(is_version_of,'')) IN ('true','t','1','yes') THEN TRUE
      ELSE FALSE
    END AS is_version_of_bool
  FROM server_thin
),
flags AS (
  SELECT
    *,
    regexp_matches(doi_l, '[-._]v[0-9]{1,3}([^0-9]|$)') AS doi_sep_vN,
    regexp_matches(doi_l, '[-._][0-9]{1,3}([^0-9]|$)')  AS doi_sep_N,

    regexp_matches(landing_l, '([?&](version|ver|v)=([0-9]{1,3}))') AS landing_param_version,
    regexp_matches(landing_l, '(/v[0-9]{1,3})(/|$)')                AS landing_path_vN,

    (vlabel_l <> '' AND vlabel_l <> 'null') AS version_label_present,

    (is_version_of_bool) AS metadata_is_version_of
  FROM base
),
reasons AS (
  SELECT
    record_id,
    server_name,
    backend,

    -- Build a pipe-separated reason string
    trim(both '|' from
      (CASE WHEN doi_sep_vN THEN '|doi_sep_vN' ELSE '' END) ||
      (CASE WHEN doi_sep_N THEN '|doi_sep_N' ELSE '' END) ||
      (CASE WHEN landing_param_version THEN '|landing_param_version' ELSE '' END) ||
      (CASE WHEN landing_path_vN THEN '|landing_path_vN' ELSE '' END) ||
      (CASE WHEN version_label_present THEN '|metadata_version_label' ELSE '' END) ||
      (CASE WHEN metadata_is_version_of THEN '|metadata_is_version_of' ELSE '' END)
    ) AS version_signal_reasons
  FROM flags
)
SELECT
  record_id, server_name, backend,
  CASE
    WHEN version_signal_reasons = '' THEN 'no_signal'
    ELSE version_signal_reasons
  END AS version_signal_reasons
FROM reasons;
""")

con.execute("""
SELECT version_signal_reasons, COUNT(*) AS n
FROM server_version_signals
GROUP BY 1
ORDER BY n DESC;
""").df()


## 2) Inspect records with any signal (sample)

In [None]:
con.execute("""
SELECT *
FROM server_version_signals
WHERE version_signal_reasons <> 'no_signal'
LIMIT 200;
""").df()


# C) Export only THIS server (safe, chunked)

If you want to move one server to Colab or work offline.

In [None]:
import math
from pathlib import Path

OUT = Path(f"data/all_backends/server_exports/{SERVER}")
OUT.mkdir(parents=True, exist_ok=True)

ROWS_PER_PART = 200_000

total = con.execute("SELECT COUNT(*) FROM server_thin").fetchone()[0]
n_parts = math.ceil(total / ROWS_PER_PART)
print("rows:", total, "parts:", n_parts)

for i in range(n_parts):
    offset = i * ROWS_PER_PART
    out_path = OUT / f"server_thin_part_{i:06d}.parquet"
    con.execute(f"""
        COPY (
            SELECT * FROM server_thin
            LIMIT {ROWS_PER_PART} OFFSET {offset}
        )
        TO '{out_path.as_posix()}'
        (FORMAT PARQUET);
    """)
print("done ->", OUT)
