<a href="https://colab.research.google.com/github/descartesmbogning/crossref_preprint_labeling/blob/main/2_crossref_preprint_labeling__part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import merged files

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
# display(df)

## Load the filtered data


In [3]:
import os
import glob

# Define the directory containing the parquet files
parquet_dir = '/content/drive/MyDrive/ScholCommLab/DATA/CROSSREF/server_labeling_approach/'

# Find the latest parquet file in the directory that contains the specified substring
list_of_files = glob.glob(parquet_dir + 'crossref_preprints_normalize*.parquet')
if not list_of_files:
    raise FileNotFoundError(f"No parquet files containing 'crossref_preprints_normalize' found in {parquet_dir}")

latest_file = max(list_of_files, key=os.path.getctime)

# Update the PARQUET variable
PARQUET = latest_file
print(f"Updated PARQUET variable to: {PARQUET}")

Updated PARQUET variable to: /content/drive/MyDrive/ScholCommLab/DATA/CROSSREF/server_labeling_approach/crossref_preprints_normalize_2025-10-31.parquet


In [4]:
df_all = pd.read_parquet(PARQUET)
display(df_all.head(2))

Unnamed: 0,doi,posted_date,url,primary_url,type,subtype,prefix,publisher,content_domain_json,container_title,short_container_title,institution_name,is_preprint_of,has_preprint,is_version_of,group_title,member,gold_server_name,doi_lc,prefix_lc,doi_prefix_from_text,doi_suffix,doi_prefix_first_token,doi_prefix_bucket_2d,primary_domain,primary_domain_extend,year
0,10.18785/fa.m017,2000-01-01,https://doi.org/10.18785/fa.m017,https://specialcollections.usm.edu/repositorie...,posted-content,preprint,10.18785,University of Southern Mississippi,"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Hardy (Robert B.) Papers,7980,Hardy (Robert B.) Papers,10.18785/fa.m017,10.18785,10.18785,fa.m017,10.18785/fa,10.18785/fa,specialcollections.usm.edu,specialcollections.usm.edu/repositories,2000
1,10.18785/fa.m083,2000-01-01,https://doi.org/10.18785/fa.m083,https://specialcollections.usm.edu/repositorie...,posted-content,preprint,10.18785,University of Southern Mississippi,"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,"United States....Land Patent-Mississippi, 1841.",7980,"United States....Land Patent-Mississippi, 1841.",10.18785/fa.m083,10.18785,10.18785,fa.m083,10.18785/fa,10.18785/fa,specialcollections.usm.edu,specialcollections.usm.edu/repositories,2000


In [5]:
# Drop duplicate rows
df = df_all.drop_duplicates()

# Display the number of rows before and after dropping duplicates
print(f"Number of rows before dropping duplicates: {len(df_all)}")
print(f"Number of rows after dropping duplicates: {len(df)}")

Number of rows before dropping duplicates: 1946456
Number of rows after dropping duplicates: 1946456


In [6]:
# Make sure posted_date is datetime
df["posted_date"] = pd.to_datetime(df["posted_date"], errors="coerce")

# Get the most recent date
latest_date = df["posted_date"].max()
latest_date = latest_date.strftime('%Y-%m-%d')
print("Most recent posted_date:", latest_date)

Most recent posted_date: 2025-10-31


# match the server name

## import the file

In [7]:
import pandas as pd

# url = "https://docs.google.com/spreadsheets/d/1lQISTenSZpBjev7d9v_HEZ9De6gT0C-ZS7BefFoViPw/export?format=csv"
prefix_server = 'https://docs.google.com/spreadsheets/d/10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4/export?format=csv&gid=174743897'
domain_server = 'https://docs.google.com/spreadsheets/d/10_7FdcpZjntqFsEHIii7bAM72uF__of_iUohSD5w8w4/export?format=csv&gid=143048761'
prefix_server_name = pd.read_csv(prefix_server)[['prefix_server_name','Field_doi_prefix_first_token']].dropna(how='all').drop_duplicates()
domain_server_name = pd.read_csv(domain_server)[['domain_server_name','Field_primary_domain_ok']].dropna(how='all').drop_duplicates()
print(len(domain_server_name))
display(domain_server_name.head())
print('--------------------------------------------')
print(len(prefix_server_name))
display(prefix_server_name.head())

297


Unnamed: 0,domain_server_name,Field_primary_domain_ok
0,EmeRI,preprints.ibict.br
1,American Mathematical Society,ams.org
2,OSF communities,osf.io
3,SSRN,ssrn.com
4,Research Square,researchsquare.com


--------------------------------------------
6193


Unnamed: 0,prefix_server_name,Field_doi_prefix_first_token
0,SSRN,10.2139/ssrn
1,Research Square,10.21203/rs
2,bioRxiv/medRxiv,10.1101/20
3,Preprints.org,10.20944/preprints
4,OSF preprints,10.31219/osf


## 1-match manual define domain and prefix name

In [8]:
# Step 2: Merge on matching domain fields
df_merged = df.merge(
    domain_server_name,
    left_on="primary_domain",          # column in df
    right_on="Field_primary_domain_ok",# column in df_server_name
    how="left"                         # keep all rows in df
)

# Step 2: Merge on matching domain fields
df_merged = df_merged.merge(
    prefix_server_name,
    left_on="doi_prefix_first_token",          # column in df
    right_on="Field_doi_prefix_first_token",# column in df_server_name
    how="left"                         # keep all rows in df
)

# Step 3: (Optional) Drop merge key from df_server_name
df_domain = df_merged.drop(columns=["Field_primary_domain_ok","Field_doi_prefix_first_token"])
# df_domain = df_merged.drop(columns=["Field_doi_prefix_first_token"])

df_domain


Unnamed: 0,doi,posted_date,url,primary_url,type,subtype,prefix,publisher,content_domain_json,container_title,short_container_title,institution_name,is_preprint_of,has_preprint,is_version_of,group_title,member,gold_server_name,doi_lc,prefix_lc,doi_prefix_from_text,doi_suffix,doi_prefix_first_token,doi_prefix_bucket_2d,primary_domain,primary_domain_extend,year,domain_server_name,prefix_server_name
0,10.18785/fa.m017,2000-01-01,https://doi.org/10.18785/fa.m017,https://specialcollections.usm.edu/repositorie...,posted-content,preprint,10.18785,University of Southern Mississippi,"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Hardy (Robert B.) Papers,7980,Hardy (Robert B.) Papers,10.18785/fa.m017,10.18785,10.18785,fa.m017,10.18785/fa,10.18785/fa,specialcollections.usm.edu,specialcollections.usm.edu/repositories,2000,University of Southern Mississippi Libraries,University of Southern Mississippi Libraries
1,10.18785/fa.m083,2000-01-01,https://doi.org/10.18785/fa.m083,https://specialcollections.usm.edu/repositorie...,posted-content,preprint,10.18785,University of Southern Mississippi,"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,"United States....Land Patent-Mississippi, 1841.",7980,"United States....Land Patent-Mississippi, 1841.",10.18785/fa.m083,10.18785,10.18785,fa.m083,10.18785/fa,10.18785/fa,specialcollections.usm.edu,specialcollections.usm.edu/repositories,2000,University of Southern Mississippi Libraries,University of Southern Mississippi Libraries
2,10.18785/fa.m181,2000-01-01,https://doi.org/10.18785/fa.m181,https://specialcollections.usm.edu/repositorie...,posted-content,preprint,10.18785,University of Southern Mississippi,"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,"Mississippi (State of) vs. W. M. McDonald, Jr.",7980,"Mississippi (State of) vs. W. M. McDonald, Jr.",10.18785/fa.m181,10.18785,10.18785,fa.m181,10.18785/fa,10.18785/fa,specialcollections.usm.edu,specialcollections.usm.edu/repositories,2000,University of Southern Mississippi Libraries,University of Southern Mississippi Libraries
3,10.18785/fa.m186,2000-01-01,https://doi.org/10.18785/fa.m186,https://specialcollections.usm.edu/repositorie...,posted-content,preprint,10.18785,University of Southern Mississippi,"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Nydegger Family Papers,7980,Nydegger Family Papers,10.18785/fa.m186,10.18785,10.18785,fa.m186,10.18785/fa,10.18785/fa,specialcollections.usm.edu,specialcollections.usm.edu/repositories,2000,University of Southern Mississippi Libraries,University of Southern Mississippi Libraries
4,10.18785/fa.m199,2000-01-01,https://doi.org/10.18785/fa.m199,https://specialcollections.usm.edu/repositorie...,posted-content,preprint,10.18785,University of Southern Mississippi,"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Stevens (Rose Budd) Papers,7980,Stevens (Rose Budd) Papers,10.18785/fa.m199,10.18785,10.18785,fa.m199,10.18785/fa,10.18785/fa,specialcollections.usm.edu,specialcollections.usm.edu/repositories,2000,University of Southern Mississippi Libraries,University of Southern Mississippi Libraries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1946451,10.22541/essoar.175443228.85241644/v2,2025-10-31,https://doi.org/10.22541/essoar.175443228.8524...,https://essopenarchive.org/users/834213/articl...,posted-content,preprint,10.22541,Wiley,"{""crossmark-restriction"": false, ""domain"": []}",,,ESS Open Archive,,,10.22541/essoar.175443228.85241644/v1,Preprints,311,ESS Open Archive,10.22541/essoar.175443228.85241644/v2,10.22541,10.22541,essoar.175443228.85241644/v2,10.22541/essoar,10.22541/essoar,essopenarchive.org,essopenarchive.org/users,2025,ESS Open Archive,ESS Open Archive
1946452,10.22541/essoar.174856814.44534698/v2,2025-10-31,https://doi.org/10.22541/essoar.174856814.4453...,https://essopenarchive.org/users/929040/articl...,posted-content,preprint,10.22541,Wiley,"{""crossmark-restriction"": false, ""domain"": []}",,,ESS Open Archive,,,10.22541/essoar.174856814.44534698/v1,Preprints,311,ESS Open Archive,10.22541/essoar.174856814.44534698/v2,10.22541,10.22541,essoar.174856814.44534698/v2,10.22541/essoar,10.22541/essoar,essopenarchive.org,essopenarchive.org/users,2025,ESS Open Archive,ESS Open Archive
1946453,10.31219/osf.io/2t6p8_v3,2025-10-31,https://doi.org/10.31219/osf.io/2t6p8_v3,https://osf.io/2t6p8_v3,posted-content,preprint,10.31219,Center for Open Science,"{""crossmark-restriction"": false, ""domain"": []}",,,,10.1016/j.physbeh.2025.115144,,10.31219/osf.io/2t6p8; 10.31219/osf.io/2t6p8_v2,Open Science Framework,15934,Open Science Framework,10.31219/osf.io/2t6p8_v3,10.31219,10.31219,osf.io/2t6p8_v3,10.31219/osf,10.31219/osf,osf.io,osf.io/2t6p8_v3,2025,OSF communities,OSF preprints
1946454,10.1101/2025.10.30.685584,2025-10-31,https://doi.org/10.1101/2025.10.30.685584,http://biorxiv.org/lookup/doi/10.1101/2025.10....,posted-content,preprint,10.1101,Cold Spring Harbor Laboratory,"{""crossmark-restriction"": false, ""domain"": []}",,,bioRxiv,,,,Cell Biology,246,bioRxiv,10.1101/2025.10.30.685584,10.1101,10.1101,2025.10.30.685584,10.1101/20,10.1101/20,biorxiv.org,biorxiv.org/lookup,2025,bioRxiv,bioRxiv/medRxiv


## 2- Validation steps

In [9]:
import pandas as pd
import unicodedata
import re

# ============================================================
# 0. CONFIGURATION CONSTANTS
#    (easier to document in methods + reports)
# ============================================================

# Domains where group_title is the best server label
OSF_AMS_DOMAINS = {"osf.io", "ams.org"}

# Domain where institution_name is the best server label
IBICT_DOMAIN = "preprints.ibict.br"

# Domains where prefix_server_name is the best label
PREFIX_OVERRIDE_DOMAINS = {
    "vimeo.com",
    "experience.arcgis.com",
    "researchcatalogue.net",
    "cambridge.org",
    "scholarcommons.usf.edu",
}

# Domains where domain_server_name is a reliable label
DOMAIN_OVERRIDE_DOMAINS = {
    "biorxiv.org",
    "engrxiv.org",
    "eartharxiv.org",
    "saemobilus.sae.org",
    "21docs.com",
    "ecoevorxiv.org",
    "datacite.org",
    "protocols.io",
    "jsr.org",
    "crossref.org",
    "ihp-wins-dev.geo-solutions.it",
    "techrxiv.org",
}

# DOI prefix+first-token patterns where prefix_server_name wins
DOI_PREFIX_OVERRIDE_TOKENS = {
    "10.25159/unisarxiv",
    "10.54120/jost",
    "10.22541/21docs",
    "10.5194/hess-",
    "10.5194/amt-",
    "10.14293/11",
    "10.14293/newpsychology",
    "10.35948/crusca",
    "10.47952/gro-publ-",
    "10.15763/11",
    "10.22541/techrxiv",
    "10.1590/scielopreprintstest",
    "10.5555/dspace",
}

PUBPUB_SUFFIX = ".pubpub.org"


# ============================================================
# 1. Normalization helper
# ============================================================

def normalize_name(s):
    """Normalize server / text names for comparison (ASCII, lowercase, no punctuation)."""
    if pd.isna(s):
        return None
    s = str(s).strip()
    if not s:
        return None

    # Unicode normalize, remove accents
    s = unicodedata.normalize("NFKD", s)
    s = s.encode("ascii", "ignore").decode("ascii")

    # Lowercase
    s = s.lower()

    # Remove punctuation, keep words + spaces
    s = re.sub(r"[^\w\s]", " ", s)

    # Collapse spaces
    s = re.sub(r"\s+", " ", s).strip()

    return s or None


# ============================================================
# 2. Row-wise match helpers
# ============================================================

def row_match_domain(row):
    """
    True if norm_domain appears in at least one of:
    norm_group, norm_inst, norm_pub.
    """
    d = row["norm_domain"]
    if not isinstance(d, str) or not d:
        return False

    for col in ["norm_group", "norm_inst", "norm_pub"]:
        val = row[col]
        if isinstance(val, str) and d in val:
            return True
    return False


def row_match_prefix(row):
    """
    True if norm_prefix appears in at least one of:
    norm_group, norm_inst, norm_pub.
    """
    p = row["norm_prefix"]
    if not isinstance(p, str) or not p:
        return False

    for col in ["norm_group", "norm_inst", "norm_pub"]:
        val = row[col]
        if isinstance(val, str) and p in val:
            return True
    return False


# ============================================================
# 3. Main pipeline function
# ============================================================

def validate_server_names(df: pd.DataFrame, inplace: bool = False):
    """
    Apply a 5-rule hybrid pipeline (generic + server-specific) to validate server names.

    GENERIC RULES (R1â€“R4)
    ---------------------
    R1 (MATCH_STRONG):
        - domain_server_name == prefix_server_name (after normalization)
        - AND text support in group_title / institution_name / publisher
        => validated_server_name = domain_server_name
        => confidence_score = 1.0

    R2 (MATCH_WEAK):
        - domain_server_name == prefix_server_name
        - AND no explicit text support
        => validated_server_name = domain_server_name
        => confidence_score = 0.8

    R3 (MATCH_DOMAIN / MATCH_PREFIX):
        - domain_server_name != prefix_server_name
        - exactly one of them has text support
        => choose the supported one
        => confidence_score = 0.9

    R4 (LOW_CONFIDENCE_MANUAL):
        - domain_server_name != prefix_server_name
        - neither has text support
        => flag for manual validation

    SERVER-SPECIFIC RULES (R5aâ€“R5f)
    --------------------------------
    R5a (MATCH_RULE5_GROUP_TITLE):
        - primary_domain in OSF_AMS_DOMAINS
        - group_title present
        => validated_server_name = group_title

    R5b (MATCH_RULE5_INSTITUTION):
        - primary_domain == IBICT_DOMAIN
        - institution_name present
        => validated_server_name = institution_name

    R5c (MATCH_RULE5_PUBPUB):
        - validated_server_name still NaN
        - primary_domain endswith '.pubpub.org'
        => if group_title present:  use group_title
           else:                    derive from subdomain

    R5d (MATCH_RULE5_PREFIX_OVERRIDE):
        - validated_server_name still NaN
        - primary_domain in PREFIX_OVERRIDE_DOMAINS
        - prefix_server_name present
        => validated_server_name = prefix_server_name

    R5e (MATCH_RULE5_DOMAIN_OVERRIDE):
        - validated_server_name still NaN
        - primary_domain in DOMAIN_OVERRIDE_DOMAINS
        - domain_server_name present
        => validated_server_name = domain_server_name

    R5f (MATCH_RULE5_DOI_PREFIX_OVERRIDE):
        - validated_server_name still NaN
        - doi_prefix_first_token in DOI_PREFIX_OVERRIDE_TOKENS
        - prefix_server_name present
        => validated_server_name = prefix_server_name

    OUTPUT
    ------
    Returns:
        df_out : DataFrame
            Original df with:
                - norm_domain, norm_prefix, norm_group, norm_inst, norm_pub
                - match_dom_text, match_pref_text
                - validated_server_name, validation_status, confidence_score
                - rule_id (explicit rule label, e.g. 'R1_MATCH_STRONG')
        manual_df : DataFrame
            Subset where validation_status == 'LOW_CONFIDENCE_MANUAL'.
    """
    if not inplace:
        df = df.copy()

    # --------------------------------------------------------
    # 3.1 Ensure expected columns exist (avoid KeyErrors)
    # --------------------------------------------------------
    required_cols = [
        "domain_server_name",
        "prefix_server_name",
        "group_title",
        "institution_name",
        "publisher",
        "primary_domain",
        "doi_prefix_first_token",
    ]
    for col in required_cols:
        if col not in df.columns:
            df[col] = pd.NA

    # --------------------------------------------------------
    # 3.2 Normalized versions (for comparisons)
    # --------------------------------------------------------
    df["norm_domain"] = df["domain_server_name"].apply(normalize_name)
    df["norm_prefix"] = df["prefix_server_name"].apply(normalize_name)
    df["norm_group"] = df["group_title"].apply(normalize_name)
    df["norm_inst"] = df["institution_name"].apply(normalize_name)
    df["norm_pub"] = df["publisher"].apply(normalize_name)

    # --------------------------------------------------------
    # 3.3 Initialize output columns
    # --------------------------------------------------------
    df["validated_server_name"] = pd.NA
    df["validation_status"] = pd.NA
    df["confidence_score"] = pd.NA
    df["rule_id"] = pd.NA  # explicit rule label (R1â€“R5f)

    # --------------------------------------------------------
    # 3.4 Compute text-match flags (row-wise)
    # --------------------------------------------------------
    df["match_dom_text"] = df.apply(row_match_domain, axis=1)
    df["match_pref_text"] = df.apply(row_match_prefix, axis=1)

    dom = df["norm_domain"]
    pref = df["norm_prefix"]

    same_dom_pref = dom.notna() & pref.notna() & (dom == pref)
    diff_dom_pref = dom.notna() & pref.notna() & (dom != pref)

    # Helper mask for "still not validated"
    def unvalidated():
        return df["validated_server_name"].isna()

    # ========================================================
    # R1 â€” STRONG MATCH (D == P with text support)
    # ========================================================
    mask1 = unvalidated() & same_dom_pref & (df["match_dom_text"] | df["match_pref_text"])

    df.loc[mask1, "validated_server_name"] = df.loc[mask1, "domain_server_name"]
    df.loc[mask1, "validation_status"] = "MATCH_STRONG"
    df.loc[mask1, "confidence_score"] = 1.0
    df.loc[mask1, "rule_id"] = "R1_MATCH_STRONG"

    # ========================================================
    # R2 â€” WEAK MATCH (D == P without text support)
    # ========================================================
    mask2 = unvalidated() & same_dom_pref & ~df["match_dom_text"] & ~df["match_pref_text"]

    df.loc[mask2, "validated_server_name"] = df.loc[mask2, "domain_server_name"]
    df.loc[mask2, "validation_status"] = "MATCH_WEAK"
    df.loc[mask2, "confidence_score"] = 0.8
    df.loc[mask2, "rule_id"] = "R2_MATCH_WEAK"

    # ========================================================
    # R3 â€” CONFLICT D != P but one is supported by text
    # ========================================================
    remaining = unvalidated() & diff_dom_pref

    mask3_dom = remaining & df["match_dom_text"] & ~df["match_pref_text"]
    mask3_pref = remaining & df["match_pref_text"] & ~df["match_dom_text"]

    # Domain wins
    df.loc[mask3_dom, "validated_server_name"] = df.loc[mask3_dom, "domain_server_name"]
    df.loc[mask3_dom, "validation_status"] = "MATCH_DOMAIN"
    df.loc[mask3_dom, "confidence_score"] = 0.9
    df.loc[mask3_dom, "rule_id"] = "R3_MATCH_DOMAIN"

    # Prefix wins
    df.loc[mask3_pref, "validated_server_name"] = df.loc[mask3_pref, "prefix_server_name"]
    df.loc[mask3_pref, "validation_status"] = "MATCH_PREFIX"
    df.loc[mask3_pref, "confidence_score"] = 0.9
    df.loc[mask3_pref, "rule_id"] = "R3_MATCH_PREFIX"

    # ========================================================
    # R4 â€” LOW CONFIDENCE (D != P and no text support)
    # ========================================================
    remaining = unvalidated() & diff_dom_pref

    mask4 = remaining & ~df["match_dom_text"] & ~df["match_pref_text"]

    df.loc[mask4, "validation_status"] = "LOW_CONFIDENCE_MANUAL"
    df.loc[mask4, "confidence_score"] = 0.3
    df.loc[mask4, "rule_id"] = "R4_LOW_CONFIDENCE"

    # ========================================================
    # R5a â€” OSF / AMS (use group_title)
    # ========================================================
    mask5_group = (
        df["primary_domain"].isin(OSF_AMS_DOMAINS)
        & df["group_title"].notna()
    )

    df.loc[mask5_group, "validated_server_name"] = df.loc[mask5_group, "group_title"]
    df.loc[mask5_group, "validation_status"] = "MATCH_RULE5_GROUP_TITLE"
    df.loc[mask5_group, "confidence_score"] = 0.98
    df.loc[mask5_group, "rule_id"] = "R5A_OSF_AMS_GROUP_TITLE"

    # ========================================================
    # R5b â€” IBICT (use institution_name)
    # ========================================================
    mask5_ibict = (
        (df["primary_domain"] == IBICT_DOMAIN)
        & df["institution_name"].notna()
    )

    df.loc[mask5_ibict, "validated_server_name"] = df.loc[mask5_ibict, "institution_name"]
    df.loc[mask5_ibict, "validation_status"] = "MATCH_RULE5_INSTITUTION"
    df.loc[mask5_ibict, "confidence_score"] = 0.98
    df.loc[mask5_ibict, "rule_id"] = "R5B_IBICT_INSTITUTION"

    # ========================================================
    # R5c â€” PubPub (only unvalidated rows)
    # ========================================================
    mask_pubpub = (
        unvalidated()
        & df["primary_domain"].notna()
        & df["primary_domain"].astype(str).str.endswith(PUBPUB_SUFFIX)
    )

    def get_pubpub_label(row):
        """Derive PubPub community name from group_title or subdomain."""
        dom_val = row["primary_domain"]
        if not isinstance(dom_val, str):
            return None

        label = None
        # Prefer group_title if present
        gt = row.get("group_title")
        if isinstance(gt, str) and gt.strip():
            label = gt.strip()
        else:
            # Otherwise derive label from subdomain
            sub = dom_val.split(PUBPUB_SUFFIX)[0]  # text before '.pubpub.org'
            sub = sub.split(".")[0]                # keep left-most segment if multiple dots
            label = sub.replace("-", " ").replace("_", " ").strip()

        return f"{label.title()} (PubPub)" if label else None

    if mask_pubpub.any():
        pubpub_labels = df.loc[mask_pubpub].apply(get_pubpub_label, axis=1)
        has_pubpub_label = pubpub_labels.notna()
        idx_pubpub = df.loc[mask_pubpub].index[has_pubpub_label]

        df.loc[idx_pubpub, "validated_server_name"] = pubpub_labels[has_pubpub_label]
        df.loc[idx_pubpub, "validation_status"] = "MATCH_RULE5_PUBPUB"
        df.loc[idx_pubpub, "confidence_score"] = 0.98
        df.loc[idx_pubpub, "rule_id"] = "R5C_PUBPUB"

    # ========================================================
    # R5d â€” ResearchCatalogue / Cambridge / etc (prefix_server_name)
    # ========================================================
    mask5_prefix = (
        unvalidated()
        & df["primary_domain"].isin(PREFIX_OVERRIDE_DOMAINS)
        & df["prefix_server_name"].notna()
    )

    df.loc[mask5_prefix, "validated_server_name"] = df.loc[mask5_prefix, "prefix_server_name"]
    df.loc[mask5_prefix, "validation_status"] = "MATCH_RULE5_PREFIX_OVERRIDE"
    df.loc[mask5_prefix, "confidence_score"] = 0.97
    df.loc[mask5_prefix, "rule_id"] = "R5D_PREFIX_OVERRIDE"

    # ========================================================
    # R5e â€” domain-based override for specific domains
    # ========================================================
    mask5_domain = (
        unvalidated()
        & df["primary_domain"].isin(DOMAIN_OVERRIDE_DOMAINS)
        & df["domain_server_name"].notna()
    )

    df.loc[mask5_domain, "validated_server_name"] = df.loc[mask5_domain, "domain_server_name"]
    df.loc[mask5_domain, "validation_status"] = "MATCH_RULE5_DOMAIN_OVERRIDE"
    df.loc[mask5_domain, "confidence_score"] = 0.97
    df.loc[mask5_domain, "rule_id"] = "R5E_DOMAIN_OVERRIDE"

    # ========================================================
    # R5f â€” DOI-prefix-based override for specific prefixes
    # ========================================================
    mask5_prefix_doi = (
        unvalidated()
        & df["doi_prefix_first_token"].isin(DOI_PREFIX_OVERRIDE_TOKENS)
        & df["prefix_server_name"].notna()
    )

    df.loc[mask5_prefix_doi, "validated_server_name"] = df.loc[mask5_prefix_doi, "prefix_server_name"]
    df.loc[mask5_prefix_doi, "validation_status"] = "MATCH_RULE5_DOI_PREFIX_OVERRIDE"
    df.loc[mask5_prefix_doi, "confidence_score"] = 0.97
    df.loc[mask5_prefix_doi, "rule_id"] = "R5F_DOI_PREFIX_OVERRIDE"

    # --------------------------------------------------------
    # 3.5 Manual review subset (after ALL rules)
    # --------------------------------------------------------
    manual_df = df[df["validation_status"] == "LOW_CONFIDENCE_MANUAL"].copy()

    return df, manual_df

In [10]:
df_valid, manual_df = validate_server_names(df_domain)
df_valid["rule_id"].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
rule_id,Unnamed: 1_level_1
R1_MATCH_STRONG,0.5084731
R3_MATCH_DOMAIN,0.1955963
R2_MATCH_WEAK,0.1831446
R5A_OSF_AMS_GROUP_TITLE,0.1084807
R5D_PREFIX_OVERRIDE,0.001593196
R5E_DOMAIN_OVERRIDE,0.001482736
R3_MATCH_PREFIX,0.001074804
R5F_DOI_PREFIX_OVERRIDE,0.0001089189
R5C_PUBPUB,3.956017e-05
R5B_IBICT_INSTITUTION,5.651453e-06


In [11]:
df_valid["rule_id"].value_counts()

Unnamed: 0_level_0,count
rule_id,Unnamed: 1_level_1
R1_MATCH_STRONG,989693
R3_MATCH_DOMAIN,380709
R2_MATCH_WEAK,356473
R5A_OSF_AMS_GROUP_TITLE,211147
R5D_PREFIX_OVERRIDE,3101
R5E_DOMAIN_OVERRIDE,2886
R3_MATCH_PREFIX,2092
R5F_DOI_PREFIX_OVERRIDE,212
R5C_PUBPUB,77
R5B_IBICT_INSTITUTION,11


In [12]:
df_valid["validation_status"].value_counts()

Unnamed: 0_level_0,count
validation_status,Unnamed: 1_level_1
MATCH_STRONG,989693
MATCH_DOMAIN,380709
MATCH_WEAK,356473
MATCH_RULE5_GROUP_TITLE,211147
MATCH_RULE5_PREFIX_OVERRIDE,3101
MATCH_RULE5_DOMAIN_OVERRIDE,2886
MATCH_PREFIX,2092
MATCH_RULE5_DOI_PREFIX_OVERRIDE,212
MATCH_RULE5_PUBPUB,77
MATCH_RULE5_INSTITUTION,11


In [13]:
print("Manual review rows:", manual_df.shape[0])

Manual review rows: 1


In [14]:
df_valid["validated_server_name"].value_counts()

Unnamed: 0_level_0,count
validated_server_name,Unnamed: 1_level_1
SSRN,443583
Research Square,436841
bioRxiv,303620
Preprints.org,110562
Open Science Framework,103422
...,...
MobilityRxiv,1
Underlay,1
TopQAD,1
KTH Royal Institute of Technology,1


In [15]:
list(df_valid["validated_server_name"].value_counts().reset_index()["validated_server_name"])

['SSRN',
 'Research Square',
 'bioRxiv',
 'Preprints.org',
 'Open Science Framework',
 'medRxiv',
 'Authorea Inc.',
 'PsyArXiv',
 'ChemRxiv',
 'JMIR Preprints',
 'protocols.io',
 'TechRxiv',
 'SocArXiv',
 'ESS Open Archive',
 'INA-Rxiv',
 'Atmospheric Chemistry and Physics',
 'EGUsphere',
 'eLife',
 'EarthArXiv',
 'PeerJ Preprints',
 'Biogeosciences',
 'Hydrology and Earth System Sciences',
 'Qeios',
 'Atmospheric Measurement Techniques',
 'engrXiv',
 'UMSIDA Preprints Server',
 'Optica Open',
 'Advance',
 'Thesis Commons',
 'SciELO Preprints',
 'Geoscientific Model Development',
 'Energy Proceedings',
 'The Cryosphere',
 'Cambridge Open Engage',
 'EcoEvoRxiv',
 'Natural Hazards and Earth System Sciences',
 'Earth System Science Data',
 'EdArXiv',
 'Climate of the Past',
 'Verfassungsblog',
 'Research Archive of Rising Scholars',
 'ScienceOpen Preprints',
 'University of Southern Mississippi Libraries',
 'Even3',
 'LawArXiv',
 'APSA Preprints',
 'Federal Reserve Bank of Minneapolis Res

In [16]:
df_valid[df_valid['domain_server_name'].isna()]

Unnamed: 0,doi,posted_date,url,primary_url,type,subtype,prefix,publisher,content_domain_json,container_title,short_container_title,institution_name,is_preprint_of,has_preprint,is_version_of,group_title,member,gold_server_name,doi_lc,prefix_lc,doi_prefix_from_text,doi_suffix,doi_prefix_first_token,doi_prefix_bucket_2d,primary_domain,primary_domain_extend,year,domain_server_name,prefix_server_name,norm_domain,norm_prefix,norm_group,norm_inst,norm_pub,validated_server_name,validation_status,confidence_score,rule_id,match_dom_text,match_pref_text


In [17]:
df_valid[df_valid['validated_server_name']=='Life Sciences']

Unnamed: 0,doi,posted_date,url,primary_url,type,subtype,prefix,publisher,content_domain_json,container_title,short_container_title,institution_name,is_preprint_of,has_preprint,is_version_of,group_title,member,gold_server_name,doi_lc,prefix_lc,doi_prefix_from_text,doi_suffix,doi_prefix_first_token,doi_prefix_bucket_2d,primary_domain,primary_domain_extend,year,domain_server_name,prefix_server_name,norm_domain,norm_prefix,norm_group,norm_inst,norm_pub,validated_server_name,validation_status,confidence_score,rule_id,match_dom_text,match_pref_text
446351,10.32942/osf.io/uqnvf,2021-03-07,https://doi.org/10.32942/osf.io/uqnvf,https://osf.io/uqnvf,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/uqnvf,10.32942,10.32942,osf.io/uqnvf,10.32942/osf,10.32942/osf,osf.io,osf.io/uqnvf,2021,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
469448,10.32942/osf.io/p2syu,2021-04-15,https://doi.org/10.32942/osf.io/p2syu,https://osf.io/p2syu,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/p2syu,10.32942,10.32942,osf.io/p2syu,10.32942/osf,10.32942/osf,osf.io,osf.io/p2syu,2021,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
530791,10.32942/osf.io/bpyvd,2021-08-03,https://doi.org/10.32942/osf.io/bpyvd,https://osf.io/bpyvd,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/bpyvd,10.32942,10.32942,osf.io/bpyvd,10.32942/osf,10.32942/osf,osf.io,osf.io/bpyvd,2021,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
562086,10.32942/osf.io/4watk,2021-10-05,https://doi.org/10.32942/osf.io/4watk,https://osf.io/4watk,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/4watk,10.32942,10.32942,osf.io/4watk,10.32942/osf,10.32942/osf,osf.io,osf.io/4watk,2021,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
580215,10.32942/osf.io/rmqy4,2021-11-08,https://doi.org/10.32942/osf.io/rmqy4,https://osf.io/rmqy4,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/rmqy4,10.32942,10.32942,osf.io/rmqy4,10.32942/osf,10.32942/osf,osf.io,osf.io/rmqy4,2021,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
591302,10.32942/osf.io/jf2as,2021-11-30,https://doi.org/10.32942/osf.io/jf2as,https://osf.io/jf2as,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/jf2as,10.32942,10.32942,osf.io/jf2as,10.32942/osf,10.32942/osf,osf.io,osf.io/jf2as,2021,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
673614,10.32942/osf.io/bfuw9,2022-04-26,https://doi.org/10.32942/osf.io/bfuw9,https://osf.io/bfuw9,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/bfuw9,10.32942,10.32942,osf.io/bfuw9,10.32942/osf,10.32942/osf,osf.io,osf.io/bfuw9,2022,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
725845,10.32942/osf.io/fgwn9,2022-07-21,https://doi.org/10.32942/osf.io/fgwn9,https://osf.io/fgwn9,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/fgwn9,10.32942,10.32942,osf.io/fgwn9,10.32942/osf,10.32942/osf,osf.io,osf.io/fgwn9,2022,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
743797,10.32942/osf.io/gvpm9,2022-08-24,https://doi.org/10.32942/osf.io/gvpm9,https://osf.io/gvpm9,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Life Sciences,29705,Life Sciences,10.32942/osf.io/gvpm9,10.32942,10.32942,osf.io/gvpm9,10.32942/osf,10.32942/osf,osf.io,osf.io/gvpm9,2022,OSF communities,OSF communities,osf communities,osf communities,life sciences,,california digital library cdl,Life Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False


In [18]:
df_valid[df_valid['validated_server_name']=='Physical Sciences and Mathematics']

Unnamed: 0,doi,posted_date,url,primary_url,type,subtype,prefix,publisher,content_domain_json,container_title,short_container_title,institution_name,is_preprint_of,has_preprint,is_version_of,group_title,member,gold_server_name,doi_lc,prefix_lc,doi_prefix_from_text,doi_suffix,doi_prefix_first_token,doi_prefix_bucket_2d,primary_domain,primary_domain_extend,year,domain_server_name,prefix_server_name,norm_domain,norm_prefix,norm_group,norm_inst,norm_pub,validated_server_name,validation_status,confidence_score,rule_id,match_dom_text,match_pref_text
314762,10.32942/osf.io/kzdxr,2020-07-22,https://doi.org/10.32942/osf.io/kzdxr,https://osf.io/kzdxr,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Physical Sciences and Mathematics,29705,Physical Sciences and Mathematics,10.32942/osf.io/kzdxr,10.32942,10.32942,osf.io/kzdxr,10.32942/osf,10.32942/osf,osf.io,osf.io/kzdxr,2020,OSF communities,OSF communities,osf communities,osf communities,physical sciences and mathematics,,california digital library cdl,Physical Sciences and Mathematics,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
632955,10.32942/osf.io/e623t,2022-02-15,https://doi.org/10.32942/osf.io/e623t,https://osf.io/e623t,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Physical Sciences and Mathematics,29705,Physical Sciences and Mathematics,10.32942/osf.io/e623t,10.32942,10.32942,osf.io/e623t,10.32942/osf,10.32942/osf,osf.io,osf.io/e623t,2022,OSF communities,OSF communities,osf communities,osf communities,physical sciences and mathematics,,california digital library cdl,Physical Sciences and Mathematics,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False


In [19]:
df_valid[df_valid['validated_server_name']=='Social and Behavioral Sciences']

Unnamed: 0,doi,posted_date,url,primary_url,type,subtype,prefix,publisher,content_domain_json,container_title,short_container_title,institution_name,is_preprint_of,has_preprint,is_version_of,group_title,member,gold_server_name,doi_lc,prefix_lc,doi_prefix_from_text,doi_suffix,doi_prefix_first_token,doi_prefix_bucket_2d,primary_domain,primary_domain_extend,year,domain_server_name,prefix_server_name,norm_domain,norm_prefix,norm_group,norm_inst,norm_pub,validated_server_name,validation_status,confidence_score,rule_id,match_dom_text,match_pref_text
542210,10.32942/osf.io/vsmka,2021-08-27,https://doi.org/10.32942/osf.io/vsmka,https://osf.io/vsmka,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Social and Behavioral Sciences,29705,Social and Behavioral Sciences,10.32942/osf.io/vsmka,10.32942,10.32942,osf.io/vsmka,10.32942/osf,10.32942/osf,osf.io,osf.io/vsmka,2021,OSF communities,OSF communities,osf communities,osf communities,social and behavioral sciences,,california digital library cdl,Social and Behavioral Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False
710225,10.32942/osf.io/z74xh,2022-06-24,https://doi.org/10.32942/osf.io/z74xh,https://osf.io/z74xh,posted-content,preprint,10.32942,California Digital Library (CDL),"{""crossmark-restriction"": false, ""domain"": []}",,,,,,,Social and Behavioral Sciences,29705,Social and Behavioral Sciences,10.32942/osf.io/z74xh,10.32942,10.32942,osf.io/z74xh,10.32942/osf,10.32942/osf,osf.io,osf.io/z74xh,2022,OSF communities,OSF communities,osf communities,osf communities,social and behavioral sciences,,california digital library cdl,Social and Behavioral Sciences,MATCH_RULE5_GROUP_TITLE,0.98,R5A_OSF_AMS_GROUP_TITLE,False,False


## 3-Fuzzy duplicate detection

In [21]:
!pip install fuzzywuzzy
from fuzzywuzzy import process
import pandas as pd

# ---------------------------------------------
# 1. Prepare clean list of candidate names
# ---------------------------------------------
# Keep only non-null, stripped, unique strings
server_names = (
    pd.Series(df_valid["validated_server_name"])
      .dropna()
      .astype(str)
      .str.strip()
      .loc[lambda s: s != ""]
      .unique()
      .tolist()
)

print(f"Number of unique validated_server_name values: {len(server_names)}")

# ---------------------------------------------
# 2. Fuzzy duplicate detection
# ---------------------------------------------
THRESHOLD = 88  # you can tune this
pairs = []
seen_pairs = set()  # to avoid (A,B) and (B,A) duplicates

for name in server_names:
    # All fuzzy matches of `name` against the full list
    matches = process.extract(name, server_names, limit=None)

    for match, score in matches:
        # Skip identical name and low scores
        if match == name or score <= THRESHOLD:
            continue

        # Create an unordered pair key to avoid symmetry
        key = tuple(sorted([name, match]))
        if key in seen_pairs:
            continue

        seen_pairs.add(key)
        pairs.append({
            "name_1": key[0],
            "name_2": key[1],
            "similarity": score,
        })

# ---------------------------------------------
# 3. Convert to DataFrame for analysis/reporting
# ---------------------------------------------
if pairs:
    dup_df = pd.DataFrame(pairs).sort_values("similarity", ascending=False)
    print(f"\nPotential duplicate server_names (threshold > {THRESHOLD}):")
    display(dup_df.head(50))  # or print, or export to CSV
else:
    print(f"No potential duplicate server_names found (threshold > {THRESHOLD}).")


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0




Number of unique validated_server_name values: 304

Potential duplicate server_names (threshold > 88):


Unnamed: 0,name_1,name_2,similarity
12,Instituto Brasileiro de InformaÃ§Ã£o em CiÃªncia ...,Instituto Brasileiro de InformaÃ§Ã£o em CiÃªncia ...,99
0,Journal for Artistic Research,VIS â€“ Nordic Journal for Artistic Research,95
7,AgriXiv,agriRxiv,93
1,IUScholarWorks,ScholarWorks,92
2,ScholarWorks,Works,90
4,IUScholarWorks,Works,90
3,San JosÃ© State University ScholarWorks,ScholarWorks,90
6,Sage Bionetworks,Works,90
5,San JosÃ© State University ScholarWorks,Works,90
8,Authorea Inc.,Authorea Inc./ test/ link not working,90


In [22]:
# dictionnaire des corrections
server_name_corrections = {
    "Techrxiv": "TechRxiv",
    "agriRxiv": "AgriRxiv",
    "AgriXiv": "AgriRxiv",
    "elife": "eLife",
    "eLife": "eLife",
    "ESS Open Archive": "Earth and Space Science Open Archive",
    "LawArXiv": "Law Archive",
    "Instituto Brasileiro de InformaÃ§Ã£o em CiÃªncia e Tecnologia Ibict": "Instituto Brasileiro de InformaÃ§Ã£o em CiÃªncia e Tecnologia (Ibict)",
    "EMERI": "EmeRI",


    "Life Sciences" : "EcoEvoRxiv",
    "Physical Sciences and Mathematics" : "EcoEvoRxiv",
    "Social and Behavioral Sciences" : "EcoEvoRxiv",

}



# appliquer le remplacement
df_valid["validated_server_name_old"] = df_valid["validated_server_name"]
df_valid["validated_server_name"] = df_valid["validated_server_name"].str.strip()
df_valid["validated_server_name"] = df_valid["validated_server_name"].replace(server_name_corrections)
# Remove spaces before and after values in 'server_name'
df_valid["validated_server_name"] = df_valid["validated_server_name"].str.strip()
df_valid["validated_server_name"] = df_valid["validated_server_name"].str.replace(r"\s+", " ", regex=True).str.strip()

In [23]:
df_valid["validated_server_name"].value_counts()

Unnamed: 0_level_0,count
validated_server_name,Unnamed: 1_level_1
SSRN,443583
Research Square,436841
bioRxiv,303620
Preprints.org,110562
Open Science Framework,103422
...,...
Pbiunamin (PubPub),1
MobilityRxiv,1
MELBA journal,1
Datacite,1


# final file

In [24]:
import pandas as pd
from typing import Dict, List, Tuple


MISSING_TOKEN = "MISSING"


def _top_k_counts(s: pd.Series, k: int | None = None) -> List[str]:
    """
    Return a list like ['value (count)', ...] in descending count order.
    NaNs are represented consistently as 'MISSING'.
    """
    vc = s.fillna(MISSING_TOKEN).value_counts(dropna=False)
    if k is not None:
        vc = vc.head(k)
    # ensure names are str to avoid "nan" looking odd
    return [f"{str(name)} ({int(cnt)})" for name, cnt in vc.items()]


def _sample_unique(s: pd.Series, k: int = 10) -> List[str]:
    """
    Return up to k unique, non-null examples (stable order).
    """
    if s is None or s.empty:
        return []
    vals = s.dropna().unique()
    if len(vals) == 0:
        return []
    # Cast to str to guard against non-string types sneaking in
    return [str(v) for v in vals[:k]]


def _build_sharing_maps(df: pd.DataFrame) -> Dict[str, Dict[str, List[str]]]:
    """
    Precompute maps of:
      - prefix -> sorted unique server_names
      - member -> sorted unique server_names
      - primary_domain -> sorted unique server_names
    Only non-null keys are included in maps; values exclude NaN.
    """
    maps: Dict[str, Dict[str, List[str]]] = {}

    if "prefix" in df.columns and "validated_server_name" in df.columns:
        maps["prefix_to_server_names"] = (
            df.dropna(subset=["prefix"])
              .groupby("prefix")["validated_server_name"]
              .apply(lambda x: sorted(pd.Series(x.dropna().unique())))
              .to_dict()
        )
    else:
        maps["prefix_to_server_names"] = {}

    if "member" in df.columns and "validated_server_name" in df.columns:
        maps["member_to_server_names"] = (
            df.dropna(subset=["member"])
              .groupby("member")["validated_server_name"]
              .apply(lambda x: sorted(pd.Series(x.dropna().unique())))
              .to_dict()
        )
    else:
        maps["member_to_server_names"] = {}

    if "primary_domain" in df.columns and "validated_server_name" in df.columns:
        maps["domain_to_server_names"] = (
            df.dropna(subset=["primary_domain"])
              .groupby("primary_domain")["validated_server_name"]
              .apply(lambda x: sorted(pd.Series(x.dropna().unique())))
              .to_dict()
        )
    else:
        maps["domain_to_server_names"] = {}

    if "group_title" in df.columns and "validated_server_name" in df.columns:
        maps["group_title_to_server_names"] = (
            df.dropna(subset=["group_title"])
              .groupby("group_title")["validated_server_name"]
              .apply(lambda x: sorted(pd.Series(x.dropna().unique())))
              .to_dict()
        )
    else:
        maps["group_title_to_server_names"] = {}

    if "institution_name" in df.columns and "validated_server_name" in df.columns:
        maps["institution_name_to_server_names"] = (
            df.dropna(subset=["institution_name"])
              .groupby("institution_name")["validated_server_name"]
              .apply(lambda x: sorted(pd.Series(x.dropna().unique())))
              .to_dict()
        )
    else:
        maps["institution_name_to_server_names"] = {}

    if "doi_prefix_first_token" in df.columns and "validated_server_name" in df.columns:
        maps["doi_prefix_first_token_to_server_names"] = (
            df.dropna(subset=["doi_prefix_first_token"])
              .groupby("doi_prefix_first_token")["validated_server_name"]
              .apply(lambda x: sorted(pd.Series(x.dropna().unique())))
              .to_dict()
        )
    else:
        maps["doi_prefix_first_token_to_server_names"] = {}

    return maps


def summarize_by_field(
    df: pd.DataFrame,
    field: str,
    examples_k: int = 10,
    preprint_subtype_value: str = "preprint",
) -> pd.DataFrame:
    """
    Produce a summary DataFrame grouped by one field.

    Columns in the result include:
      - Field_<field>: the group key
      - Publishers / Prefixes / Members / institution_name / group_title: value(count) lists
      - Associated with Institution: bool (any non-null institution_name in the group)
      - institution_name_count / group_title_count: unique counts with MISSING bucket
      - Example URLs / Example Primary URLs / Example DOIs: up to `examples_k` unique examples
      - Number of Preprint Works: count where `subtype == preprint_subtype_value`
      - Server_name Sharing Prefix/Member/Primary Domain (+ Count): union of server_names across the
        groupâ€™s prefixes/members/domains via global maps

    Notes:
    - NaNs in the grouping column are mapped to 'MISSING' as an explicit bucket.
    - Uses stable sort: descending by preprint count, then ascending by group key.
    """
    if field not in df.columns:
        raise KeyError(f"Field {field!r} not in DataFrame")

    # Precompute maps once
    maps = _build_sharing_maps(df)
    prefix_map = maps["prefix_to_server_names"]
    member_map = maps["member_to_server_names"]
    domain_map = maps["domain_to_server_names"]
    group_title_map = maps["group_title_to_server_names"]
    institution_name_map = maps["institution_name_to_server_names"]
    doi_prefix_first_token_map = maps["doi_prefix_first_token_to_server_names"]

    # Work on a copy with the grouping column filled
    work = df.copy()
    work[field] = work[field].fillna(MISSING_TOKEN)

    # Group once
    g = work.groupby(field, dropna=False)

    rows: List[Dict] = []
    for server_key, group in g:
        # Unique keys per group (non-null only)
        prefixes_u = group["prefix"].dropna().unique() if "prefix" in group else []
        members_u = group["member"].dropna().unique() if "member" in group else []
        domains_u = group["primary_domain"].dropna().unique() if "primary_domain" in group else []
        group_titles_u = group["group_title"].dropna().unique() if "group_title" in group else []
        institutions_u = group["institution_name"].dropna().unique() if "institution_name" in group else []
        doi_prefixes_u = group["doi_prefix_first_token"].dropna().unique() if "doi_prefix_first_token" in group else []

        # Sharing server_names across ALL prefixes/members/domains in this group
        server_names_sharing_prefix = sorted(
            set().union(*(set(prefix_map.get(px, [])) for px in prefixes_u))
        ) if len(prefixes_u) else []

        server_names_sharing_member = sorted(
            set().union(*(set(member_map.get(mb, [])) for mb in members_u))
        ) if len(members_u) else []

        server_names_sharing_domain = sorted(
            set().union(*(set(domain_map.get(dom, [])) for dom in domains_u))
        ) if len(domains_u) else []

        server_names_sharing_group = sorted(
            set().union(*(set(group_title_map.get(gt, [])) for gt in group_titles_u))
        ) if len(group_titles_u) else []

        server_names_sharing_institution = sorted(
            set().union(*(set(institution_name_map.get(ins, [])) for ins in institutions_u))
        ) if len(institutions_u) else []

        server_names_sharing_doi_prefix = sorted(
            set().union(*(set(doi_prefix_first_token_map.get(dp, [])) for dp in doi_prefixes_u))
        ) if len(doi_prefixes_u) else []


        # Preprint counting (robust if subtype column is missing)
        if "subtype" in group.columns:
            n_preprints = int((group["subtype"] == preprint_subtype_value).sum())
        else:
            n_preprints = 0

        # Assemble row
        row = {
            f"Field_{field}": server_key,

            # Distributions (as 'value (count)' lists)
            "Publishers": _top_k_counts(group.get("publisher", pd.Series(index=group.index))),
            "Prefixes": _top_k_counts(group.get("prefix", pd.Series(index=group.index))),
            "Members": _top_k_counts(group.get("member", pd.Series(index=group.index))),
            "institution_name": _top_k_counts(group.get("institution_name", pd.Series(index=group.index))),
            "group_title": _top_k_counts(group.get("group_title", pd.Series(index=group.index))),

            "primary_domain": _top_k_counts(group.get("primary_domain", pd.Series(index=group.index))),
            "primary_domain_extend": _top_k_counts(group.get("primary_domain_extend", pd.Series(index=group.index))),
            "doi_prefix_first_token": _top_k_counts(group.get("doi_prefix_first_token", pd.Series(index=group.index))),
            "gold_server_name": _top_k_counts(group.get("gold_server_name", pd.Series(index=group.index))),
            "validation_status": _top_k_counts(group.get("validation_status", pd.Series(index=group.index))),
            "rule_id": _top_k_counts(group.get("rule_id", pd.Series(index=group.index))),
            "year": _top_k_counts(group.get("year", pd.Series(index=group.index))),

            # Boolean/Counts
            "Associated with Institution": bool(group.get("institution_name", pd.Series(index=group.index)).notna().any()),
            "institution_name_count": int(group.get("institution_name", pd.Series()).fillna(MISSING_TOKEN).nunique()),
            "group_title_count": int(group.get("group_title", pd.Series()).fillna(MISSING_TOKEN).nunique()),

            # Examples (unique, up to k)
            "Example URLs": _sample_unique(group.get("url", pd.Series()), examples_k),
            "Example Primary URLs": _sample_unique(group.get("primary_url", pd.Series()), examples_k),
            "Example DOIs": _sample_unique(group.get("doi", pd.Series()), examples_k),

            # Within-group preprint count
            "Number of Preprint Works": n_preprints,

            # Cross-prefix/member/domain server_name sharing
            "Server Sharing Prefix": server_names_sharing_prefix,
            "Server Sharing Prefix Count": len(server_names_sharing_prefix),
            "Server Sharing Member": server_names_sharing_member,
            "Server Sharing Member Count": len(server_names_sharing_member),
            "Server Sharing Primary Domain": server_names_sharing_domain,
            "Server Sharing Primary Domain Count": len(server_names_sharing_domain),
            "Server Sharing Group Title's": server_names_sharing_group,
            "Server Sharing Group Title's Count": len(server_names_sharing_group),
            "Server Sharing Institution": server_names_sharing_institution,
            "Server Sharing Institution Count": len(server_names_sharing_institution),
            "Server Sharing DOI Prefix and Token": server_names_sharing_doi_prefix,
            "Server Sharing DOI Prefix and Token Count": len(server_names_sharing_doi_prefix),

        }
        rows.append(row)

    summary = pd.DataFrame(rows)

    # Stable sort by largest preprint count then name
    summary = summary.sort_values(
        by=["Number of Preprint Works", f"Field_{field}"],
        ascending=[False, True],
        kind="mergesort"
    ).reset_index(drop=True)

    return summary


In [25]:
# â”€â”€ Run for multiple fields â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fields = ["validated_server_name_old",'validated_server_name']

summaries = {}
for fld in fields:
    summaries[fld] = summarize_by_field(df_valid, fld)

# display
# display(summaries)

# save all to CSVs (set your own output_dir & latest_date)
output_dir = "/content/drive/MyDrive/ScholCommLab/DATA/CROSSREF/server_labeling_approach"
os.makedirs(output_dir, exist_ok=True)
# latest_date = pd.Timestamp.utcnow().date().isoformat()
for fld, sdf in summaries.items():
    sdf.to_csv(os.path.join(output_dir, f"final_summary_crossref_preprints_{fld}_{latest_date}.csv"), index=False, encoding="utf-8-sig")

In [26]:
display(summaries['validated_server_name'])

Unnamed: 0,Field_validated_server_name,Publishers,Prefixes,Members,institution_name,group_title,primary_domain,primary_domain_extend,doi_prefix_first_token,gold_server_name,validation_status,rule_id,year,Associated with Institution,institution_name_count,group_title_count,Example URLs,Example Primary URLs,Example DOIs,Number of Preprint Works,Server Sharing Prefix,Server Sharing Prefix Count,Server Sharing Member,Server Sharing Member Count,Server Sharing Primary Domain,Server Sharing Primary Domain Count,Server Sharing Group Title's,Server Sharing Group Title's Count,Server Sharing Institution,Server Sharing Institution Count,Server Sharing DOI Prefix and Token,Server Sharing DOI Prefix and Token Count
0,SSRN,[Elsevier BV (443583)],[10.2139 (443583)],[78 (443583)],[MISSING (443583)],[SSRN (443583)],[ssrn.com (443583)],[ssrn.com/abstract (443583)],[10.2139/ssrn (443583)],[SSRN (443583)],[MATCH_STRONG (443583)],[R1_MATCH_STRONG (443583)],"[2024 (177125), 2025 (149595), 2023 (114555), ...",False,1,1,"[https://doi.org/10.2139/ssrn.1601118, https:/...","[https://www.ssrn.com/abstract=1601118, https:...","[10.2139/ssrn.1601118, 10.2139/ssrn.2709350, 1...",443583,[SSRN],1,[SSRN],1,[SSRN],1,[SSRN],1,[],0,[SSRN],1
1,Research Square,[Springer Science and Business Media LLC (2647...,[10.21203 (436841)],"[297 (264738), 8761 (172103)]","[Research Square (415077), MISSING (21764)]","[In Review (436156), Protocol Exchange (683), ...","[researchsquare.com (436158), protocols.io (683)]","[researchsquare.com/article (436158), protocol...","[10.21203/rs (436840), 10.21203/rs- (1)]","[Research Square (415077), In Review (21762), ...","[MATCH_STRONG (421361), MATCH_WEAK (14797), MA...","[R1_MATCH_STRONG (421361), R2_MATCH_WEAK (1479...","[2023 (82465), 2022 (75681), 2021 (75168), 202...",True,2,3,"[https://doi.org/10.21203/rs.1.1/v1, https://d...",[https://www.researchsquare.com/article/rs-2/v...,"[10.21203/rs.1.1/v1, 10.21203/rs.1.2/v1, 10.21...",436841,"[Research Square, protocols.io]",2,"[Research Square, protocols.io]",2,"[Research Square, protocols.io]",2,"[Research Square, protocols.io]",2,[Research Square],1,"[Research Square, protocols.io]",2
2,bioRxiv,[Cold Spring Harbor Laboratory (303620)],[10.1101 (303620)],[246 (303620)],"[bioRxiv (303619), Cold Spring Harbor Laborato...","[Neuroscience (54496), Microbiology (28706), B...",[biorxiv.org (303620)],[biorxiv.org/lookup (303620)],"[10.1101/20 (237028), 10.1101/10 (828), 10.110...","[bioRxiv (303619), Cold Spring Harbor Laborato...","[MATCH_DOMAIN (303619), MATCH_RULE5_DOMAIN_OVE...","[R3_MATCH_DOMAIN (303619), R5E_DOMAIN_OVERRIDE...","[2024 (43585), 2025 (40812), 2023 (39130), 202...",True,2,30,"[https://doi.org/10.1101/000042, https://doi.o...","[http://biorxiv.org/lookup/doi/10.1101/000042,...","[10.1101/000042, 10.1101/000125, 10.1101/00002...",303620,"[bioRxiv, medRxiv]",2,"[bioRxiv, medRxiv]",2,[bioRxiv],1,"[Earth and Space Science Open Archive, EarthAr...",6,[bioRxiv],1,"[bioRxiv, medRxiv]",2
3,Preprints.org,[MDPI AG (110562)],[10.20944 (110562)],[1968 (110562)],[MISSING (110562)],"[Biology and Life Sciences (15001), Medicine a...",[preprints.org (110562)],[preprints.org/manuscript (110562)],[10.20944/preprints (110562)],"[Biology and Life Sciences (15001), Medicine a...",[MATCH_WEAK (110562)],[R2_MATCH_WEAK (110562)],"[2024 (28613), 2025 (26177), 2023 (21790), 202...",False,1,24,[https://doi.org/10.20944/preprints201605.0001...,[http://www.preprints.org/manuscript/201605.00...,"[10.20944/preprints201605.0001.v1, 10.20944/pr...",110562,[Preprints.org],1,"[Encyclopedia, Preprints.org]",2,[Preprints.org],1,"[Cambridge Open Engage, EarthArXiv, EcoEvoRxiv...",7,[],0,[Preprints.org],1
4,Open Science Framework,"[Center for Open Science (103243), California ...","[10.31219 (101368), 10.31227 (649), 10.31234 (...","[15934 (103243), 29705 (110), 242 (56), 33966 ...",[MISSING (103422)],[Open Science Framework (103422)],[osf.io (103422)],"[osf.io/895wn_v1 (2), osf.io/q6jyt_v1 (2), osf...","[10.31219/osf (101368), 10.31227/osf (649), 10...",[Open Science Framework (103422)],[MATCH_RULE5_GROUP_TITLE (103422)],[R5A_OSF_AMS_GROUP_TITLE (103422)],"[2022 (21849), 2023 (20952), 2021 (19909), 202...",False,1,1,"[https://doi.org/10.31219/osf.io/2twgy, https:...","[https://osf.io/2twgy, https://osf.io/w9623, h...","[10.31219/osf.io/2twgy, 10.31219/osf.io/w9623,...",103422,"[AfricArXiv, AgriRxiv, Arabixiv, BITSS, BodoAr...",27,"[AfricArXiv, AgriRxiv, Aquaculture Compendium,...",36,"[AfricArXiv, AgriRxiv, Arabixiv, BITSS, BioHac...",30,[Open Science Framework],1,[],0,"[AfricArXiv, AgriRxiv, Arabixiv, BITSS, BodoAr...",26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,Universidad de Lima,[Universidad de Lima (1)],[10.26439 (1)],[10978 (1)],[MISSING (1)],[Repositorio Universidad de Lima - Prepublicac...,[hdl.handle.net (1)],[hdl.handle.net/20.500.12724 (1)],[10.26439/ulima (1)],[Repositorio Universidad de Lima - Prepublicac...,[MATCH_PREFIX (1)],[R3_MATCH_PREFIX (1)],[2022 (1)],False,1,1,[https://doi.org/10.26439/ulima.prep.14933],[https://hdl.handle.net/20.500.12724/14933],[10.26439/ulima.prep.14933],1,[Universidad de Lima],1,[Universidad de Lima],1,"[DSpace/ not working, IUScholarWorks, SHAREOK ...",5,[Universidad de Lima],1,[],0,[Universidad de Lima],1
295,"University of Agder, Faculty of Fine Arts",[Society for Artistic Research (1)],[10.22501 (1)],[9794 (1)],[MISSING (1)],[MISSING (1)],[researchcatalogue.net (1)],[researchcatalogue.net/view (1)],[10.22501/uia (1)],[Society for Artistic Research (1)],[MATCH_RULE5_PREFIX_OVERRIDE (1)],[R5D_PREFIX_OVERRIDE (1)],[2024 (1)],False,1,1,[https://doi.org/10.22501/uia.2848768],[https://www.researchcatalogue.net/view/284876...,[10.22501/uia.2848768],1,"[HUB â€” Journal of Research in Art, Design and ...",10,"[HUB â€” Journal of Research in Art, Design and ...",10,"[HUB â€” Journal of Research in Art, Design and ...",10,[],0,[],0,"[University of Agder, Faculty of Fine Arts]",1
296,Unjournal (PubPub),[PubPub (1)],[10.21428 (1)],[9621 (1)],[MISSING (1)],[MISSING (1)],[unjournal.pubpub.org (1)],[unjournal.pubpub.org/pub (1)],[10.21428/d (1)],[PubPub (1)],[MATCH_RULE5_PUBPUB (1)],[R5C_PUBPUB (1)],[2023 (1)],False,1,1,[https://doi.org/10.21428/d28e8e57.b24c1b96],[https://unjournal.pubpub.org/pub/deletemeeval...,[10.21428/d28e8e57.b24c1b96],1,"[80Tahunaghfarid (PubPub), AYS Open Press, Acc...",59,"[80Tahunaghfarid (PubPub), AYS Open Press, Acc...",59,[Unjournal (PubPub)],1,[],0,[],0,"[Diskusi Dosen (PubPub), Dunia Dosen (PubPub),...",4
297,Wisatahalalmaros (PubPub),[PubPub (1)],[10.21428 (1)],[9621 (1)],[MISSING (1)],[MISSING (1)],[wisatahalalmaros.pubpub.org (1)],[wisatahalalmaros.pubpub.org/pub (1)],[10.21428/9d6214b0 (1)],[PubPub (1)],[MATCH_RULE5_PUBPUB (1)],[R5C_PUBPUB (1)],[2022 (1)],False,1,1,[https://doi.org/10.21428/9d6214b0.b4024c75],[https://wisatahalalmaros.pubpub.org/pub/0eg1z...,[10.21428/9d6214b0.b4024c75],1,"[80Tahunaghfarid (PubPub), AYS Open Press, Acc...",59,"[80Tahunaghfarid (PubPub), AYS Open Press, Acc...",59,[Wisatahalalmaros (PubPub)],1,[],0,[],0,[Wisatahalalmaros (PubPub)],1


In [27]:
display(summaries['validated_server_name_old'])

Unnamed: 0,Field_validated_server_name_old,Publishers,Prefixes,Members,institution_name,group_title,primary_domain,primary_domain_extend,doi_prefix_first_token,gold_server_name,validation_status,rule_id,year,Associated with Institution,institution_name_count,group_title_count,Example URLs,Example Primary URLs,Example DOIs,Number of Preprint Works,Server Sharing Prefix,Server Sharing Prefix Count,Server Sharing Member,Server Sharing Member Count,Server Sharing Primary Domain,Server Sharing Primary Domain Count,Server Sharing Group Title's,Server Sharing Group Title's Count,Server Sharing Institution,Server Sharing Institution Count,Server Sharing DOI Prefix and Token,Server Sharing DOI Prefix and Token Count
0,SSRN,[Elsevier BV (443583)],[10.2139 (443583)],[78 (443583)],[MISSING (443583)],[SSRN (443583)],[ssrn.com (443583)],[ssrn.com/abstract (443583)],[10.2139/ssrn (443583)],[SSRN (443583)],[MATCH_STRONG (443583)],[R1_MATCH_STRONG (443583)],"[2024 (177125), 2025 (149595), 2023 (114555), ...",False,1,1,"[https://doi.org/10.2139/ssrn.1601118, https:/...","[https://www.ssrn.com/abstract=1601118, https:...","[10.2139/ssrn.1601118, 10.2139/ssrn.2709350, 1...",443583,[SSRN],1,[SSRN],1,[SSRN],1,[SSRN],1,[],0,[SSRN],1
1,Research Square,[Springer Science and Business Media LLC (2647...,[10.21203 (436841)],"[297 (264738), 8761 (172103)]","[Research Square (415077), MISSING (21764)]","[In Review (436156), Protocol Exchange (683), ...","[researchsquare.com (436158), protocols.io (683)]","[researchsquare.com/article (436158), protocol...","[10.21203/rs (436840), 10.21203/rs- (1)]","[Research Square (415077), In Review (21762), ...","[MATCH_STRONG (421361), MATCH_WEAK (14797), MA...","[R1_MATCH_STRONG (421361), R2_MATCH_WEAK (1479...","[2023 (82465), 2022 (75681), 2021 (75168), 202...",True,2,3,"[https://doi.org/10.21203/rs.1.1/v1, https://d...",[https://www.researchsquare.com/article/rs-2/v...,"[10.21203/rs.1.1/v1, 10.21203/rs.1.2/v1, 10.21...",436841,"[Research Square, protocols.io]",2,"[Research Square, protocols.io]",2,"[Research Square, protocols.io]",2,"[Research Square, protocols.io]",2,[Research Square],1,"[Research Square, protocols.io]",2
2,bioRxiv,[Cold Spring Harbor Laboratory (303620)],[10.1101 (303620)],[246 (303620)],"[bioRxiv (303619), Cold Spring Harbor Laborato...","[Neuroscience (54496), Microbiology (28706), B...",[biorxiv.org (303620)],[biorxiv.org/lookup (303620)],"[10.1101/20 (237028), 10.1101/10 (828), 10.110...","[bioRxiv (303619), Cold Spring Harbor Laborato...","[MATCH_DOMAIN (303619), MATCH_RULE5_DOMAIN_OVE...","[R3_MATCH_DOMAIN (303619), R5E_DOMAIN_OVERRIDE...","[2024 (43585), 2025 (40812), 2023 (39130), 202...",True,2,30,"[https://doi.org/10.1101/000042, https://doi.o...","[http://biorxiv.org/lookup/doi/10.1101/000042,...","[10.1101/000042, 10.1101/000125, 10.1101/00002...",303620,"[bioRxiv, medRxiv]",2,"[bioRxiv, medRxiv]",2,[bioRxiv],1,"[Earth and Space Science Open Archive, EarthAr...",6,[bioRxiv],1,"[bioRxiv, medRxiv]",2
3,Preprints.org,[MDPI AG (110562)],[10.20944 (110562)],[1968 (110562)],[MISSING (110562)],"[Biology and Life Sciences (15001), Medicine a...",[preprints.org (110562)],[preprints.org/manuscript (110562)],[10.20944/preprints (110562)],"[Biology and Life Sciences (15001), Medicine a...",[MATCH_WEAK (110562)],[R2_MATCH_WEAK (110562)],"[2024 (28613), 2025 (26177), 2023 (21790), 202...",False,1,24,[https://doi.org/10.20944/preprints201605.0001...,[http://www.preprints.org/manuscript/201605.00...,"[10.20944/preprints201605.0001.v1, 10.20944/pr...",110562,[Preprints.org],1,"[Encyclopedia, Preprints.org]",2,[Preprints.org],1,"[Cambridge Open Engage, EarthArXiv, EcoEvoRxiv...",7,[],0,[Preprints.org],1
4,Open Science Framework,"[Center for Open Science (103243), California ...","[10.31219 (101368), 10.31227 (649), 10.31234 (...","[15934 (103243), 29705 (110), 242 (56), 33966 ...",[MISSING (103422)],[Open Science Framework (103422)],[osf.io (103422)],"[osf.io/895wn_v1 (2), osf.io/q6jyt_v1 (2), osf...","[10.31219/osf (101368), 10.31227/osf (649), 10...",[Open Science Framework (103422)],[MATCH_RULE5_GROUP_TITLE (103422)],[R5A_OSF_AMS_GROUP_TITLE (103422)],"[2022 (21849), 2023 (20952), 2021 (19909), 202...",False,1,1,"[https://doi.org/10.31219/osf.io/2twgy, https:...","[https://osf.io/2twgy, https://osf.io/w9623, h...","[10.31219/osf.io/2twgy, 10.31219/osf.io/w9623,...",103422,"[AfricArXiv, AgriRxiv, Arabixiv, BITSS, BodoAr...",27,"[AfricArXiv, AgriRxiv, Aquaculture Compendium,...",36,"[AfricArXiv, AgriRxiv, Arabixiv, BITSS, BioHac...",30,[Open Science Framework],1,[],0,"[AfricArXiv, AgriRxiv, Arabixiv, BITSS, BodoAr...",26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,Universidad de Lima,[Universidad de Lima (1)],[10.26439 (1)],[10978 (1)],[MISSING (1)],[Repositorio Universidad de Lima - Prepublicac...,[hdl.handle.net (1)],[hdl.handle.net/20.500.12724 (1)],[10.26439/ulima (1)],[Repositorio Universidad de Lima - Prepublicac...,[MATCH_PREFIX (1)],[R3_MATCH_PREFIX (1)],[2022 (1)],False,1,1,[https://doi.org/10.26439/ulima.prep.14933],[https://hdl.handle.net/20.500.12724/14933],[10.26439/ulima.prep.14933],1,[Universidad de Lima],1,[Universidad de Lima],1,"[DSpace/ not working, IUScholarWorks, SHAREOK ...",5,[Universidad de Lima],1,[],0,[Universidad de Lima],1
301,"University of Agder, Faculty of Fine Arts",[Society for Artistic Research (1)],[10.22501 (1)],[9794 (1)],[MISSING (1)],[MISSING (1)],[researchcatalogue.net (1)],[researchcatalogue.net/view (1)],[10.22501/uia (1)],[Society for Artistic Research (1)],[MATCH_RULE5_PREFIX_OVERRIDE (1)],[R5D_PREFIX_OVERRIDE (1)],[2024 (1)],False,1,1,[https://doi.org/10.22501/uia.2848768],[https://www.researchcatalogue.net/view/284876...,[10.22501/uia.2848768],1,"[HUB â€” Journal of Research in Art, Design and ...",10,"[HUB â€” Journal of Research in Art, Design and ...",10,"[HUB â€” Journal of Research in Art, Design and ...",10,[],0,[],0,"[University of Agder, Faculty of Fine Arts]",1
302,Unjournal (PubPub),[PubPub (1)],[10.21428 (1)],[9621 (1)],[MISSING (1)],[MISSING (1)],[unjournal.pubpub.org (1)],[unjournal.pubpub.org/pub (1)],[10.21428/d (1)],[PubPub (1)],[MATCH_RULE5_PUBPUB (1)],[R5C_PUBPUB (1)],[2023 (1)],False,1,1,[https://doi.org/10.21428/d28e8e57.b24c1b96],[https://unjournal.pubpub.org/pub/deletemeeval...,[10.21428/d28e8e57.b24c1b96],1,"[80Tahunaghfarid (PubPub), AYS Open Press, Acc...",59,"[80Tahunaghfarid (PubPub), AYS Open Press, Acc...",59,[Unjournal (PubPub)],1,[],0,[],0,"[Diskusi Dosen (PubPub), Dunia Dosen (PubPub),...",4
303,Wisatahalalmaros (PubPub),[PubPub (1)],[10.21428 (1)],[9621 (1)],[MISSING (1)],[MISSING (1)],[wisatahalalmaros.pubpub.org (1)],[wisatahalalmaros.pubpub.org/pub (1)],[10.21428/9d6214b0 (1)],[PubPub (1)],[MATCH_RULE5_PUBPUB (1)],[R5C_PUBPUB (1)],[2022 (1)],False,1,1,[https://doi.org/10.21428/9d6214b0.b4024c75],[https://wisatahalalmaros.pubpub.org/pub/0eg1z...,[10.21428/9d6214b0.b4024c75],1,"[80Tahunaghfarid (PubPub), AYS Open Press, Acc...",59,"[80Tahunaghfarid (PubPub), AYS Open Press, Acc...",59,[Wisatahalalmaros (PubPub)],1,[],0,[],0,[Wisatahalalmaros (PubPub)],1
