In [None]:
import pandas as pd
from difflib import SequenceMatcher
from itertools import combinations

def normalize_name(name):
    return "".join(ch.lower() for ch in name if ch.isalnum())

def sim(a, b):
    return SequenceMatcher(None, normalize_name(a), normalize_name(b)).ratio()


In [None]:
def compare_leagues(df_a,df_b,tag_a,tag_b,thresh):
    merged = df_a.merge(df_b,on=["birthYear", "birthMonth", "birthDay"],how="inner",suffixes=(f"_{tag_a}", f"_{tag_b}"))

    merged["first_sim"] = merged.apply(lambda r: sim(r[f"nameFirst_{tag_a}"], r[f"nameFirst_{tag_b}"]), axis=1)
    merged["last_sim"] = merged.apply(lambda r: sim(r[f"nameLast_{tag_a}"],  r[f"nameLast_{tag_b}"]),  axis=1)

    mask = (merged["first_sim"] >= thresh) | (merged["last_sim"] >= thresh)
    candidates = merged.loc[mask].copy()
    candidates["pair_type"] = f"{tag_a}-{tag_b}"

    columns = [
        f"playerID_{tag_a}", f"nameFirst_{tag_a}", f"nameLast_{tag_a}",
        f"playerID_{tag_b}", f"nameFirst_{tag_b}", f"nameLast_{tag_b}",
        "birthYear", "birthMonth", "birthDay",
        "first_sim", "last_sim", "pair_type"
    ]
    return candidates[columns]


In [None]:
mlb = pd.read_csv("mlb_people.csv")
npb = pd.read_csv("npb_people.csv")
kbo = pd.read_csv("kbo_people.csv")

datasets = {"MLB": mlb, "NPB": npb, "KBO": kbo}

In [None]:
datasets = {"MLB": mlb, "NPB": npb, "KBO": kbo}

all_pairs = []
for (tag_a, df_a), (tag_b, df_b) in combinations(datasets.items(), 2):
    all_pairs.append(compare_leagues(df_a, df_b, tag_a, tag_b, thresh=0.40))

candidates = pd.concat(all_pairs, ignore_index=True)

candidates.to_csv("3league_name_candidates.csv", index=False)