In [1]:
import pandas as pd, re

def norm_name(s):
    return re.sub(r"[^a-z0-9]", "", str(s).lower())

In [2]:
def canon_id(x): # 104799.0 → 104799
    s = str(x).strip()
    return s[:-2] if s.endswith(".0") else s

In [3]:
def person_key(row):
    parts = [
        str(row["birthYear"]) if pd.notna(row["birthYear"]) else "",
        str(row["birthMonth"]) if pd.notna(row["birthMonth"]) else "",
        str(row["birthDay"]) if pd.notna(row["birthDay"])else "",
        norm_name(row["nameFirst"]), norm_name(row["nameLast"]),
    ]
    key = "-".join(parts)
    return key if "" not in parts else key + "-" + canon_id(row["playerID"])

In [4]:
def prep(path, tag):
    df = pd.read_csv(path, dtype=str)
    for c in ["birthYear", "birthMonth", "birthDay"]:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
    df["playerID"] = df["playerID"].map(canon_id)
    df["person_key"] = df.apply(person_key, axis=1)

    base = ["playerID","birthCountry","weight","height","bats","throws", "nameFirst","nameLast"]
    ren = {c: f"{tag}_{c}" for c in base}
    ren["playerID"] = f"{tag.lower()}_playerID"
    return df[["person_key"] + base].rename(columns=ren)

In [5]:
mlb = prep("mlb_people.csv", "mlb")
npb = prep("npb_people.csv", "npb")
kbo = prep("kbo_people.csv", "kbo")

merged = (mlb.merge(npb, on="person_key", how="outer").merge(kbo, on="person_key", how="outer"))

In [6]:
take_first = lambda *v: next((x.title() for x in v if pd.notna(x) and x != ""), "")
merged["nameFirst"] = merged.apply(lambda r: take_first(r.get("mlb_nameFirst"), r.get("npb_nameFirst"), r.get("kbo_nameFirst")), axis=1)
merged["nameLast"] = merged.apply(lambda r: take_first(r.get("mlb_nameLast"),  r.get("npb_nameLast"),  r.get("kbo_nameLast")),  axis=1)

parts = merged["person_key"].str.split("-", expand=True)
merged["birthYear"] = pd.to_numeric(parts[0], errors="coerce").astype("Int64")
merged["birthMonth"] = pd.to_numeric(parts[1], errors="coerce").astype("Int64")
merged["birthDay"] = pd.to_numeric(parts[2], errors="coerce").astype("Int64")

In [7]:
lead = ["mlb_playerID","kbo_playerID","npb_playerID"]
core = ["birthYear","birthMonth","birthDay","nameFirst","nameLast"]
others = [c for c in merged.columns if c not in lead + core + ["person_key"]]
merged = merged[lead + core + others]

In [8]:
merged.to_csv("mnk_people5.csv", index=False)
merged.dropna(subset=lead, thresh=2).to_csv("merged_player_records3=5.csv", index=False)

print("merged rows :", len(merged))
print("≥2-league :", len(merged.dropna(subset=lead, thresh=2)))

merged rows : 30532
≥2-league : 1492
