In [None]:
import pandas as pd, numpy as np

def detect_origin(r):
    for c in ("kbo_birthCountry", "npb_birthCountry", "mlb_birthCountry"):
        v = r.get(c)
        if pd.notna(v):
            v = str(v).lower()
            if "korea" in v: return "Korea"
            if "japan" in v: return "Japan"
    return "Other"

def choose(r, field):
    pref = {
        "Korea": ["kbo_"+field, "npb_"+field, "mlb_"+field],
        "Japan": ["npb_"+field, "kbo_"+field, "mlb_"+field],
        "Other": ["mlb_"+field, "npb_"+field, "kbo_"+field],
    }[r["origin"]]
    for c in pref:
        v = r.get(c)
        if pd.notna(v): return v
    return np.nan

def first_non_null(s):
    s = s.dropna()
    return s.iloc[0] if not s.empty else np.nan


In [None]:
weights = ["mlb_weight", "npb_weight", "kbo_weight"]
ht = ["mlb_height", "npb_height", "kbo_height"]
bats  = ["mlb_bats", "npb_bats", "kbo_bats"]
throws = ["mlb_throws", "npb_throws", "kbo_throws"]

df = pd.read_csv("mnk_people.csv")
df[weights] = df[weights].replace({0: np.nan, "0.0": np.nan})

agg = {
    "mlb_playerID": first_non_null,
    "npb_playerID": first_non_null,
    "kbo_playerID": first_non_null,
    "birthYear": "first",
    "birthMonth": "first",
    "birthDay": "first",
    "nameLast": "first",
    "nameFirst": "first",
    **{c: first_non_null for c in weights + ht + bats + throws +
       ["mlb_birthCountry", "npb_birthCountry", "kbo_birthCountry"]}
}

df = df.groupby("mnkID", as_index=False).agg(agg)

league_cnt = df[["mlb_playerID", "npb_playerID", "kbo_playerID"]].notna().sum(axis=1)
multi = league_cnt > 1

rep = df[weights].bfill(axis=1).ffill(axis=1).iloc[:, 0]
for w in weights:
    m = multi & df[w].isna()
    df.loc[m, w] = rep[m]

df["origin"] = df.apply(detect_origin, axis=1)
for f in ("height", "bats", "throws"):
    df[f] = df.apply(lambda r: choose(r, f), axis=1)

df["nameFull"] = (
    df["nameFirst"].fillna("").str.strip() + " " +
    df["nameLast"].fillna("").str.strip()).str.strip()

out = (
    df.rename(columns={"mnkID": "playerID"})
      .loc[:, [
          "playerID", "mlb_playerID", "npb_playerID", "kbo_playerID",
          "birthYear", "birthMonth", "birthDay",
          "nameLast", "nameFirst", "nameFull",
          "mlb_weight", "npb_weight", "kbo_weight",
          "height", "bats", "throws"
      ]]
      .sort_values("playerID"))

out.to_csv("mnk_people2.csv", index=False, na_rep="")


In [6]:
# Batting
people = pd.read_csv("mnk_people.csv")
batting = pd.read_csv("mnk_batting.csv") 

lookup = {}
for _, r in people.iterrows():
    if pd.notna(r["mlb_playerID"]): lookup[r["mlb_playerID"]] = r["playerID"]
    if pd.notna(r["npb_playerID"]): lookup[r["npb_playerID"]] = r["playerID"]
    if pd.notna(r["kbo_playerID"]): lookup[r["kbo_playerID"]] = r["playerID"]

leagues = ["mlb_playerID", "npb_playerID", "kbo_playerID"]

  batting = pd.read_csv("mnk_batting.csv")


In [None]:
def resolve_pid(row):
    if pd.notna(row["playerID"]) and row["playerID"] != "": 
        return row["playerID"]
    for c in leagues:
        key = row.get(c)
        if pd.notna(key) and key in lookup:
            return lookup[key]
    return np.nan

batting["playerID"] = batting.apply(resolve_pid, axis=1)

batting.to_csv("mnk_batting2.csv", index=False, na_rep="")

In [None]:
# Pitching
pitching = pd.read_csv("mnk_pitching.csv") 
pitching["playerID"] = pitching.apply(resolve_pid, axis=1)
pitching.to_csv("mnk_pitching2.csv", index=False, na_rep="")

  pitching = pd.read_csv("mnk_pitching.csv")


In [11]:
# Fielding
fielding = pd.read_csv("mnk_fielding.csv") 
fielding["playerID"] = fielding.apply(resolve_pid, axis=1)
fielding.to_csv("mnk_fielding2.csv", index=False, na_rep="")

  fielding = pd.read_csv("mnk_fielding.csv")
