In [None]:
import pandas as pd
import numpy as np

BASE_STATS = ["G","AB","R","H","2B","3B","HR","RBI","SB","CS","BB","SO","GIDP"]
DERIVED_STATS = ["PA","TB","AVG","SLG","OBP","OPS","BB_pct","K_pct"]
STATS = BASE_STATS + DERIVED_STATS
HELPER = {"HBP","SH","SF"}
SOURCE_FILES = {"MLB":"mlb_batting.csv","KBO":"kbo_batting.csv","NPB":"npb_batting.csv"}

def div(n,d):
    return np.where(d!=0,n/d,np.nan)

def keep_id_cols(cols):
    id_like = {c for c in cols if c.lower().endswith("id")}
    extras = {"playerID","yearID","lgID","teamID","stint"}
    return [c for c in cols if c in id_like or c in extras]

def harmonize(df):
    df = df.copy()
        
    for c in BASE_STATS: df[c] = df.get(c,0)
    for c in HELPER: df[c] = df.get(c,0)
    
    df["PA"] = df["AB"]+df["BB"]+df["HBP"]+df["SH"]+df["SF"]
    singles = df["H"]-df["2B"]-df["3B"]-df["HR"]
    df["TB"] = singles + 2*df["2B"] + 3*df["3B"] + 4*df["HR"]
    
    df["AVG"] = div(df["H"],df["AB"])
    df["SLG"] = div(df["TB"],df["AB"])
    df["OBP"] = div(df["H"]+df["BB"]+df["HBP"],df["AB"]+df["BB"]+df["HBP"]+df["SF"])
    df["OPS"] = df["OBP"]+df["SLG"]
    df["BB_pct"] = div(df["BB"],df["PA"])
    df["K_pct"] = div(df["SO"],df["PA"])
    df[["AVG","SLG","OBP","OPS","BB_pct","K_pct"]] = df[["AVG","SLG","OBP","OPS","BB_pct","K_pct"]].round(3)
    
    demo = keep_id_cols(df.columns)
    final_cols = list(dict.fromkeys(demo + STATS))
    return df[final_cols]

In [None]:
for lg,path in SOURCE_FILES.items():
    raw = pd.read_csv(path)
    tidy = harmonize(raw)
    tidy.to_csv(f"{lg.lower()}_batting2.csv",index=False,na_rep="")

  raw = pd.read_csv(path)
