### ============================================================
### AutoGluon Tabular 기반 end_x/end_y 회귀 파이프라인 (수정본)
1) GroupKFold(game_id) OOF 평가(유클리드)
2) object 결측치 처리 일관화 (학습/OOF/importance/최종/테스트)
3) 임시 predictor 폴더 try/finally 정리
4) (누수 완화) feature importance pruning을 CV-train에서만 누적
5) branching OOF: valid subset만 예측 + 안전장치
6) OOF 결과 디스크 캐시 (후보 비교 반복 학습 낭비 제거)
7) k_prev 탐색은 cheap preset(빠르게), 본 탐색은 good, 최종은 best
### ============================================================

In [1]:
# !pip install numpy pandas tqdm catboost scikit-learn
# !pip install -U autogluon
# !pip install -U torch torchvision --index-url https://download.pytorch.org/whl/cu130

import os, json, shutil, hashlib, pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from autogluon.tabular import TabularPredictor

In [2]:
# -------------------------
# GPU 확인 (환경 점검용)
# -------------------------
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def get_num_gpus():
    """
    GPU 자동 감지:
      - torch import 가능 + cuda available -> 1
      - 아니면 0
    """
    try:
        import torch
        return 1 if torch.cuda.is_available() else 0
    except Exception:
        return 0

NUM_GPUS = get_num_gpus()
print("NUM_GPUS =", NUM_GPUS)

NUM_GPUS = 1


In [3]:
# -------------------------
# 경로 설정
# -------------------------
ART_DIR       = "artifacts"    # preprocess에서 만든 parquet 폴더
MODEL_DIR     = "models_ag"    # 최종 predictor 저장 폴더
TMP_DIR       = "ag_tmp"       # fold별 임시 predictor 저장 폴더
OOF_CACHE_DIR = "oof_cache"    # OOF 캐시 폴더

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(TMP_DIR, exist_ok=True)
os.makedirs(OOF_CACHE_DIR, exist_ok=True)

# -------------------------
# 경기장 상수(좌표 클리핑/후처리에 사용)
# -------------------------
PITCH_X, PITCH_Y = 105.0, 68.0
GOAL_Y = 34.0

In [4]:
# -------------------------
# Global option: parallel/sequential
#   - "parallel_local": AG 내부 bagging/stacking fold 전략을 parallel로
#   - "sequential_local": 안정성 우선
#   - None: AG 기본값
# -------------------------
FOLD_FITTING_STRATEGY = "sequential_local"  # 필요시 "sequential_local"로 변경

### Utils: 평가/클리핑/데이터 로딩/전처리

In [5]:
def euclidean_mean_distance(y_true_xy: np.ndarray, y_pred_xy: np.ndarray) -> float:
    """
    (end_x, end_y) 2차원 예측의 평균 유클리드 거리.
    - 최종 평가 지표(OOF 비교/모델 선택/후처리 튜닝)에 사용
    """
    diff = y_true_xy - y_pred_xy
    return float(np.sqrt(diff[:, 0]**2 + diff[:, 1]**2).mean())

In [6]:
def clip_xy(px, py):
    """
    예측 좌표를 경기장 범위로 강제 클리핑
    - x: [0, 105], y: [0, 68]
    - OOF/테스트 예측 모두에 적용해 비현실적인 값 방지
    """
    return np.clip(px, 0, PITCH_X), np.clip(py, 0, PITCH_Y)

In [7]:
def fill_object_missing(df: pd.DataFrame, cols=None, fill_value="MISSING") -> pd.DataFrame:
    """
    object 컬럼의 NaN을 일관되게 채우기

    - 원 코드에선 X_all만 채우고 실제 학습 df(tr/va/train_full)는 안 채워져서 불안정 가능
    - OOF/importance/최종학습/테스트 모두 여기로 통일
    """
    out = df.copy()
    use_cols = cols if cols is not None else out.columns
    for c in use_cols:
        if c in out.columns and out[c].dtype == "object":
            out[c] = out[c].fillna(fill_value)
    return out

In [8]:
def load_pack(k_prev: int):
    """
    preprocess에서 저장한 k_prev 버전의 train/test feature + label을 로드
    - 입력: artifacts/features_train_k{k}.parquet, labels_train_k{k}.parquet, features_test_k{k}.parquet
    - 출력: X_train(피처), y_train(라벨), X_test(피처)
    """
    X_train = pd.read_parquet(os.path.join(ART_DIR, f"features_train_k{k_prev}.parquet"))
    y_train = pd.read_parquet(os.path.join(ART_DIR, f"labels_train_k{k_prev}.parquet"))
    X_test  = pd.read_parquet(os.path.join(ART_DIR, f"features_test_k{k_prev}.parquet"))
    return X_train, y_train, X_test

In [9]:
def prepare_data(X_train, y_train):
    """
    학습용 테이블(data)을 만들고, GroupKFold를 위한 groups(game_id)와 feature 컬럼 리스트를 구성.
    AutoGluon은 fit(train_data=...)에 label 컬럼이 포함된 df를 넣어야 하므로 merge된 data를 중심으로 사용.

    - data: (피처 + end_x/end_y 라벨 포함) merged df
    - groups: GroupKFold 분리용 game_id
    - feat_cols: 모델 입력으로 사용할 피처 컬럼 목록
    """
    # 1) episode 단위로 피처와 라벨을 합침
    data = X_train.merge(y_train, on="game_episode", how="inner")

    # 2) group split 단위는 game_id (같은 game의 episode들이 fold를 넘나들지 않게)
    groups = data["game_id"].values

    # 3) feature 컬럼은 label/키 제외
    drop_cols = {"game_episode", "end_x", "end_y"}
    feat_cols = [c for c in data.columns if c not in drop_cols]

    # 4) 참고용(평가용 라벨 배열)
    yx = data["end_x"].values
    yy = data["end_y"].values

    return data, yx, yy, groups, feat_cols

### 0-1) OOF 캐시 유틸 (후보 비교 반복 학습 제거)

In [10]:
def make_oof_key(k_prev, feat_cols, x_metric, y_metric, n_splits, tag="base"):
    """
    OOF 캐시 키 생성
    - feature 리스트가 길어서 md5 hash로 축약
    - tag로 목적(예: 'cand', 'ksearch') 구분 가능
    """
    h = hashlib.md5(("|".join(feat_cols)).encode("utf-8")).hexdigest()[:10]
    return f"{tag}_k{k_prev}_f{h}_{x_metric}_{y_metric}_cv{n_splits}"

In [11]:
def load_oof_cache(key):
    """캐시 존재 시 (score, oof_pred) 반환"""
    path = os.path.join(OOF_CACHE_DIR, f"{key}.pkl")
    if os.path.exists(path):
        with open(path, "rb") as f:
            return pickle.load(f)
    return None

In [12]:
def save_oof_cache(key, obj):
    """(score, oof_pred) 저장"""
    path = os.path.join(OOF_CACHE_DIR, f"{key}.pkl")
    with open(path, "wb") as f:
        pickle.dump(obj, f)

### 1) AutoGluon helper: fold 1개 학습 / OOF 생성 / OOF 점수 계산

In [13]:
def ag_fit_one_fold(
    train_df: pd.DataFrame,
    label: str,
    eval_metric="rmse",
    presets="good_quality",
    time_limit=None,
    path=None,
    num_gpus=NUM_GPUS,
    fold_fitting_strategy=None,   # "parallel_local" / "sequential_local" / None
    extra_fit_kwargs=None,        # dict 형태로 추가 fit kwargs 주입 가능
):
    """
    (단일 fold) AutoGluon TabularPredictor 학습

    - train_df: feature + label이 포함된 DF
    - label: 학습할 타깃 컬럼명 ("end_x" 또는 "end_y")
    - eval_metric: AutoGluon이 내부 모델 선택/평가에 사용할 metric (rmse/mae 등)
    - presets: 학습 품질/시간 트레이드오프 템플릿
    - time_limit: 이 fold 학습에 허용할 총 시간(초). None이면 제한 없음
    - path: predictor 저장 경로(폴더). 이미 있으면 삭제 후 재생성
    """
    if path is not None and os.path.exists(path):
        shutil.rmtree(path)

    fit_kwargs = dict(
        train_data=train_df,
        presets=presets,
        time_limit=time_limit,
        ag_args_fit={"num_gpus": int(num_gpus)},
    )

    if fold_fitting_strategy is not None:
        fit_kwargs["ag_args_ensemble"] = {"fold_fitting_strategy": fold_fitting_strategy}

    if extra_fit_kwargs:
        fit_kwargs.update(extra_fit_kwargs)

    predictor = TabularPredictor(
        label=label,
        problem_type="regression",
        eval_metric=eval_metric,
        path=path,
    ).fit(**fit_kwargs)

    return predictor

In [14]:
def ag_oof_predict(
    df_all: pd.DataFrame,
    label: str,
    feat_cols: list,
    groups: np.ndarray,
    n_splits=5,
    eval_metric="rmse",
    presets="good_quality",
    time_limit=None,
    save_root=TMP_DIR,
    num_gpus=NUM_GPUS,
    fold_fitting_strategy=None,
):
    """
    GroupKFold 기반 OOF 예측 생성.
    - fold마다 predictor를 학습하고(valid fold에는 절대 학습 데이터가 들어가지 않음)
      valid fold를 예측한 값을 oof 배열에 채움.

    반환:
    - oof: 길이 N(샘플 수)짜리 예측값 배열
    """
    gkf = GroupKFold(n_splits=n_splits)
    oof = np.zeros(len(df_all), dtype=float)

    # gkf.split의 y는 사실상 형식상 필요(여기서는 df_all[label]을 넣음)
    for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_all[feat_cols], df_all[label], groups=groups)):
        # 학습 df: feature + label 포함
        tr = df_all.iloc[tr_idx][feat_cols + [label]].copy()
        va = df_all.iloc[va_idx][feat_cols].copy()
        
        # 검증 df: feature만 (예측용), object 결측 일관 처리
        tr = fill_object_missing(tr, cols=feat_cols)
        va = fill_object_missing(va, cols=feat_cols)

        # fold별 predictor 저장 폴더
        path = os.path.join(save_root, f"{label}_fold{fold}")
        
        try:
            predictor = ag_fit_one_fold(
                train_df=tr,
                label=label,
                eval_metric=eval_metric,
                presets=presets,
                time_limit=time_limit,
                path=path,
                num_gpus=num_gpus,
                fold_fitting_strategy=fold_fitting_strategy,
            )
            
            oof[va_idx] = predictor.predict(va).values
        finally:
            # ✅ 학습 중 에러가 나도 폴더는 정리
            if os.path.exists(path):
                shutil.rmtree(path)

    return oof

In [15]:
def ag_cv_score_xy(
    data_merged: pd.DataFrame,
    feat_cols: list,
    groups: np.ndarray,
    x_metric="rmse",
    y_metric="rmse",
    n_splits=5,
    presets="good_quality",
    time_limit=None,
    num_gpus=NUM_GPUS,
    fold_fitting_strategy=None,
):
    """
    end_x/end_y 각각 OOF를 만든 뒤, (x,y) 유클리드 평균 거리로 점수 계산.

    반환:
    - score: mean euclidean
    - oof_xy: shape (N,2) -> [:,0]=px_oof, [:,1]=py_oof
    """
    # end_x OOF
    px = ag_oof_predict(
        df_all=data_merged, label="end_x", feat_cols=feat_cols, groups=groups,
        n_splits=n_splits, eval_metric=x_metric, presets=presets,
        time_limit=time_limit, save_root=TMP_DIR, num_gpus=num_gpus,
        fold_fitting_strategy=fold_fitting_strategy,
    )

    # end_y OOF
    py = ag_oof_predict(
        df_all=data_merged, label="end_y", feat_cols=feat_cols, groups=groups,
        n_splits=n_splits, eval_metric=y_metric, presets=presets,
        time_limit=time_limit, save_root=TMP_DIR, num_gpus=num_gpus,
        fold_fitting_strategy=fold_fitting_strategy,
    )

    # 경기장 밖으로 나간 예측은 클리핑 후 평가
    px, py = clip_xy(px, py)

    # 정답
    y_true = data_merged[["end_x", "end_y"]].values

    # 최종 점수(평가 기준은 항상 Euclidean)
    score = euclidean_mean_distance(y_true, np.column_stack([px, py]))

    return score, np.column_stack([px, py])

### 2) pruning: feature importance top-N : 누수 완화 pruning: CV-train에서만 importance 누적

In [16]:
def ag_prune_features_cv(
    data_merged: pd.DataFrame,
    feat_cols: list,
    groups: np.ndarray,
    n_splits=5,
    top_n=200,
    eval_metric="rmse",
    presets="good_quality",
    time_limit=None,
    num_gpus=NUM_GPUS,
    save_root=TMP_DIR,
    fold_fitting_strategy=None,
):
    """
    ✅ 누수 완화 pruning:
    - 각 fold에서 "train subset"으로만 end_x/end_y importance 계산
    - fold별 importance 평균을 내서 top_n 피처 선택
    """
    gkf = GroupKFold(n_splits=n_splits)
    imp_sum = pd.Series(0.0, index=feat_cols)

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(data_merged[feat_cols], data_merged["end_x"], groups=groups)):
        tr_all = data_merged.iloc[tr_idx].copy()

        # object 결측 처리
        tr_all = fill_object_missing(tr_all, cols=feat_cols)

        # fold별 임시 path
        path_x = os.path.join(save_root, f"imp_cv_endx_fold{fold}")
        path_y = os.path.join(save_root, f"imp_cv_endy_fold{fold}")

        try:
            # end_x importance
            pred_x = ag_fit_one_fold(
                train_df=tr_all[feat_cols + ["end_x"]],
                label="end_x",
                eval_metric=eval_metric,
                presets=presets,
                time_limit=time_limit,
                path=path_x,
                num_gpus=num_gpus,
                fold_fitting_strategy=fold_fitting_strategy,
            )

            imp_x = pred_x.feature_importance(tr_all[feat_cols], silent=True)["importance"]


            # end_y importance
            pred_y = ag_fit_one_fold(
                train_df=tr_all[feat_cols + ["end_y"]],
                label="end_y",
                eval_metric=eval_metric,
                presets=presets,
                time_limit=time_limit,
                path=path_y,
                num_gpus=num_gpus,
                fold_fitting_strategy=fold_fitting_strategy,
            )
            
            imp_y = pred_y.feature_importance(tr_all[feat_cols], silent=True)["importance"]

            # fold importance 평균
            imp_fold = (imp_x.reindex(feat_cols).fillna(0) + imp_y.reindex(feat_cols).fillna(0)) / 2.0
            imp_sum = imp_sum.add(imp_fold, fill_value=0.0)

        finally:
            for p in [path_x, path_y]:
                if os.path.exists(p):
                    shutil.rmtree(p)

    imp_avg = (imp_sum / n_splits).sort_values(ascending=False)
    top_features = imp_avg.head(min(top_n, len(imp_avg))).index.tolist()
    return top_features, imp_avg

### 3) branching CV: result_name으로 성공/실패 분기 학습 + 라우팅 예측

In [17]:
def ag_cv_score_xy_branch(
    data_merged: pd.DataFrame,
    feat_cols: list,
    groups: np.ndarray,
    x_metric="rmse",
    y_metric="rmse",
    result_col="result_name",
    n_splits=5,
    presets="good_quality",
    time_limit=None,
    min_side=50,
    num_gpus=NUM_GPUS,
    fold_fitting_strategy=None,
):
    """
    result_name 기준 분기 학습 OOF:
    - train fold에서 Successful/Unsuccessful 별도 모델 학습
    - valid fold는 result_name으로 라우팅
    
    - ✅ valid subset만 각각 예측(효율)
    - ✅ 한쪽 표본이 너무 적으면 전체 모델로 fallback
    """
    gkf = GroupKFold(n_splits=n_splits)
    oof = np.zeros((len(data_merged), 2), dtype=float)

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(data_merged[feat_cols], data_merged["end_x"], groups=groups)):
        tr_all = data_merged.iloc[tr_idx].copy()
        va_all = data_merged.iloc[va_idx].copy()

        # 전처리(학습/예측에 쓰일 feature)
        tr_all = fill_object_missing(tr_all, cols=feat_cols)
        va_feat = fill_object_missing(va_all[feat_cols].copy(), cols=feat_cols)

        # train에서 성공/실패 분리
        tr_s = tr_all[result_col].astype(str).eq("Successful")
        tr_u = ~tr_s

        # fallback: 표본이 너무 적으면 전체 모델
        if tr_s.sum() < min_side or tr_u.sum() < min_side:
            # fallback: 전체 모델
            path_x = os.path.join(TMP_DIR, f"branch_all_endx_fold{fold}")
            path_y = os.path.join(TMP_DIR, f"branch_all_endy_fold{fold}")
            try:
                px_model = ag_fit_one_fold(
                    train_df=tr_all[feat_cols + ["end_x"]],
                    label="end_x",
                    eval_metric=x_metric,
                    presets=presets,
                    time_limit=time_limit,
                    path=path_x,
                    num_gpus=num_gpus,
                    fold_fitting_strategy=fold_fitting_strategy,
                )

                py_model = ag_fit_one_fold(
                    train_df=tr_all[feat_cols + ["end_y"]],
                    label="end_y",
                    eval_metric=y_metric,
                    presets=presets,
                    time_limit=time_limit,
                    path=path_y,
                    num_gpus=num_gpus,
                    fold_fitting_strategy=fold_fitting_strategy,
                )
                px = px_model.predict(va_feat).values
                py = py_model.predict(va_feat).values
            finally:
                for p in [path_x, path_y]:
                    if os.path.exists(p):
                        shutil.rmtree(p)

        # -------- 분기 학습: 성공/실패 각각 별도 predictor 학습 --------
        else:
            # 성공/실패 모델 각각 학습
            path_sx = os.path.join(TMP_DIR, f"branch_s_endx_fold{fold}")
            path_sy = os.path.join(TMP_DIR, f"branch_s_endy_fold{fold}")
            path_ux = os.path.join(TMP_DIR, f"branch_u_endx_fold{fold}")
            path_uy = os.path.join(TMP_DIR, f"branch_u_endy_fold{fold}")

            try:
                px_s = ag_fit_one_fold(
                    train_df=tr_all.loc[tr_s, feat_cols + ["end_x"]],
                    label="end_x",
                    eval_metric=x_metric,
                    presets=presets,
                    time_limit=time_limit,
                    path=path_sx,
                    num_gpus=num_gpus,
                    fold_fitting_strategy=fold_fitting_strategy,
                )
                py_s = ag_fit_one_fold(
                    train_df=tr_all.loc[tr_s, feat_cols + ["end_y"]],
                    label="end_y",
                    eval_metric=y_metric,
                    presets=presets,
                    time_limit=time_limit,
                    path=path_sy,
                    num_gpus=num_gpus,
                    fold_fitting_strategy=fold_fitting_strategy,
                )
                px_u = ag_fit_one_fold(
                    train_df=tr_all.loc[tr_u, feat_cols + ["end_x"]],
                    label="end_x",
                    eval_metric=x_metric,
                    presets=presets,
                    time_limit=time_limit,
                    path=path_ux,
                    num_gpus=num_gpus,
                    fold_fitting_strategy=fold_fitting_strategy,
                )
                py_u = ag_fit_one_fold(
                    train_df=tr_all.loc[tr_u, feat_cols + ["end_y"]],
                    label="end_y",
                    eval_metric=y_metric,
                    presets=presets,
                    time_limit=time_limit,
                    path=path_uy,
                    num_gpus=num_gpus,
                    fold_fitting_strategy=fold_fitting_strategy,
                )

                # valid 라우팅 마스크
                va_s_mask = va_all[result_col].astype(str).eq("Successful").values
                px = np.empty(len(va_all), dtype=float)
                py = np.empty(len(va_all), dtype=float)

                # ✅ valid subset만 예측
                idx_s = np.where(va_s_mask)[0]
                idx_u = np.where(~va_s_mask)[0]
                if len(idx_s):
                    px[idx_s] = px_s.predict(va_feat.iloc[idx_s]).values
                    py[idx_s] = py_s.predict(va_feat.iloc[idx_s]).values
                if len(idx_u):
                    px[idx_u] = px_u.predict(va_feat.iloc[idx_u]).values
                    py[idx_u] = py_u.predict(va_feat.iloc[idx_u]).values

            finally:
                for p in [path_sx, path_sy, path_ux, path_uy]:
                    if os.path.exists(p):
                        shutil.rmtree(p)

        px, py = clip_xy(px, py)
        oof[va_idx, 0] = px
        oof[va_idx, 1] = py

    y_true = data_merged[["end_x", "end_y"]].values
    score = euclidean_mean_distance(y_true, oof)
    return score, oof

### 4) 후처리(postprocess) 함수: 예측점을 start 기준으로 보정(shrink)

In [18]:
def apply_postprocess(df_feat: pd.DataFrame, pred_xy: np.ndarray,
                      forward_scale: float, lateral_shrink: float):
    """
    후처리 아이디어:
    - pred_x는 start_x 기준으로 전진/후진 변위를 forward_scale만큼 스케일
    - pred_y는 중앙선(GOAL_Y=34) 기준으로 lateral_shrink만큼 중앙으로 당김

    주의: df_feat에 start_x가 반드시 있어야 함.
    """
    start_x = df_feat["start_x"].values

    # x: start 기준 변위 스케일링
    pred_x = start_x + (pred_xy[:, 0] - start_x) * forward_scale

    # y: 중앙선 기준 shrink
    pred_y = GOAL_Y + (pred_xy[:, 1] - GOAL_Y) * lateral_shrink

    # 보정 후에도 경기장 범위로 클리핑
    pred_x, pred_y = clip_xy(pred_x, pred_y)
    return np.column_stack([pred_x, pred_y])

### 5) 메인 실행(STEP 1 ~ 6)

#### 하이퍼/프리셋

In [19]:
K_LIST   = [3, 5, 7, 10]
N_SPLITS = 5

TIME_LIMIT_PER_FOLD = None

SEARCH_PRESETS_K = "medium_quality"  # k_prev 탐색(cheap)
SEARCH_PRESETS   = "good_quality"    # 후보 비교/프루닝
FINAL_PRESETS    = "best_quality"    # 최종 학습

#### STEP 1: k_prev(k개의 과거 정보 이용) 선택 (cheap preset + OOF 캐시)
**Pass가 시작되기 직전 몇 개(k개)의 과거 이벤트 정보를 피처로 사용할지(k_prev) 결정**

In [20]:
best_k = {"k_prev": None, "oof": float("inf"), "oof_pred": None}

# k_prev별로 OOF score를 계산 -> 가장 작은 score를 best로 선택
for k_prev in K_LIST:
    print(f"\n==== STEP1: Search k_prev={k_prev} (cheap preset) ====")

    # k_prev별 데이터 로드
    X_train, y_train, _ = load_pack(k_prev)

    # merged data + 그룹 + feature 컬럼
    data, yx, yy, groups, feat_cols = prepare_data(X_train, y_train)

    # ✅ k 탐색도 캐시 사용(자주 재실행하니까)
    key = make_oof_key(k_prev, feat_cols, "rmse", "rmse", N_SPLITS, tag="ksearch")
    cached = load_oof_cache(key)

    if cached is not None:
        score, oof_pred = cached
        print(f"[CACHE HIT] {key}")
        
    else:
        score, oof_pred = ag_cv_score_xy(
            data_merged=data,
            feat_cols=feat_cols,
            groups=groups,
            x_metric="rmse",
            y_metric="rmse",
            n_splits=N_SPLITS,
            presets=SEARCH_PRESETS_K,
            time_limit=TIME_LIMIT_PER_FOLD,
            num_gpus=NUM_GPUS,
            fold_fitting_strategy=FOLD_FITTING_STRATEGY,
        )
        save_oof_cache(key, (score, oof_pred))

    print(f"[k={k_prev}] OOF mean euclidean = {score:.6f}")
    if score < best_k["oof"]:
        best_k.update({"k_prev": k_prev, "oof": score, "oof_pred": oof_pred})

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26200
CPU Count:          12
Memory Avail:       17.67 GB / 31.59 GB (55.9%)
Disk Space Avail:   25.93 GB / 100.00 GB (25.9%)
Presets specified: ['medium_quality']
Using hyperparameters preset: hyperparameters='default'



==== STEP1: Search k_prev=3 (cheap preset) ====
[CACHE HIT] ksearch_k3_f7086118df8_rmse_rmse_cv5
[k=3] OOF mean euclidean = 14.169963

==== STEP1: Search k_prev=5 (cheap preset) ====


Beginning AutoGluon training ...
AutoGluon will save models to "d:\공모전\스포츠\ag_tmp\end_x_fold0"
Train Data Rows:    12324
Train Data Columns: 73
Label Column:       end_x
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    18095.02 MB
	Train Data (Original)  Memory Usage: 14.09 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 3 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 G

[k=5] OOF mean euclidean = 14.280500

==== STEP1: Search k_prev=7 (cheap preset) ====


Beginning AutoGluon training ...
AutoGluon will save models to "d:\공모전\스포츠\ag_tmp\end_x_fold0"
Train Data Rows:    12324
Train Data Columns: 93
Label Column:       end_x
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    17088.33 MB
	Train Data (Original)  Memory Usage: 18.61 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 3 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 G

[k=7] OOF mean euclidean = 14.351664

==== STEP1: Search k_prev=10 (cheap preset) ====


Beginning AutoGluon training ...
AutoGluon will save models to "d:\공모전\스포츠\ag_tmp\end_x_fold0"
Train Data Rows:    12324
Train Data Columns: 123
Label Column:       end_x
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    17209.80 MB
	Train Data (Original)  Memory Usage: 25.39 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 3 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 

[k=10] OOF mean euclidean = 14.394723


In [21]:
print("\nBEST k_prev:", best_k["k_prev"], "baseline OOF:", best_k["oof"])


BEST k_prev: 3 baseline OOF: 14.169962815260774


#### STEP 2: 선택된 k_prev에서 pruning (importance top TOP_N) : 피처 선택
**STEP 1에서 선택된 피처 셋에서 모델에 불필요하거나 노이즈가 많은 피처를 제거**

In [None]:
# 선택된 k_prev pack 로드
X_train, y_train, X_test = load_pack(k_prev)
data, yx, yy, groups, feat_cols = prepare_data(X_train, y_train)

TOP_N = min(200, len(feat_cols))
print(f"\n==== STEP2: Pruning via CV-train importance (top {TOP_N}) ====")

pruned_features, imp_all = ag_prune_features_cv(
    data_merged=data,
    feat_cols=feat_cols,
    groups=groups,
    n_splits=N_SPLITS,
    top_n=TOP_N,
    eval_metric="rmse",
    presets=SEARCH_PRESETS,
    time_limit=TIME_LIMIT_PER_FOLD,
    num_gpus=NUM_GPUS,
    save_root=TMP_DIR,
    fold_fitting_strategy=FOLD_FITTING_STRATEGY,
)

In [None]:
print("Top 20 pruned features:")
print(imp_all.head(20))

#### STEP 3: metric(=loss 성격) 분리 비교 (baseline vs pruned) : “모델 후보” 평가
**피처 셋과 AutoGluon 내부 metric(end_x / end_y 각각 rmse vs mae 중 무엇이 더 유리한지)의 최적 조합**  
 후보: (x=rmse, y=rmse) vs (x=rmse, y=mae)
 - baseline feature + (rmse, rmse)
 - baseline feature + (rmse, mae)
 - pruned feature + (rmse, rmse)
 - pruned feature + (rmse, mae)  
 => 각각 OOF score를 구해서 최솟값을 best로 선택

In [None]:
print("\n==== STEP3: Candidates (no-branch yet) with OOF cache ====")

candidates = []

def eval_candidate(tag_name, fcols, x_metric, y_metric):
    """
    후보 1개를 평가(OOF score + OOF pred)하고 candidates에 넣기
    - ✅ OOF 캐시 사용으로 중복 학습 제거
    """
    key = make_oof_key(k_prev, fcols, x_metric, y_metric, N_SPLITS, tag="cand")
    cached = load_oof_cache(key)
    if cached is not None:
        score, oof_pred = cached
        print(f"[CACHE HIT] {tag_name} -> {score}")
    else:
        score, oof_pred = ag_cv_score_xy(
            data_merged=data,
            feat_cols=fcols,
            groups=groups,
            x_metric=x_metric,
            y_metric=y_metric,
            n_splits=N_SPLITS,
            presets=SEARCH_PRESETS,
            time_limit=TIME_LIMIT_PER_FOLD,
            num_gpus=NUM_GPUS,
            fold_fitting_strategy=FOLD_FITTING_STRATEGY,
        )

        save_oof_cache(key, (score, oof_pred))
        print(f"[TRAINED]   {tag_name} -> {score:.6f}")

    candidates.append((tag_name, fcols, x_metric, y_metric, score, oof_pred))

In [None]:
# baseline
eval_candidate("baseline_rmse_rmse", feat_cols,        "rmse", "rmse")

In [None]:
eval_candidate("baseline_rmse_mae",  feat_cols,        "rmse", "mae")

In [None]:
# pruned
eval_candidate("pruned_rmse_rmse",   pruned_features,  "rmse", "rmse")

In [None]:
eval_candidate("pruned_rmse_mae",    pruned_features,  "rmse", "mae")

In [None]:
best = min(candidates, key=lambda x: x[4])
best_name, best_fcols, best_xm, best_ym, base_score, base_oof = best
print("\nBest candidate (no-branch yet):", best_name, "score:", base_score)

#### STEP 4: result_name 분기 모델 비교 (best candidate 기준)
**Pass 결과(Successful vs Unsuccessful)별로 모델을 분리하여 학습하는 Branching 전략이 단일 모델보다 성능이 좋은지 확인**  
분기 모델이 더 좋으면 use_branch=True, 아니면 base 후보 유지

In [None]:
print("\n==== STEP4: Branching check ====")

use_branch = False
branch_score = float("inf")
branch_oof = None

# data에 result_name 컬럼이 있는 경우에만 branching을 시도
if "result_name" in data.columns:
    # branching은 구조가 달라 캐시 키를 별도로 둠(원하면 캐시 가능)
    branch_score, branch_oof = ag_cv_score_xy_branch(
        data_merged=data,
        feat_cols=best_fcols,
        groups=groups,
        x_metric=best_xm,
        y_metric=best_ym,
        result_col="result_name",
        n_splits=N_SPLITS,
        presets=SEARCH_PRESETS,
        time_limit=TIME_LIMIT_PER_FOLD,
        min_side=50,
        num_gpus=NUM_GPUS,
        fold_fitting_strategy=FOLD_FITTING_STRATEGY,
    )

    print("Base best score:", base_score)
    print("Branch score  :", branch_score)

    use_branch = branch_score < base_score
    print("Use branching?", use_branch)
else:
    print("[INFO] data has no result_name -> skip branching")

#### STEP 5: 후처리 튜닝 (OOF 기반 grid search)
**예측 좌표를 경기장의 특성에 맞게 보정하여 성능을 미세 조정**

In [None]:
print("\n==== STEP5: Postprocess tuning ====")

# 분기 사용 여부에 따라 사용할 OOF를 선택
if use_branch:
    oof_use = branch_oof
    base_ref = branch_score
else:
    oof_use = base_oof
    base_ref = base_score

# grid 탐색 범위
grid_forward = [0.90, 0.95, 1.00, 1.05]
grid_lateral = [0.80, 0.90, 1.00]

y_true = data[["end_x", "end_y"]].values

# 초기값은 “후처리 안 함”
best_pp = {"forward_scale": 1.0, "lateral_shrink": 1.0, "score": base_ref}

# grid search: OOF에 후처리 적용 후 euclidean 최소 조합 찾기
for fs in grid_forward:
    for ls in grid_lateral:
        adj = apply_postprocess(data, oof_use, fs, ls)
        s = euclidean_mean_distance(y_true, adj)
        if s < best_pp["score"]:
            best_pp = {"forward_scale": fs, "lateral_shrink": ls, "score": s}

print("Best postprocess:", best_pp)

# 후처리 적용이 실제로 이득이면 사용
use_postprocess = best_pp["score"] < base_ref
print("Use postprocess?", use_postprocess)

#### STEP 6: 최종 학습(전체 데이터) + 저장 + submission

In [None]:
final = {
    "k_prev": int(k_prev),
    "feature_set": best_name,
    "x_metric": best_xm,
    "y_metric": best_ym,
    "use_branch": bool(use_branch),
    "use_postprocess": bool(use_postprocess),
    "postprocess": best_pp if use_postprocess else {"forward_scale": 1.0, "lateral_shrink": 1.0},
    "presets_k_search": SEARCH_PRESETS_K,
    "presets_search": SEARCH_PRESETS,
    "presets_final": FINAL_PRESETS,
    "fold_fitting_strategy": FOLD_FITTING_STRATEGY,
    "num_gpus": int(NUM_GPUS),
}
print("\nFINAL CONFIG:", final)

In [None]:
# (1) 최종 train 준비 (+ 전처리 일관 적용)
train_full_x = fill_object_missing(data[best_fcols + ["end_x"]].copy(), cols=best_fcols)
train_full_y = fill_object_missing(data[best_fcols + ["end_y"]].copy(), cols=best_fcols)

# ✅ 최종학습도 object 결측 일관 처리
train_full_x = fill_object_missing(train_full_x, cols=best_fcols)
train_full_y = fill_object_missing(train_full_y, cols=best_fcols)

In [None]:
# (2) 테스트 준비 (+ 전처리)
Xt = fill_object_missing(X_test[best_fcols].copy(), cols=best_fcols)

# ✅ 테스트에 result_name 없으면 branching 불가 -> 자동 해제
if use_branch and ("result_name" not in X_test.columns):
    print("[WARN] X_test has no result_name. Disable branching.")
    use_branch = False
    final["use_branch"] = False

def fit_kwargs_final():
    """최종 .fit()에 넘길 kwargs를 None 없이 안전하게 구성."""
    kw = dict(
        presets=FINAL_PRESETS,
        ag_args_fit={"num_gpus": int(NUM_GPUS)},
    )
    if FOLD_FITTING_STRATEGY is not None:
        kw["ag_args_ensemble"] = {"fold_fitting_strategy": FOLD_FITTING_STRATEGY}
    return kw

In [None]:
# (3) 최종 학습 및 예측
if not use_branch:
    # 단일 모델(분기 없음): end_x predictor 1개 + end_y predictor 1개
    px_pred = TabularPredictor(
        label="end_x", problem_type="regression", eval_metric=best_xm,
        path=os.path.join(MODEL_DIR, "predictor_endx"),
    ).fit(train_data=train_full_x, **fit_kwargs_final())

    py_pred = TabularPredictor(
        label="end_y", problem_type="regression", eval_metric=best_ym,
        path=os.path.join(MODEL_DIR, "predictor_endy"),
    ).fit(train_data=train_full_y, **fit_kwargs_final())

    px = px_pred.predict(Xt).values
    py = py_pred.predict(Xt).values

else:
    # 분기 모델: 성공/실패 각각 end_x/end_y predictor (총 4개)
    is_s = data["result_name"].astype(str).eq("Successful")
    is_u = ~is_s

    # fallback: 한쪽이 너무 작으면(안전장치) 전체 데이터로 단일 모델 학습
    if is_s.sum() < 50 or is_u.sum() < 50:
        print("[WARN] One side too small -> fallback to single model")

        px_all = TabularPredictor(
            label="end_x", problem_type="regression", eval_metric=best_xm,
            path=os.path.join(MODEL_DIR, "predictor_endx_all"),
        ).fit(train_data=train_full_x, **fit_kwargs_final())

        py_all = TabularPredictor(
            label="end_y", problem_type="regression", eval_metric=best_ym,
            path=os.path.join(MODEL_DIR, "predictor_endy_all"),
        ).fit(train_data=train_full_y, **fit_kwargs_final())

        px = px_all.predict(Xt).values
        py = py_all.predict(Xt).values

    else:
        # 학습 데이터도 전처리 일관 유지
        tr_s_x = fill_object_missing(data.loc[is_s, best_fcols + ["end_x"]].copy(), cols=best_fcols)
        tr_s_y = fill_object_missing(data.loc[is_s, best_fcols + ["end_y"]].copy(), cols=best_fcols)
        tr_u_x = fill_object_missing(data.loc[is_u, best_fcols + ["end_x"]].copy(), cols=best_fcols)
        tr_u_y = fill_object_missing(data.loc[is_u, best_fcols + ["end_y"]].copy(), cols=best_fcols)

        # 성공 모델
        px_s = TabularPredictor(
            label="end_x", problem_type="regression", eval_metric=best_xm,
            path=os.path.join(MODEL_DIR, "predictor_endx_success"),
        ).fit(train_data=tr_s_x, **fit_kwargs_final())

        py_s = TabularPredictor(
            label="end_y", problem_type="regression", eval_metric=best_ym,
            path=os.path.join(MODEL_DIR, "predictor_endy_success"),
        ).fit(train_data=tr_s_y, **fit_kwargs_final())

        # 실패 모델
        px_u = TabularPredictor(
            label="end_x", problem_type="regression", eval_metric=best_xm,
            path=os.path.join(MODEL_DIR, "predictor_endx_unsuccess"),
        ).fit(train_data=tr_u_x, **fit_kwargs_final())

        py_u = TabularPredictor(
            label="end_y", problem_type="regression", eval_metric=best_ym,
            path=os.path.join(MODEL_DIR, "predictor_endy_unsuccess"),
        ).fit(train_data=tr_u_y, **fit_kwargs_final())

        # 테스트 라우팅 (테스트에 result_name 있어야 여기까지 옴)
        is_s_test = X_test["result_name"].astype(str).eq("Successful").values

        # 예측 (전체를 예측한 뒤 라우팅 — 여기서는 최종 1회라서 단순 유지)
        pred_s_x = px_s.predict(Xt).values
        pred_s_y = py_s.predict(Xt).values
        pred_u_x = px_u.predict(Xt).values
        pred_u_y = py_u.predict(Xt).values

        px = np.where(is_s_test, pred_s_x, pred_u_x)
        py = np.where(is_s_test, pred_s_y, pred_u_y)

In [None]:
# (4) 클리핑 + (선택) 후처리
px, py = clip_xy(px, py)
pred = np.column_stack([px, py])

if use_postprocess:
    pred = apply_postprocess(
        df_feat=X_test,  # start_x 필요
        pred_xy=pred,
        forward_scale=best_pp["forward_scale"],
        lateral_shrink=best_pp["lateral_shrink"]
    )

In [None]:
# (5) submission 저장
sub = pd.DataFrame({
    "game_episode": X_test["game_episode"].values,
    "end_x": pred[:, 0],
    "end_y": pred[:, 1],
})

sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head())

In [None]:
# (6) meta 저장
meta_path = os.path.join(MODEL_DIR, "model_meta.json")
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump({
        **final,
        "oof_base_or_branch_score": float(base_ref),
        "oof_after_postprocess_score": float(best_pp["score"]) if use_postprocess else float(base_ref),
        "feature_cols": best_fcols,
    }, f, ensure_ascii=False, indent=2)

print("Saved meta:", meta_path)