# preprocess.ipynb (multi-k)

이 노트북은 **train/test 이벤트 로드 → 마지막 Pass 샘플 생성 → 피처 생성 → 여러 k_prev 버전 저장**까지 수행합니다.

- 입력
  - `data/train.csv`
  - `data/test.csv` (index: game_id, game_episode, path)
  - `data/test/.../{game_episode}.csv`
- 출력(예: k=3,5,7,10)
  - `artifacts/features_train_k3.parquet`, `artifacts/labels_train_k3.parquet`, `artifacts/features_test_k3.parquet`
  - ...
  - `artifacts/test_index.csv`


In [1]:
import os
import numpy as np
import pandas as pd

DATA_DIR = "data"
ART_DIR  = "artifacts"
os.makedirs(ART_DIR, exist_ok=True)

PITCH_X, PITCH_Y = 105.0, 68.0
GOAL_X, GOAL_Y   = 105.0, 34.0

print("DATA_DIR:", os.path.abspath(DATA_DIR))
print("ART_DIR :", os.path.abspath(ART_DIR))


DATA_DIR: d:\공모전\스포츠\data
ART_DIR : d:\공모전\스포츠\artifacts


In [2]:
def load_test_events_from_index(test_index_path: str, base_dir: str = "."):
    idx = pd.read_csv(test_index_path)
    required = {"game_id", "game_episode", "path"}
    missing = required - set(idx.columns)
    if missing:
        raise ValueError(f"test index에 필요한 컬럼이 없습니다: {missing}")

    dfs = []
    for p in idx["path"].tolist():
        full_path = os.path.join(base_dir, p)
        if not os.path.exists(full_path):
            raise FileNotFoundError(f"파일이 없습니다: {full_path}")
        dfs.append(pd.read_csv(full_path))

    test_events = pd.concat(dfs, ignore_index=True)
    return test_events, idx

def add_spatial_from_start(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    dxg = (GOAL_X - df["start_x"]).clip(lower=1e-6)
    dyg = (df["start_y"] - GOAL_Y).abs()

    df["start_dist_to_goal"] = np.sqrt(dxg**2 + dyg**2)
    df["start_angle_to_goal"] = np.arctan2(dyg, dxg)
    df["start_dist_to_sideline"] = np.minimum(df["start_y"], PITCH_Y - df["start_y"])
    df["start_dist_to_endline"] = PITCH_X - df["start_x"]

    df["start_x_ratio"] = df["start_x"] / PITCH_X
    df["start_y_centered_abs"] = (df["start_y"] - GOAL_Y).abs()
    return df

def build_last_pass_dataset_safe(events: pd.DataFrame, k_prev: int = 5):
    df = events.copy()
    df = df.sort_values(["game_episode", "period_id", "time_seconds", "action_id"]).reset_index(drop=True)
    df = add_spatial_from_start(df)

    is_pass = (df["type_name"] == "Pass")
    if not is_pass.any():
        raise ValueError("type_name=='Pass'가 하나도 없습니다.")
    last_pass_idx = df[is_pass].groupby("game_episode", sort=False).tail(1).index
    df["is_target_pass"] = False
    df.loc[last_pass_idx, "is_target_pass"] = True

    g = df.groupby("game_episode", sort=False)

    base_cols = ["type_name", "result_name", "start_x", "start_y", "end_x", "end_y", "time_seconds"]
    for i in range(1, k_prev + 1):
        for c in base_cols:
            df[f"prev{i}_{c}"] = g[c].shift(i)
        df[f"prev{i}_dx"] = df[f"prev{i}_end_x"] - df[f"prev{i}_start_x"]
        df[f"prev{i}_dy"] = df[f"prev{i}_end_y"] - df[f"prev{i}_start_y"]
        df[f"prev{i}_dist_move"] = np.sqrt(df[f"prev{i}_dx"]**2 + df[f"prev{i}_dy"]**2)

    prev_dx_cols = [f"prev{i}_dx" for i in range(1, k_prev + 1)]
    prev_dy_cols = [f"prev{i}_dy" for i in range(1, k_prev + 1)]
    df["prevk_sum_dx"] = df[prev_dx_cols].sum(axis=1, skipna=True)
    df["prevk_sum_abs_dy"] = df[prev_dy_cols].abs().sum(axis=1, skipna=True)
    df["prevk_mean_dx"] = df["prevk_sum_dx"] / max(k_prev, 1)
    df["prevk_lateral_ratio"] = df["prevk_sum_abs_dy"] / (df["prevk_sum_dx"].abs() + 1e-6)

    df["dx_evt"] = df["end_x"] - df["start_x"]
    df["dy_evt"] = df["end_y"] - df["start_y"]
    not_target = ~df["is_target_pass"]

    def nansum_masked(s):
        arr = s.where(not_target.loc[s.index])
        return np.nansum(arr.values)

    df["ep_len_before"] = g["is_target_pass"].transform(lambda x: (~x).sum())
    df["ep_time_span"]  = g["time_seconds"].transform(lambda x: x.max() - x.min())
    df["ep_sum_dx_before"]     = g["dx_evt"].transform(nansum_masked)
    df["ep_sum_abs_dy_before"] = g["dy_evt"].transform(lambda s: nansum_masked(s.abs()))

    last_pass = df.loc[df["is_target_pass"]].copy()

    feature_cols = [
        "game_id", "period_id", "time_seconds", "team_id", "player_id", "is_home",
        "start_x", "start_y",
        "start_dist_to_goal", "start_angle_to_goal", "start_dist_to_sideline", "start_dist_to_endline",
        "start_x_ratio", "start_y_centered_abs",
        "ep_len_before", "ep_time_span", "ep_sum_dx_before", "ep_sum_abs_dy_before",
        "prevk_sum_dx", "prevk_sum_abs_dy", "prevk_mean_dx", "prevk_lateral_ratio",
        "result_name",
    ]
    for i in range(1, k_prev + 1):
        feature_cols += [
            f"prev{i}_type_name",
            f"prev{i}_result_name",
            f"prev{i}_start_x", f"prev{i}_start_y",
            f"prev{i}_end_x",   f"prev{i}_end_y",
            f"prev{i}_dx",      f"prev{i}_dy", f"prev{i}_dist_move",
            f"prev{i}_time_seconds",
        ]

    X = last_pass[["game_episode"] + feature_cols].copy()

    y = None
    if {"end_x", "end_y"}.issubset(last_pass.columns) and last_pass[["end_x", "end_y"]].notna().all().all():
        y = last_pass[["game_episode", "end_x", "end_y"]].copy()

    return X, y


In [3]:
# 1) Load events
train_events = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_events, test_index = load_test_events_from_index(os.path.join(DATA_DIR, "test.csv"), base_dir="data")

print("train_events:", train_events.shape)
print("test_events :", test_events.shape)
print("test_index  :", test_index.shape)

# 저장(제출 정렬 확인용)
test_index.to_csv(os.path.join(ART_DIR, "test_index.csv"), index=False)


train_events: (356721, 15)
test_events : (53110, 15)
test_index  : (2414, 3)


In [4]:
# 2) Build & save multiple k_prev versions
K_LIST = [3, 5, 7, 10]

for K_PREV in K_LIST:
    print(f"\n=== Building features for k_prev={K_PREV} ===")
    X_train, y_train = build_last_pass_dataset_safe(train_events, k_prev=K_PREV)
    X_test, _        = build_last_pass_dataset_safe(test_events,  k_prev=K_PREV)

    assert y_train is not None, f"y_train이 None입니다(k_prev={K_PREV}). train 라벨을 확인하세요."

    train_feat_path  = os.path.join(ART_DIR, f"features_train_k{K_PREV}.parquet")
    train_label_path = os.path.join(ART_DIR, f"labels_train_k{K_PREV}.parquet")
    test_feat_path   = os.path.join(ART_DIR, f"features_test_k{K_PREV}.parquet")

    X_train.to_parquet(train_feat_path, index=False)
    y_train.to_parquet(train_label_path, index=False)
    X_test.to_parquet(test_feat_path, index=False)

    print("Saved:")
    print(" -", train_feat_path)
    print(" -", train_label_path)
    print(" -", test_feat_path)



=== Building features for k_prev=3 ===
Saved:
 - artifacts\features_train_k3.parquet
 - artifacts\labels_train_k3.parquet
 - artifacts\features_test_k3.parquet

=== Building features for k_prev=5 ===
Saved:
 - artifacts\features_train_k5.parquet
 - artifacts\labels_train_k5.parquet
 - artifacts\features_test_k5.parquet

=== Building features for k_prev=7 ===
Saved:
 - artifacts\features_train_k7.parquet
 - artifacts\labels_train_k7.parquet
 - artifacts\features_test_k7.parquet

=== Building features for k_prev=10 ===


  df[f"prev{i}_dist_move"] = np.sqrt(df[f"prev{i}_dx"]**2 + df[f"prev{i}_dy"]**2)
  df[f"prev{i}_{c}"] = g[c].shift(i)
  df[f"prev{i}_{c}"] = g[c].shift(i)
  df[f"prev{i}_{c}"] = g[c].shift(i)
  df[f"prev{i}_{c}"] = g[c].shift(i)
  df[f"prev{i}_{c}"] = g[c].shift(i)
  df[f"prev{i}_{c}"] = g[c].shift(i)
  df[f"prev{i}_{c}"] = g[c].shift(i)
  df[f"prev{i}_dx"] = df[f"prev{i}_end_x"] - df[f"prev{i}_start_x"]
  df[f"prev{i}_dy"] = df[f"prev{i}_end_y"] - df[f"prev{i}_start_y"]
  df[f"prev{i}_dist_move"] = np.sqrt(df[f"prev{i}_dx"]**2 + df[f"prev{i}_dy"]**2)
  df["prevk_sum_dx"] = df[prev_dx_cols].sum(axis=1, skipna=True)
  df["prevk_sum_abs_dy"] = df[prev_dy_cols].abs().sum(axis=1, skipna=True)
  df["prevk_mean_dx"] = df["prevk_sum_dx"] / max(k_prev, 1)
  df["prevk_lateral_ratio"] = df["prevk_sum_abs_dy"] / (df["prevk_sum_dx"].abs() + 1e-6)
  df["dx_evt"] = df["end_x"] - df["start_x"]
  df["dy_evt"] = df["end_y"] - df["start_y"]
  df["ep_len_before"] = g["is_target_pass"].transform(lambda x

Saved:
 - artifacts\features_train_k10.parquet
 - artifacts\labels_train_k10.parquet
 - artifacts\features_test_k10.parquet
