## 訓練模型

In [69]:
import os
import numpy as np
import pandas as pd
import joblib
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)

# 確保 models/ 資料夾存在
os.makedirs('models', exist_ok=True)

# 0. 讀取原始特徵
df = pd.read_csv(
    '/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv'
)

# 1. 按 player_id 拆 train/test（80% / 20%）
unique_players = df['player_id'].unique()
train_players, test_players = train_test_split(
    unique_players, test_size=0.2, random_state=42
)
train_idx = df['player_id'].isin(train_players)
test_idx  = df['player_id'].isin(test_players)

# 2. 特徵欄位 & scaler（只在訓練集上 fit）
feature_cols = [c for c in df.columns if c.startswith('f')]
X_all = df[feature_cols].values
scaler = StandardScaler().fit(X_all[train_idx])       # ← 僅在 train 上 fit
X_scaled_all = scaler.transform(X_all)
X_train       = X_scaled_all[train_idx]
X_test        = X_scaled_all[test_idx]

# 3. 內層 CV
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 4. 任務設定
tasks = {
    'gender':             ('gender',             'binary'),
    'hold_racket_handed': ('hold racket handed', 'binary'),
    'play_years':         ('play years',         'multi'),
    'level':              ('level',              'multi'),
}

# 5. 隨機搜尋空間
param_dist = {
    'max_depth':        randint(2, 8),
    'learning_rate':    uniform(0.01, 0.2),
    'n_estimators':     randint(100, 400),
    'subsample':        uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha':        uniform(0, 5),
    'reg_lambda':       uniform(0, 5),
}

def train_with_search(task_name, y_col, problem_type):
    # --- 標籤處理（只在 train labels 上 fit encoder） ---
    if problem_type == 'binary':
        # 正例 = 原始 == 1，其餘當反例
        y_all = (df[y_col].values == 1).astype(int)
        le = None
    else:
        le = LabelEncoder()
        y_train_raw = df.loc[train_idx, y_col].values
        le.fit(y_train_raw)  # ← 僅在 train 上 fit
        y_all = le.transform(df[y_col].values)

    y_train = y_all[train_idx]
    y_test  = y_all[test_idx]

    # --- 建立 base model & 處理不平衡 ---
    base = XGBClassifier(random_state=42, verbosity=0)
    fit_kwargs = {}

    if problem_type == 'binary':
        neg, pos = np.bincount(y_train)
        base.set_params(
            objective='binary:logistic',
            scale_pos_weight=neg/pos,
            eval_metric='logloss'
        )
        scoring = 'roc_auc'
    else:
        cw = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        fit_kwargs['sample_weight'] = np.array([cw[y] for y in y_train])
        base.set_params(
            objective='multi:softprob',
            num_class=len(np.unique(y_all)),
            eval_metric='mlogloss'
        )
        scoring = 'roc_auc_ovr'

    # --- 超參數搜尋 ---
    search = RandomizedSearchCV(
        estimator=base,
        param_distributions=param_dist,
        n_iter=30,
        scoring=scoring,
        cv=inner_cv,
        random_state=42,
        n_jobs=-1,
        refit=True,
        verbose=1
    )
    search.fit(X_train, y_train, **fit_kwargs)

    # --- 測試集評估 ---
    best = search.best_estimator_
    prob_test = best.predict_proba(X_test)
    if problem_type == 'binary':
        auc = roc_auc_score(y_test, prob_test[:,1])
    else:
        auc = roc_auc_score(y_test, prob_test, multi_class='ovr', average='micro')
    print(f"[{task_name}] Test ROC-AUC = {auc:.4f}")

    # --- 儲存 model、scaler 和（必要時）encoder ---
    save_obj = {'model': best, 'scaler': scaler}
    if le is not None:
        save_obj['le'] = le

    fn = os.path.join('models', f"xgb_{task_name}.joblib")
    joblib.dump(save_obj, fn)
    print(f"Saved model to {fn}\n" + "="*60 + "\n")

# 6. 執行所有任務
for name, (col, ptype) in tasks.items():
    train_with_search(name, col, ptype)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[gender] Test ROC-AUC = 0.9795
Saved model to models/xgb_gender.joblib

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[hold_racket_handed] Test ROC-AUC = 0.9998
Saved model to models/xgb_hold_racket_handed.joblib

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[play_years] Test ROC-AUC = 0.7161
Saved model to models/xgb_play_years.joblib

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[level] Test ROC-AUC = 0.8698
Saved model to models/xgb_level.joblib



## 將 Player_years 拉出來建模型

In [14]:
import os, warnings, joblib
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing  import StandardScaler, LabelEncoder
from sklearn.metrics        import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from lightgbm               import LGBMClassifier

# ---- 全面靜音 ----
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*does not have valid feature names.*")

# -------------------------------------------------
# 0. 讀資料
# -------------------------------------------------
df = pd.read_csv("/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv")

# 1. 依 player_id 切外層 train / test（80 / 20）
players = df["player_id"].unique()
train_p, test_p = train_test_split(players, test_size=0.2, random_state=42)
train_idx = df["player_id"].isin(train_p)
test_idx  = df["player_id"].isin(test_p)

# 2. 特徵 + 標準化（保持 DataFrame 型態）
feature_cols = [c for c in df.columns if c.startswith("f")]
scaler = StandardScaler().fit(df.loc[train_idx, feature_cols])

X_scaled = pd.DataFrame(
    scaler.transform(df[feature_cols]),
    columns=feature_cols,
    index=df.index
)
X_train, X_test = X_scaled.loc[train_idx], X_scaled.loc[test_idx]

# 3. 標籤處理
y_raw = df["play years"].values
le    = LabelEncoder().fit(y_raw[train_idx])
y_all = le.transform(y_raw)
y_train, y_test = y_all[train_idx], y_all[test_idx]

# 4. 不平衡 sample_weight
cw = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
sw_train = np.array([cw[y] for y in y_train])
sw_all   = np.array([cw[y] for y in y_all])

# -------------------------------------------------
# 5. LightGBM + RandomizedSearchCV（靜音）
# -------------------------------------------------
base = LGBMClassifier(
    objective="multiclass",
    num_class=len(le.classes_),
    random_state=42,
    n_jobs=-1,
    verbose=-1       # 關閉 LightGBM 自身列印
)

param_dist = {
    "n_estimators":      randint(200, 800),
    "learning_rate":     uniform(0.02, 0.18),
    "max_depth":         randint(3, 10),
    "num_leaves":        randint(16, 128),
    "subsample":         uniform(0.6, 0.4),
    "colsample_bytree":  uniform(0.6, 0.4),
    "reg_alpha":         uniform(0.0, 5.0),
    "reg_lambda":        uniform(0.0, 5.0)
}

inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=base,
    param_distributions=param_dist,
    n_iter=30,
    scoring="roc_auc_ovr",
    cv=inner_cv,
    random_state=42,
    refit=True,
    verbose=0,       # 關閉搜尋進度
    n_jobs=-1
)

search.fit(X_train, y_train, sample_weight=sw_train)

# -------------------------------------------------
# 6. 外層 test 評估
# -------------------------------------------------
best = search.best_estimator_
proba_test = best.predict_proba(X_test)

auc_test = roc_auc_score(
    y_test,
    proba_test,
    multi_class="ovr",
    average="micro"
)
print(f"[play_years] Test ROC-AUC = {auc_test:.4f}")

# -------------------------------------------------
# 7. 全資料重訓 & 存檔
# -------------------------------------------------
best.fit(X_scaled, y_all, sample_weight=sw_all)

os.makedirs("models", exist_ok=True)
joblib.dump(
    {"model": best, "scaler": scaler, "le": le},
    "models/lgbm_play_years.joblib"
)
print("✅ 已儲存 LightGBM play_years 模型到 models/lgbm_play_years.joblib")

[play_years] Test ROC-AUC = 0.7141
✅ 已儲存 LightGBM play_years 模型到 models/lgbm_play_years.joblib


In [17]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection   import StratifiedKFold
from sklearn.preprocessing     import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from lightgbm                  import LGBMClassifier
from sklearn.metrics           import roc_auc_score
from sklearn.base              import clone
import joblib

# === 0. 載入資料 & 標籤編碼 ===
df    = pd.read_csv('/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv')
X     = df[[c for c in df.columns if c.startswith('f')]].values
y_raw = df['play years'].values  # 原始就是 0/1/2
le    = LabelEncoder().fit(y_raw)
y     = le.transform(y_raw)

# === 1. 計算 sample_weight ===
cw = compute_class_weight('balanced', classes=np.unique(y), y=y)
sw = np.array([cw[yi] for yi in y])

# === 2. 定義模型（不 pre-fit scaler）===
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    is_unbalance=True,
    learning_rate=0.05,
    n_estimators=300,
    random_state=42
)

# === 3. 五折手動 CV，fold 內才做 scaler.fit ===
cv     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, valid_idx in cv.split(X, y):
    # 切分
    X_tr, X_va = X[train_idx], X[valid_idx]
    y_tr, y_va = y[train_idx], y[valid_idx]
    sw_tr      = sw[train_idx]

    # fold 內標準化
    scaler = StandardScaler().fit(X_tr)
    X_tr_s  = scaler.transform(X_tr)
    X_va_s  = scaler.transform(X_va)

    # train & pred
    m = clone(model)
    m.fit(X_tr_s, y_tr, sample_weight=sw_tr)
    prob = m.predict_proba(X_va_s)

    # 評分
    scores.append(roc_auc_score(y_va, prob, multi_class='ovr'))

scores = np.array(scores)
print("LGBM 5-fold ROC-AUC:", np.round(scores,4), "→", np.round(scores.mean(),4))

# === 4. 用全資料 retrain 並儲存 ===
#    這裡也先 fit scaler 再 train model
scaler_full = StandardScaler().fit(X)
X_s_full    = scaler_full.transform(X)
model.fit(X_s_full, y, sample_weight=sw)

os.makedirs('models', exist_ok=True)
joblib.dump(
    {'model': model, 'scaler': scaler_full, 'le': le},
    'models/lgbm_play_years.joblib'
)
print("✅ 已儲存 LGBM play_years 模型到 models/lgbm_play_years.joblib")

LGBM 5-fold ROC-AUC: [0.9671 0.9864 0.9766 0.9701 0.9823] → 0.9765
✅ 已儲存 LGBM play_years 模型到 models/lgbm_play_years.joblib


## 把 level 也拉出來建模型

In [None]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection       import GroupKFold
from sklearn.preprocessing         import StandardScaler, LabelEncoder
from sklearn.utils.class_weight    import compute_class_weight
from sklearn.metrics               import roc_auc_score
from sklearn.base                  import clone
from lightgbm                      import LGBMClassifier
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# === 0. 讀取資料 & label encoding ===
df    = pd.read_csv('/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv')
X     = df[[c for c in df.columns if c.startswith('f')]].values
y_raw = df['level'].values         # 原始就是 [2,3,4,5]
le    = LabelEncoder().fit(y_raw)  # encode to [0,1,2,3]
y     = le.transform(y_raw)
groups= df['player_id'].values     # 用來 GroupKFold

n_classes = len(le.classes_)       # 一定要告訴 ROC-AUC 有幾個類別

# === 1. 計算 sample_weight（class‐balanced） ===
cw = compute_class_weight('balanced', classes=np.unique(y), y=y)
sw = np.array([cw[yi] for yi in y])

# === 2. 建立 LGBM 多分類器 ===
model = LGBMClassifier(
    objective='multiclass',
    num_class=n_classes,
    is_unbalance=True,
    learning_rate=0.05,
    n_estimators=300,
    random_state=42
)

# === 3. 5-fold GroupKFold（以 player_id 分群） ===
gkf    = GroupKFold(n_splits=5)
scores = []

for tr_idx, va_idx in gkf.split(X, y, groups=groups):
    # 切分
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    sw_tr      = sw[tr_idx]

    # fold 內標準化（防止洩漏）
    scaler = StandardScaler().fit(X_tr)
    X_tr_s  = scaler.transform(X_tr)
    X_va_s  = scaler.transform(X_va)

    # train & predict
    m = clone(model)
    m.fit(X_tr_s, y_tr, sample_weight=sw_tr)
    prob = m.predict_proba(X_va_s)

    # micro one-vs-rest ROC-AUC，指定全部的 labels
    score = roc_auc_score(
        y_va,
        prob,
        multi_class='ovr',
        average='micro',
        labels=np.arange(n_classes)
    )
    scores.append(score)

scores = np.array(scores)
print("LGBM level 5-fold GroupKFold ROC-AUC:", 
      np.round(scores, 4), "→", np.round(scores.mean(), 4))

# === 4. 全資料 retrain & 存檔 ===
scaler_full = StandardScaler().fit(X)
X_full_s    = scaler_full.transform(X)
model.fit(X_full_s, y, sample_weight=sw)

os.makedirs('models', exist_ok=True)
joblib.dump({
    'model': model,
    'scaler': scaler_full,
    'le':     le
}, 'models/lgbm_level.joblib')

print("✅ 已儲存 LGBM level 模型到 models/lgbm_level.joblib")

LGBM level 5-fold GroupKFold ROC-AUC: [0.6585 0.8641 0.652  0.8498 0.7076] → 0.7464


## 用測試集測試模型

In [7]:
import pandas as pd
df_test = pd.read_csv("/Users/yuchingchen/Documents/AI_CUP/feature_engineering/test_features.csv")
print(df_test.shape)

(1430, 1081)


In [90]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names.*",
    category=UserWarning,
)

import os
import numpy as np
import pandas as pd
from joblib import load

# === 0. 路徑設定（請依實際環境修改）===
TEST_FEAT_CSV = "/Users/yuchingchen/Documents/AI_CUP/feature_engineering/test_features.csv"
MODELS_DIR    = "/Users/yuchingchen/Documents/AI_CUP/model/models"
OUTPUT_CSV    = "/Users/yuchingchen/Documents/AI_CUP/model/sample_submission.csv"

# === 1. 讀取測試特徵 ===
df_test = pd.read_csv(TEST_FEAT_CSV, dtype={"unique_id": str})
uids    = df_test["unique_id"].values
X_raw   = df_test.drop(columns=["unique_id"]).values

# === 2. 定義任務及模型路徑 ===
tasks = {
    "gender": {
        "model_path": os.path.join(MODELS_DIR, "xgb_gender.joblib"),
        "is_binary":  True,
        "base_col":   "gender"
    },
    "hold_racket_handed": {
        "model_path": os.path.join(MODELS_DIR, "xgb_hold_racket_handed.joblib"),
        "is_binary":  True,
        "base_col":   "hold racket handed"
    },
    "play_years": {
        "model_path": os.path.join(MODELS_DIR, "lgbm_play_years.joblib"),
        "is_binary":  False,
        "base_col":   "play years"
    },
    "level": {
        "model_path": os.path.join(MODELS_DIR, "xgb_level.joblib"),
        "is_binary":  False,
        "base_col":   "level"
    },
}

# === 3. per‐segment 預測並收集所有切片機率，同時把 classes 記錄下來 ===
df_probs = pd.DataFrame({"unique_id": uids})

for tname, cfg in tasks.items():
    # 載入當初存的模型字典（含 model, le, scaler）
    obj     = load(cfg["model_path"])
    clf     = obj["model"]
    le      = obj.get("le", None)
    scaler  = obj["scaler"]

    # 標準化後預測
    X_scaled = scaler.transform(X_raw)
    proba    = clf.predict_proba(X_scaled)  # shape = (n_seg, n_class)

    # 還原原始標籤
    if cfg["is_binary"]:
        # binary: 正類一定對應到 label=1 的那一欄
        classes = np.array([0, 1])
    else:
        # multi: 用存下的 LabelEncoder 才有原始編號 (e.g. [2,3,4,5])
        classes = le.classes_

    cfg["classes"] = classes  # 留到後面使用
    base = cfg["base_col"]

    if cfg["is_binary"]:
        df_probs[base] = proba[:, 1]
    else:
        for idx, cls in enumerate(classes):
            df_probs[f"{base}_{cls}"] = proba[:, idx]

# === 4. group by unique_id，做平均→挑類別→挑最佳切片 ===
records = []
for uid, grp in df_probs.groupby("unique_id"):
    rec = {"unique_id": uid}

    for tname, cfg in tasks.items():
        base    = cfg["base_col"]
        classes = cfg["classes"]

        if cfg["is_binary"]:
            # 二分類：平均所有 segment 的正類 (label=1) 機率
            rec[base] = round(grp[base].mean(), 4)

        else:
            # 多分類：先算各 class 的平均機率 → 選最高平均的 class idx
            cls_cols = [c for c in grp.columns if c.startswith(base + "_")]
            avg      = grp[cls_cols].mean(axis=0).values
            chosen   = int(np.argmax(avg))

            # 再找該 class 在哪一個 segment 最強
            best_seg   = int(np.argmax(grp[cls_cols].values[:, chosen]))
            best_proba = grp[cls_cols].values[best_seg]  # C 維機率向量

            # 寫回該 segment 上所有 class 的機率
            for idx, cls in enumerate(classes):
                rec[f"{base}_{cls}"] = best_proba[idx]

    records.append(rec)

submission = pd.DataFrame(records)

# === 5. 多分類欄位 sum-to-1 + 四捨五入 ===
for cfg in tasks.values():
    if not cfg["is_binary"]:
        base     = cfg["base_col"]
        cls_cols = [c for c in submission.columns if c.startswith(base + "_")]
        mat      = submission[cls_cols].values
        mat      = mat / mat.sum(axis=1, keepdims=True)
        submission[cls_cols] = np.round(mat, 4)

# === 6. 重排欄位並存檔（四捨五入不採科學記號）===
cols = ["unique_id", "gender", "hold racket handed"]

# play_years_* 按照原始 class 排序
py_cols = sorted(
    [c for c in submission.columns if c.startswith("play years_")],
    key=lambda x: int(x.split("_")[1])
)
lv_cols = sorted(
    [c for c in submission.columns if c.startswith("level_")],
    key=lambda x: int(x.split("_")[1])
)
cols += py_cols + lv_cols

submission[cols].to_csv(
    OUTPUT_CSV,
    index=False,
    float_format="%.4f"   # 關閉科學記號，固定四位小數
)
print(f"✅ 已產生 submission：{OUTPUT_CSV}")

✅ 已產生 submission：/Users/yuchingchen/Documents/AI_CUP/model/sample_submission.csv
