## 使用 Catboost 訓練模型

In [21]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier

# === 1. 讀取資料 ===
df = pd.read_csv("/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv")
X_base = df.drop(columns=['gender', 'hand', 'play_years', 'level'])
y_gender = df['gender'] - 1
y_hand = df['hand'] - 1
y_years = df['play_years']
y_level = df['level']

# === 2. 特徵標準化（僅儲存，用不到於 CatBoost）===
scaler = StandardScaler()
scaler.fit(X_base)  # 儲存用，不實際使用於 CatBoost

# === 3. 訓練集切分 ===
X_train_g, X_test_g, y_gender_train, y_gender_test = train_test_split(X_base, y_gender, test_size=0.2, random_state=42, stratify=y_gender)
X_train_h, X_test_h, y_hand_train, y_hand_test = train_test_split(X_base, y_hand, test_size=0.2, random_state=42, stratify=y_hand)
X_train_y, X_test_y, y_years_train, y_years_test = train_test_split(X_base, y_years, test_size=0.2, random_state=42, stratify=y_years)
X_train_l, X_test_l, y_level_train, y_level_test = train_test_split(X_base, y_level, test_size=0.2, random_state=42, stratify=y_level)

# === 4. 建立模型資料夾與 AUC 評分容器 ===
os.makedirs("catboost_saved_models", exist_ok=True)
auc_dict = {}

# === 5. gender 模型 ===
w_gender = compute_sample_weight(class_weight='balanced', y=y_gender_train)
model_gender = CatBoostClassifier(verbose=0, random_state=42)
model_gender.fit(X_train_g, y_gender_train, sample_weight=w_gender)
prob_gender = model_gender.predict_proba(X_test_g)[:, 1]
auc_dict["gender_auc"] = roc_auc_score(y_gender_test, prob_gender)
joblib.dump(model_gender, "catboost_saved_models/model_gender_cat.pkl")

# === 6. hand 模型 ===
w_hand = compute_sample_weight(class_weight='balanced', y=y_hand_train)
model_hand = CatBoostClassifier(verbose=0, random_state=42)
model_hand.fit(X_train_h, y_hand_train, sample_weight=w_hand)
prob_hand = model_hand.predict_proba(X_test_h)[:, 1]
auc_dict["hand_auc"] = roc_auc_score(y_hand_test, prob_hand)
joblib.dump(model_hand, "catboost_saved_models/model_hand_cat.pkl")

# === 7. play_years One-vs-Rest 模型 ===
classes_years = np.sort(y_years.unique())
probs_years = []
models_years = {}

for c in classes_years:
    y_bin = (y_years_train == c).astype(int)
    w = compute_sample_weight(class_weight='balanced', y=y_bin)
    model = CatBoostClassifier(verbose=0, random_state=42)
    model.fit(X_train_y, y_bin, sample_weight=w)
    p = model.predict_proba(X_test_y)[:, 1]
    probs_years.append(p)
    models_years[int(c)] = model

probs_years = np.vstack(probs_years).T
auc_dict["play_years_auc"] = roc_auc_score(pd.get_dummies(y_years_test), probs_years, average='micro')
joblib.dump({"models": models_years, "classes": classes_years}, "catboost_saved_models/model_years_cat.pkl")

# === 8. level One-vs-Rest + GridSearch 模型 ===
classes_level = np.sort(y_level.unique())
probs_level = []
models_level = {}
param_grid = {
    "learning_rate": [0.05, 0.1],
    "depth": [4, 5],
    "iterations": [200, 300]
}

for c in classes_level:
    y_bin = (y_level_train == c).astype(int)
    w = compute_sample_weight(class_weight='balanced', y=y_bin)
    best_auc = -1
    best_model = None

    for lr in param_grid['learning_rate']:
        for d in param_grid['depth']:
            for n in param_grid['iterations']:
                model = CatBoostClassifier(verbose=0, random_state=42, learning_rate=lr, depth=d, iterations=n)
                model.fit(X_train_l, y_bin, sample_weight=w)
                p = model.predict_proba(X_test_l)[:, 1]
                auc = roc_auc_score((y_level_test == c).astype(int), p)

                if auc > best_auc:
                    best_auc = auc
                    best_model = model

    p_best = best_model.predict_proba(X_test_l)[:, 1]
    probs_level.append(p_best)
    models_level[int(c)] = best_model

probs_level = np.vstack(probs_level).T
auc_dict["level_auc"] = roc_auc_score(pd.get_dummies(y_level_test), probs_level, average='micro')
joblib.dump({"models": models_level, "classes": classes_level}, "catboost_saved_models/model_level_cat_grid.pkl")

# === 9. 儲存 scaler、AUC 統計 ===
joblib.dump(scaler, "catboost_saved_models/scaler.pkl")
auc_dict["mean_auc"] = np.mean(list(auc_dict.values()))
print("✅ 模型訓練完成 AUCs:")
for k, v in auc_dict.items():
    print(f"{k:20s}: {v:.4f}")
print("✅ 所有模型與 scaler 已儲存")

✅ 模型訓練完成 AUCs:
gender_auc          : 0.9712
hand_auc            : 1.0000
play_years_auc      : 0.9832
level_auc           : 0.9922
mean_auc            : 0.9867
✅ 所有模型與 scaler 已儲存


## 用測試集測試模型

In [7]:
import pandas as pd
df_test = pd.read_csv("/Users/yuchingchen/Documents/AI_CUP/feature_engineering/test_features.csv")
print(df_test.shape)

(1430, 1081)


In [9]:
import pandas as pd
import numpy as np
import joblib

# === 路徑設定 ===
test_feature_path = "/Users/yuchingchen/Documents/AI_CUP/feature_engineering/test_features.csv"
submission_output_path = "/Users/yuchingchen/Documents/AI_CUP/model/sample_submission.csv"

# === 讀取測試資料 ===
df_test = pd.read_csv(test_feature_path)
X_test = df_test.drop(columns=['unique_id'])
unique_ids = df_test['unique_id']

# === 載入模型與 scaler、類別資訊 ===
scaler = joblib.load("catboost_saved_models/scaler.pkl")
model_gender = joblib.load("catboost_saved_models/model_gender_cat.pkl")
model_hand = joblib.load("catboost_saved_models/model_hand_cat.pkl")
model_years = joblib.load("catboost_saved_models/model_years_cat.pkl")
model_level = joblib.load("catboost_saved_models/model_level_cat_grid.pkl")

# === 標準化（只給 gender / hand 用）===
X_test_scaled = scaler.transform(X_test)

# === 二分類預測 ===
gender_probs = model_gender.predict_proba(X_test_scaled)[:, 1]
hand_probs = model_hand.predict_proba(X_test_scaled)[:, 1]

# === play_years 預測（One-vs-Rest）===
years_probs = []
for c in model_years['classes']:
    model = model_years['models'][c]
    p = model.predict_proba(X_test)[:, 1]
    years_probs.append(p)
years_probs = np.vstack(years_probs).T

# === level 預測（One-vs-Rest + GridSearch）===
level_probs = []
for c in model_level['classes']:
    model = model_level['models'][c]
    p = model.predict_proba(X_test)[:, 1]
    level_probs.append(p)
level_probs = np.vstack(level_probs).T

# === 建立 dataframe（含所有機率）===
df_probs = pd.DataFrame({
    "unique_id": unique_ids,
    "gender": gender_probs,
    "hold racket handed": hand_probs
})
for i, cls in enumerate(model_years['classes']):
    df_probs[f"play years_{cls}"] = years_probs[:, i]
for i, cls in enumerate(model_level['classes']):
    df_probs[f"level_{cls}"] = level_probs[:, i]

# === 找出每位 unique_id 的最有信心預測段 ===
def select_most_confident(group):
    uid = group.name  # 取得 groupby 的 key，也就是 unique_id
    prob_cols = group.columns
    group['confidence'] = group[prob_cols].max(axis=1)
    row = group.loc[group['confidence'].idxmax()].drop('confidence')
    row['unique_id'] = uid
    return row

submission = df_probs.groupby("unique_id").apply(
    select_most_confident, include_groups=False
).reset_index(drop=True)

# === 欄位順序排序 ===
final_cols = [
    "unique_id", "gender", "hold racket handed",
    "play years_0", "play years_1", "play years_2",
    "level_2", "level_3", "level_4", "level_5"
]
submission = submission[final_cols]

# === 四捨五入 + 將 id 轉成整數 ===
submission = submission.round(4)
submission["unique_id"] = submission["unique_id"].astype(int)

# === 輸出 CSV ===
submission.to_csv(submission_output_path, index=False, float_format="%.4f")
print(f"✅ 已輸出整合後提交檔：{submission_output_path}，共 {len(submission)} 筆")

✅ 已輸出整合後提交檔：/Users/yuchingchen/Documents/AI_CUP/model/sample_submission.csv，共 1430 筆
