In [24]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

print("🚀 四任務統一訓練 - 最佳策略集成")
print("="*60)

# 讀取數據
df = pd.read_csv('/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv')
os.makedirs('models', exist_ok=True)

# 按選手分割數據
unique_players = df['player_id'].unique()
train_players, test_players = train_test_split(unique_players, test_size=0.2, random_state=42)
train_mask = df['player_id'].isin(train_players)
test_mask = df['player_id'].isin(test_players)

# 基礎特徵準備
feature_cols = [c for c in df.columns if c.startswith('f')]
scaler = StandardScaler()
X_train = scaler.fit_transform(df.loc[train_mask, feature_cols])
X_test = scaler.transform(df.loc[test_mask, feature_cols])

print(f"數據規模: 訓練{X_train.shape}, 測試{X_test.shape}")
print(f"訓練集選手: {len(train_players)}, 測試集選手: {len(test_players)}")

# 任務配置
tasks = {
    'gender': {
        'column': 'gender',
        'type': 'binary',
        'description': '性別預測 (男生機率)',
        'target_auc': 0.95
    },
    'hold_racket_handed': {
        'column': 'hold racket handed', 
        'type': 'binary',
        'description': '慣用手預測 (右手機率)',
        'target_auc': 0.95
    },
    'play_years': {
        'column': 'play years',
        'type': 'multiclass',
        'description': '球齡預測 (0:低, 1:中, 2:高)',
        'target_auc': 0.75,
        'use_advanced': True  # 使用進階策略
    },
    'level': {
        'column': 'level',
        'type': 'multiclass', 
        'description': '等級預測 (2:甲組, 3:乙組, 4:國手, 5:青少)',
        'target_auc': 0.85
    }
}

def train_binary_task(task_name, task_config):
    """訓練二分類任務"""
    print(f"\n{'='*50}")
    print(f"🎯 訓練 {task_name}: {task_config['description']}")
    print(f"{'='*50}")
    
    # 準備標籤
    y_train = (df.loc[train_mask, task_config['column']] == 1).astype(int)
    y_test = (df.loc[test_mask, task_config['column']] == 1).astype(int)
    
    print(f"類別分布 - 訓練: 0={np.sum(y_train==0)}, 1={np.sum(y_train==1)}")
    print(f"類別分布 - 測試: 0={np.sum(y_test==0)}, 1={np.sum(y_test==1)}")
    
    # 類別權重
    pos_weight = np.sum(y_train == 0) / max(1, np.sum(y_train == 1))
    
    # 方法1: 調參 XGBoost (已知有效)
    xgb_model = XGBClassifier(
        objective='binary:logistic',
        scale_pos_weight=pos_weight,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1.0,
        reg_lambda=1.0,
        min_child_weight=3,
        gamma=0.1,
        random_state=42,
        verbosity=0
    )
    
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict_proba(X_test)[:, 1]
    xgb_auc = roc_auc_score(y_test, xgb_pred)
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    
    # 方法2: SVM (如果預期有提升)
    svm_model = SVC(
        kernel='rbf',
        C=10.0,
        gamma='scale',
        probability=True,
        class_weight='balanced',
        random_state=42
    )
    
    svm_model.fit(X_train, y_train)
    svm_pred = svm_model.predict_proba(X_test)[:, 1]
    svm_auc = roc_auc_score(y_test, svm_pred)
    print(f"SVM AUC: {svm_auc:.4f}")
    
    # 選擇最佳單模型
    if svm_auc > xgb_auc:
        best_single_model = svm_model
        best_single_pred = svm_pred
        best_single_auc = svm_auc
        best_model_name = "SVM"
    else:
        best_single_model = xgb_model
        best_single_pred = xgb_pred
        best_single_auc = xgb_auc
        best_model_name = "XGBoost"
    
    # 融合預測 (如果兩個模型都不錯)
    if abs(svm_auc - xgb_auc) < 0.02:  # 相差不大，值得融合
        blend_pred = 0.5 * svm_pred + 0.5 * xgb_pred
        blend_auc = roc_auc_score(y_test, blend_pred)
        print(f"融合模型 AUC: {blend_auc:.4f}")
        
        if blend_auc > best_single_auc:
            best_pred = blend_pred
            best_auc = blend_auc
            best_model_name = "SVM+XGB融合"
            best_model = {'svm': svm_model, 'xgb': xgb_model, 'type': 'blend'}
        else:
            best_pred = best_single_pred
            best_auc = best_single_auc
            best_model = best_single_model
    else:
        best_pred = best_single_pred
        best_auc = best_single_auc
        best_model = best_single_model
    
    # 評估結果
    target = task_config['target_auc']
    if best_auc >= target:
        status = "🎉 超越目標"
    elif best_auc >= target - 0.02:
        status = "🎯 接近目標"
    else:
        status = "💪 需要改進"
    
    print(f"最佳方案: {best_model_name}, AUC: {best_auc:.4f} {status}")
    
    return best_model, best_auc, best_model_name

def train_multiclass_task(task_name, task_config):
    """訓練多分類任務"""
    print(f"\n{'='*50}")
    print(f"🎯 訓練 {task_name}: {task_config['description']}")
    print(f"{'='*50}")
    
    # 準備標籤
    le = LabelEncoder()
    y_train = le.fit_transform(df.loc[train_mask, task_config['column']])
    y_test = le.transform(df.loc[test_mask, task_config['column']])
    
    print(f"類別: {le.classes_}")
    print(f"類別分布 - 訓練: {dict(zip(le.classes_, np.bincount(y_train)))}")
    print(f"類別分布 - 測試: {dict(zip(le.classes_, np.bincount(y_test)))}")
    
    # 類別權重
    class_counts = np.bincount(y_train)
    class_weights = len(y_train) / (len(class_counts) * class_counts)
    sample_weights = np.array([class_weights[y] for y in y_train])
    
    results = {}
    models = {}
    
    # 方法1: 標準 XGBoost
    xgb_model = XGBClassifier(
        objective='multi:softprob',
        num_class=len(le.classes_),
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1.0,
        reg_lambda=1.0,
        min_child_weight=3,
        gamma=0.1,
        random_state=42,
        verbosity=0
    )
    
    xgb_model.fit(X_train, y_train, sample_weight=sample_weights)
    xgb_pred = xgb_model.predict_proba(X_test)
    xgb_auc = roc_auc_score(y_test, xgb_pred, multi_class='ovr', average='micro')
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    results['XGBoost'] = xgb_auc
    models['XGBoost'] = xgb_model
    
    # 方法2: SVM
    svm_model = SVC(
        kernel='rbf',
        C=10.0,
        gamma='scale',
        probability=True,
        class_weight='balanced',
        random_state=42
    )
    
    svm_model.fit(X_train, y_train)
    svm_pred = svm_model.predict_proba(X_test)
    svm_auc = roc_auc_score(y_test, svm_pred, multi_class='ovr', average='micro')
    print(f"SVM AUC: {svm_auc:.4f}")
    results['SVM'] = svm_auc
    models['SVM'] = svm_model
    
    # 方法3: SVM + 特徵選擇 (針對 play_years 使用)
    if task_config.get('use_advanced', False):
        print("🔍 使用進階策略: SVM + 特徵選擇")
        
        # 特徵選擇
        selector = SelectKBest(score_func=f_classif, k=300)
        X_train_fs = selector.fit_transform(X_train, y_train)
        X_test_fs = selector.transform(X_test)
        
        svm_fs_model = SVC(
            kernel='rbf',
            C=10.0,
            gamma='scale',
            probability=True,
            class_weight='balanced',
            random_state=42
        )
        
        svm_fs_model.fit(X_train_fs, y_train)
        svm_fs_pred = svm_fs_model.predict_proba(X_test_fs)
        svm_fs_auc = roc_auc_score(y_test, svm_fs_pred, multi_class='ovr', average='micro')
        print(f"SVM+特徵選擇 AUC: {svm_fs_auc:.4f}")
        results['SVM+特徵選擇'] = svm_fs_auc
        models['SVM+特徵選擇'] = {'model': svm_fs_model, 'selector': selector}
        
        # 智能融合 (SVM特徵選擇 + XGBoost)
        print("🤝 測試智能融合...")
        best_blend_auc = 0
        best_weight = 0.5
        
        for svm_weight in [0.3, 0.4, 0.5, 0.6, 0.7]:
            xgb_weight = 1.0 - svm_weight
            blend_pred = svm_weight * svm_fs_pred + xgb_weight * xgb_pred
            blend_auc = roc_auc_score(y_test, blend_pred, multi_class='ovr', average='micro')
            
            if blend_auc > best_blend_auc:
                best_blend_auc = blend_auc
                best_weight = svm_weight
        
        print(f"最佳融合 - SVM權重:{best_weight:.1f}, AUC: {best_blend_auc:.4f}")
        results['智能融合'] = best_blend_auc
        models['智能融合'] = {
            'svm_fs': models['SVM+特徵選擇'],
            'xgb': xgb_model,
            'svm_weight': best_weight,
            'type': 'advanced_blend'
        }
    
    # 選擇最佳方法
    best_method = max(results.keys(), key=lambda k: results[k])
    best_auc = results[best_method]
    best_model = models[best_method]
    
    # 評估結果
    target = task_config['target_auc']
    if best_auc >= target:
        status = "🎉 超越目標"
    elif best_auc >= target - 0.03:
        status = "🎯 接近目標"
    else:
        status = "💪 需要改進"
    
    print(f"最佳方案: {best_method}, AUC: {best_auc:.4f} {status}")
    
    # 保存標籤編碼器
    best_model_info = {
        'model': best_model,
        'label_encoder': le,
        'method': best_method
    }
    
    return best_model_info, best_auc, best_method

# 執行所有任務
all_results = {}
all_models = {}

for task_name, task_config in tasks.items():
    try:
        if task_config['type'] == 'binary':
            model, auc, method = train_binary_task(task_name, task_config)
        else:
            model, auc, method = train_multiclass_task(task_name, task_config)
        
        all_results[task_name] = auc
        all_models[task_name] = model
        
        # 保存模型
        save_obj = {
            'model': model,
            'scaler': scaler,
            'test_auc': auc,
            'method': method,
            'task_config': task_config,
            'feature_cols': feature_cols
        }
        
        model_path = f"models/{task_name}.joblib"
        joblib.dump(save_obj, model_path)
        print(f"✅ 已保存: {model_path}")
        
    except Exception as e:
        print(f"❌ 任務 {task_name} 失敗: {e}")
        all_results[task_name] = 0.0

# 最終總結
print(f"\n{'='*60}")
print("🏆 四任務統一訓練結果總結")
print(f"{'='*60}")

total_score = 0
achieved_targets = 0

for task_name, task_config in tasks.items():
    auc = all_results.get(task_name, 0.0)
    target = task_config['target_auc']
    
    if auc >= target:
        status = "🎯"
        achieved_targets += 1
    elif auc >= target - 0.03:
        status = "🔥"
    else:
        status = "💪"
    
    improvement_needed = max(0, target - auc)
    
    print(f"{status} {task_name:18s}: {auc:.4f} (目標: {target:.2f}, 差距: {improvement_needed:.3f})")
    total_score += auc

average_auc = total_score / len(tasks)
print(f"\n📊 總體表現:")
print(f"   平均 AUC: {average_auc:.4f}")
print(f"   達標任務: {achieved_targets}/{len(tasks)}")
print(f"   達標率: {achieved_targets/len(tasks)*100:.0f}%")

# 競賽提交建議
print(f"\n🎯 競賽提交格式:")
print("   - gender: 預測男生機率")
print("   - hold_racket_handed: 預測右手機率")
print("   - play_years_0/1/2: 預測各球齡層機率")
print("   - level_2/3/4/5: 預測各等級機率")

print(f"\n✅ 所有模型已保存至 models/ 資料夾")
print(f"✅ 可用於生成最終提交檔案")

# 重點改進建議
print(f"\n💡 針對性改進建議:")
for task_name, auc in all_results.items():
    target = tasks[task_name]['target_auc']
    if auc < target - 0.03:
        print(f"   📈 {task_name}: 考慮更多特徵工程或數據增強")
    elif auc < target:
        print(f"   🔧 {task_name}: 微調超參數可能有幫助")
    else:
        print(f"   ✅ {task_name}: 表現優秀，保持現有策略")

🚀 四任務統一訓練 - 最佳策略集成
數據規模: 訓練(1557, 918), 測試(398, 918)
訓練集選手: 33, 測試集選手: 9

🎯 訓練 gender: 性別預測 (男生機率)
類別分布 - 訓練: 0=262, 1=1295
類別分布 - 測試: 0=66, 1=332
XGBoost AUC: 0.9770
SVM AUC: 0.9914
融合模型 AUC: 0.9922
最佳方案: SVM+XGB融合, AUC: 0.9922 🎉 超越目標
✅ 已保存: models/gender.joblib

🎯 訓練 hold_racket_handed: 慣用手預測 (右手機率)
類別分布 - 訓練: 0=292, 1=1265
類別分布 - 測試: 0=74, 1=324
XGBoost AUC: 1.0000
SVM AUC: 0.9969
融合模型 AUC: 0.9989
最佳方案: XGBoost, AUC: 1.0000 🎉 超越目標
✅ 已保存: models/hold_racket_handed.joblib

🎯 訓練 play_years: 球齡預測 (0:低, 1:中, 2:高)
類別: [0 1 2]
類別分布 - 訓練: {0: 292, 1: 713, 2: 552}
類別分布 - 測試: {0: 95, 1: 155, 2: 148}
XGBoost AUC: 0.7187
SVM AUC: 0.7286
🔍 使用進階策略: SVM + 特徵選擇
SVM+特徵選擇 AUC: 0.7321
🤝 測試智能融合...
最佳融合 - SVM權重:0.5, AUC: 0.7417
最佳方案: 智能融合, AUC: 0.7417 🎯 接近目標
✅ 已保存: models/play_years.joblib

🎯 訓練 level: 等級預測 (2:甲組, 3:乙組, 4:國手, 5:青少)
類別: [2 3 4 5]
類別分布 - 訓練: {2: 614, 3: 115, 4: 100, 5: 728}
類別分布 - 測試: {2: 101, 3: 86, 4: 36, 5: 175}
XGBoost AUC: 0.8723
SVM AUC: 0.7964
最佳方案: XGBoost, AUC: 0.8723 🎉 超越目標
✅ 已保

In [39]:
import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
import pandas as pd
from joblib import load

# === 配置 ===
TEST_FEAT_CSV = "/Users/yuchingchen/Documents/AI_CUP/feature_engineering/test_features.csv"
MODELS_DIR = "/Users/yuchingchen/Documents/AI_CUP/model/models"
OUTPUT_CSV = "/Users/yuchingchen/Documents/AI_CUP/model/sample_submission.csv"

print("⚖️ 平衡最終版本 - 避免極端分布")
print("="*60)

def balanced_predict(model_obj, X_scaled, task_name, is_binary):
    """平衡調整預測 - 避免極端分布"""
    
    model = model_obj.get("model")
    method = model_obj.get("method", "unknown")
    
    print(f"   🎯 預測 {task_name} ({method})")
    
    try:
        # 獲取原始預測
        if method == "SVM+XGB融合":
            svm_pred = model["svm"].predict_proba(X_scaled)
            xgb_pred = model["xgb"].predict_proba(X_scaled)
            raw_proba = 0.55 * svm_pred + 0.45 * xgb_pred
            
        elif method == "智能融合":
            inner_model = model.get("model", model)
            svm_fs = inner_model.get("svm_fs")
            xgb_model = inner_model.get("xgb")
            
            if svm_fs and xgb_model:
                if isinstance(svm_fs, dict) and "selector" in svm_fs:
                    selector = svm_fs["selector"]
                    svm_model = svm_fs["model"]
                    X_fs = selector.transform(X_scaled)
                    svm_pred = svm_model.predict_proba(X_fs)
                else:
                    svm_pred = svm_fs.predict_proba(X_scaled)
                
                xgb_pred = xgb_model.predict_proba(X_scaled)
                
                # 適中的權重調整
                svm_weight = 0.68  # 0.75 太激進，0.68 比較平衡
                raw_proba = svm_weight * svm_pred + (1 - svm_weight) * xgb_pred
                print(f"   ⚖️  play_years 平衡權重: SVM={svm_weight:.2f}")
            else:
                raise ValueError("智能融合組件不完整")
                
        elif isinstance(model, dict):
            if "model" in model:
                raw_proba = model["model"].predict_proba(X_scaled)
            else:
                for key in ["svm", "xgb", "model"]:
                    if key in model and hasattr(model[key], "predict_proba"):
                        raw_proba = model[key].predict_proba(X_scaled)
                        break
        else:
            raw_proba = model.predict_proba(X_scaled)
        
        # 平衡的後處理
        if task_name == "play_years":
            return balanced_adjust_play_years(raw_proba)
        elif task_name == "level":
            return balanced_adjust_level(raw_proba)
        else:
            return raw_proba
        
    except Exception as e:
        print(f"   ❌ 預測失敗: {e}")
        return generate_balanced_backup(X_scaled, task_name, is_binary)

def balanced_adjust_play_years(proba):
    """平衡調整 play_years"""
    print(f"   ⚖️  平衡調整 play_years...")
    
    # 目標：適度提升高球齡，不過度削減中等球齡
    # 理想分布 [16%, 50%, 34%]
    adjusted = proba.copy()
    
    # 溫和的轉移
    transfer_1_to_2 = adjusted[:, 1] * 0.25  # 25%的中等 → 高等
    transfer_0_to_2 = adjusted[:, 0] * 0.10  # 10%的低等 → 高等
    
    adjusted[:, 1] -= transfer_1_to_2
    adjusted[:, 0] -= transfer_0_to_2  
    adjusted[:, 2] += (transfer_1_to_2 + transfer_0_to_2)
    
    # 確保合理的最小值
    adjusted = np.maximum(adjusted, 0.02)  # 每個類別至少2%
    
    # 正規化
    adjusted = adjusted / adjusted.sum(axis=1, keepdims=True)
    
    new_dist = np.mean(adjusted, axis=0)
    print(f"   📊 平衡調整後: {new_dist}")
    
    return adjusted

def balanced_adjust_level(proba):
    """平衡調整 level - 避免等級3過少"""
    print(f"   ⚖️  平衡調整 level...")
    
    # 目標：[32%, 8%, 12%, 48%] - 讓等級3有合理存在感
    adjusted = proba.copy()
    
    # 策略：更溫和的調整
    # 從等級3轉移，但保留合理比例
    transfer_3_to_2 = adjusted[:, 1] * 0.3  # 30%的等級3 → 等級2  
    transfer_3_to_4 = adjusted[:, 1] * 0.1  # 10%的等級3 → 等級4
    
    # 從等級5轉移
    transfer_5_to_2 = adjusted[:, 3] * 0.12  # 12%的等級5 → 等級2
    transfer_5_to_4 = adjusted[:, 3] * 0.08  # 8%的等級5 → 等級4
    
    # 執行轉移
    adjusted[:, 1] -= (transfer_3_to_2 + transfer_3_to_4)  # 等級3減少
    adjusted[:, 3] -= (transfer_5_to_2 + transfer_5_to_4)  # 等級5減少
    
    adjusted[:, 0] += (transfer_3_to_2 + transfer_5_to_2)  # 等級2增加
    adjusted[:, 2] += (transfer_3_to_4 + transfer_5_to_4)  # 等級4增加
    
    # 給等級4額外的小提升
    boost_4 = 0.015  # 1.5% 基礎提升
    adjusted[:, 2] += boost_4
    adjusted[:, 3] -= boost_4 * 0.6  # 主要從等級5補償
    adjusted[:, 0] -= boost_4 * 0.4  # 部分從等級2補償
    
    # 確保每個等級都有合理的最小值
    min_vals = [0.15, 0.05, 0.08, 0.25]  # 等級2:15%, 3:5%, 4:8%, 5:25%
    for i, min_val in enumerate(min_vals):
        adjusted[:, i] = np.maximum(adjusted[:, i], min_val)
    
    # 正規化
    adjusted = adjusted / adjusted.sum(axis=1, keepdims=True)
    
    new_dist = np.mean(adjusted, axis=0)
    print(f"   📊 平衡調整後: {new_dist}")
    
    return adjusted

def generate_balanced_backup(X_scaled, task_name, is_binary):
    """平衡的備用預測"""
    n_samples = X_scaled.shape[0]
    
    if is_binary:
        if task_name == "gender":
            base_prob = 0.81
            noise = np.random.normal(0, 0.08, n_samples)
            proba = np.clip(base_prob + noise, 0.2, 0.9)
            return np.column_stack([1 - proba, proba])
        else:  # hold_racket_handed
            base_prob = 0.80
            noise = np.random.normal(0, 0.08, n_samples)
            proba = np.clip(base_prob + noise, 0.2, 0.9)
            return np.column_stack([1 - proba, proba])
    else:
        if task_name == "play_years":
            # 平衡分布 [16%, 50%, 34%]
            return np.random.dirichlet([1.6, 5.0, 3.4], n_samples)
        else:  # level
            # 平衡分布 [32%, 8%, 12%, 48%]
            return np.random.dirichlet([3.2, 0.8, 1.2, 4.8], n_samples)

# === 執行平衡調整預測 ===
print(f"\n🚀 執行平衡調整...")

# 讀取數據
df_test = pd.read_csv(TEST_FEAT_CSV, dtype={"unique_id": str})
uids = df_test["unique_id"].values
X_raw = df_test.drop(columns=["unique_id"]).values

tasks = {
    "gender": {"is_binary": True, "base_col": "gender"},
    "hold_racket_handed": {"is_binary": True, "base_col": "hold racket handed"},
    "play_years": {"is_binary": False, "base_col": "play years"},
    "level": {"is_binary": False, "base_col": "level"},
}

df_probs = pd.DataFrame({"unique_id": uids})

for task_name, config in tasks.items():
    print(f"\n🔍 {task_name}...")
    
    model_path = os.path.join(MODELS_DIR, f"{task_name}.joblib")
    model_obj = load(model_path)
    
    X_scaled = model_obj["scaler"].transform(X_raw)
    proba = balanced_predict(model_obj, X_scaled, task_name, config["is_binary"])
    
    if config["is_binary"]:
        classes = [0, 1]
        df_probs[config["base_col"]] = proba[:, 1]
    else:
        if "label_encoder" in model_obj:
            le = model_obj["label_encoder"]
            classes = le.classes_
        else:
            classes = [0, 1, 2] if task_name == "play_years" else [2, 3, 4, 5]
        
        for idx, cls in enumerate(classes):
            df_probs[f"{config['base_col']}_{cls}"] = proba[:, idx]

# === 聚合和輸出 ===
print(f"\n📊 聚合預測...")

records = []
for uid, grp in df_probs.groupby("unique_id"):
    rec = {"unique_id": uid}
    
    rec["gender"] = round(grp["gender"].mean(), 4)
    rec["hold racket handed"] = round(grp["hold racket handed"].mean(), 4)
    
    for task_name, base_col in [("play_years", "play years"), ("level", "level")]:
        cls_cols = [c for c in grp.columns if c.startswith(base_col + "_")]
        if cls_cols:
            avg = grp[cls_cols].mean(axis=0).values
            chosen = int(np.argmax(avg))
            best_seg = int(np.argmax(grp[cls_cols].values[:, chosen]))
            best_proba = grp[cls_cols].values[best_seg]
            
            for i, col in enumerate(cls_cols):
                cls = col.split("_")[-1]
                rec[f"{base_col}_{cls}"] = best_proba[i]

    records.append(rec)

submission = pd.DataFrame(records)

# 最終正規化
for base_col in ["play years", "level"]:
    cls_cols = [c for c in submission.columns if c.startswith(base_col + "_")]
    if cls_cols:
        mat = submission[cls_cols].values
        mat = mat / mat.sum(axis=1, keepdims=True)
        submission[cls_cols] = np.round(mat, 4)

# 整理欄位
cols = ["unique_id", "gender", "hold racket handed"]
py_cols = sorted([c for c in submission.columns if c.startswith("play years_")], 
                key=lambda x: int(x.split("_")[1]))
lv_cols = sorted([c for c in submission.columns if c.startswith("level_")], 
                key=lambda x: int(x.split("_")[1]))
cols += py_cols + lv_cols

submission[cols].to_csv(OUTPUT_CSV, index=False, float_format="%.4f")

# === 最終分析 ===
print(f"\n{'='*60}")
print("⚖️ 平衡最終版本結果")
print(f"{'='*60}")

print(f"樣本總數: {len(submission)}")
print(f"Gender 分布: 男性 {submission['gender'].mean():.3f}")
print(f"Hold handed 分布: 右手 {submission['hold racket handed'].mean():.3f}")

if py_cols:
    py_dist = [submission[c].mean() for c in py_cols]
    print(f"Play years 分布: {[round(x, 3) for x in py_dist]}")
    print(f"⚖️  低:{py_dist[0]:.1%} 中:{py_dist[1]:.1%} 高:{py_dist[2]:.1%}")

if lv_cols:
    lv_dist = [submission[c].mean() for c in lv_cols]
    print(f"Level 分布: {[round(x, 3) for x in lv_dist]}")
    print(f"⚖️  2:{lv_dist[0]:.1%} 3:{lv_dist[1]:.1%} 4:{lv_dist[2]:.1%} 5:{lv_dist[3]:.1%}")

print(f"\n✅ 平衡版 submission 已儲存: {OUTPUT_CSV}")

print(f"\n⚖️ 平衡調整特色:")
print(f"   📊 避免極端分布 (等級3 從 2.7% → {lv_dist[1]:.1%})")
print(f"   🎯 保持合理的類別存在感")
print(f"   📈 適度提升目標類別 (高球齡、等級4)")
print(f"   🛡️  更穩定的預測邏輯")

print(f"\n💡 為什麼這個版本更好:")
print(f"   ✅ 沒有過少的類別 (所有類別 > 5%)")
print(f"   ✅ 保持模型的核心預測邏輯")
print(f"   ✅ 分布更自然，符合真實世界")
print(f"   ✅ 降低因極端分布造成的風險")

print(f"\n🎯 最終建議:")
print(f"   這個平衡版本避免了極端分布的風險")
print(f"   既改善了目標類別，又保持了整體合理性")
print(f"   應該是最穩定達到 0.80+ 的版本")

⚖️ 平衡最終版本 - 避免極端分布

🚀 執行平衡調整...

🔍 gender...
   🎯 預測 gender (SVM+XGB融合)

🔍 hold_racket_handed...
   🎯 預測 hold_racket_handed (XGBoost)

🔍 play_years...
   🎯 預測 play_years (智能融合)
   ⚖️  play_years 平衡權重: SVM=0.68
   ⚖️  平衡調整 play_years...
   📊 平衡調整後: [0.14610439 0.52202283 0.33187278]

🔍 level...
   🎯 預測 level (XGBoost)
   ⚖️  平衡調整 level...
   📊 平衡調整後: [0.26391673 0.1562876  0.09774652 0.48204842]

📊 聚合預測...

⚖️ 平衡最終版本結果
樣本總數: 1430
Gender 分布: 男性 0.802
Hold handed 分布: 右手 0.786
Play years 分布: [0.146, 0.522, 0.332]
⚖️  低:14.6% 中:52.2% 高:33.2%
Level 分布: [0.264, 0.156, 0.098, 0.482]
⚖️  2:26.4% 3:15.6% 4:9.8% 5:48.2%

✅ 平衡版 submission 已儲存: /Users/yuchingchen/Documents/AI_CUP/model/sample_submission.csv

⚖️ 平衡調整特色:
   📊 避免極端分布 (等級3 從 2.7% → 15.6%)
   🎯 保持合理的類別存在感
   📈 適度提升目標類別 (高球齡、等級4)
   🛡️  更穩定的預測邏輯

💡 為什麼這個版本更好:
   ✅ 沒有過少的類別 (所有類別 > 5%)
   ✅ 保持模型的核心預測邏輯
   ✅ 分布更自然，符合真實世界
   ✅ 降低因極端分布造成的風險

🎯 最終建議:
   這個平衡版本避免了極端分布的風險
   既改善了目標類別，又保持了整體合理性
   應該是最穩定達到 0.80+ 的版本


## 訓練模型

In [69]:
import os
import numpy as np
import pandas as pd
import joblib
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)

# 確保 models/ 資料夾存在
os.makedirs('models', exist_ok=True)

# 0. 讀取原始特徵
df = pd.read_csv(
    '/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv'
)

# 1. 按 player_id 拆 train/test（80% / 20%）
unique_players = df['player_id'].unique()
train_players, test_players = train_test_split(
    unique_players, test_size=0.2, random_state=42
)
train_idx = df['player_id'].isin(train_players)
test_idx  = df['player_id'].isin(test_players)

# 2. 特徵欄位 & scaler（只在訓練集上 fit）
feature_cols = [c for c in df.columns if c.startswith('f')]
X_all = df[feature_cols].values
scaler = StandardScaler().fit(X_all[train_idx])       # ← 僅在 train 上 fit
X_scaled_all = scaler.transform(X_all)
X_train       = X_scaled_all[train_idx]
X_test        = X_scaled_all[test_idx]

# 3. 內層 CV
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 4. 任務設定
tasks = {
    'gender':             ('gender',             'binary'),
    'hold_racket_handed': ('hold racket handed', 'binary'),
    'play_years':         ('play years',         'multi'),
    'level':              ('level',              'multi'),
}

# 5. 隨機搜尋空間
param_dist = {
    'max_depth':        randint(2, 8),
    'learning_rate':    uniform(0.01, 0.2),
    'n_estimators':     randint(100, 400),
    'subsample':        uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha':        uniform(0, 5),
    'reg_lambda':       uniform(0, 5),
}

def train_with_search(task_name, y_col, problem_type):
    # --- 標籤處理（只在 train labels 上 fit encoder） ---
    if problem_type == 'binary':
        # 正例 = 原始 == 1，其餘當反例
        y_all = (df[y_col].values == 1).astype(int)
        le = None
    else:
        le = LabelEncoder()
        y_train_raw = df.loc[train_idx, y_col].values
        le.fit(y_train_raw)  # ← 僅在 train 上 fit
        y_all = le.transform(df[y_col].values)

    y_train = y_all[train_idx]
    y_test  = y_all[test_idx]

    # --- 建立 base model & 處理不平衡 ---
    base = XGBClassifier(random_state=42, verbosity=0)
    fit_kwargs = {}

    if problem_type == 'binary':
        neg, pos = np.bincount(y_train)
        base.set_params(
            objective='binary:logistic',
            scale_pos_weight=neg/pos,
            eval_metric='logloss'
        )
        scoring = 'roc_auc'
    else:
        cw = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        fit_kwargs['sample_weight'] = np.array([cw[y] for y in y_train])
        base.set_params(
            objective='multi:softprob',
            num_class=len(np.unique(y_all)),
            eval_metric='mlogloss'
        )
        scoring = 'roc_auc_ovr'

    # --- 超參數搜尋 ---
    search = RandomizedSearchCV(
        estimator=base,
        param_distributions=param_dist,
        n_iter=30,
        scoring=scoring,
        cv=inner_cv,
        random_state=42,
        n_jobs=-1,
        refit=True,
        verbose=1
    )
    search.fit(X_train, y_train, **fit_kwargs)

    # --- 測試集評估 ---
    best = search.best_estimator_
    prob_test = best.predict_proba(X_test)
    if problem_type == 'binary':
        auc = roc_auc_score(y_test, prob_test[:,1])
    else:
        auc = roc_auc_score(y_test, prob_test, multi_class='ovr', average='micro')
    print(f"[{task_name}] Test ROC-AUC = {auc:.4f}")

    # --- 儲存 model、scaler 和（必要時）encoder ---
    save_obj = {'model': best, 'scaler': scaler}
    if le is not None:
        save_obj['le'] = le

    fn = os.path.join('models', f"xgb_{task_name}.joblib")
    joblib.dump(save_obj, fn)
    print(f"Saved model to {fn}\n" + "="*60 + "\n")

# 6. 執行所有任務
for name, (col, ptype) in tasks.items():
    train_with_search(name, col, ptype)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[gender] Test ROC-AUC = 0.9795
Saved model to models/xgb_gender.joblib

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[hold_racket_handed] Test ROC-AUC = 0.9998
Saved model to models/xgb_hold_racket_handed.joblib

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[play_years] Test ROC-AUC = 0.7161
Saved model to models/xgb_play_years.joblib

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[level] Test ROC-AUC = 0.8698
Saved model to models/xgb_level.joblib



## 將 Player_years 拉出來建模型

In [14]:
import os, warnings, joblib
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing  import StandardScaler, LabelEncoder
from sklearn.metrics        import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from lightgbm               import LGBMClassifier

# ---- 全面靜音 ----
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*does not have valid feature names.*")

# -------------------------------------------------
# 0. 讀資料
# -------------------------------------------------
df = pd.read_csv("/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv")

# 1. 依 player_id 切外層 train / test（80 / 20）
players = df["player_id"].unique()
train_p, test_p = train_test_split(players, test_size=0.2, random_state=42)
train_idx = df["player_id"].isin(train_p)
test_idx  = df["player_id"].isin(test_p)

# 2. 特徵 + 標準化（保持 DataFrame 型態）
feature_cols = [c for c in df.columns if c.startswith("f")]
scaler = StandardScaler().fit(df.loc[train_idx, feature_cols])

X_scaled = pd.DataFrame(
    scaler.transform(df[feature_cols]),
    columns=feature_cols,
    index=df.index
)
X_train, X_test = X_scaled.loc[train_idx], X_scaled.loc[test_idx]

# 3. 標籤處理
y_raw = df["play years"].values
le    = LabelEncoder().fit(y_raw[train_idx])
y_all = le.transform(y_raw)
y_train, y_test = y_all[train_idx], y_all[test_idx]

# 4. 不平衡 sample_weight
cw = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
sw_train = np.array([cw[y] for y in y_train])
sw_all   = np.array([cw[y] for y in y_all])

# -------------------------------------------------
# 5. LightGBM + RandomizedSearchCV（靜音）
# -------------------------------------------------
base = LGBMClassifier(
    objective="multiclass",
    num_class=len(le.classes_),
    random_state=42,
    n_jobs=-1,
    verbose=-1       # 關閉 LightGBM 自身列印
)

param_dist = {
    "n_estimators":      randint(200, 800),
    "learning_rate":     uniform(0.02, 0.18),
    "max_depth":         randint(3, 10),
    "num_leaves":        randint(16, 128),
    "subsample":         uniform(0.6, 0.4),
    "colsample_bytree":  uniform(0.6, 0.4),
    "reg_alpha":         uniform(0.0, 5.0),
    "reg_lambda":        uniform(0.0, 5.0)
}

inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=base,
    param_distributions=param_dist,
    n_iter=30,
    scoring="roc_auc_ovr",
    cv=inner_cv,
    random_state=42,
    refit=True,
    verbose=0,       # 關閉搜尋進度
    n_jobs=-1
)

search.fit(X_train, y_train, sample_weight=sw_train)

# -------------------------------------------------
# 6. 外層 test 評估
# -------------------------------------------------
best = search.best_estimator_
proba_test = best.predict_proba(X_test)

auc_test = roc_auc_score(
    y_test,
    proba_test,
    multi_class="ovr",
    average="micro"
)
print(f"[play_years] Test ROC-AUC = {auc_test:.4f}")

# -------------------------------------------------
# 7. 全資料重訓 & 存檔
# -------------------------------------------------
best.fit(X_scaled, y_all, sample_weight=sw_all)

os.makedirs("models", exist_ok=True)
joblib.dump(
    {"model": best, "scaler": scaler, "le": le},
    "models/lgbm_play_years.joblib"
)
print("✅ 已儲存 LightGBM play_years 模型到 models/lgbm_play_years.joblib")

[play_years] Test ROC-AUC = 0.7141
✅ 已儲存 LightGBM play_years 模型到 models/lgbm_play_years.joblib


In [17]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection   import StratifiedKFold
from sklearn.preprocessing     import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from lightgbm                  import LGBMClassifier
from sklearn.metrics           import roc_auc_score
from sklearn.base              import clone
import joblib

# === 0. 載入資料 & 標籤編碼 ===
df    = pd.read_csv('/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv')
X     = df[[c for c in df.columns if c.startswith('f')]].values
y_raw = df['play years'].values  # 原始就是 0/1/2
le    = LabelEncoder().fit(y_raw)
y     = le.transform(y_raw)

# === 1. 計算 sample_weight ===
cw = compute_class_weight('balanced', classes=np.unique(y), y=y)
sw = np.array([cw[yi] for yi in y])

# === 2. 定義模型（不 pre-fit scaler）===
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    is_unbalance=True,
    learning_rate=0.05,
    n_estimators=300,
    random_state=42
)

# === 3. 五折手動 CV，fold 內才做 scaler.fit ===
cv     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, valid_idx in cv.split(X, y):
    # 切分
    X_tr, X_va = X[train_idx], X[valid_idx]
    y_tr, y_va = y[train_idx], y[valid_idx]
    sw_tr      = sw[train_idx]

    # fold 內標準化
    scaler = StandardScaler().fit(X_tr)
    X_tr_s  = scaler.transform(X_tr)
    X_va_s  = scaler.transform(X_va)

    # train & pred
    m = clone(model)
    m.fit(X_tr_s, y_tr, sample_weight=sw_tr)
    prob = m.predict_proba(X_va_s)

    # 評分
    scores.append(roc_auc_score(y_va, prob, multi_class='ovr'))

scores = np.array(scores)
print("LGBM 5-fold ROC-AUC:", np.round(scores,4), "→", np.round(scores.mean(),4))

# === 4. 用全資料 retrain 並儲存 ===
#    這裡也先 fit scaler 再 train model
scaler_full = StandardScaler().fit(X)
X_s_full    = scaler_full.transform(X)
model.fit(X_s_full, y, sample_weight=sw)

os.makedirs('models', exist_ok=True)
joblib.dump(
    {'model': model, 'scaler': scaler_full, 'le': le},
    'models/lgbm_play_years.joblib'
)
print("✅ 已儲存 LGBM play_years 模型到 models/lgbm_play_years.joblib")

LGBM 5-fold ROC-AUC: [0.9671 0.9864 0.9766 0.9701 0.9823] → 0.9765
✅ 已儲存 LGBM play_years 模型到 models/lgbm_play_years.joblib


## 把 level 也拉出來建模型

In [None]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection       import GroupKFold
from sklearn.preprocessing         import StandardScaler, LabelEncoder
from sklearn.utils.class_weight    import compute_class_weight
from sklearn.metrics               import roc_auc_score
from sklearn.base                  import clone
from lightgbm                      import LGBMClassifier
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# === 0. 讀取資料 & label encoding ===
df    = pd.read_csv('/Users/yuchingchen/Documents/AI_CUP/feature_engineering/train_features.csv')
X     = df[[c for c in df.columns if c.startswith('f')]].values
y_raw = df['level'].values         # 原始就是 [2,3,4,5]
le    = LabelEncoder().fit(y_raw)  # encode to [0,1,2,3]
y     = le.transform(y_raw)
groups= df['player_id'].values     # 用來 GroupKFold

n_classes = len(le.classes_)       # 一定要告訴 ROC-AUC 有幾個類別

# === 1. 計算 sample_weight（class‐balanced） ===
cw = compute_class_weight('balanced', classes=np.unique(y), y=y)
sw = np.array([cw[yi] for yi in y])

# === 2. 建立 LGBM 多分類器 ===
model = LGBMClassifier(
    objective='multiclass',
    num_class=n_classes,
    is_unbalance=True,
    learning_rate=0.05,
    n_estimators=300,
    random_state=42
)

# === 3. 5-fold GroupKFold（以 player_id 分群） ===
gkf    = GroupKFold(n_splits=5)
scores = []

for tr_idx, va_idx in gkf.split(X, y, groups=groups):
    # 切分
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    sw_tr      = sw[tr_idx]

    # fold 內標準化（防止洩漏）
    scaler = StandardScaler().fit(X_tr)
    X_tr_s  = scaler.transform(X_tr)
    X_va_s  = scaler.transform(X_va)

    # train & predict
    m = clone(model)
    m.fit(X_tr_s, y_tr, sample_weight=sw_tr)
    prob = m.predict_proba(X_va_s)

    # micro one-vs-rest ROC-AUC，指定全部的 labels
    score = roc_auc_score(
        y_va,
        prob,
        multi_class='ovr',
        average='micro',
        labels=np.arange(n_classes)
    )
    scores.append(score)

scores = np.array(scores)
print("LGBM level 5-fold GroupKFold ROC-AUC:", 
      np.round(scores, 4), "→", np.round(scores.mean(), 4))

# === 4. 全資料 retrain & 存檔 ===
scaler_full = StandardScaler().fit(X)
X_full_s    = scaler_full.transform(X)
model.fit(X_full_s, y, sample_weight=sw)

os.makedirs('models', exist_ok=True)
joblib.dump({
    'model': model,
    'scaler': scaler_full,
    'le':     le
}, 'models/lgbm_level.joblib')

print("✅ 已儲存 LGBM level 模型到 models/lgbm_level.joblib")

LGBM level 5-fold GroupKFold ROC-AUC: [0.6585 0.8641 0.652  0.8498 0.7076] → 0.7464


## 用測試集測試模型

In [7]:
import pandas as pd
df_test = pd.read_csv("/Users/yuchingchen/Documents/AI_CUP/feature_engineering/test_features.csv")
print(df_test.shape)

(1430, 1081)


In [90]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names.*",
    category=UserWarning,
)

import os
import numpy as np
import pandas as pd
from joblib import load

# === 0. 路徑設定（請依實際環境修改）===
TEST_FEAT_CSV = "/Users/yuchingchen/Documents/AI_CUP/feature_engineering/test_features.csv"
MODELS_DIR    = "/Users/yuchingchen/Documents/AI_CUP/model/models"
OUTPUT_CSV    = "/Users/yuchingchen/Documents/AI_CUP/model/sample_submission.csv"

# === 1. 讀取測試特徵 ===
df_test = pd.read_csv(TEST_FEAT_CSV, dtype={"unique_id": str})
uids    = df_test["unique_id"].values
X_raw   = df_test.drop(columns=["unique_id"]).values

# === 2. 定義任務及模型路徑 ===
tasks = {
    "gender": {
        "model_path": os.path.join(MODELS_DIR, "xgb_gender.joblib"),
        "is_binary":  True,
        "base_col":   "gender"
    },
    "hold_racket_handed": {
        "model_path": os.path.join(MODELS_DIR, "xgb_hold_racket_handed.joblib"),
        "is_binary":  True,
        "base_col":   "hold racket handed"
    },
    "play_years": {
        "model_path": os.path.join(MODELS_DIR, "lgbm_play_years.joblib"),
        "is_binary":  False,
        "base_col":   "play years"
    },
    "level": {
        "model_path": os.path.join(MODELS_DIR, "xgb_level.joblib"),
        "is_binary":  False,
        "base_col":   "level"
    },
}

# === 3. per‐segment 預測並收集所有切片機率，同時把 classes 記錄下來 ===
df_probs = pd.DataFrame({"unique_id": uids})

for tname, cfg in tasks.items():
    # 載入當初存的模型字典（含 model, le, scaler）
    obj     = load(cfg["model_path"])
    clf     = obj["model"]
    le      = obj.get("le", None)
    scaler  = obj["scaler"]

    # 標準化後預測
    X_scaled = scaler.transform(X_raw)
    proba    = clf.predict_proba(X_scaled)  # shape = (n_seg, n_class)

    # 還原原始標籤
    if cfg["is_binary"]:
        # binary: 正類一定對應到 label=1 的那一欄
        classes = np.array([0, 1])
    else:
        # multi: 用存下的 LabelEncoder 才有原始編號 (e.g. [2,3,4,5])
        classes = le.classes_

    cfg["classes"] = classes  # 留到後面使用
    base = cfg["base_col"]

    if cfg["is_binary"]:
        df_probs[base] = proba[:, 1]
    else:
        for idx, cls in enumerate(classes):
            df_probs[f"{base}_{cls}"] = proba[:, idx]

# === 4. group by unique_id，做平均→挑類別→挑最佳切片 ===
records = []
for uid, grp in df_probs.groupby("unique_id"):
    rec = {"unique_id": uid}

    for tname, cfg in tasks.items():
        base    = cfg["base_col"]
        classes = cfg["classes"]

        if cfg["is_binary"]:
            # 二分類：平均所有 segment 的正類 (label=1) 機率
            rec[base] = round(grp[base].mean(), 4)

        else:
            # 多分類：先算各 class 的平均機率 → 選最高平均的 class idx
            cls_cols = [c for c in grp.columns if c.startswith(base + "_")]
            avg      = grp[cls_cols].mean(axis=0).values
            chosen   = int(np.argmax(avg))

            # 再找該 class 在哪一個 segment 最強
            best_seg   = int(np.argmax(grp[cls_cols].values[:, chosen]))
            best_proba = grp[cls_cols].values[best_seg]  # C 維機率向量

            # 寫回該 segment 上所有 class 的機率
            for idx, cls in enumerate(classes):
                rec[f"{base}_{cls}"] = best_proba[idx]

    records.append(rec)

submission = pd.DataFrame(records)

# === 5. 多分類欄位 sum-to-1 + 四捨五入 ===
for cfg in tasks.values():
    if not cfg["is_binary"]:
        base     = cfg["base_col"]
        cls_cols = [c for c in submission.columns if c.startswith(base + "_")]
        mat      = submission[cls_cols].values
        mat      = mat / mat.sum(axis=1, keepdims=True)
        submission[cls_cols] = np.round(mat, 4)

# === 6. 重排欄位並存檔（四捨五入不採科學記號）===
cols = ["unique_id", "gender", "hold racket handed"]

# play_years_* 按照原始 class 排序
py_cols = sorted(
    [c for c in submission.columns if c.startswith("play years_")],
    key=lambda x: int(x.split("_")[1])
)
lv_cols = sorted(
    [c for c in submission.columns if c.startswith("level_")],
    key=lambda x: int(x.split("_")[1])
)
cols += py_cols + lv_cols

submission[cols].to_csv(
    OUTPUT_CSV,
    index=False,
    float_format="%.4f"   # 關閉科學記號，固定四位小數
)
print(f"✅ 已產生 submission：{OUTPUT_CSV}")

✅ 已產生 submission：/Users/yuchingchen/Documents/AI_CUP/model/sample_submission.csv
