In [20]:
from pathlib import Path
import numpy as np
import pandas as pd
import math
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from catboost.utils import get_gpu_device_count
# from lightgbm import LGBMClassifier
# from lightgbm import early_stopping, log_evaluation
import joblib
import os
from datetime import datetime

### 特徵工程

In [21]:
from utils.fft import data_generate
datapath = './dataset/39_Test_Dataset/test_data'
tar_dir = './dataset/39_Test_Dataset/tabular_data_test'
# data_generate(datapath, tar_dir)
datapath = './dataset/39_Training_Dataset/train_data'
tar_dir = './dataset/39_Training_Dataset/tabular_data_train'
# data_generate(datapath, tar_dir)

### Training

In [22]:
from utils.model import model
def train(train_info_path: str, datapath: str):
    # 讀取訓練資訊，根據 player_id 將資料分成 80% 訓練、20% 測試
    info = pd.read_csv(train_info_path)
    unique_players = info['player_id'].unique()
    train_players, test_players = train_test_split(unique_players, test_size=0.2, random_state=42)
    
    # 讀取特徵 CSV 檔（位於 "./tabular_data_train"）
    datalist = list(Path(datapath).glob('**/*.csv'))
    target_mask = ['gender', 'hold racket handed', 'play years', 'level']
    
    # 根據 test_players 分組資料
    x_train = pd.DataFrame()
    y_train = pd.DataFrame(columns=target_mask)
    x_test = pd.DataFrame()
    y_test = pd.DataFrame(columns=target_mask)
    
    for file in datalist:
        # 取得檔案名
        unique_id = int(Path(file).stem)
        row = info[info['unique_id'] == unique_id]
        if row.empty:
            continue
        # 取的該列資料
        player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        # lable資料*27
        target = row[target_mask]
        target_repeated = pd.concat([target] * len(data))
        if player_id in train_players:
            x_train = pd.concat([x_train, data], ignore_index=True)
            y_train = pd.concat([y_train, target_repeated], ignore_index=True)
        elif player_id in test_players:
            x_test = pd.concat([x_test, data], ignore_index=True)
            y_test = pd.concat([y_test, target_repeated], ignore_index=True)
    
    # 標準化特徵
    scaler = MinMaxScaler() # 將特徵縮放到 [0, 1] 範圍
    le = LabelEncoder()
    
    X_train_scaled = scaler.fit_transform(x_train)
    X_test_scaled = scaler.transform(x_test)
    group_size = 27
    model_list = ["random_forest", "catboost"]

    # 評分：針對各目標進行模型訓練與評分
    y_train_le_gender = le.fit_transform(y_train['gender'])
    y_test_le_gender = le.transform(y_test['gender'])
    clf_gender = model(X_train_scaled, y_train_le_gender, X_test_scaled, y_test_le_gender, mode="binary", model_str=model_list[1], group_size=group_size)
    
    y_train_le_hold = le.fit_transform(y_train['hold racket handed'])
    y_test_le_hold = le.transform(y_test['hold racket handed'])
    clf_hold = model(X_train_scaled, y_train_le_hold, X_test_scaled, y_test_le_hold, mode="binary", model_str=model_list[1], group_size=group_size)
    
    y_train_le_years = le.fit_transform(y_train['play years'])
    y_test_le_years = le.transform(y_test['play years'])
    clf_years = model(X_train_scaled, y_train_le_years, X_test_scaled, y_test_le_years, mode="multiary", model_str=model_list[1], group_size=group_size)
    
    y_train_le_level = le.fit_transform(y_train['level'])
    y_test_le_level = le.transform(y_test['level'])
    clf_level = model(X_train_scaled, y_train_le_level, X_test_scaled, y_test_le_level, mode="multiary", model_str=model_list[1], group_size=group_size)

    os.makedirs("models", exist_ok=True)
    joblib.dump(scaler, "models/scaler.pkl")
    joblib.dump(clf_gender, "models/gender_model.pkl")
    joblib.dump(clf_hold,   "models/hold_model.pkl")
    joblib.dump(clf_years,  "models/years_model.pkl")
    joblib.dump(clf_level,  "models/level_model.pkl")

In [23]:
train_info_path = './dataset/39_Training_Dataset/train_info.csv'
datapath = './dataset/39_Training_Dataset/tabular_data_train'
train(train_info_path, datapath)

0:	test: 0.6811155	best: 0.6811155 (0)	total: 34.6ms	remaining: 5m 45s
100:	test: 0.8983613	best: 0.8985631 (23)	total: 1.65s	remaining: 2m 41s
200:	test: 0.8927701	best: 0.9010476 (102)	total: 3.3s	remaining: 2m 41s
300:	test: 0.8767101	best: 0.9010476 (102)	total: 5.14s	remaining: 2m 45s
400:	test: 0.8621541	best: 0.9010476 (102)	total: 6.9s	remaining: 2m 45s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.9010476374
bestIteration = 102

Shrink model to first 103 iterations.
Binary AUC of Catboost: 0.9210478276743337 

0:	test: 0.9740716	best: 0.9740716 (0)	total: 29.3ms	remaining: 4m 53s
100:	test: 0.9997638	best: 0.9997657 (99)	total: 1.75s	remaining: 2m 51s
200:	test: 0.9998157	best: 0.9998158 (199)	total: 3.82s	remaining: 3m 6s
300:	test: 0.9998455	best: 0.9998455 (300)	total: 5.8s	remaining: 3m 6s
400:	test: 0.9998552	best: 0.9998554 (396)	total: 7.48s	remaining: 2m 59s
500:	test: 0.9998658	best: 0.9998658 (500)	total: 9.18s	remaining: 2m 53s
600:	test: 0.99

### inference

In [24]:
def inference(model_dir: str, test_data_dir: str, submission_path: str):
    model_dir = Path(model_dir)
    test_data_dir = Path(test_data_dir)

    Path(submission_path).parent.mkdir(parents=True, exist_ok=True)

    # 收集所有測試檔案（每個選手一份）
    datalist = list(test_data_dir.glob('**/*.csv'))
    unique_ids = [int(f.stem) for f in datalist]


    # 載入所有測試資料
    x_test = pd.DataFrame()
    unique_ids = []
    group_size = 27
    for file in datalist:
        df = pd.read_csv(file)
        x_test = pd.concat([x_test, df], ignore_index=True)
        n_groups = len(df) // group_size
        unique_ids.extend([int(file.stem)] * n_groups)

    scaler = joblib.load(model_dir / "scaler.pkl")
    X_test_scaled = scaler.transform(x_test)
    num_groups = len(X_test_scaled) // group_size

    # 每位選手 27 筆資料
    group_size = 27
    num_groups = len(X_test_scaled) // group_size

    # 載入模型
    model_gender = joblib.load(model_dir / "gender_model.pkl")
    model_hold = joblib.load(model_dir / "hold_model.pkl")
    model_years = joblib.load(model_dir / "years_model.pkl")
    model_level = joblib.load(model_dir / "level_model.pkl")

    # 預測 group-wise 機率平均
    def predict_groupwise_proba(model, X):
        pred_proba = model.predict_proba(X)
        results = []
        for i in range(num_groups):
            group = pred_proba[i*group_size:(i+1)*group_size]
            avg_proba = np.mean(group, axis=0)
            results.append(avg_proba)
        return np.array(results)
    def predict_binary_groupwise(model, X):
        pred_proba = model.predict_proba(X)
        results = []
        for i in range(num_groups):
            group = pred_proba[i*group_size:(i+1)*group_size, 0]  # 類別 0 概率（male 或 right）
            if np.mean(group) > 0.5:
                result = max(group)  # 選擇最大概率
            else:
                result = min(group)  # 選擇最小概率
            results.append([result, 1 - result])  # [P(class_0), P(class_1)]
        return np.array(results)
    def predict_multiary_groupwise(model, X):
        pred_proba = model.predict_proba(X)
        results = []
        for i in range(num_groups):
            group = pred_proba[i*group_size:(i+1)*group_size]
            class_sums = np.sum(group, axis=0)
            chosen_class = np.argmax(class_sums)
            candidate_probs = group[:, chosen_class]
            best_instance = np.argmax(candidate_probs)
            results.append(group[best_instance])
        return np.array(results)

    # 執行四個任務預測
    gender_probs = predict_groupwise_proba(model_gender, X_test_scaled)
    hold_probs   = predict_groupwise_proba(model_hold, X_test_scaled)
    years_probs  = predict_multiary_groupwise(model_years, X_test_scaled)
    level_probs  = predict_multiary_groupwise(model_level, X_test_scaled)

    # 建立 submission dataframe
    submission = pd.DataFrame({
        'unique_id': unique_ids,
        'gender': np.round(gender_probs[:, 0], 6),
        'hold racket handed': np.round(hold_probs[:, 0], 6),
        'play years_0': np.round(years_probs[:, 0], 6),
        'play years_1': np.round(years_probs[:, 1], 6),
        'play years_2': np.round(years_probs[:, 2], 6),
        'level_2': np.round(level_probs[:, 0], 6),
        'level_3': np.round(level_probs[:, 1], 6),
        'level_4': np.round(level_probs[:, 2], 6),
        'level_5': np.round(level_probs[:, 3], 6)
    })
    submission = submission.sort_values(by="unique_id").reset_index(drop=True)
    submission.to_csv(submission_path, index=False, encoding='utf-8', lineterminator='\n', float_format='%.6f')
    print(f"Submission saved to {submission_path}")

In [25]:
today = datetime.now()
model_dir = "models"
test_data_dir = "./dataset/39_Test_Dataset/tabular_data_test"
submission_path = f"./result/submission{today.strftime('%m%d')}.csv"
inference(model_dir, test_data_dir, submission_path)

Submission saved to ./result/submission0612.csv
