In [1]:
# import zipfile
# from pathlib import Path

# # function to extract zip file
# def extract_zip(zip_path: str, extract_to: str):
#     """Extracts a zip file to the specified directory."""
#     zip_path = Path(zip_path)
#     extract_to = Path(extract_to)

#     # make sure target folder exists
#     extract_to.mkdir(parents=True, exist_ok=True)

#     with zipfile.ZipFile(zip_path, "r") as z:
#         z.extractall(extract_to)

#     print(f"Extracted {zip_path.name} → {extract_to}/")

# extract_zip("./dataset/39_Training_Dataset.zip", "./dataset")
# extract_zip("./dataset/39_Test_Dataset.zip", "./dataset")


In [4]:
from pathlib import Path
import numpy as np
import pandas as pd
import math
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
import joblib
import os

### 特徵工程

In [3]:
# from utils.fft import data_generate

# datapath = './dataset/39_Test_Dataset/test_data'
# tar_dir = './dataset/39_Test_Dataset/tabular_data_test'
# data_generate(datapath, tar_dir)
# datapath = './dataset/39_Training_Dataset/train_data'
# tar_dir = './dataset/39_Training_Dataset/tabular_data_train'
# data_generate(datapath, tar_dir)

### inference

In [9]:
def inference(model_dir: str, test_data_dir: str, submission_path: str):
    model_dir = Path(model_dir)
    test_data_dir = Path(test_data_dir)

    # 收集所有測試檔案（每個選手一份）
    datalist = list(test_data_dir.glob('**/*.csv'))
    unique_ids = [int(f.stem) for f in datalist]


    # 載入所有測試資料
    x_test = pd.DataFrame()
    unique_ids = []
    group_size = 27
    for file in datalist:
        df = pd.read_csv(file)
        x_test = pd.concat([x_test, df], ignore_index=True)
        n_groups = len(df) // group_size
        unique_ids.extend([int(file.stem)] * n_groups)

    scaler = joblib.load(model_dir / "scaler.pkl")
    X_test_scaled = scaler.transform(x_test)
    num_groups = len(X_test_scaled) // group_size

    # 每位選手 27 筆資料
    group_size = 27
    num_groups = len(X_test_scaled) // group_size

    # 載入模型
    model_gender = joblib.load(model_dir / "gender_model.pkl")
    model_hold = joblib.load(model_dir / "hold_model.pkl")
    model_years = joblib.load(model_dir / "years_model.pkl")
    model_level = joblib.load(model_dir / "level_model.pkl")

    # 預測 group-wise 機率平均
    def predict_groupwise_proba(model, X):
        pred_proba = model.predict_proba(X)
        results = []
        for i in range(num_groups):
            group = pred_proba[i*group_size:(i+1)*group_size]
            avg_proba = np.mean(group, axis=0)
            results.append(avg_proba)
        return np.array(results)
    def predict_binary_groupwise(model, X):
        pred_proba = model.predict_proba(X)[:, 0]  # 類別 0 的概率
        results = []
        if np.mean(pred_proba[:group_size]) > 0.5:
            results = [max(pred_proba[i*group_size:(i+1)*group_size]) for i in range(num_groups)]
        else:
            results = [min(pred_proba[i*group_size:(i+1)*group_size]) for i in range(num_groups)]
        return np.array([1 - x for x in results])  # 反轉概率
    def predict_multiary_groupwise(model, X):
        pred_proba = model.predict_proba(X)
        results = []
        for i in range(num_groups):
            group = pred_proba[i*group_size:(i+1)*group_size]
            class_sums = np.sum(group, axis=0)
            chosen_class = np.argmax(class_sums)
            candidate_probs = group[:, chosen_class]
            best_instance = np.argmax(candidate_probs)
            results.append(group[best_instance])
        return np.array(results)

    # 執行四個任務預測
    gender_probs = predict_groupwise_proba(model_gender, X_test_scaled)
    print(gender_probs)
    hold_probs   = predict_groupwise_proba(model_hold, X_test_scaled)
    print(hold_probs)
    years_probs  = predict_multiary_groupwise(model_years, X_test_scaled)
    level_probs  = predict_multiary_groupwise(model_level, X_test_scaled)

    # 建立 submission dataframe
    submission = pd.DataFrame({
        'unique_id': unique_ids,
        'gender': np.round(gender_probs[:, 0], 6),
        'hold racket handed': np.round(hold_probs[:, 0], 6),
        'play years_0': np.round(years_probs[:, 0], 6),
        'play years_1': np.round(years_probs[:, 1], 6),
        'play years_2': np.round(years_probs[:, 2], 6),
        'level_2': np.round(level_probs[:, 0], 6),
        'level_3': np.round(level_probs[:, 1], 6),
        'level_4': np.round(level_probs[:, 2], 6),
        'level_5': np.round(level_probs[:, 3], 6)
    })
    submission = submission.sort_values(by="unique_id").reset_index(drop=True)
    submission.to_csv(submission_path, index=False, encoding='utf-8', lineterminator='\n', float_format='%.6f')
    print(f"Submission saved to {submission_path}")

In [11]:
model_dir = "models"
test_data_dir = "./dataset/39_Test_Dataset/tabular_data_test"
submission_path = "submission0424.csv"
inference(model_dir, test_data_dir, submission_path)

[[0.94916098 0.05083902]
 [0.96376635 0.03623365]
 [0.94139196 0.05860804]
 ...
 [0.9573419  0.0426581 ]
 [0.96415107 0.03584893]
 [0.73803748 0.26196252]]
[[9.99752083e-01 2.47917269e-04]
 [9.99631004e-01 3.68995618e-04]
 [9.97101875e-01 2.89812521e-03]
 ...
 [9.99411091e-01 5.88908517e-04]
 [2.11619855e-02 9.78838014e-01]
 [9.98873492e-01 1.12650776e-03]]
Submission saved to submission0424.csv


In [None]:
le_gender = joblib.load("./models/label_encoder_gender.pkl")
print(le_gender.classes_)  # 確認 ['female', 'male'] 或其他

FileNotFoundError: [Errno 2] No such file or directory: './models/label_encoder_gender.pkl'