In [1]:
# exp_04 – CatBoost GPU
# セル1：ライブラリインポート・設定

import os
import sys
from datetime import datetime
import pandas as pd
import numpy as np
import mlflow
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import accuracy_score

# reproduction
SEED = 42
np.random.seed(SEED)

# MLflow Experiment 設定
mlflow.set_experiment("spaceship_titanic")
mlflow.sklearn.autolog()

# 出力ディレクトリ
timestamp = datetime.now().strftime("%Y%m%d%H%M")
OUTPUT_DIR = f"../outputs/submissions/{timestamp}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("セル1 実行完了：ライブラリ読み込み・設定OK")


  from .autonotebook import tqdm as notebook_tqdm


セル1 実行完了：ライブラリ読み込み・設定OK


In [2]:
# exp_04 – セル2：データ読み込み＆前処理パイプライン

import pandas as pd
import numpy as np

# 1) 前処理関数をセル内定義
def preprocess_df(df):
    df = df.copy()
    
    # --- Cabin 分解 ---
    # 例: "B/123/P" → Deck="B", CabinNum=123, Side="P"
    cabin = df["Cabin"].fillna("Z/0/Z").str.split("/", expand=True)
    df["Deck"]      = cabin[0]
    df["CabinNum"]  = cabin[1].astype(int)
    df["Side"]      = cabin[2]
    
    # --- GroupSize ---
    # PassengerId: "0001_01" → group_key="0001"
    grp = df["PassengerId"].str.split("_", expand=True)[0]
    df["GroupSize"] = grp.map(df["PassengerId"].groupby(grp).count())
    
    # --- Spending 合計・LogTotalSpending ---
    spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df["TotalSpending"]    = df[spend_cols].sum(axis=1)
    df["LogTotalSpending"] = np.log1p(df["TotalSpending"])
    
    # --- 相対支出特徴 ---
    # 年齢あたり・グループあたり
    df["SpendingPerAge"]   = df["TotalSpending"] / (df["Age"] + 1e-6)
    df["SpendingPerGroup"] = df["TotalSpending"] / df["GroupSize"]
    
    # --- AgeBand（5分位でビン分割）---
    df["AgeBand"] = pd.qcut(df["Age"].fillna(-1), q=5, duplicates="drop").astype(str)
    
    # --- PlanetRoute 組み合わせ ---
    df["PlanetRoute"] = df["HomePlanet"].fillna("Unknown") + "_" + df["Destination"].fillna("Unknown")
    
    # --- カテゴリ列の欠損埋め（必要なら）---
    for col in ["HomePlanet","Destination","Deck","Side","AgeBand","PlanetRoute"]:
        df[col] = df[col].fillna("Unknown")
    
    return df

# 2) データ読み込み
train = pd.read_csv("../data/train.csv")
test  = pd.read_csv("../data/test.csv")

# 3) FE 適用
train_fe = preprocess_df(train)
test_fe  = preprocess_df(test)

# 4) 簡易ターゲットエンコーディング（全データ版）
te_cols = ["HomePlanet","Destination","PlanetRoute"]
global_mean = train_fe["Transported"].mean()
for col in te_cols:
    mapping = train_fe.groupby(col)["Transported"].mean()
    train_fe[f"{col}_TE"] = train_fe[col].map(mapping).fillna(global_mean)
    test_fe [f"{col}_TE"] = test_fe [col].map(mapping).fillna(global_mean)
# 元のカテゴリ列は削除
train_fe.drop(columns=te_cols, inplace=True)
test_fe .drop(columns=te_cols, inplace=True)
train_fe.drop(columns=["Cabin"], inplace=True)
test_fe .drop(columns=["Cabin"], inplace=True)

# 5) 確認
print("train:", train_fe.shape)
print("test: ", test_fe.shape)
display(train_fe.head())


train: (8693, 23)
test:  (4277, 22)


Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,Side,GroupSize,TotalSpending,LogTotalSpending,SpendingPerAge,SpendingPerGroup,AgeBand,HomePlanet_TE,Destination_TE,PlanetRoute_TE
0,0001_01,False,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,P,1,0.0,0.0,0.0,0.0,"(30.0, 40.0]",0.658846,0.471175,0.634987
1,0002_01,False,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,S,1,736.0,6.602588,30.666665,736.0,"(23.0, 30.0]",0.423946,0.471175,0.389229
2,0003_01,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,S,2,10383.0,9.248021,179.017238,5191.5,"(40.0, 79.0]",0.658846,0.471175,0.634987
3,0003_02,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,S,2,5176.0,8.551981,156.84848,2588.0,"(30.0, 40.0]",0.658846,0.471175,0.634987
4,0004_01,False,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,S,1,1091.0,6.995766,68.187496,1091.0,"(-1.001, 18.0]",0.423946,0.471175,0.389229


In [3]:
# exp_04 – セル3：StratifiedGroupKFold 定義（修正版）

from sklearn.model_selection import StratifiedGroupKFold

# ——— 文字列列の削除＆型変換 ———
# 不要列 Name を削除
train_fe.drop(columns=["Name"], inplace=True)
test_fe .drop(columns=["Name"], inplace=True)
# CryoSleep, VIP の欠損を False に、そして int 型 (0/1) に変換
for col in ["CryoSleep", "VIP"]:
    train_fe[col] = train_fe[col].fillna(False).astype(int)
    test_fe [col] = test_fe [col].fillna(False).astype(int)

# ——— グループ列の追加 ———
train_fe["Group"] = train_fe["PassengerId"].str.split("_").str[0]

# ——— 説明変数・目的変数・グループ取得 ———
features = [
    c for c in train_fe.columns
    if c not in ["PassengerId", "Transported", "Group", "fold"]
]
X      = train_fe[features]
y      = train_fe["Transported"].astype(int)
groups = train_fe["Group"]

# ——— Fold の準備＆割り当て ———
skf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
train_fe["fold"] = -1

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y, groups)):
    train_fe.loc[val_idx, "fold"] = fold
    print(f"Fold {fold}: train {len(tr_idx)} rows, valid {len(val_idx)} rows")

# ——— 各Foldの正例比率確認 ———
print("\n各Foldの分布（Transported 比率）")
for fold in range(5):
    sub = train_fe[train_fe["fold"] == fold]
    print(f"Fold {fold} - Positive rate: {sub['Transported'].mean():.4f}")


  train_fe[col] = train_fe[col].fillna(False).astype(int)
  test_fe [col] = test_fe [col].fillna(False).astype(int)
  train_fe[col] = train_fe[col].fillna(False).astype(int)
  test_fe [col] = test_fe [col].fillna(False).astype(int)


Fold 0: train 6996 rows, valid 1697 rows
Fold 1: train 6939 rows, valid 1754 rows
Fold 2: train 6942 rows, valid 1751 rows
Fold 3: train 6992 rows, valid 1701 rows
Fold 4: train 6903 rows, valid 1790 rows

各Foldの分布（Transported 比率）
Fold 0 - Positive rate: 0.4944
Fold 1 - Positive rate: 0.5017
Fold 2 - Positive rate: 0.5226
Fold 3 - Positive rate: 0.4909
Fold 4 - Positive rate: 0.5078


In [4]:
# exp_04 – 型確認用デバッグコード

# object 型の列一覧を出力
obj_cols = train_fe.select_dtypes(include=['object']).columns.tolist()
print("Object 型の列一覧:", obj_cols)


Object 型の列一覧: ['PassengerId', 'Deck', 'Side', 'AgeBand', 'Group']


In [5]:
# exp_04 – セル4：Optuna によるハイパーパラメータ探索

import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# 目的関数定義
def objective(trial):
    # 試行パラメータ
    params = {
        "loss_function": "Logloss",
        "task_type":    "GPU",
        "random_seed":  SEED,
        "verbose":      False,
        "depth":        trial.suggest_int("depth", 6, 10),
        "learning_rate":trial.suggest_float("learning_rate", 0.03, 0.2),
        "iterations":   trial.suggest_int("iterations", 500, 1000),
    }

    cv_scores = []
    # 各 Fold で学習＆評価
    for fold in range(5):
        tr_idx = train_fe[train_fe["fold"] != fold].index
        va_idx = train_fe[train_fe["fold"] == fold].index

        X_tr, y_tr = X.loc[tr_idx], y.loc[tr_idx]
        X_va, y_va = X.loc[va_idx], y.loc[va_idx]

        cat_feats = ["Deck", "Side", "AgeBand"]
        
        model = CatBoostClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=(X_va, y_va),
            cat_features=cat_feats,
            early_stopping_rounds=50
        )
        
        preds = model.predict(X_va)
        cv_scores.append(accuracy_score(y_va, preds))

    # 平均 CV 精度を返す
    return np.mean(cv_scores)

# Study 作成＆最適化
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=20, timeout=None)

# 結果表示
print("Best CV score:", study.best_value)
print("Best parameters:", study.best_params)


[I 2025-07-15 23:22:13,710] A new study created in memory with name: no-name-a35110c8-523e-4271-a9d4-f13bfc56fbc2
[I 2025-07-15 23:23:08,005] Trial 0 finished with value: 0.8123111371585543 and parameters: {'depth': 7, 'learning_rate': 0.19162143208968577, 'iterations': 866}. Best is trial 0 with value: 0.8123111371585543.
[I 2025-07-15 23:26:06,279] Trial 1 finished with value: 0.8105368738271663 and parameters: {'depth': 8, 'learning_rate': 0.056523168875214205, 'iterations': 578}. Best is trial 0 with value: 0.8123111371585543.
[I 2025-07-15 23:26:46,279] Trial 2 finished with value: 0.8115294326495255 and parameters: {'depth': 6, 'learning_rate': 0.177249944781739, 'iterations': 801}. Best is trial 0 with value: 0.8123111371585543.
[I 2025-07-15 23:36:12,262] Trial 3 finished with value: 0.8113709644639713 and parameters: {'depth': 9, 'learning_rate': 0.033499364030286416, 'iterations': 985}. Best is trial 0 with value: 0.8123111371585543.
[I 2025-07-15 23:47:28,230] Trial 4 finish

Best CV score: 0.8160579237629315
Best parameters: {'depth': 10, 'learning_rate': 0.06609764881530694, 'iterations': 591}
