In [1]:
# 1
# ライブラリ読み込み＆定数定義
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, HTML

TRAIN_PATH = '../data/train.csv'
TEST_PATH  = '../data/test.csv'
TARGET     = 'Transported'
ID_COL     = 'PassengerId'
GROUP_COL  = 'Group'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2
# データ読み込み＆基本情報表示
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
print("Train shape:", train.shape)
print("Test  shape:", test.shape)
display(train.head())
print(train.dtypes)
print(train.isnull().sum())


Train shape: (8693, 14)
Test  shape: (4277, 13)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [3]:
# 3
# 目的変数分布＆グループ列作成
print(train[TARGET].value_counts(normalize=True))
if GROUP_COL not in train.columns:
    train['LastName'] = train['Name'].str.split().str[0]
    train[GROUP_COL] = train['LastName'].factorize()[0]
    display(train[[GROUP_COL, 'Name']].head())

Transported
True     0.503624
False    0.496376
Name: proportion, dtype: float64


Unnamed: 0,Group,Name
0,0,Maham Ofracculy
1,1,Juanna Vines
2,2,Altark Susent
3,3,Solam Susent
4,4,Willy Santantines


In [4]:
# 4
# 表示設定＆特徴量リスト作成
pd.options.display.max_columns = None
pd.options.display.max_rows    = None
num_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_features = ['HomePlanet','CryoSleep','Destination','VIP']
info_df = pd.DataFrame({
    'feature': num_features + cat_features,
    'dtype': [train[f].dtype for f in num_features + cat_features],
    'missing': [train[f].isna().sum() for f in num_features + cat_features]
})
display(HTML(info_df.to_html(index=False)))

feature,dtype,missing
Age,float64,179
RoomService,float64,181
FoodCourt,float64,183
ShoppingMall,float64,208
Spa,float64,183
VRDeck,float64,188
HomePlanet,object,201
CryoSleep,object,217
Destination,object,182
VIP,object,203


In [5]:
# 5
# 欠損補完＆エンコーディング
num_imputer = SimpleImputer(strategy='median')
train_num = pd.DataFrame(num_imputer.fit_transform(train[num_features]), columns=num_features)


# Cabin を除いた cat_features で imputer を fit
cat_imputer = SimpleImputer(strategy='most_frequent')
train_cat   = pd.DataFrame(
    cat_imputer.fit_transform(train[cat_features]),
    columns=cat_features
)
# ↓ LabelEncoder を保存する辞書を作成
le_dict = {}

# 各カテゴリ特徴量ごとにエンコード＆保存
for col in cat_features:
    le = LabelEncoder().fit(train_cat[col])
    train_cat[col] = le.transform(train_cat[col])
    le_dict[col] = le

X_processed = pd.concat([train_num, train_cat], axis=1)
display(HTML(X_processed.head().to_html(index=False)))
display(HTML(pd.DataFrame(X_processed.dtypes, columns=['dtype']).to_html()))

Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,CryoSleep,Destination,VIP
39.0,0.0,0.0,0.0,0.0,0.0,1,0,2,0
24.0,109.0,9.0,25.0,549.0,44.0,0,0,2,0
58.0,43.0,3576.0,0.0,6715.0,49.0,1,0,2,1
33.0,0.0,1283.0,371.0,3329.0,193.0,1,0,2,0
16.0,303.0,70.0,151.0,565.0,2.0,0,0,2,0


Unnamed: 0,dtype
Age,float64
RoomService,float64
FoodCourt,float64
ShoppingMall,float64
Spa,float64
VRDeck,float64
HomePlanet,int64
CryoSleep,int64
Destination,int64
VIP,int64


In [6]:
# 5.1
# 特徴量エンジニアリング：Cabin分解＆Soundex対応
import re

# Deck/Num/Side 分解
train['Deck']    = train['Cabin'].fillna('Unknown/0/Unknown')\
                        .str.split('/', expand=True)[0]
train['CabinNo'] = train['Cabin'].fillna('Unknown/0/Unknown')\
                        .str.split('/', expand=True)[1].astype(float)
train['Side']    = train['Cabin'].fillna('Unknown/0/Unknown')\
                        .str.split('/', expand=True)[2]

# ↓ Deck と Side を同じ Encoder で変換＆保存
for col in ['Deck','Side']:
    le = LabelEncoder().fit(train[col])
    train[col] = le.transform(train[col])
    le_dict[col] = le
# Soundex作成（NaN対応）
def soundex(name):
    if not isinstance(name, str) or len(name) == 0:
        return '0000'
    name = name.upper()
    mapping = {'BFPV':'1','CGJKQSXZ':'2','DT':'3','L':'4','MN':'5','R':'6'}
    first = name[0]
    tail = name[1:]
    nums = ''.join(mapping.get(ch, '') for ch in tail)
    nums = re.sub(r"(\d)\1+", r"\1", nums)
    return first + (nums + '000')[:3]

# ↓ LastName 列を作成
train['LastName'] = train['Name'].str.split().str[0]
# ↓ Soundex 列を作成
train['Soundex']  = train['LastName'].apply(soundex)

# ↓ Soundex 用 LabelEncoder を fit & 保存
le_sdx = LabelEncoder().fit(train['Soundex'])
train['Soundex'] = le_sdx.transform(train['Soundex'])
# ↓ 辞書にも格納
le_dict['Soundex'] = le_sdx

# 特徴量リストへ追加
num_features += ['CabinNo']
cat_features += ['Deck','Side','Soundex']

# CabinNo, Deck, Side, Soundex を train_num, train_cat に結合
X_processed = pd.concat(
    [ train_num,
      train_cat,
      train[['CabinNo','Deck','Side','Soundex']] ],
    axis=1
)
# 以降セル6〜で X = X_processed を使います
X = X_processed

y      = train[TARGET].astype(int)
groups = train[GROUP_COL]
sgkf   = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
# 6
# Optuna目的関数定義＆最適化
def objective(trial):
    params = {
        # ← GPU を使う設定に変更
        'tree_method':      'hist',
        'device':           'cuda',
        'objective':        'binary:logistic',
        'eval_metric':      'logloss',
        'max_depth':        trial.suggest_int('max_depth', 4, 10),
        'eta':              trial.suggest_loguniform('eta', 0.01, 0.3),
        'subsample':        trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0)
    }
    oof = np.zeros(len(y))
    for tr_idx, va_idx in sgkf.split(X, y, groups):
        dtr = xgb.DMatrix(X.iloc[tr_idx], label=y.iloc[tr_idx])
        dva = xgb.DMatrix(X.iloc[va_idx], label=y.iloc[va_idx])
        bst = xgb.train(
            params,
            dtr,
            num_boost_round=10000,
            evals=[(dva, 'valid')],
            early_stopping_rounds=100,
            verbose_eval=False
        )
        oof[va_idx] = bst.predict(dva)
    return log_loss(y, oof)

# Optuna 実行もそのまま
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=25)


[I 2025-07-22 00:35:26,000] A new study created in memory with name: no-name-f6eca257-4142-43ad-b5b3-e9e061724747
  'eta':              trial.suggest_loguniform('eta', 0.01, 0.3),
[I 2025-07-22 00:35:32,561] Trial 0 finished with value: 0.5119306664760742 and parameters: {'max_depth': 9, 'eta': 0.12563900326308428, 'subsample': 0.9054564784867869, 'colsample_bytree': 0.689515522097887, 'scale_pos_weight': 7.91193389533692}. Best is trial 0 with value: 0.5119306664760742.
  'eta':              trial.suggest_loguniform('eta', 0.01, 0.3),
[I 2025-07-22 00:35:37,340] Trial 1 finished with value: 0.4191175186834842 and parameters: {'max_depth': 7, 'eta': 0.0921396205943922, 'subsample': 0.9295792934402977, 'colsample_bytree': 0.7722176237090728, 'scale_pos_weight': 2.154698855545007}. Best is trial 1 with value: 0.4191175186834842.
  'eta':              trial.suggest_loguniform('eta', 0.01, 0.3),
[I 2025-07-22 00:36:10,961] Trial 2 finished with value: 0.5233587031066225 and parameters: {'m

In [8]:
# 7
# 最適化結果表示
output = f"Best logloss: {study.best_value}\nBest params:\n"
for k,v in study.best_params.items(): output+= f"{k}: {v}\n"
output+= f"Trial#: {study.best_trial.number}"
display(HTML(f"<pre>{output}</pre>"))

In [13]:
# 8
# ベストパラメータでCV評価
# ベストパラメータの取得
params = study.best_params.copy()
# 必要パラメータ
params.update({
    'tree_method': 'hist',
    'device':      'cuda',
    'objective':        'binary:logistic',
    'eval_metric':      'logloss',
})

# CV評価の実行
oof = np.zeros(len(y))
scores = []
for tr, va in sgkf.split(X, y, groups):
    dtr = xgb.DMatrix(X.iloc[tr], label=y.iloc[tr])
    dva = xgb.DMatrix(X.iloc[va], label=y.iloc[va])
    bst = xgb.train(
        params,
        dtr,
        num_boost_round=10000,
        evals=[(dva, 'valid')],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    pred = bst.predict(dva)
    scores.append(log_loss(y.iloc[va], pred))
    oof[va] = pred

# 結果表示
display(HTML(f"<pre>CV logloss: {np.mean(scores):.6f} ± {np.std(scores):.6f}</pre>"))


In [14]:
# 9
# ベストパラメータでCV評価 (accuracy)
pred_acc_scores=[]
for tr, va in sgkf.split(X,y,groups):
    dtr=xgb.DMatrix(X.iloc[tr],label=y.iloc[tr]); dva=xgb.DMatrix(X.iloc[va],label=y.iloc[va])
    bst=xgb.train(params,dtr,num_boost_round=10000,evals=[(dva,'valid')],early_stopping_rounds=100,verbose_eval=False)
    preds=bst.predict(dva); labels=(preds>0.5).astype(int)
    pred_acc_scores.append(accuracy_score(y.iloc[va], labels))
display(HTML(f"<pre>CV Accuracy: {np.mean(pred_acc_scores):.5f} ± {np.std(pred_acc_scores):.5f}</pre>"))

In [18]:
# 10
# キャリブレーション＆閾値再最適化
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_curve

# params はすでに 'eval_metric' を含むので、キーワード引数側からは削除
# （もし別の指標を使いたければ params 側を書き換える）
clf = xgb.XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    gpu_id=0,
    use_label_encoder=False,
)

calib = CalibratedClassifierCV(clf, cv=5, method='sigmoid')

# モデル全体に対してキャリブレーション
calib.fit(X, y)

# OOF キャリブレーション確率
oof_calib = np.zeros(len(y))
for tr, va in sgkf.split(X, y, groups):
    calib.fit(X.iloc[tr], y.iloc[tr])
    oof_calib[va] = calib.predict_proba(X.iloc[va])[:, 1]

# Youden の J 指数で閾値を決定
fpr, tpr, th = roc_curve(y, oof_calib)
thr = th[np.argmax(tpr - fpr)]
acc_calib = accuracy_score(y, (oof_calib > thr).astype(int))

print(f"Calibrated Youden Thr: {thr:.3f} → Accuracy: {acc_calib:.5f}")


# 全データ DMatrix 作成
dtrain_all = xgb.DMatrix(X, label=y)

# 全データで CV 実行し、最適ラウンド数を取得
cv_results = xgb.cv(
    params,
    dtrain_all,
    num_boost_round=10000,
    nfold=5,
    early_stopping_rounds=100,
    metrics="logloss",
    as_pandas=True
)
best_rounds = len(cv_results)

# 最適ラウンドで全データ再学習（以降の pseudo‑label 用の base モデル）
bst = xgb.train(params, dtrain_all, num_boost_round=best_rounds)
print(f"Final model with {best_rounds} rounds")


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda

Calibrated Youden Thr: 0.509 → Accuracy: 0.80755
Final model with 465 rounds


In [19]:
# 11: Pseudo‑Labeling
# 1) test に派生列を先に作成＆エンコード
test['Deck']     = test['Cabin'].fillna('Unknown/0/Unknown')\
                          .str.split('/', expand=True)[0]
test['CabinNo']  = test['Cabin'].fillna('Unknown/0/Unknown')\
                          .str.split('/', expand=True)[1].astype(float)
test['Side']     = test['Cabin'].fillna('Unknown/0/Unknown')\
                          .str.split('/', expand=True)[2]
test['LastName'] = test['Name'].str.split().str[0]
test['Soundex']  = test['LastName'].apply(soundex)

test['Deck']    = le_dict['Deck'].transform(test['Deck'])
test['Side']    = le_dict['Side'].transform(test['Side'])
test['Soundex'] = le_sdx.transform(test['Soundex'])

# 2) 数値特徴量 6 列の欠損補完
orig_num = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
test_num = pd.DataFrame(
    num_imputer.transform(test[orig_num]),
    columns=orig_num
)

# 3) カテゴリ特徴量 4 列の欠損補完＆エンコーディング
orig_cat = ['HomePlanet','CryoSleep','Destination','VIP']
test_cat = pd.DataFrame(
    cat_imputer.transform(test[orig_cat]),
    columns=orig_cat
)
for col in orig_cat:
    test_cat[col] = le_dict[col].transform(test_cat[col])

# 4) 最終特徴量セット結合
X_test = pd.concat(
    [test_num, test_cat, test[['CabinNo','Deck','Side','Soundex']]],
    axis=1
)
X_test = X_test[X_processed.columns]

# --- 5) 予測実行 & Pseudo‑Labeling 続行 ---
test_proba = bst.predict(xgb.DMatrix(X_test))
pseudo_idx = np.where((test_proba > 0.90)|(test_proba < 0.10))[0]
pseudo_X   = X_test.iloc[pseudo_idx]
pseudo_y   = (test_proba[pseudo_idx] > 0.5).astype(int)

X_pl = pd.concat([X, pseudo_X], axis=0)
y_pl = np.concatenate([y, pseudo_y], axis=0)
dtr_pl = xgb.DMatrix(X_pl, label=y_pl)

# ここだけ学習
bst_pl = xgb.train(params, dtr_pl, num_boost_round=best_rounds)
pl_oof = bst_pl.predict(xgb.DMatrix(X))
print("PL Accuracy:", accuracy_score(y, (pl_oof>0.5).astype(int)))


PL Accuracy: 0.8817439318992293


In [25]:
# 12
# Submission用CSV作成
import os
# テストデータ予測
preds_test = bst_pl.predict(xgb.DMatrix(X_test))
# 二値化（0.5閾値）
labels_test = (preds_test > 0.5)
# DataFrame生成
submission = pd.DataFrame({
    ID_COL: test[ID_COL],
    TARGET: labels_test
})
# 保存ディレクトリ作成
output_dir = '/mnt/c/dev/AI/kaggle/spaceship-titanic/outputs/submissions/ver4'

os.makedirs(output_dir, exist_ok=True)
submission_path = os.path.join(output_dir, 'soya_model4.csv')
submission.to_csv(submission_path, index=False)
print(f"Saved submission to {submission_path}")


Saved submission to /mnt/c/dev/AI/kaggle/spaceship-titanic/outputs/submissions/ver4/soya_model4.csv
