In [None]:
# 1
# ライブラリ読み込み＆定数定義
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, HTML

TRAIN_PATH = '../data/train.csv'
TEST_PATH  = '../data/test.csv'
TARGET     = 'Transported'
ID_COL     = 'PassengerId'
GROUP_COL  = 'Group'

In [None]:
# 2
# データ読み込み＆基本情報表示
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
print("Train shape:", train.shape)
print("Test  shape:", test.shape)
display(train.head())
print(train.dtypes)
print(train.isnull().sum())


In [None]:
# 3
# 目的変数分布＆グループ列作成
print(train[TARGET].value_counts(normalize=True))
if GROUP_COL not in train.columns:
    train['LastName'] = train['Name'].str.split().str[0]
    train[GROUP_COL] = train['LastName'].factorize()[0]
    display(train[[GROUP_COL, 'Name']].head())

In [None]:
# 4
# 表示設定＆特徴量リスト作成
pd.options.display.max_columns = None
pd.options.display.max_rows    = None
num_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_features = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
info_df = pd.DataFrame({
    'feature': num_features + cat_features,
    'dtype': [train[f].dtype for f in num_features + cat_features],
    'missing': [train[f].isna().sum() for f in num_features + cat_features]
})
display(HTML(info_df.to_html(index=False)))

In [None]:
# 5
# 欠損補完＆エンコーディング
num_imputer = SimpleImputer(strategy='median')
train_num = pd.DataFrame(num_imputer.fit_transform(train[num_features]), columns=num_features)


cat_imputer = SimpleImputer(strategy='most_frequent')
train_cat = pd.DataFrame(cat_imputer.fit_transform(train[cat_features]), columns=cat_features)
# ↓ LabelEncoder を保存する辞書を作成
le_dict = {}

# 各カテゴリ特徴量ごとにエンコード＆保存
for col in cat_features:
    le = LabelEncoder().fit(train_cat[col])
    train_cat[col] = le.transform(train_cat[col])
    le_dict[col] = le

X_processed = pd.concat([train_num, train_cat], axis=1)
display(HTML(X_processed.head().to_html(index=False)))
display(HTML(pd.DataFrame(X_processed.dtypes, columns=['dtype']).to_html()))

In [None]:
# 5.1
# 特徴量エンジニアリング：Cabin分解＆Soundex対応
import re
# Deck/Num/Side分解
train['Deck'] = train['Cabin'].fillna('Unknown/0/Unknown').str.split('/', expand=True)[0]
train['CabinNo'] = train['Cabin'].fillna('Unknown/0/Unknown').str.split('/', expand=True)[1].astype(float)
train['Side'] = train['Cabin'].fillna('Unknown/0/Unknown').str.split('/', expand=True)[2]
# LabelEncoding
# ↓ Deck, Side をエンコード＆保存
for col in ['Deck','Side']:
    le = LabelEncoder().fit(train[col])
    train[col] = le.transform(train[col])
    le_dict[col] = le
# Soundex作成（NaN対応）
def soundex(name):
    if not isinstance(name, str) or len(name) == 0:
        return '0000'
    name = name.upper()
    mapping = {'BFPV':'1','CGJKQSXZ':'2','DT':'3','L':'4','MN':'5','R':'6'}
    first = name[0]
    tail = name[1:]
    nums = ''.join(mapping.get(ch, '') for ch in tail)
    nums = re.sub(r"(\d)\1+", r"\1", nums)
    return first + (nums + '000')[:3]
# ↓ Soundex 用 LabelEncoder を fit & 保存
le_sdx = LabelEncoder().fit(train['Soundex'])
train['Soundex'] = le_sdx.transform(train['Soundex'])
# （必要なら）le_dict['Soundex'] = le_sdx
le_sdx = LabelEncoder()
train['Soundex'] = le_sdx.fit_transform(train['Soundex'])
# 特徴量リストへ追加
num_features += ['CabinNo']
cat_features += ['Deck','Side','Soundex']

In [None]:
# 6
# Optuna目的関数定義＆最適化
X = X_processed
y = train[TARGET].astype(int)
groups = train[GROUP_COL]
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
def objective(trial):
    params = {
        'tree_method': 'hist','objective':'binary:logistic','eval_metric':'logloss',
        'max_depth': trial.suggest_int('max_depth',4,10),
        'eta': trial.suggest_loguniform('eta',0.01,0.3),
        'subsample': trial.suggest_float('subsample',0.5,1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.5,1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight',1.0,10.0)
    }
    oof = np.zeros(len(y))
    for tr, va in sgkf.split(X, y, groups):
        dtr = xgb.DMatrix(X.iloc[tr], label=y.iloc[tr])
        dva = xgb.DMatrix(X.iloc[va], label=y.iloc[va])
        bst = xgb.train(params, dtr, num_boost_round=10000, evals=[(dva,'valid')], early_stopping_rounds=100, verbose_eval=False)
        oof[va] = bst.predict(dva)
    return log_loss(y, oof)
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials=25)

In [None]:
# 7
# 最適化結果表示
output = f"Best logloss: {study.best_value}\nBest params:\n"
for k,v in study.best_params.items(): output+= f"{k}: {v}\n"
output+= f"Trial#: {study.best_trial.number}"
display(HTML(f"<pre>{output}</pre>"))

In [None]:
# 8
# ベストパラメータでCV評価
# ベストパラメータの取得
params = study.best_params.copy()
# 必要パラメータ
params.update({
    'tree_method': 'hist',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
})

# CV評価の実行
oof = np.zeros(len(y))
scores = []
for tr, va in sgkf.split(X, y, groups):
    dtr = xgb.DMatrix(X.iloc[tr], label=y.iloc[tr])
    dva = xgb.DMatrix(X.iloc[va], label=y.iloc[va])
    bst = xgb.train(
        params,
        dtr,
        num_boost_round=10000,
        evals=[(dva, 'valid')],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    pred = bst.predict(dva)
    scores.append(log_loss(y.iloc[va], pred))
    oof[va] = pred

# 結果表示
display(HTML(f"<pre>CV logloss: {np.mean(scores):.6f} ± {np.std(scores):.6f}</pre>"))


In [None]:
# 9
# ベストパラメータでCV評価 (accuracy)
pred_acc_scores=[]
for tr, va in sgkf.split(X,y,groups):
    dtr=xgb.DMatrix(X.iloc[tr],label=y.iloc[tr]); dva=xgb.DMatrix(X.iloc[va],label=y.iloc[va])
    bst=xgb.train(params,dtr,num_boost_round=10000,evals=[(dva,'valid')],early_stopping_rounds=100,verbose_eval=False)
    preds=bst.predict(dva); labels=(preds>0.5).astype(int)
    pred_acc_scores.append(accuracy_score(y.iloc[va], labels))
display(HTML(f"<pre>CV Accuracy: {np.mean(pred_acc_scores):.5f} ± {np.std(pred_acc_scores):.5f}</pre>"))

In [None]:
# 10
# キャリブレーション＆閾値再最適化
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_curve

# params はすでに 'eval_metric' を含むので、キーワード引数側からは削除
# （もし別の指標を使いたければ params 側を書き換える）
clf = xgb.XGBClassifier(**params, use_label_encoder=False)  
calib = CalibratedClassifierCV(clf, cv=5, method='sigmoid')

# モデル全体に対してキャリブレーション
calib.fit(X, y)

# OOF キャリブレーション確率
oof_calib = np.zeros(len(y))
for tr, va in sgkf.split(X, y, groups):
    calib.fit(X.iloc[tr], y.iloc[tr])
    oof_calib[va] = calib.predict_proba(X.iloc[va])[:, 1]

# Youden の J 指数で閾値を決定
fpr, tpr, th = roc_curve(y, oof_calib)
thr = th[np.argmax(tpr - fpr)]
acc_calib = accuracy_score(y, (oof_calib > thr).astype(int))

print(f"Calibrated Youden Thr: {thr:.3f} → Accuracy: {acc_calib:.5f}")


In [None]:
# 11
# Pseudo-Labeling
# 数値特徴量の欠損補完
test_num = pd.DataFrame(
    num_imputer.transform(test[num_features]),
    columns=num_features
)

# カテゴリ特徴量の欠損補完
test_cat = pd.DataFrame(
    cat_imputer.transform(test[cat_features]),
    columns=cat_features
)

# カテゴリ変数を train と同じ LabelEncoder で変換
for col in cat_features:
    test_cat[col] = le_dict[col].transform(test_cat[col])

# Cabin 分解
test['Deck']    = test['Cabin'].fillna('Unknown/0/Unknown').str.split('/', expand=True)[0]
test['CabinNo'] = test['Cabin'].fillna('Unknown/0/Unknown').str.split('/', expand=True)[1].astype(float)
test['Side']    = test['Cabin'].fillna('Unknown/0/Unknown').str.split('/', expand=True)[2]

# 5) Deck, Side を train のエンコーダーで変換
test['Deck'] = le_dict['Deck'].transform(test['Deck'])
test['Side'] = le_dict['Side'].transform(test['Side'])

# Soundex 作成＆変換
test['LastName'] = test['Name'].str.split().str[0]
test['Soundex'] = test['LastName'].apply(soundex)
test['Soundex'] = le_sdx.transform(test['Soundex'])

# 最終的な test 用特徴量セットを作成
X_test = pd.concat(
    [test_num, test_cat, test[['CabinNo','Deck','Side','Soundex']]],
    axis=1
)

test_proba = bst.predict(xgb.DMatrix(X_test[num_features + cat_features]))
pseudo_idx = np.where((test_proba>0.90)|(test_proba<0.10))[0]
pseudo_X = test.iloc[pseudo_idx][num_features+cat_features]
pseudo_y = (test_proba[pseudo_idx]>0.5).astype(int)
X_pl = pd.concat([X,pseudo_X],axis=0)
y_pl = np.concatenate([y,pseudo_y],axis=0)
dtr_pl = xgb.DMatrix(X_pl,label=y_pl)
bst_pl = xgb.train(params,dtr_pl,num_boost_round=bst.best_iteration)
pl_oof = bst_pl.predict(xgb.DMatrix(X))
print("PL Accuracy:",accuracy_score(y,(pl_oof>0.5).astype(int)))