In [1]:
# ── セル1: ライブラリのインポートと表示設定 ──

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 表示設定
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

# グラフをノートブック内に表示
%matplotlib inline

# ランダムシード
RANDOM_STATE = 42

print("セル1 実行完了: ライブラリ読み込みと設定を行いました。")


セル1 実行完了: ライブラリ読み込みと設定を行いました。


In [2]:
# ── セル2: データ読み込み＆型確認 ──

# 相対パスで data フォルダから読み込み
train = pd.read_csv('../data/train.csv')
test  = pd.read_csv('../data/test.csv')

# 形状確認
print(f"train: {train.shape}, test: {test.shape}")

# 各列のデータ型を表示
display(train.dtypes)

# train データの先頭 5 行を確認
display(train.head())


train: (8693, 14), test: (4277, 13)


PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
# ── セル3: Cabin 分解 ──

# Cabin を「Deck/番号/Side」に分割し、新しい列を作成
for df in [train, test]:
    # 分割
    cabin_split = df['Cabin'].str.split('/', expand=True)
    df['Deck']     = cabin_split[0]  # 例: 'B'
    df['CabinNum'] = pd.to_numeric(cabin_split[1], errors='coerce')  # 例: 0 → 数値
    df['Side']     = cabin_split[2]  # 例: 'P' or 'S'

    # 欠損値処理
    df['Deck']     = df['Deck'].fillna('Missing')
    df['Side']     = df['Side'].fillna('Missing')

# 結果確認
display(train[['Cabin','Deck','CabinNum','Side']].head())
display(test[['Cabin','Deck','CabinNum','Side']].head())


Unnamed: 0,Cabin,Deck,CabinNum,Side
0,B/0/P,B,0.0,P
1,F/0/S,F,0.0,S
2,A/0/S,A,0.0,S
3,A/0/S,A,0.0,S
4,F/1/S,F,1.0,S


Unnamed: 0,Cabin,Deck,CabinNum,Side
0,G/3/S,G,3.0,S
1,F/4/S,F,4.0,S
2,C/0/S,C,0.0,S
3,C/1/S,C,1.0,S
4,F/5/S,F,5.0,S


In [4]:
# ── セル4: グループサイズ (GroupSize) の作成 ──

for df in [train, test]:
    # PassengerId のプレフィックスをグループIDとして抽出
    df['Group'] = df['PassengerId'].str.split('_').str[0]
    # グループごとの人数を算出
    df['GroupSize'] = df.groupby('Group')['PassengerId'].transform('count')

# 結果確認
display(train[['PassengerId','Group','GroupSize']].head())
display(test[['PassengerId','Group','GroupSize']].head())


Unnamed: 0,PassengerId,Group,GroupSize
0,0001_01,1,1
1,0002_01,2,1
2,0003_01,3,2
3,0003_02,3,2
4,0004_01,4,1


Unnamed: 0,PassengerId,Group,GroupSize
0,0013_01,13,1
1,0018_01,18,1
2,0019_01,19,1
3,0021_01,21,1
4,0023_01,23,1


In [5]:
# ── セル5: 支出合計 & 対数変換 ──

for df in [train, test]:
    # 支出項目の合計を計算
    df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    # log1p 変換でスケールを調整
    df['LogTotalSpending'] = np.log1p(df['TotalSpending'])

# 結果確認
display(train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck',
               'TotalSpending','LogTotalSpending']].head())
display(test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck',
              'TotalSpending','LogTotalSpending']].head())


Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpending,LogTotalSpending
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,109.0,9.0,25.0,549.0,44.0,736.0,6.602588
2,43.0,3576.0,0.0,6715.0,49.0,10383.0,9.248021
3,0.0,1283.0,371.0,3329.0,193.0,5176.0,8.551981
4,303.0,70.0,151.0,565.0,2.0,1091.0,6.995766


Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpending,LogTotalSpending
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,9.0,0.0,2823.0,0.0,2832.0,7.949091
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,6652.0,0.0,181.0,585.0,7418.0,8.9118
4,10.0,0.0,635.0,0.0,0.0,645.0,6.4708


In [6]:
# ── セル6: 追加特徴量作成（年齢帯・HomePlanet×Destination・支出比率） ──

for df in [train, test]:
    # 年齢帯 (AgeBand) の作成
    df['AgeBand'] = pd.cut(
        df['Age'],
        bins=[0, 12, 18, 30, 50, 70, df['Age'].max()+1],
        labels=['Child','Teen','YoungAdult','Adult','Senior','Elder']
    )

    # HomePlanet と Destination の組み合わせ特徴 (PlanetRoute)
    df['HomePlanet']    = df['HomePlanet'].fillna('Missing')
    df['Destination']   = df['Destination'].fillna('Missing')
    df['PlanetRoute']   = df['HomePlanet'] + '_' + df['Destination']

    # 支出／年齢、支出／グループサイズ の比率
    # Age=0 を避けるため +1、GroupSize=0 はありえない想定
    df['SpendingPerAge']   = df['TotalSpending'] / (df['Age'] + 1)
    df['SpendingPerGroup'] = df['TotalSpending'] / df['GroupSize']

# 確認
display(train[['Age','AgeBand','HomePlanet','Destination','PlanetRoute',
               'SpendingPerAge','SpendingPerGroup']].head())
display(test[['Age','AgeBand','HomePlanet','Destination','PlanetRoute',
              'SpendingPerAge','SpendingPerGroup']].head())


Unnamed: 0,Age,AgeBand,HomePlanet,Destination,PlanetRoute,SpendingPerAge,SpendingPerGroup
0,39.0,Adult,Europa,TRAPPIST-1e,Europa_TRAPPIST-1e,0.0,0.0
1,24.0,YoungAdult,Earth,TRAPPIST-1e,Earth_TRAPPIST-1e,29.44,736.0
2,58.0,Senior,Europa,TRAPPIST-1e,Europa_TRAPPIST-1e,175.983051,5191.5
3,33.0,Adult,Europa,TRAPPIST-1e,Europa_TRAPPIST-1e,152.235294,2588.0
4,16.0,Teen,Earth,TRAPPIST-1e,Earth_TRAPPIST-1e,64.176471,1091.0


Unnamed: 0,Age,AgeBand,HomePlanet,Destination,PlanetRoute,SpendingPerAge,SpendingPerGroup
0,27.0,YoungAdult,Earth,TRAPPIST-1e,Earth_TRAPPIST-1e,0.0,0.0
1,19.0,YoungAdult,Earth,TRAPPIST-1e,Earth_TRAPPIST-1e,141.6,2832.0
2,31.0,Adult,Europa,55 Cancri e,Europa_55 Cancri e,0.0,0.0
3,38.0,Adult,Europa,TRAPPIST-1e,Europa_TRAPPIST-1e,190.205128,7418.0
4,20.0,YoungAdult,Earth,TRAPPIST-1e,Earth_TRAPPIST-1e,30.714286,645.0


In [9]:
# ── セル7 : 前処理パイプライン定義 & モデル学習（Baseline with new features） ──

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier

# 1) 特徴量リスト
num_feats = [
    'CryoSleep','Age','VIP',
    'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck',
    'CabinNum','GroupSize','TotalSpending','LogTotalSpending',
    'SpendingPerAge','SpendingPerGroup'
]
cat_feats = ['Deck','Side','AgeBand','HomePlanet','Destination','PlanetRoute']

# 2) 説明変数・目的変数
X = train[num_feats + cat_feats]
y = train['Transported'].astype(int)

# 3) 前処理パイプライン
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', num_pipe, num_feats),
    ('cat', cat_pipe, cat_feats)
])

# 4) モデルパイプライン
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=RANDOM_STATE
    ))
])

# 5) CV 評価
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')

print("FoldごとのCV Accuracy:", scores)
print("平均CV Accuracy:", scores.mean())


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001818 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2722
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 53
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2720
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 52
[LightGBM] [Info] [binary:

In [11]:
# ── セル8: OptunaによるLightGBMハイパーパラメータ最適化 ──

import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier

# 目的関数定義
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': RANDOM_STATE,
        'n_jobs': -1,
        'verbosity': -1
    }
    # パイプライン再定義
    clf = LGBMClassifier(**params)
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    # CVでAccuracyを評価
    score = cross_val_score(pipe, X, y, cv=cv, scoring='accuracy').mean()
    return score

# スタディ実行
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# 最良結果の表示
print("Best CV Accuracy:", study.best_value)
print("Best Params:")
for key, val in study.best_params.items():
    print(f"  {key}: {val}")


[I 2025-07-14 18:08:24,533] A new study created in memory with name: no-name-78bf2281-7b34-4748-869d-914a6dc8c669
[I 2025-07-14 18:08:26,996] Trial 0 finished with value: 0.7950073154220745 and parameters: {'n_estimators': 222, 'learning_rate': 0.01961140697746414, 'max_depth': 3, 'num_leaves': 51, 'subsample': 0.5715477049435773, 'colsample_bytree': 0.5676355193690834}. Best is trial 0 with value: 0.7950073154220745.
[I 2025-07-14 18:08:41,042] Trial 1 finished with value: 0.8086962534848343 and parameters: {'n_estimators': 314, 'learning_rate': 0.0190753784487074, 'max_depth': 11, 'num_leaves': 88, 'subsample': 0.6105187988327343, 'colsample_bytree': 0.7134091601839578}. Best is trial 1 with value: 0.8086962534848343.
[I 2025-07-14 18:08:47,411] Trial 2 finished with value: 0.7982284833617987 and parameters: {'n_estimators': 176, 'learning_rate': 0.130862197829453, 'max_depth': 9, 'num_leaves': 119, 'subsample': 0.5933286944540312, 'colsample_bytree': 0.9183815760921377}. Best is tri

Best CV Accuracy: 0.8113422459503795
Best Params:
  n_estimators: 63
  learning_rate: 0.23319964855802278
  max_depth: 4
  num_leaves: 147
  subsample: 0.8666528971766383
  colsample_bytree: 0.7828660774195799


In [12]:
# ── セル9: チューニングモデルで予測 & 提出ファイル作成 ──

import os
from datetime import datetime
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

# 1) 最良パラメータで再学習
best_params = study.best_params
# Optunaで探索したパラメータに乱数シードを追加
best_params.update({'random_state': RANDOM_STATE})
best_clf = LGBMClassifier(**best_params)
best_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', best_clf)
])
best_pipe.fit(X, y)

# 2) テストデータで予測
X_test = test[num_feats + cat_feats]
preds = best_pipe.predict(X_test)

# 3) サンプル提出ファイルに結果をセット
submission = pd.read_csv('../data/sample_submission.csv')
submission['Transported'] = preds.astype(bool)

# 4) 出力ディレクトリの作成
timestamp = datetime.now().strftime('%Y%m%d%H%M')
output_dir = f'../outputs/submissions/{timestamp}'
os.makedirs(output_dir, exist_ok=True)

# 5) CSV出力
output_path = os.path.join(output_dir, f'sub_tuned_{timestamp}.csv')
submission.to_csv(output_path, index=False)

print(f'提出ファイルを保存しました: {output_path}')


提出ファイルを保存しました: ../outputs/submissions/202507141818\sub_tuned_202507141818.csv
