# 2024/02/14更新
- 書籍発売後にライブラリのバージョンアップが生じたため、書籍のコードが一部動作しなくなりました
- このため、書籍のコードが動作するようにコードを一部変更
    - 変更を最小化するため、基本的には書籍に合わせてライブラリをダウングレード
    - 名称変更となったライブラリは最新のライブラリ名に変更
- なお、一部ですが、2024/02/14時点の最新ライブラリでも動作するコードをコメントアウトで同じセルに参考までに掲載しました。

In [1]:
# 引数が大きく変更されているため、ダウングレードで対応
# 最初に実行してください。
!pip install pandas==1.3.5
!pip install lightgbm==3.3.1
!pip install scikit-learn==1.0.2

# なお、LightGBMの最新版ではCallbackが使われており、過去バージョンと大きく書き方が変化。最新版を使い方を知りたい場合は公式ページを参照してください。
# https://lightgbm.readthedocs.io/en/latest/index.html

Collecting pandas==1.3.5
  Downloading pandas-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.0
    Uninstalling pandas-2.2.0:
      Successfully uninstalled pandas-2.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
arviz 0.17.0 requires pandas>=1.4.0, but you have pandas 1.3.5 which is incompatible.
beatrix-jupyterlab 2023.128.151533 requires jupyterlab~=3.6.0, but you have jupyterlab 4.0.11 which is incompatible.
esda 2.5.1 requires pandas>1.4, but you have pandas 1.3.5 which is incompatible.
featuretools 1.28.0 requires pandas>=1.5.0, but you have pandas 1.3.5 which is incompatible.

# Kaggleで磨く 機械学習の実践力
# 第6章 モデルチューニング

# 6.1 LightGBMのハイパーパラメータのチューニング
## 6.1.2 ハイパーパラメータの自動チューニング

#### スクリプト: ライブラリのインポート (スクリプト4-1の再掲)

In [2]:
import numpy as np
import pandas as pd
import os
import pickle
import gc

# 分布確認
# import pandas_profiling as pdp
import ydata_profiling as pdp # ライブラリ名称が変更になったため

# 可視化
import matplotlib.pyplot as plt

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# バリデーション
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold

# 評価指標
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# モデリング: lightgbm
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

#### スクリプト: ファイルの読み込み (スクリプト4-2の再掲)

In [3]:
df_train = pd.read_csv("../input/titanic/train.csv")

#### スクリプト: データセット作成 (スクリプト4-8の再掲)

In [4]:
x_train, y_train, id_train = df_train[["Pclass", "Fare"]], \
                             df_train[["Survived"]], \
                             df_train[["PassengerId"]]
print(x_train.shape, y_train.shape, id_train.shape)

(891, 2) (891, 1) (891, 1)


#### スクリプト6-1: optunaのインポート

In [5]:
import optuna

#### スクリプト6-2: 目的関数の定義

In [6]:
# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.02,
    'n_estimators': 100000,
    "bagging_freq": 1,
    "seed": 123,
}

def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e2, log=True),
    }
    params_tuning.update(params_base)
    
    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    for nfold in np.arange(5):
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
        x_va, y_va = x_train.loc[idx_va, :], y_train.loc[idx_va, :]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
#         # 2024/02/14環境で動かしたい場合はこのコードを利用してください。
#         model.fit(x_tr,
#                   y_tr,
#                   eval_set=[(x_tr,y_tr), (x_va,y_va)],
#                   callbacks=[
#                       lgb.early_stopping(stopping_rounds=100, verbose=True),
#                       lgb.log_evaluation(0),
#                   ],
#                  )
        
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = accuracy_score(y_va, np.where(y_va_pred>=0.5, 1, 0))
        list_metrics.append(metric_va)
    
    # 評価値の計算
    metrics = np.mean(list_metrics)
    
    return metrics

#### スクリプト6-3: 最適化処理（探索の実行）

In [7]:
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=30)

[I 2024-02-14 12:56:15,523] A new study created in memory with name: no-name-7006dc18-ca31-4eff-b0a4-2e2b586d8ce3




[I 2024-02-14 12:56:18,499] Trial 0 finished with value: 0.664478061640826 and parameters: {'num_leaves': 181, 'min_data_in_leaf': 61, 'min_sum_hessian_in_leaf': 4.792414358623587e-05, 'feature_fraction': 0.7756573845414456, 'bagging_fraction': 0.8597344848927815, 'lambda_l1': 0.492522233779106, 'lambda_l2': 83.76388146302445}. Best is trial 0 with value: 0.664478061640826.




[I 2024-02-14 12:56:20,010] Trial 1 finished with value: 0.6712196346745339 and parameters: {'num_leaves': 178, 'min_data_in_leaf': 99, 'min_sum_hessian_in_leaf': 0.00015009027543233888, 'feature_fraction': 0.6715890080754348, 'bagging_fraction': 0.8645248536920208, 'lambda_l1': 0.567922374174008, 'lambda_l2': 0.01732652966363563}. Best is trial 1 with value: 0.6712196346745339.




[I 2024-02-14 12:56:21,622] Trial 2 finished with value: 0.65762350134957 and parameters: {'num_leaves': 107, 'min_data_in_leaf': 149, 'min_sum_hessian_in_leaf': 3.52756635172055e-05, 'feature_fraction': 0.5877258780737462, 'bagging_fraction': 0.7657756869209191, 'lambda_l1': 1.3406343673102123, 'lambda_l2': 3.4482904089131434}. Best is trial 1 with value: 0.6712196346745339.




[I 2024-02-14 12:56:23,360] Trial 3 finished with value: 0.6722302429226037 and parameters: {'num_leaves': 219, 'min_data_in_leaf': 146, 'min_sum_hessian_in_leaf': 0.0006808799287054756, 'feature_fraction': 0.8612216912851107, 'bagging_fraction': 0.6614794569265892, 'lambda_l1': 0.2799978022399009, 'lambda_l2': 0.08185645330667264}. Best is trial 3 with value: 0.6722302429226037.




[I 2024-02-14 12:56:24,575] Trial 4 finished with value: 0.668972443663298 and parameters: {'num_leaves': 81, 'min_data_in_leaf': 128, 'min_sum_hessian_in_leaf': 1.889360449174926e-05, 'feature_fraction': 0.7168505863397641, 'bagging_fraction': 0.7154313816648219, 'lambda_l1': 0.9434967110751797, 'lambda_l2': 0.5050346330980694}. Best is trial 3 with value: 0.6722302429226037.




[I 2024-02-14 12:56:26,092] Trial 5 finished with value: 0.6587847592743706 and parameters: {'num_leaves': 85, 'min_data_in_leaf': 88, 'min_sum_hessian_in_leaf': 0.004788147156768277, 'feature_fraction': 0.9720800091019398, 'bagging_fraction': 0.7509183379421682, 'lambda_l1': 3.1319282717196035, 'lambda_l2': 0.029005047452739414}. Best is trial 3 with value: 0.6722302429226037.




[I 2024-02-14 12:56:26,459] Trial 6 finished with value: 0.6161634548992531 and parameters: {'num_leaves': 87, 'min_data_in_leaf': 86, 'min_sum_hessian_in_leaf': 0.003971252247766701, 'feature_fraction': 0.6252276826982534, 'bagging_fraction': 0.7415171321313522, 'lambda_l1': 87.54657140659076, 'lambda_l2': 1.1965765212602313}. Best is trial 3 with value: 0.6722302429226037.




[I 2024-02-14 12:56:29,989] Trial 7 finished with value: 0.6992530286862093 and parameters: {'num_leaves': 160, 'min_data_in_leaf': 28, 'min_sum_hessian_in_leaf': 0.0030131614432849746, 'feature_fraction': 0.8015300642054637, 'bagging_fraction': 0.7725340032332324, 'lambda_l1': 0.23499322154972468, 'lambda_l2': 0.1646202117975735}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:31,907] Trial 8 finished with value: 0.6823363254033017 and parameters: {'num_leaves': 111, 'min_data_in_leaf': 138, 'min_sum_hessian_in_leaf': 0.00423029374725911, 'feature_fraction': 0.7552111687390055, 'bagging_fraction': 0.8346568914811361, 'lambda_l1': 2.206714812711709, 'lambda_l2': 3.1594683442464033}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:32,912] Trial 9 finished with value: 0.6362751867428285 and parameters: {'num_leaves': 175, 'min_data_in_leaf': 170, 'min_sum_hessian_in_leaf': 1.7765808030254076e-05, 'feature_fraction': 0.8818414207216692, 'bagging_fraction': 0.6218331872684371, 'lambda_l1': 0.05982625838323253, 'lambda_l2': 1.9490717640641542}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:36,185] Trial 10 finished with value: 0.673435440336451 and parameters: {'num_leaves': 32, 'min_data_in_leaf': 6, 'min_sum_hessian_in_leaf': 0.0009194171614722974, 'feature_fraction': 0.5040305717020102, 'bagging_fraction': 0.9940542446575642, 'lambda_l1': 0.010612397212799423, 'lambda_l2': 0.1661409929489422}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:36,957] Trial 11 finished with value: 0.6161634548992531 and parameters: {'num_leaves': 141, 'min_data_in_leaf': 198, 'min_sum_hessian_in_leaf': 0.009951069387483545, 'feature_fraction': 0.7991399603154743, 'bagging_fraction': 0.8761275059380933, 'lambda_l1': 8.895512707730266, 'lambda_l2': 11.692356850069807}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:42,259] Trial 12 finished with value: 0.6802083987194777 and parameters: {'num_leaves': 255, 'min_data_in_leaf': 18, 'min_sum_hessian_in_leaf': 0.001634914743632515, 'feature_fraction': 0.8476730378212194, 'bagging_fraction': 0.5595408581248553, 'lambda_l1': 0.09349295720311095, 'lambda_l2': 0.2669531355707319}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:43,660] Trial 13 finished with value: 0.6712196346745339 and parameters: {'num_leaves': 140, 'min_data_in_leaf': 43, 'min_sum_hessian_in_leaf': 0.0021756690901938718, 'feature_fraction': 0.9479314162009256, 'bagging_fraction': 0.9474999290561824, 'lambda_l1': 15.027486795162927, 'lambda_l2': 16.04887249986447}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:45,818] Trial 14 finished with value: 0.6846400100433118 and parameters: {'num_leaves': 31, 'min_data_in_leaf': 60, 'min_sum_hessian_in_leaf': 0.0002511161117887837, 'feature_fraction': 0.7214624501496751, 'bagging_fraction': 0.8148189817022143, 'lambda_l1': 0.10302449045855197, 'lambda_l2': 7.1467516807077525}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:47,626] Trial 15 finished with value: 0.6745904211913878 and parameters: {'num_leaves': 9, 'min_data_in_leaf': 46, 'min_sum_hessian_in_leaf': 0.00023305225408823253, 'feature_fraction': 0.690460596426745, 'bagging_fraction': 0.8032054077767327, 'lambda_l1': 0.026008451540619953, 'lambda_l2': 12.21210843043782}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:49,397] Trial 16 finished with value: 0.6778858828698764 and parameters: {'num_leaves': 41, 'min_data_in_leaf': 65, 'min_sum_hessian_in_leaf': 0.00014054556930505904, 'feature_fraction': 0.816763159679514, 'bagging_fraction': 0.6677912738306708, 'lambda_l1': 0.14515159340667338, 'lambda_l2': 34.66806840700916}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:51,808] Trial 17 finished with value: 0.6801393509509761 and parameters: {'num_leaves': 48, 'min_data_in_leaf': 28, 'min_sum_hessian_in_leaf': 0.00041942600526778174, 'feature_fraction': 0.6231218216909848, 'bagging_fraction': 0.5002172961009613, 'lambda_l1': 0.03426707576896973, 'lambda_l2': 0.04837506886369723}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:55,658] Trial 18 finished with value: 0.6790848032138597 and parameters: {'num_leaves': 218, 'min_data_in_leaf': 61, 'min_sum_hessian_in_leaf': 7.90220458942919e-05, 'feature_fraction': 0.9236171148088437, 'bagging_fraction': 0.9432358972978486, 'lambda_l1': 0.18409793634935437, 'lambda_l2': 0.42463135597338925}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:56,914] Trial 19 finished with value: 0.6622308706295901 and parameters: {'num_leaves': 160, 'min_data_in_leaf': 113, 'min_sum_hessian_in_leaf': 0.0004165592806968668, 'feature_fraction': 0.7302924887036465, 'bagging_fraction': 0.805029885015916, 'lambda_l1': 0.010045321756357375, 'lambda_l2': 0.0995890378098838}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:56:59,513] Trial 20 finished with value: 0.6745904211913879 and parameters: {'num_leaves': 209, 'min_data_in_leaf': 35, 'min_sum_hessian_in_leaf': 0.0013845801360137025, 'feature_fraction': 0.5306298908707103, 'bagging_fraction': 0.6900582768491921, 'lambda_l1': 0.3216819410872765, 'lambda_l2': 0.9453451423419853}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:02,054] Trial 21 finished with value: 0.6667252526520621 and parameters: {'num_leaves': 106, 'min_data_in_leaf': 73, 'min_sum_hessian_in_leaf': 0.008748025832898368, 'feature_fraction': 0.7668244440376193, 'bagging_fraction': 0.8140984986812078, 'lambda_l1': 3.445630241563508, 'lambda_l2': 4.156916351584709}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:03,676] Trial 22 finished with value: 0.6644152909421882 and parameters: {'num_leaves': 62, 'min_data_in_leaf': 129, 'min_sum_hessian_in_leaf': 0.004061668550970804, 'feature_fraction': 0.7482688842343571, 'bagging_fraction': 0.8886750178544316, 'lambda_l1': 2.1127374904866487, 'lambda_l2': 4.554403222246632}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:04,731] Trial 23 finished with value: 0.6453141673466826 and parameters: {'num_leaves': 111, 'min_data_in_leaf': 6, 'min_sum_hessian_in_leaf': 0.0026312807570427793, 'feature_fraction': 0.6773917561139398, 'bagging_fraction': 0.8221198194153576, 'lambda_l1': 8.59982035475244, 'lambda_l2': 0.7542908028826634}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:06,632] Trial 24 finished with value: 0.6823488795430293 and parameters: {'num_leaves': 9, 'min_data_in_leaf': 164, 'min_sum_hessian_in_leaf': 0.0007958826711101101, 'feature_fraction': 0.8193477007279062, 'bagging_fraction': 0.9137850613244668, 'lambda_l1': 0.054062737213373784, 'lambda_l2': 7.254429610183551}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:08,197] Trial 25 finished with value: 0.6599836796183542 and parameters: {'num_leaves': 21, 'min_data_in_leaf': 176, 'min_sum_hessian_in_leaf': 0.0007220208410542904, 'feature_fraction': 0.897993229060577, 'bagging_fraction': 0.9096697647298414, 'lambda_l1': 0.04052688745892243, 'lambda_l2': 33.56741330161014}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:11,787] Trial 26 finished with value: 0.6757140166970059 and parameters: {'num_leaves': 63, 'min_data_in_leaf': 49, 'min_sum_hessian_in_leaf': 0.00019985545118893227, 'feature_fraction': 0.8168240033302105, 'bagging_fraction': 0.777469997490065, 'lambda_l1': 0.07242256998354961, 'lambda_l2': 6.818961945625027}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:13,910] Trial 27 finished with value: 0.6700207143305505 and parameters: {'num_leaves': 9, 'min_data_in_leaf': 110, 'min_sum_hessian_in_leaf': 0.0011015177430549305, 'feature_fraction': 0.832859028154117, 'bagging_fraction': 0.9245894599408239, 'lambda_l1': 0.13213697905758195, 'lambda_l2': 1.9898087013583565}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:14,740] Trial 28 finished with value: 0.6161634548992531 and parameters: {'num_leaves': 54, 'min_data_in_leaf': 169, 'min_sum_hessian_in_leaf': 0.0004463850369701868, 'feature_fraction': 0.7818076921940365, 'bagging_fraction': 0.7239926057056618, 'lambda_l1': 0.26894380562875153, 'lambda_l2': 82.43363076639685}. Best is trial 7 with value: 0.6992530286862093.




[I 2024-02-14 12:57:16,166] Trial 29 finished with value: 0.6587470968551881 and parameters: {'num_leaves': 161, 'min_data_in_leaf': 196, 'min_sum_hessian_in_leaf': 8.505644215173895e-05, 'feature_fraction': 0.7116385045852216, 'bagging_fraction': 0.9850169158486759, 'lambda_l1': 0.02276315503480073, 'lambda_l2': 32.318206202200265}. Best is trial 7 with value: 0.6992530286862093.


#### スクリプト6-4: 探索結果の確認

In [8]:
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

acc(best)=0.6993


{'num_leaves': 160,
 'min_data_in_leaf': 28,
 'min_sum_hessian_in_leaf': 0.0030131614432849746,
 'feature_fraction': 0.8015300642054637,
 'bagging_fraction': 0.7725340032332324,
 'lambda_l1': 0.23499322154972468,
 'lambda_l2': 0.1646202117975735}

#### スクリプト6-5: ベストなハイパーパラメータの取得

In [9]:
params_best = trial.params
params_best.update(params_base)
display(params_best)

{'num_leaves': 160,
 'min_data_in_leaf': 28,
 'min_sum_hessian_in_leaf': 0.0030131614432849746,
 'feature_fraction': 0.8015300642054637,
 'bagging_fraction': 0.7725340032332324,
 'lambda_l1': 0.23499322154972468,
 'lambda_l2': 0.1646202117975735,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.02,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'seed': 123}

# 6.2 LightGBM以外のモデル利用
## 6.2.1 scikit-learnの各種モデル

### Titanicデータを用いた例：ロジスティック回帰
#### スクリプト6-6: ファイル読み込みとデータセット作成

In [10]:
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Embarked"]]
y_train = df_train[["Survived"]]

In [11]:
# 欠損値の確認
x_train.isnull().sum()

Pclass        0
Age         177
Embarked      2
dtype: int64

#### スクリプト6-7: 欠損値の補間

In [12]:
# 欠損値補間：数値データ
x_train["Age"] = x_train["Age"].fillna(x_train["Age"].mean())

# 欠損値補間：カテゴリ変数
x_train["Embarked"] = x_train["Embarked"].fillna(x_train["Embarked"].mode()[0])

#### スクリプト6-8: カテゴリ変数の数値化（one-hot-encoding）

In [13]:
ohe = OneHotEncoder()
ohe.fit(x_train[["Embarked"]])
df_embarked = pd.DataFrame(
    ohe.transform(x_train[["Embarked"]]).toarray(), 
    columns=["Embarked_{}".format(col) for col in ohe.categories_[0]])

x_train = pd.concat([x_train, df_embarked], axis=1)
x_train = x_train.drop(columns=["Embarked"])

#### スクリプト6-9: 数値データの正規化

In [14]:
x_train["Pclass"] = (x_train["Pclass"] -x_train["Pclass"].min()) / (x_train["Pclass"].max() - x_train["Pclass"].min()) 
x_train["Age"] = (x_train["Age"] -x_train["Age"].min()) / (x_train["Age"].max() - x_train["Age"].min()) 

#### スクリプト6-10: 学習データと検証データの分割（ホールドアウト検証）

In [15]:
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=123)
print(x_tr.shape, x_va.shape, y_tr.shape, y_va.shape)

(712, 5) (179, 5) (712, 1) (179, 1)


#### スクリプト6-11: LogisticRegression

In [16]:
# モデル定義
from sklearn.linear_model import LogisticRegression
model_logis = LogisticRegression()

# 学習
model_logis.fit(x_tr, y_tr)

# 予測
y_va_pred = model_logis.predict(x_va)
print("accuracy:{:.4f}".format(accuracy_score(y_va, y_va_pred)))
print(y_va_pred[:5])

accuracy:0.7263
[0 1 0 1 0]


#### スクリプト6-12: 確率値の取得

In [17]:
y_va_pred_proba = model_logis.predict_proba(x_va)
print(y_va_pred_proba[:5, :])

[[0.83621285 0.16378715]
 [0.23058311 0.76941689]
 [0.83244141 0.16755859]
 [0.32227072 0.67772928]
 [0.62569522 0.37430478]]


### Titanicデータを用いた例：SVM
#### スクリプト6-13: SVM

In [18]:
# モデル定義
from sklearn.svm import SVC
model_svm = SVC(C=1.0, random_state=123, probability=True)

# 学習
model_svm.fit(x_tr, y_tr)

# 予測
y_va_pred = model_svm.predict(x_va)
print("accuracy:{:.4f}".format(accuracy_score(y_va, y_va_pred)))
print(y_va_pred[:5])

# 確率値の取得
y_va_pred_proba = model_svm.predict_proba(x_va)
print(y_va_pred_proba[:5, :])

accuracy:0.7151
[0 1 0 1 0]
[[0.73985924 0.26014076]
 [0.28242534 0.71757466]
 [0.73986177 0.26013823]
 [0.26828214 0.73171786]
 [0.58950192 0.41049808]]


## 6.2.2 ニューラルネットワーク


### ニューラルネットワークの適用例：①全結合層のみのネットワークモデル
#### スクリプト6-14: ライブラリのインポート

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Embedding, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, SGD

2024-02-14 12:57:21.602077: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 12:57:21.602277: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 12:57:21.794351: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


#### スクリプト6-15: tensorflowの再現性のためのシード指定

In [20]:
def seed_everything(seed):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

#### スクリプト6-16: ファイルの読み込みとデータセット作成

In [21]:
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Embarked"]]
y_train = df_train[["Survived"]]

#### スクリプト6-17: 数値データの前処理

In [22]:
# 欠損値補間
x_train["Age"] = x_train["Age"].fillna(x_train["Age"].mean())

# 正規化
for col in ["Pclass", "Age"]:
    value_min = x_train[col].min()
    value_max = x_train[col].max()
    x_train[col] = (x_train[col] - value_min) / (value_max - value_min)

#### スクリプト6-18: カテゴリ変数の前処理

In [23]:
# 欠損値補間
x_train["Embarked"] = x_train["Embarked"].fillna(x_train["Embarked"].mode()[0])

# one-hot-encoding
ohe = OneHotEncoder()
ohe.fit(x_train[["Embarked"]])
df_embarked = pd.DataFrame(ohe.transform(x_train[["Embarked"]]).toarray(), 
                           columns=["Embarked_{}".format(col) for col in ohe.categories_[0]])
x_train = pd.concat([x_train.drop(columns=["Embarked"]), 
                     df_embarked], axis=1)

#### スクリプト6-19: 学習データと検証データの分割

In [24]:
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=123)
print(x_tr.shape, x_va.shape, y_tr.shape, y_va.shape)

(712, 5) (179, 5) (712, 1) (179, 1)


#### スクリプト6-20: モデル定義

In [25]:
def create_model():
    input_num = Input(shape=(5,))
    x_num = Dense(10, activation="relu")(input_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.3)(x_num)
    x_num = Dense(10, activation="relu")(x_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.2)(x_num)
    x_num = Dense(5, activation="relu")(x_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.1)(x_num)
    out = Dense(1, activation="sigmoid")(x_num)

    model = Model(inputs=input_num,
                  outputs=out,
                 )

    model.compile(
        optimizer="Adam",
        loss="binary_crossentropy",
        metrics=["binary_crossentropy"],
    )
    
    return model

model = create_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 5)]               0         
                                                                 
 dense (Dense)               (None, 10)                60        
                                                                 
 batch_normalization (Batch  (None, 10)                40        
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 10)                110       
                                                                 
 batch_normalization_1 (Bat  (None, 10)                40        
 chNormalization)                                            

#### スクリプト6-21: モデル学習

In [26]:
seed_everything(seed=123)
model = create_model()
model.fit(x=x_tr,
          y=y_tr,
          validation_data=(x_va, y_va),
          batch_size=8,
          epochs=10000,
          callbacks=[
              ModelCheckpoint(filepath="model_keras.h5", monitor="val_loss", mode="min", verbose=1, save_best_only=True, save_weights_only=True),
              EarlyStopping(monitor="val_loss", mode="min", min_delta=0, patience=10, verbose=1, restore_best_weights=True),
              ReduceLROnPlateau(monitor="val_loss", mode="min", factor=0.1, patience=5, verbose=1),
          ],
          verbose=1,
         )

Epoch 1/10000
Epoch 1: val_loss improved from inf to 0.68176, saving model to model_keras.h5
Epoch 2/10000
Epoch 2: val_loss improved from 0.68176 to 0.66805, saving model to model_keras.h5
Epoch 3/10000
Epoch 3: val_loss improved from 0.66805 to 0.65480, saving model to model_keras.h5
Epoch 4/10000
Epoch 4: val_loss improved from 0.65480 to 0.63951, saving model to model_keras.h5
Epoch 5/10000
Epoch 5: val_loss improved from 0.63951 to 0.62343, saving model to model_keras.h5
Epoch 6/10000
Epoch 6: val_loss improved from 0.62343 to 0.61811, saving model to model_keras.h5
Epoch 7/10000
Epoch 7: val_loss improved from 0.61811 to 0.61267, saving model to model_keras.h5
Epoch 8/10000
Epoch 8: val_loss did not improve from 0.61267
Epoch 9/10000
Epoch 9: val_loss improved from 0.61267 to 0.61192, saving model to model_keras.h5
Epoch 10/10000
Epoch 10: val_loss improved from 0.61192 to 0.60599, saving model to model_keras.h5
Epoch 11/10000
Epoch 11: val_loss improved from 0.60599 to 0.60516, 

<keras.src.callbacks.History at 0x7d520da7d0f0>

#### スクリプト6-22: モデルの評価

In [27]:
y_va_pred = model.predict(x_va, batch_size=8, verbose=1)
print("accuracy: {:.4f}".format(accuracy_score(y_va, np.where(y_va_pred>=0.5,1,0))))

accuracy: 0.7095


### ニューラルネットワークの適用例：②埋め込み層ありのネットワークモデル
#### スクリプト6-23: ファイルの読み込みとデータセット作成

In [28]:
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Cabin"]]
y_train = df_train[["Survived"]]

#### スクリプト6-24: 数値データの前処理

In [29]:
# 欠損値補間
x_train["Age"] = x_train["Age"].fillna(x_train["Age"].mean())

# 正規化
for col in ["Pclass", "Age"]:
    value_min = x_train[col].min()
    value_max = x_train[col].max()
    x_train[col] = (x_train[col] - value_min) / (value_max - value_min)

#### スクリプト6-25: カテゴリ変数の前処理

In [30]:
# 欠損値補間
x_train["Cabin"] = x_train["Cabin"].fillna("None")

# label-encoding
le = LabelEncoder()
le.fit(x_train[["Cabin"]])
x_train["Cabin"] = le.transform(x_train["Cabin"])

print(le.classes_)
print("count:", len(le.classes_))

['A10' 'A14' 'A16' 'A19' 'A20' 'A23' 'A24' 'A26' 'A31' 'A32' 'A34' 'A36'
 'A5' 'A6' 'A7' 'B101' 'B102' 'B18' 'B19' 'B20' 'B22' 'B28' 'B3' 'B30'
 'B35' 'B37' 'B38' 'B39' 'B4' 'B41' 'B42' 'B49' 'B5' 'B50' 'B51 B53 B55'
 'B57 B59 B63 B66' 'B58 B60' 'B69' 'B71' 'B73' 'B77' 'B78' 'B79' 'B80'
 'B82 B84' 'B86' 'B94' 'B96 B98' 'C101' 'C103' 'C104' 'C106' 'C110' 'C111'
 'C118' 'C123' 'C124' 'C125' 'C126' 'C128' 'C148' 'C2' 'C22 C26'
 'C23 C25 C27' 'C30' 'C32' 'C45' 'C46' 'C47' 'C49' 'C50' 'C52' 'C54'
 'C62 C64' 'C65' 'C68' 'C7' 'C70' 'C78' 'C82' 'C83' 'C85' 'C86' 'C87'
 'C90' 'C91' 'C92' 'C93' 'C95' 'C99' 'D' 'D10 D12' 'D11' 'D15' 'D17' 'D19'
 'D20' 'D21' 'D26' 'D28' 'D30' 'D33' 'D35' 'D36' 'D37' 'D45' 'D46' 'D47'
 'D48' 'D49' 'D50' 'D56' 'D6' 'D7' 'D9' 'E10' 'E101' 'E12' 'E121' 'E17'
 'E24' 'E25' 'E31' 'E33' 'E34' 'E36' 'E38' 'E40' 'E44' 'E46' 'E49' 'E50'
 'E58' 'E63' 'E67' 'E68' 'E77' 'E8' 'F E69' 'F G63' 'F G73' 'F2' 'F33'
 'F38' 'F4' 'G6' 'None' 'T']
count: 148


#### スクリプト6-26: 学習データと検証データの分離

In [31]:
x_train_num, x_train_cat = x_train[["Pclass", "Age"]], x_train[["Cabin"]]

x_num_tr, x_num_va, x_cat_tr, x_cat_va, y_tr, y_va = \
   train_test_split(x_train_num, x_train_cat, y_train, test_size=0.2, stratify=y_train, random_state=123)
print(x_num_tr.shape, x_num_va.shape, x_cat_tr.shape, x_cat_va.shape, y_tr.shape, y_va.shape)

(712, 2) (179, 2) (712, 1) (179, 1) (712, 1) (179, 1)


#### スクリプト6-27: モデル定義

In [32]:
def create_model_embedding():
    ################# num
    input_num = Input(shape=(2,))
    layer_num = Dense(10, activation="relu")(input_num)
    layer_num = BatchNormalization()(layer_num)
    layer_num = Dropout(0.2)(layer_num)
    layer_num = Dense(10, activation="relu")(layer_num)

    ################# cat
    input_cat = Input(shape=(1,))
    layer_cat = input_cat[:, 0]
    layer_cat = Embedding(input_dim=148, output_dim=74)(layer_cat)
    layer_cat = Dropout(0.2)(layer_cat)
    layer_cat = Flatten()(layer_cat)

    ################# concat
    hidden_layer = Concatenate()([layer_num, layer_cat])
    hidden_layer = Dense(50, activation="relu")(hidden_layer)
    hidden_layer = BatchNormalization()(hidden_layer)
    hidden_layer = Dropout(0.1)(hidden_layer)
    hidden_layer = Dense(20, activation="relu")(hidden_layer)
    hidden_layer = BatchNormalization()(hidden_layer)
    hidden_layer = Dropout(0.1)(hidden_layer)
    output_layer = Dense(1, activation="sigmoid")(hidden_layer)

    model = Model(inputs=[input_num, input_cat],
                  outputs=output_layer,
                 )

    model.compile(
        optimizer="Adam",
        loss="binary_crossentropy",
        metrics=["binary_crossentropy"],
    )
    
    return model

model = create_model_embedding()
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 2)]                  0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 dense_8 (Dense)             (None, 10)                   30        ['input_3[0][0]']             
                                                                                                  
 tf.__operators__.getitem (  (None,)                      0         ['input_4[0][0]']             
 SlicingOpLambda)                                                                           

#### スクリプト6-28: モデルの学習

In [33]:
seed_everything(seed=123)
model = create_model_embedding()
model.fit(x=[x_num_tr, x_cat_tr],
          y=y_tr,
          validation_data=([x_num_va, x_cat_va], y_va),
          batch_size=8,
          epochs=10000,
          callbacks=[
              ModelCheckpoint(filepath="model_keras_embedding.h5", monitor="val_loss", mode="min", verbose=1, save_best_only=True, save_weights_only=True),
              EarlyStopping(monitor="val_loss", mode="min", min_delta=0, patience=10, verbose=1, restore_best_weights=True),
              ReduceLROnPlateau(monitor="val_loss", mode="min", factor=0.1, patience=5, verbose=1),
          ],
          verbose=1,
         )

Epoch 1/10000
Epoch 1: val_loss improved from inf to 0.65861, saving model to model_keras_embedding.h5
Epoch 2/10000
Epoch 2: val_loss improved from 0.65861 to 0.65290, saving model to model_keras_embedding.h5
Epoch 3/10000
Epoch 3: val_loss improved from 0.65290 to 0.64176, saving model to model_keras_embedding.h5
Epoch 4/10000
Epoch 4: val_loss improved from 0.64176 to 0.61593, saving model to model_keras_embedding.h5
Epoch 5/10000
Epoch 5: val_loss did not improve from 0.61593
Epoch 6/10000
Epoch 6: val_loss improved from 0.61593 to 0.58922, saving model to model_keras_embedding.h5
Epoch 7/10000
Epoch 7: val_loss did not improve from 0.58922
Epoch 8/10000
Epoch 8: val_loss did not improve from 0.58922
Epoch 9/10000
Epoch 9: val_loss did not improve from 0.58922
Epoch 10/10000
Epoch 10: val_loss did not improve from 0.58922
Epoch 11/10000
Epoch 11: val_loss did not improve from 0.58922

Epoch 11: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 12/10000
Epoch

<keras.src.callbacks.History at 0x7d5206fb28f0>

#### スクリプト6-29: モデル評価

In [34]:
y_va_pred = model.predict([x_num_va, x_cat_va], batch_size=8, verbose=1)
print("accuracy: {:.4f}".format(accuracy_score(y_va, np.where(y_va_pred>=0.5,1,0))))

accuracy: 0.7151


# 6.3 アンサンブル
## 6.3.1 単純平均

#### スクリプト6-30: 3モデルの予測値を持つデータフレームを乱数で作成

In [35]:
np.random.seed(123)
df = pd.DataFrame({
    "true": [0]*700 + [1]*300,
    "pred1":np.arange(1000) + np.random.rand(1000)*1200,
    "pred2":np.arange(1000) + np.random.rand(1000)*1000,
    "pred3":np.arange(1000) + np.random.rand(1000)*800,
})
df["pred1"] = np.clip(df["pred1"]/df["pred1"].max(), 0, 1)
df["pred2"] = np.clip(df["pred2"]/df["pred2"].max(), 0, 1)
df["pred3"] = np.clip(df["pred3"]/df["pred3"].max(), 0, 1)

df_train, df_test = train_test_split(df, test_size=0.8, stratify=df["true"], random_state=123)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train.head()

Unnamed: 0,true,pred1,pred2,pred3
0,1,0.683821,0.874443,0.859939
1,0,0.540691,0.113419,0.197144
2,0,0.310541,0.334798,0.599304
3,0,0.043486,0.170622,0.378528
4,0,0.550847,0.354703,0.59886


#### スクリプト6-31: 単純平均によるアンサンブル

In [36]:
df_train["pred_ensemble1"] = (df_train["pred1"] + df_train["pred2"] + df_train["pred3"]) / 3
df_train.head()

Unnamed: 0,true,pred1,pred2,pred3,pred_ensemble1
0,1,0.683821,0.874443,0.859939,0.806068
1,0,0.540691,0.113419,0.197144,0.283752
2,0,0.310541,0.334798,0.599304,0.414881
3,0,0.043486,0.170622,0.378528,0.197545
4,0,0.550847,0.354703,0.59886,0.50147


#### スクリプト6-32: アンサンブル用の精度評価関数と、精度評価

In [37]:
def evaluate_ensemble(input_df, col_pred):
    print("[auc] model1:{:.4f}, model2:{:.4f}, model3:{:.4f} -> ensemble:{:.4f}".format(
        roc_auc_score(input_df["true"], input_df["pred1"]),
        roc_auc_score(input_df["true"], input_df["pred2"]),
        roc_auc_score(input_df["true"], input_df["pred3"]),
        roc_auc_score(input_df["true"], input_df[col_pred]),
    ))

evaluate_ensemble(df_train, col_pred="pred_ensemble1")

[auc] model1:0.8342, model2:0.8671, model3:0.9050 -> ensemble:0.9585


#### スクリプト6-33: 推論時のアンサンブル処理と、精度評価

In [38]:
df_test["pred_ensemble1"] = (df_test["pred1"] + df_test["pred2"] + df_test["pred3"]) / 3
evaluate_ensemble(df_test, col_pred="pred_ensemble1")

[auc] model1:0.8086, model2:0.8398, model3:0.8973 -> ensemble:0.9396


## 6.3.2 重み付き平均

#### スクリプト6-34: 重み付き平均によるアンサンブル

In [39]:
weight = [0.3, 0.3, 0.4]
weight = weight / np.sum(weight)
print(weight)

df_train["pred_ensemble2"] = df_train["pred1"] * weight[0] + \
                             df_train["pred2"] * weight[1] + \
                             df_train["pred3"] * weight[2]
df_train[["true","pred1","pred2","pred3","pred_ensemble2"]].head()

[0.3 0.3 0.4]


Unnamed: 0,true,pred1,pred2,pred3,pred_ensemble2
0,1,0.683821,0.874443,0.859939,0.811455
1,0,0.540691,0.113419,0.197144,0.275091
2,0,0.310541,0.334798,0.599304,0.433324
3,0,0.043486,0.170622,0.378528,0.215643
4,0,0.550847,0.354703,0.59886,0.511209


#### スクリプト6-35: アンサンブルの精度評価

In [40]:
evaluate_ensemble(df_train, col_pred="pred_ensemble2")

[auc] model1:0.8342, model2:0.8671, model3:0.9050 -> ensemble:0.9614


#### スクリプト6-36: 推論時のアンサンブル処理と、精度評価

In [41]:
df_test["pred_ensemble2"] = df_test["pred1"] * weight[0] + \
                            df_test["pred2"] * weight[1] + \
                            df_test["pred3"] * weight[2]
evaluate_ensemble(df_test, col_pred="pred_ensemble2")

[auc] model1:0.8086, model2:0.8398, model3:0.8973 -> ensemble:0.9420


## 6.3.3 スタッキング

#### スクリプト6-37: スタッキングによるアンサンブル

In [42]:
from sklearn.linear_model import Lasso

x, y = df_train[["pred1", "pred2", "pred3"]], df_train[["true"]]
oof = np.zeros(len(x))
models = []

cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x, y))
for nfold in np.arange(5):
    # 学習データと検証データの分離
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr = x.loc[idx_tr, :], y.loc[idx_tr, :]
    x_va, y_va = x.loc[idx_va, :], y.loc[idx_va, :]
    
    # モデル学習
    model = Lasso(alpha=0.01)
    model.fit(x_tr, y_tr)
    models.append(model)
    
    # 検証データの予測値算出
    y_va_pred = model.predict(x_va)
    oof[idx_va] = y_va_pred
    
df_train["pred_ensemble3"] = oof
df_train["pred_ensemble3"] = df_train["pred_ensemble3"].clip(lower=0, upper=1)
df_train[["true","pred1","pred2","pred3","pred_ensemble3"]].head()

Unnamed: 0,true,pred1,pred2,pred3,pred_ensemble3
0,1,0.683821,0.874443,0.859939,0.74502
1,0,0.540691,0.113419,0.197144,0.0
2,0,0.310541,0.334798,0.599304,0.206734
3,0,0.043486,0.170622,0.378528,0.0
4,0,0.550847,0.354703,0.59886,0.303498


#### スクリプト6-38: アンサンブルの精度評価

In [43]:
evaluate_ensemble(df_train, col_pred="pred_ensemble3")

[auc] model1:0.8342, model2:0.8671, model3:0.9050 -> ensemble:0.9577


#### スクリプト6-39: 推論時のアンサンブル処理と、精度評価

In [44]:
df_test["pred_ensemble3"] = 0
for model in models:
    df_test["pred_ensemble3"] += model.predict(df_test[["pred1", "pred2", "pred3"]]) / len(models)
df_test["pred_ensemble3"] = df_test["pred_ensemble3"].clip(lower=0, upper=1)
evaluate_ensemble(df_test, col_pred="pred_ensemble3")

[auc] model1:0.8086, model2:0.8398, model3:0.8973 -> ensemble:0.9437
