# Optunaを用いてパラメータチューニング

In [1]:
import os
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold

import optuna
import lightgbm as lgb

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from conf.config import DIRS, PARAMS, CFG
from utils import DataLoader, DataProcesser, DataEnginner, cross_val_train, model_cv, regular_tuning, new_tuning

# 従来の手法

### データの読み込み

In [2]:
# データのロード
dataloader = DataLoader()
train_df, test_df = dataloader.load_data()

# データの前処理
dataprocesser = DataProcesser(train_df, test_df)
trains, X_test = dataprocesser.preprocess()
X_train, y_train = trains["X_train"], trains["y_train"]

data_enginner = DataEnginner(X_train, X_test)
X_train_pp, X_test_pp = data_enginner.execute()

# 評価指標をRMSEにしたいので
y_train = np.log1p(y_train)

train dataの中に欠損値はありませんでした。
test dataの中に欠損値はありませんでした。
Train shape: (90615, 8)   test shape: (60411, 8)


Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400
1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200
2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050
3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500
4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975
...,...,...,...,...,...,...,...,...
90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450
90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400
90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815
90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700


## パラメータ探索

In [None]:
study = optuna.create_study(study_name="optimization_lgb", direction="minimize")
study.optimize(lambda trial: regular_tuning(trial, X_train_pp, y_train), n_trials=100)

print(f"Number of finished trials: {len(study.trials)}")
print(f"Beset Trial: {study.best_trial.params}")

In [4]:
study.best_params

{'learning_rate': 0.025258393107978006,
 'max_depth': 6,
 'num_leaves': 34,
 'min_data_in_leaf': 28,
 'bagging_fraction': 0.5239212357004639,
 'feature_fraction': 0.5592185917717165}

In [5]:
optimized_param = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "random_state": PARAMS["seed"],
    'learning_rate': 0.025258393107978006,
    'max_depth': 6,
    'num_leaves': 34,
    'min_data_in_leaf': 28,
    'bagging_fraction': 0.5239212357004639,
    'feature_fraction': 0.5592185917717165
}

In [7]:
val_scores, val_preds, test_preds = cross_val_train(X_train_pp, y_train, X_test, optimized_param)

Training until validation scores don't improve for 100 rounds
[500]	valid_0's rmse: 0.151649
[1000]	valid_0's rmse: 0.15055
[1500]	valid_0's rmse: 0.150173
[2000]	valid_0's rmse: 0.149929
Early stopping, best iteration is:
[2326]	valid_0's rmse: 0.149856
Fold: 0   Train RMSLE: 0.1351  Val RMSLE: 0.1499
--------------------------------------------------
Training until validation scores don't improve for 100 rounds
[500]	valid_0's rmse: 0.150605
[1000]	valid_0's rmse: 0.149824
[1500]	valid_0's rmse: 0.149582
Early stopping, best iteration is:
[1821]	valid_0's rmse: 0.149517
Fold: 1   Train RMSLE: 0.1372  Val RMSLE: 0.1495
--------------------------------------------------
Training until validation scores don't improve for 100 rounds
[500]	valid_0's rmse: 0.149673
[1000]	valid_0's rmse: 0.148862
[1500]	valid_0's rmse: 0.148378
[2000]	valid_0's rmse: 0.148217
Early stopping, best iteration is:
[1987]	valid_0's rmse: 0.148213
Fold: 2   Train RMSLE: 0.1370  Val RMSLE: 0.1482
----------------

In [8]:
print(f"Mean RMSLE: {np.mean(val_scores):.5f}")

Mean RMSLE: 0.14831


## submitの作成

In [9]:
# submitのidを抽出
sub = test_df.reset_index()[["id"]]

sub["Rings"] = np.exp(test_preds) - 1
sub.to_csv(os.path.join(DIRS["OUTPUT"], "submission_ropt.csv"), index=False)

スコアは0.14643であった 。\
チューニング前と比較すると、少し向上した。

# 新しいチューニング手法

In [None]:
# チューニング済みのパラメータを使ってみる
params =  {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "random_state": PARAMS["seed"],
    'learning_rate': 0.025258393107978006,
    'max_depth': 6,
    'num_leaves': 34,
    'min_data_in_leaf': 28,
    'bagging_fraction': 0.5239212357004639,
    'feature_fraction': 0.5592185917717165
}

model = new_tuning(X_train_pp, y_train, params=params)

In [5]:
new_opt_params = model.params # 再調整後のパラメータ
print(f"Best params: \n{new_opt_params}")

Best params: 
{'objective': 'regression', 'metric': 'rmse', 'verbosity': -1, 'random_state': 42, 'learning_rate': 0.025258393107978006, 'max_depth': 6, 'num_leaves': 34, 'bagging_fraction': 0.973535237083564, 'feature_fraction': 0.4, 'feature_pre_filter': False, 'min_child_samples': 28, 'lambda_l1': 0.4582719833044848, 'lambda_l2': 0.008518290813950797, 'bagging_freq': 1, 'num_iterations': 3000}


In [None]:
val_scores, val_preds, test_preds = cross_val_train(X_train_pp, y_train, X_test_pp, new_opt_params)
print(f"Mean RMSLE: {np.mean(val_scores):.5f}")

## Submitの作成

In [7]:
# submitのidを抽出
sub = test_df.reset_index()[["id"]]

sub["Rings"] = np.exp(test_preds) - 1
sub.to_csv(os.path.join(DIRS["OUTPUT"], "submission_new_opt.csv"), index=False)

スコアは0.14622であった。\
従来手法と比較して少し向上した。

submit同士の平均をとって提出してみる

In [10]:
sub_1 = pd.read_csv(os.path.join(DIRS["OUTPUT"], "submission_ropt.csv"))
sub_2 = pd.read_csv(os.path.join(DIRS["OUTPUT"], "submission_new_opt.csv"))

ids = sub_1[["id"]]
ring_mean = (sub_1[["Rings"]] + sub_2[["Rings"]]) / 2

sub_fin = pd.concat((ids, ring_mean), axis=1)

In [13]:
# 保存
sub_fin.to_csv(os.path.join(DIRS["OUTPUT"], "submission_opt_mean.csv"), index=False)

スコアは0.14629であり、向上は見られなかった