In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from dataset import make_dataset
from os.path import join


train_path = join('jeju_data', 'train_new.parquet')
test_path = join('jeju_data', 'test_new.parquet')

x_train, y_train, test = make_dataset(train_path, test_path)

시작시간: 2022-10-17 17:08:17.103751
Train dataset success !
Test dataset success !
끝난 시간:  0:01:06.058098


In [4]:
X = x_train.copy()
y = y_train.copy()

# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, random_state=1103)

In [6]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler


def objective_xgb(trial: Trial, x, y):
    params = {
        "n_estimators": trial.suggest_int('n_estimators', 500, 4000),
        'max_depth': trial.suggest_int('max_depth', 8, 16),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_int('gamma', 1, 3),
        'learning_rate': 0.01,
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
        'random_state': 42
    }

    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

    model = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0)
    xgb_model = model.fit(x_train, y_train, verbose=False, eval_set=[(x_val, y_val)])
    y_pred = xgb_model.predict(x_val)
    score = mean_absolute_error(y_val, y_pred)

    return score


study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: objective_xgb(trial, X, y), n_trials=100)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value, study.best_trial.params))

params = study.best_trial.params
xgb_model = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0).fit(x_train, y_train)
y_pred = xgb_model.predict(test)

import pandas as pd

sample_submission = pd.read_csv('./jeju_data/sample_submission.csv')
sample_submission['target'] = y_pred
sample_submission.to_csv("./submit_xgb_optuna.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2022-10-14 18:57:48,506][0m A new study created in memory with name: no-name-f6691f71-2cb6-4215-8744-ff0e7a33eb83[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[32m[I 2022-10-14 19:09:55,549][0m Trial 0 finished with value: 3.105232119599799 and parameters: {'n_estimators': 2228, 'max_depth': 14, 'min_child_weight': 106, 'gamma': 3, 'colsample_bytree': 0.511775658530421, 'lambda': 0.38923883880902777, 'alpha': 0.7762555149231181, 'subsample': 0.7}. Best is trial 0 with value: 3.105232119599799.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[32m[I 2022-10-14 19:26:22,433][0m Trial 1 finished with value: 3.0931925798815416 and parameters: {'n_estimators': 3479, 'max_depth': 14, 'min_child_weight': 147, 'gamma': 3, 'colsample_bytree': 0.714324195058753, 'lambda': 0.007888321601

Best trial: score 3.0240065260133813,
params {'n_estimators': 3097, 'max_depth': 16, 'min_child_weight': 13, 'gamma': 2, 'colsample_bytree': 0.7505550592952616, 'lambda': 0.0032786114572041485, 'alpha': 0.00425353975186883, 'subsample': 1.0}
