#### test score = 3.19666

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from dataset import make_dataset
from os.path import join


train_path = join('jeju_data', 'train_new.parquet')
test_path = join('jeju_data', 'test_new.parquet')

x_train, y_train, test = make_dataset(train_path, test_path)

Train dataset success !
Test dataset success !


In [5]:
X = x_train.copy()
y = y_train.copy()

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, random_state=1103)

In [7]:
model = XGBRegressor(tree_method='gpu_hist', gpu_id=0).fit(x_train, y_train)
y_pred = model.predict(x_val)
mean_absolute_error(y_val, y_pred)

3.8581919912014215

In [12]:
x_train.columns

Index(['day_of_week', 'lane_count', 'road_rating', 'multi_linked',
       'connect_code', 'maximum_speed_limit', 'weight_restricted',
       'height_restricted', 'road_type', 'start_latitude', 'start_longitude',
       'start_turn_restricted', 'end_latitude', 'end_longitude',
       'end_turn_restricted', 'distance', 'week', 'over_max_speed', 'time',
       'sin_time', 'cos_time', 'month'],
      dtype='object')

In [10]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler


def objective_xgb(trial: Trial, x, y):
    params = {
        "n_estimators": trial.suggest_int('n_estimators', 500, 4000),
        'max_depth': trial.suggest_int('max_depth', 8, 16),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_int('gamma', 1, 3),
        'learning_rate': 0.01,
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
        'random_state': 42
    }

    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

    model = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0)
    xgb_model = model.fit(x_train, y_train, verbose=False, eval_set=[(x_val, y_val)])
    y_pred = xgb_model.predict(x_val)
    score = mean_absolute_error(y_val, y_pred)

    return score


study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: objective_xgb(trial, X, y), n_trials=10)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value, study.best_trial.params))

params = study.best_trial.params
xgb_model = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0).fit(x_train, y_train)
y_pred = xgb_model.predict(test)

import pandas as pd

sample_submission = pd.read_csv('./jeju_data/sample_submission.csv')
sample_submission['target'] = y_pred
sample_submission.to_csv("./submit_xgb_optuna.csv", index=False)

[32m[I 2022-10-14 16:11:35,278][0m A new study created in memory with name: no-name-02b87cd9-9745-40a8-8a07-fa8fdfb8579e[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[32m[I 2022-10-14 16:24:15,728][0m Trial 0 finished with value: 3.1168892183071804 and parameters: {'n_estimators': 2886, 'max_depth': 15, 'min_child_weight': 139, 'gamma': 2, 'colsample_bytree': 0.5835912825501288, 'lambda': 0.018776374001998226, 'alpha': 0.24458911485627424, 'subsample': 1.0}. Best is trial 0 with value: 3.1168892183071804.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[32m[I 2022-10-14 16:25:34,566][0m Trial 1 finished with value: 3.481380082423414 and parameters: {'n_estimators': 1072, 'max_depth': 10, 'min_child_weight': 198, 'gamma': 3, 'colsample_bytree': 0.7191773655711338, 'lambda': 9.86334333395884, 'alpha': 0.020138259161696922, 'subsample

Best trial: score 3.1168892183071804,
params {'n_estimators': 2886, 'max_depth': 15, 'min_child_weight': 139, 'gamma': 2, 'colsample_bytree': 0.5835912825501288, 'lambda': 0.018776374001998226, 'alpha': 0.24458911485627424, 'subsample': 1.0}
