In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from dataset_test import make_dataset
from os.path import join


train_path = join('jeju_data', 'train_new.parquet')
test_path = join('jeju_data', 'test_new.parquet')

x_train, y_train, test = make_dataset(train_path, test_path)

Start time:  2022-10-22 14:39:18.596475
Train dataset success !
Test dataset success !
End time: 2022-10-22 14:40:30.984163
Play time:  0:01:12.387688


In [2]:
X = x_train.copy()
y = y_train.copy()

# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, random_state=1103)

In [5]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler


def objective_xgb(trial: Trial, x, y):
    params = {
        "n_estimators": trial.suggest_int('n_estimators', 500, 5000),
        'max_depth': trial.suggest_int('max_depth', 8, 16),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_int('gamma', 1, 3),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
        'random_state': 42
    }

    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3)

    model = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0)
    xgb_model = model.fit(x_train, y_train, verbose=False, eval_set=[(x_val, y_val)], early_stopping_rounds=50)
    y_pred = xgb_model.predict(x_val)
    score = mean_absolute_error(y_val, y_pred)

    return score


study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: objective_xgb(trial, X, y), n_trials=30)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value, study.best_trial.params))

params = study.best_trial.params
xgb_model = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0).fit(x_train, y_train)
y_pred = xgb_model.predict(test)

import pandas as pd

sample_submission = pd.read_csv('./jeju_data/sample_submission.csv')
sample_submission['target'] = y_pred
sample_submission.to_csv("./submit_xgb_optunadelout.csv", index=False)

[32m[I 2022-10-22 14:41:07,918][0m A new study created in memory with name: no-name-e23f4d61-58b8-4338-bb16-276b3523f20d[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[32m[I 2022-10-22 14:47:00,932][0m Trial 0 finished with value: 3.145041949676306 and parameters: {'n_estimators': 977, 'max_depth': 14, 'min_child_weight': 287, 'gamma': 3, 'learning_rate': 0.014, 'colsample_bytree': 0.6196607960754024, 'lambda': 0.008556536487634988, 'alpha': 0.0019315291967633202, 'subsample': 0.7}. Best is trial 0 with value: 3.145041949676306.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[32m[I 2022-10-22 14:51:22,286][0m Trial 1 finished with value: 3.1310896651926843 and parameters: {'n_estimators': 3563, 'max_depth': 11, 'min_child_weight': 260, 'gamma': 1, 'learning_rate': 0.018, 'colsample_bytree': 0.7089070438190315, 'lambda': 5.947818902

Best trial: score 3.0168951916659785,
params {'n_estimators': 4519, 'max_depth': 13, 'min_child_weight': 22, 'gamma': 3, 'learning_rate': 0.016, 'colsample_bytree': 0.7242144738048932, 'lambda': 0.007923992135983554, 'alpha': 0.13203244419302776, 'subsample': 0.8}


params {'n_estimators': 4519, 'max_depth': 13, 'min_child_weight': 22, 'gamma': 3, 'learning_rate': 0.016, 'colsample_bytree': 0.7242144738048932, 'lambda': 0.007923992135983554, 'alpha': 0.13203244419302776, 'subsample': 0.8}


In [19]:
study.best_trial.params

{'n_estimators': 4519,
 'max_depth': 13,
 'min_child_weight': 22,
 'gamma': 3,
 'learning_rate': 0.016,
 'colsample_bytree': 0.7242144738048932,
 'lambda': 0.007923992135983554,
 'alpha': 0.13203244419302776,
 'subsample': 0.8}

In [6]:
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error


param = study.best_trial.params

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

XGB_model= {}

for f in range(5):
      print(f'===================================={f+1}============================================')
      train_idx, val_idx = folds[f]
      
      x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]
      
      XGB = XGBRegressor(**param, tree_method='gpu_hist', gpu_id=0)
      XGB.fit(x_train, y_train)
      
      y_pred = XGB.predict(x_val)
      mae = mean_absolute_error(y_val, y_pred)
      print(f"{f + 1} Fold MAE = {mae}")
      XGB_model[f] = XGB
      print(f'================================================================================\n\n')
              


for fold in range(5):
    sample_submission['target'] += XGB_model[fold].predict(test)/5    
    




1 Fold MAE = 3.021009554121474


2 Fold MAE = 3.019629712531273


3 Fold MAE = 3.016938838577998


4 Fold MAE = 3.0173574349745227


5 Fold MAE = 3.016860395132144




In [21]:
df_imp = pd.DataFrame({'imp':XGB.feature_importances_}, index = XGB.feature_names_in_)
df_imp = df_imp[df_imp.imp > 0].sort_values('imp').copy()
df_imp

Unnamed: 0,imp
day_of_week,0.00074
post_holiday,0.000754
pre_holiday,0.000923
vacation,0.001018
holiday,0.001093
week,0.001368
multi_linked,0.001387
month,0.001574
season,0.003486
sin_time,0.003757


In [None]:
sample_submission.to_csv("./submit_xgb_fold.csv", index=False)