In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [2]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [3]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [4]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=5)

[32m[I 2021-08-23 14:26:27,800][0m A new study created in memory with name: no-name-971dd5da-8fa7-4b97-a0e0-770b215ef836[0m


[0]	validation_0-rmse:6.48157
[1000]	validation_0-rmse:0.72711
[2000]	validation_0-rmse:0.72463
[3000]	validation_0-rmse:0.72318
[4000]	validation_0-rmse:0.72215
[5000]	validation_0-rmse:0.72137
[6000]	validation_0-rmse:0.72082
[6999]	validation_0-rmse:0.72033


[32m[I 2021-08-23 14:26:45,249][0m Trial 0 finished with value: 0.7203216783964389 and parameters: {'learning_rate': 0.16863562371470991, 'reg_lambda': 0.00213735442160522, 'reg_alpha': 0.005877169105900977, 'subsample': 0.7075822416620647, 'colsample_bytree': 0.8376750199672234, 'max_depth': 1}. Best is trial 0 with value: 0.7203216783964389.[0m


[0]	validation_0-rmse:7.58039
[1000]	validation_0-rmse:0.73385
[2000]	validation_0-rmse:0.73106
[3000]	validation_0-rmse:0.72965
[4000]	validation_0-rmse:0.72864
[5000]	validation_0-rmse:0.72784
[6000]	validation_0-rmse:0.72717
[6999]	validation_0-rmse:0.72659


[32m[I 2021-08-23 14:27:02,019][0m Trial 1 finished with value: 0.7265843315303601 and parameters: {'learning_rate': 0.025883586001082638, 'reg_lambda': 0.00011166766563913344, 'reg_alpha': 0.00011063517935014173, 'subsample': 0.6044479881702076, 'colsample_bytree': 0.9033246491964274, 'max_depth': 1}. Best is trial 0 with value: 0.7203216783964389.[0m


[0]	validation_0-rmse:6.66636
[791]	validation_0-rmse:0.72084


[32m[I 2021-08-23 14:27:07,271][0m Trial 2 finished with value: 0.7203879162948018 and parameters: {'learning_rate': 0.144607750993663, 'reg_lambda': 4.317964505286956e-08, 'reg_alpha': 1.4990047247773775e-05, 'subsample': 0.5198688561483064, 'colsample_bytree': 0.19934165695029948, 'max_depth': 4}. Best is trial 0 with value: 0.7203216783964389.[0m


[0]	validation_0-rmse:7.19977
[904]	validation_0-rmse:0.71977


[32m[I 2021-08-23 14:27:15,423][0m Trial 3 finished with value: 0.7194712240560249 and parameters: {'learning_rate': 0.07530206262581056, 'reg_lambda': 1.132014403606756e-05, 'reg_alpha': 4.146131556492896e-05, 'subsample': 0.6671061536867464, 'colsample_bytree': 0.10778806404406721, 'max_depth': 6}. Best is trial 3 with value: 0.7194712240560249.[0m


[0]	validation_0-rmse:6.84466
[1000]	validation_0-rmse:0.72783
[2000]	validation_0-rmse:0.72523
[3000]	validation_0-rmse:0.72371
[4000]	validation_0-rmse:0.72272
[5000]	validation_0-rmse:0.72191
[6000]	validation_0-rmse:0.72133
[6999]	validation_0-rmse:0.72096


[32m[I 2021-08-23 14:27:31,623][0m Trial 4 finished with value: 0.7208955305592295 and parameters: {'learning_rate': 0.1214317144598447, 'reg_lambda': 1.486660387264361e-06, 'reg_alpha': 2.6503957160366253e-08, 'subsample': 0.5526699691712402, 'colsample_bytree': 0.9002282109115242, 'max_depth': 1}. Best is trial 3 with value: 0.7194712240560249.[0m


In [5]:
study.best_params

{'learning_rate': 0.07530206262581056,
 'reg_lambda': 1.132014403606756e-05,
 'reg_alpha': 4.146131556492896e-05,
 'subsample': 0.6671061536867464,
 'colsample_bytree': 0.10778806404406721,
 'max_depth': 6}