In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import optuna

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv',index_col = "id")
test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv',index_col = "id")
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv',index_col = "id").iloc[:1000]

train.columns = [col.replace(" ", "_") for col in train.columns]
test.columns = [col.replace(" ", "_") for col in test.columns]

def date_preprocessing(df, yearT = 360):
    df = df.copy()
    df["Policy_Start_Date"] = pd.to_datetime(df["Policy_Start_Date"])
    days_in_month = df["Policy_Start_Date"].dt.days_in_month
    
    df["Year"] = df["Policy_Start_Date"].dt.year
    df["Month"] = df["Policy_Start_Date"].dt.month
    df["Day"] = df["Policy_Start_Date"].dt.day
    
    df["Sin_Month"] = np.sin(df["Month"] * np.pi * 2 / 12)
    df["Cos_Month"] = np.cos(df["Month"] * np.pi * 2 / 12)
    

    df["Day_of_Year"] = df["Policy_Start_Date"].dt.dayofyear
    df["Sin_Year_Day"] = np.sin(df["Day_of_Year"] * np.pi * 2 / yearT)
    df["Cos_Year_Day"] = np.cos(df["Day_of_Year"] * np.pi * 2 / yearT)
    
    df["Sin_Day"] = np.sin(df["Day"] * np.pi * 2 / days_in_month)
    df["Cos_Day"] = np.cos(df["Day"] * np.pi * 2 / days_in_month)

    df.drop("Policy_Start_Date",axis = 1,inplace = True)

    return df


train = date_preprocessing(train)
test = date_preprocessing(test)

cat_features = [col for col in train.columns if train[col].dtype == 'object']

for col in cat_features:
    train[col] = train[col].fillna("None").astype("category")
    test[col] = test[col].fillna("None").astype("category")

In [3]:
X, y = train.drop("Premium_Amount", axis=1), train["Premium_Amount"]
kf = KFold(n_splits=3, random_state=42, shuffle=True)


def objective(trial):
    
    xgb_params = {
        'booster': "gbtree",
        'verbosity': 0,
        'tree_method': "hist",
        'objective': 'reg:squarederror',
        'eval_metric': "rmse",
        'seed': 42,
        'enable_categorical': True,
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10, log=True),
        "gamma": trial.suggest_float("gamma", 1e-3, 10, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1e-1, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1e-1, log=True),
    }

    xgb_train_scores = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        y_log = np.log1p(y)
        X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[val_idx]

        xgb_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
        xgb_valid = xgb.DMatrix(X_valid, y_valid, enable_categorical=True)

        model = xgb.train(
            xgb_params,
            xgb_train,
            evals=[(xgb_train, "train"), (xgb_valid, "valid")],
            verbose_eval=False,
            early_stopping_rounds=50,
            num_boost_round = 3000
        )

        y_pred = model.predict(xgb.DMatrix(X_valid, enable_categorical=True),
                               iteration_range=(0, model.best_iteration + 1))
        y_pred = np.maximum(0, y_pred)
        y_valid = np.maximum(0, y_valid)

        rmsle = mean_squared_error(y_valid, y_pred, squared=False)
        xgb_train_scores.append(rmsle)

    return np.mean(xgb_train_scores)

study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, timeout = 3600 * 3)


print("Best Parameters:", study.best_params)
print("Best RMSLE:", study.best_value)

[I 2024-12-02 14:34:21,264] A new study created in memory with name: no-name-ccd843a5-9360-4929-bf12-4dc10d8c3645
[I 2024-12-02 15:05:47,490] Trial 0 finished with value: 1.0474665543902593 and parameters: {'learning_rate': 0.005186447414713202, 'max_depth': 11, 'min_child_weight': 0.3594033461623969, 'gamma': 0.20057682792761367, 'subsample': 0.5859619765729993, 'colsample_bytree': 0.6244270994263428, 'reg_alpha': 0.005264193725502624, 'reg_lambda': 0.020627177088206744}. Best is trial 0 with value: 1.0474665543902593.
[I 2024-12-02 15:13:09,999] Trial 1 finished with value: 1.0480629278876166 and parameters: {'learning_rate': 0.0304296085105073, 'max_depth': 4, 'min_child_weight': 0.2940394479748326, 'gamma': 0.6664161460255262, 'subsample': 0.5809026494818321, 'colsample_bytree': 0.938211817842243, 'reg_alpha': 0.00045435432412761236, 'reg_lambda': 0.0012505635415755749}. Best is trial 0 with value: 1.0474665543902593.
[I 2024-12-02 15:36:43,002] Trial 2 finished with value: 1.04748

Best Parameters: {'learning_rate': 0.02171923819904943, 'max_depth': 9, 'min_child_weight': 0.0316557821270838, 'gamma': 1.4120959725363444, 'subsample': 0.6903485027200728, 'colsample_bytree': 0.8136345780495537, 'reg_alpha': 0.00015390129213025162, 'reg_lambda': 0.0014684563381986906}
Best RMSLE: 1.046269915123041
