In [1]:
!pip install optuna-integration[lightgbm]

import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as lgbm
from lightgbm import early_stopping,log_evaluation
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv',index_col = "id")
test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv',index_col = "id")
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv',index_col = "id")

train.columns = [col.replace(" ", "_") for col in train.columns]
test.columns = [col.replace(" ", "_") for col in test.columns]

def date_preprocessing(df, yearT = 360):
    df = df.copy()
    df["Policy_Start_Date"] = pd.to_datetime(df["Policy_Start_Date"])
    days_in_month = df["Policy_Start_Date"].dt.days_in_month
    
    df["Year"] = df["Policy_Start_Date"].dt.year
    df["Month"] = df["Policy_Start_Date"].dt.month
    df["Day"] = df["Policy_Start_Date"].dt.day
    
    df["Sin_Month"] = np.sin(df["Month"] * np.pi * 2 / 12)
    df["Cos_Month"] = np.cos(df["Month"] * np.pi * 2 / 12)
    

    df["Day_of_Year"] = df["Policy_Start_Date"].dt.dayofyear
    df["Sin_Year_Day"] = np.sin(df["Day_of_Year"] * np.pi * 2 / yearT)
    df["Cos_Year_Day"] = np.cos(df["Day_of_Year"] * np.pi * 2 / yearT)
    
    df["Sin_Day"] = np.sin(df["Day"] * np.pi * 2 / days_in_month)
    df["Cos_Day"] = np.cos(df["Day"] * np.pi * 2 / days_in_month)

    df.drop("Policy_Start_Date",axis = 1,inplace = True)

    return df


train = date_preprocessing(train)
test = date_preprocessing(test)

cat_features = [col for col in train.columns if train[col].dtype == 'object']

for col in cat_features:
    train[col] = train[col].fillna("None").astype("category")
    test[col] = test[col].fillna("None").astype("category")

Collecting optuna-integration[lightgbm]
  Downloading optuna_integration-4.1.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.1.0-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.1.0


In [2]:

def objective(trial):
    goss_params = {
                'random_state': 42,
                'objective': 'regression',
                'metric': 'rmse',
                'boosting_type': 'goss',
                'verbosity': -1,
                "verbose_eval": -1,
                'enable_categorical': True, 
                 "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                "max_depth": trial.suggest_int("max_depth", -1, 20), 
                "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
                "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
                "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
                "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
                "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.0, 1.0)}

    X,y = train.drop("Premium_Amount",axis = 1).copy(),train["Premium_Amount"].copy()
    kf = KFold(n_splits = 3,random_state = 42,shuffle = True)
    lgb_train_scores = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        y_log = np.log1p(y)
        X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[val_idx]
    
    
        lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_features)
        lgb_valid = lgb.Dataset(X_valid, y_valid, categorical_feature=cat_features, reference=lgb_train)

        model = lgb.train(
            goss_params,
            lgb_train,
            valid_sets=[lgb_train,lgb_valid],
            valid_names=["Training","Validation"],
            num_boost_round = 3000,
            callbacks=[lgb.early_stopping(stopping_rounds=50)])
         
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    
        y_pred = np.maximum(0,y_pred)
        y_valid = np.maximum(0,y_valid)
        
        rmsle = mean_squared_error(y_valid, y_pred,squared=False)
        lgb_train_scores.append(rmsle)

    return np.mean(lgb_train_scores)

study = optuna.create_study(sampler = optuna.samplers.TPESampler(),direction = "minimize")
study.optimize(objective,timeout = 3600 * 3)
print(study.best_params)

[I 2024-12-02 13:04:56,167] A new study created in memory with name: no-name-bd9f3c9e-3e3b-4eda-9fe9-414f573f5a22


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04883	Validation's rmse: 1.05343
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04976	Validation's rmse: 1.05079
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04974	Validation's rmse: 1.0519


[I 2024-12-02 13:24:17,408] Trial 0 finished with value: 1.0520406411045409 and parameters: {'learning_rate': 0.000975650712612266, 'num_leaves': 115, 'max_depth': 7, 'min_data_in_leaf': 84, 'feature_fraction': 0.6560268657630726, 'lambda_l1': 0.001055515495359659, 'lambda_l2': 0.1398323136718019, 'min_gain_to_split': 0.562399352061678}. Best is trial 0 with value: 1.0520406411045409.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.06529	Validation's rmse: 1.06721
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.06634	Validation's rmse: 1.06451
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.06606	Validation's rmse: 1.06612


[I 2024-12-02 13:31:09,648] Trial 1 finished with value: 1.0659500472107395 and parameters: {'learning_rate': 0.0013552196200012042, 'num_leaves': 45, 'max_depth': 2, 'min_data_in_leaf': 35, 'feature_fraction': 0.6318921741523428, 'lambda_l1': 2.717712769107199e-06, 'lambda_l2': 0.01127212730807297, 'min_gain_to_split': 0.2453836544899739}. Best is trial 0 with value: 1.0520406411045409.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.08008	Validation's rmse: 1.08192
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.0812	Validation's rmse: 1.07919
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.08069	Validation's rmse: 1.08108


[I 2024-12-02 13:45:11,723] Trial 2 finished with value: 1.0807304079400497 and parameters: {'learning_rate': 0.0001147007291964683, 'num_leaves': 90, 'max_depth': 5, 'min_data_in_leaf': 73, 'feature_fraction': 0.7717417455800069, 'lambda_l1': 5.079849034864797e-06, 'lambda_l2': 5.67571996977698e-07, 'min_gain_to_split': 0.053787858662044497}. Best is trial 0 with value: 1.0520406411045409.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04242	Validation's rmse: 1.04974
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2999]	Training's rmse: 1.04324	Validation's rmse: 1.04718
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04333	Validation's rmse: 1.04795


[I 2024-12-02 13:59:50,277] Trial 3 finished with value: 1.0482927301648095 and parameters: {'learning_rate': 0.003425396687646092, 'num_leaves': 52, 'max_depth': 18, 'min_data_in_leaf': 46, 'feature_fraction': 0.5046459144740114, 'lambda_l1': 0.018840684289445495, 'lambda_l2': 1.0490048012421692e-06, 'min_gain_to_split': 0.5697426063694268}. Best is trial 3 with value: 1.0482927301648095.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04975	Validation's rmse: 1.0524
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.05069	Validation's rmse: 1.04942
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.05078	Validation's rmse: 1.05064


[I 2024-12-02 14:13:32,627] Trial 4 finished with value: 1.0508194157862911 and parameters: {'learning_rate': 0.0007197411185695668, 'num_leaves': 101, 'max_depth': 5, 'min_data_in_leaf': 38, 'feature_fraction': 0.9723703684118694, 'lambda_l1': 0.004169330644532124, 'lambda_l2': 4.985596820004779e-08, 'min_gain_to_split': 0.6208492023857983}. Best is trial 3 with value: 1.0482927301648095.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04233	Validation's rmse: 1.04758
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2998]	Training's rmse: 1.04315	Validation's rmse: 1.04491
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2999]	Training's rmse: 1.04326	Validation's rmse: 1.04561


[I 2024-12-02 14:29:18,055] Trial 5 finished with value: 1.046030621678699 and parameters: {'learning_rate': 0.0016122415986355863, 'num_leaves': 51, 'max_depth': 13, 'min_data_in_leaf': 50, 'feature_fraction': 0.9102663606182569, 'lambda_l1': 0.3431540884629721, 'lambda_l2': 0.0009095518208902767, 'min_gain_to_split': 0.46853279746593046}. Best is trial 5 with value: 1.046030621678699.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[567]	Training's rmse: 1.03739	Validation's rmse: 1.04766
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[558]	Training's rmse: 1.03823	Validation's rmse: 1.04505
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[656]	Training's rmse: 1.03714	Validation's rmse: 1.04565


[I 2024-12-02 14:33:11,464] Trial 6 finished with value: 1.0461180564004977 and parameters: {'learning_rate': 0.012815451869924515, 'num_leaves': 90, 'max_depth': 17, 'min_data_in_leaf': 74, 'feature_fraction': 0.7759741091446384, 'lambda_l1': 6.656541726017007e-08, 'lambda_l2': 2.2048646173173525, 'min_gain_to_split': 0.5744549637007111}. Best is trial 5 with value: 1.046030621678699.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1130]	Training's rmse: 1.03694	Validation's rmse: 1.04877
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1128]	Training's rmse: 1.03801	Validation's rmse: 1.04613
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1142]	Training's rmse: 1.0379	Validation's rmse: 1.04686


[I 2024-12-02 14:40:24,027] Trial 7 finished with value: 1.04725407717294 and parameters: {'learning_rate': 0.012186685764281148, 'num_leaves': 95, 'max_depth': 7, 'min_data_in_leaf': 93, 'feature_fraction': 0.582564274023877, 'lambda_l1': 0.003242728668476384, 'lambda_l2': 0.38560627548670257, 'min_gain_to_split': 0.29374352669617265}. Best is trial 5 with value: 1.046030621678699.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04317	Validation's rmse: 1.04824
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04397	Validation's rmse: 1.0456
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04411	Validation's rmse: 1.0464


[I 2024-12-02 14:59:07,799] Trial 8 finished with value: 1.0467475207868482 and parameters: {'learning_rate': 0.000997883791851457, 'num_leaves': 75, 'max_depth': 16, 'min_data_in_leaf': 99, 'feature_fraction': 0.8711985371518296, 'lambda_l1': 1.1984810163997405e-06, 'lambda_l2': 4.9638256634862966e-08, 'min_gain_to_split': 0.07728312036446072}. Best is trial 5 with value: 1.046030621678699.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.0622	Validation's rmse: 1.06425
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.06318	Validation's rmse: 1.06147
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.06305	Validation's rmse: 1.063


[I 2024-12-02 15:07:55,720] Trial 9 finished with value: 1.0629072005846696 and parameters: {'learning_rate': 0.0009416832201990159, 'num_leaves': 146, 'max_depth': 3, 'min_data_in_leaf': 64, 'feature_fraction': 0.5701671304975583, 'lambda_l1': 0.3730131710181061, 'lambda_l2': 0.00012018582407097452, 'min_gain_to_split': 0.3616151267088912}. Best is trial 5 with value: 1.046030621678699.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[289]	Training's rmse: 1.0425	Validation's rmse: 1.04765
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[356]	Training's rmse: 1.04256	Validation's rmse: 1.04502
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[346]	Training's rmse: 1.04276	Validation's rmse: 1.04561


[I 2024-12-02 15:09:23,421] Trial 10 finished with value: 1.0460934618851663 and parameters: {'learning_rate': 0.03542657145132098, 'num_leaves': 26, 'max_depth': 12, 'min_data_in_leaf': 15, 'feature_fraction': 0.9800408004035949, 'lambda_l1': 0.4458660608216733, 'lambda_l2': 0.0002755840317661732, 'min_gain_to_split': 0.9591796645282334}. Best is trial 5 with value: 1.046030621678699.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[90]	Training's rmse: 1.04422	Validation's rmse: 1.04798
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[84]	Training's rmse: 1.04513	Validation's rmse: 1.0453
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[155]	Training's rmse: 1.04382	Validation's rmse: 1.04589


[I 2024-12-02 15:10:00,944] Trial 11 finished with value: 1.0463916199238248 and parameters: {'learning_rate': 0.09691277751855445, 'num_leaves': 20, 'max_depth': 12, 'min_data_in_leaf': 11, 'feature_fraction': 0.9740316099409866, 'lambda_l1': 9.488809070242187, 'lambda_l2': 0.00026096131684290324, 'min_gain_to_split': 0.9981127666069013}. Best is trial 5 with value: 1.046030621678699.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[493]	Training's rmse: 1.04228	Validation's rmse: 1.04795
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[348]	Training's rmse: 1.04422	Validation's rmse: 1.04532
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[499]	Training's rmse: 1.04307	Validation's rmse: 1.04593


[I 2024-12-02 15:11:45,369] Trial 12 finished with value: 1.046403159309197 and parameters: {'learning_rate': 0.03288321603952487, 'num_leaves': 20, 'max_depth': 12, 'min_data_in_leaf': 10, 'feature_fraction': 0.8786580910072568, 'lambda_l1': 0.4154610302843644, 'lambda_l2': 0.007992024457787272, 'min_gain_to_split': 0.8210727200847164}. Best is trial 5 with value: 1.046030621678699.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1210]	Training's rmse: 1.04036	Validation's rmse: 1.04749
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1317]	Training's rmse: 1.04072	Validation's rmse: 1.04482
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1351]	Training's rmse: 1.0407	Validation's rmse: 1.04541


[I 2024-12-02 15:18:23,668] Trial 13 finished with value: 1.0459058765680267 and parameters: {'learning_rate': 0.0074316004574371255, 'num_leaves': 48, 'max_depth': 13, 'min_data_in_leaf': 25, 'feature_fraction': 0.8859387413757838, 'lambda_l1': 7.427838968787608, 'lambda_l2': 2.9930367783188713e-05, 'min_gain_to_split': 0.7980368381910301}. Best is trial 13 with value: 1.0459058765680267.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2100]	Training's rmse: 1.03969	Validation's rmse: 1.04757
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1857]	Training's rmse: 1.04108	Validation's rmse: 1.04494
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2156]	Training's rmse: 1.04044	Validation's rmse: 1.04559


[I 2024-12-02 15:29:28,121] Trial 14 finished with value: 1.0460332388755076 and parameters: {'learning_rate': 0.003979697120474634, 'num_leaves': 59, 'max_depth': -1, 'min_data_in_leaf': 29, 'feature_fraction': 0.8377042010683275, 'lambda_l1': 8.190396291658363, 'lambda_l2': 1.0495644745540572e-05, 'min_gain_to_split': 0.7488356050493857}. Best is trial 13 with value: 1.0459058765680267.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.08475	Validation's rmse: 1.08676
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.08592	Validation's rmse: 1.08418
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.08521	Validation's rmse: 1.08595


[I 2024-12-02 15:48:23,523] Trial 15 finished with value: 1.0856280198413228 and parameters: {'learning_rate': 0.00018620952040628407, 'num_leaves': 63, 'max_depth': 20, 'min_data_in_leaf': 53, 'feature_fraction': 0.41315296956757797, 'lambda_l1': 0.0482166942976701, 'lambda_l2': 1.667737129938963e-05, 'min_gain_to_split': 0.4130842640870702}. Best is trial 13 with value: 1.0459058765680267.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1405]	Training's rmse: 1.04063	Validation's rmse: 1.04787
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1173]	Training's rmse: 1.04229	Validation's rmse: 1.04524
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1257]	Training's rmse: 1.04208	Validation's rmse: 1.04589


[I 2024-12-02 15:53:55,680] Trial 16 finished with value: 1.0463342177153396 and parameters: {'learning_rate': 0.007848805418326386, 'num_leaves': 40, 'max_depth': 14, 'min_data_in_leaf': 26, 'feature_fraction': 0.7661082183511994, 'lambda_l1': 7.05505307355849e-05, 'lambda_l2': 0.003963981978794984, 'min_gain_to_split': 0.7448560908725992}. Best is trial 13 with value: 1.0459058765680267.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04291	Validation's rmse: 1.04765
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04377	Validation's rmse: 1.04496
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3000]	Training's rmse: 1.04386	Validation's rmse: 1.0456


[I 2024-12-02 16:08:07,385] Trial 17 finished with value: 1.0460696356037935 and parameters: {'learning_rate': 0.002139725571524349, 'num_leaves': 37, 'max_depth': 9, 'min_data_in_leaf': 48, 'feature_fraction': 0.9048540411600601, 'lambda_l1': 1.6999343673904734, 'lambda_l2': 2.618744607760069e-05, 'min_gain_to_split': 0.8601458666753529}. Best is trial 13 with value: 1.0459058765680267.


{'learning_rate': 0.0074316004574371255, 'num_leaves': 48, 'max_depth': 13, 'min_data_in_leaf': 25, 'feature_fraction': 0.8859387413757838, 'lambda_l1': 7.427838968787608, 'lambda_l2': 2.9930367783188713e-05, 'min_gain_to_split': 0.7980368381910301}
