In [1]:
!jupyter nbextension enable --py widgetsnbextension

Config option `kernel_spec_manager_class` not recognized by `EnableNBExtensionApp`.
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [2]:
import numpy as np
import optuna
import pandas as pd
from lightgbm import LGBMRegressor
from optuna.visualization import plot_optimization_history, plot_contour
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, train_test_split

In [3]:
abalone = pd.read_csv("data/abalone.csv")
abalone["Sex"] = abalone["Sex"].astype("category")
abalone

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Log_age
0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,2.525729
1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,2.525729
2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,2.014903
3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,2.442347
4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,2.351375
...,...,...,...,...,...,...,...,...,...
90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,2.014903
90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,2.351375
90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,2.014903
90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,2.014903


In [4]:
abalone.describe()

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Log_age
count,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0
mean,0.517098,0.401679,0.135464,0.789035,0.340778,0.169422,0.225898,2.378353
std,0.118217,0.098026,0.038008,0.457671,0.204428,0.100909,0.130203,0.271609
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,0.916291
25%,0.445,0.345,0.11,0.419,0.1775,0.0865,0.12,2.251292
50%,0.545,0.425,0.14,0.7995,0.33,0.166,0.225,2.351375
75%,0.6,0.47,0.16,1.0675,0.463,0.2325,0.305,2.525729
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,3.417727


In [5]:
# Define constants
RANDOM_SEED = 42
TARGET = "Log_age"

In [6]:
X, y = abalone.copy().drop(columns=[TARGET]), abalone[TARGET].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
def objective(trial):
    params = {
        'n_jobs':-1,
        "metric":'rmse',  
        "verbosity": -1,
        "bagging_freq": 1,
        "boosting_type": "gbdt",    
        "objective": 'regression', 
        'random_state': RANDOM_SEED,
        'max_depth': trial.suggest_int('max_depth', 3, 15),                        
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "n_estimators": trial.suggest_int('n_estimators', 400, 1000),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),               
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.01),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 60),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_train_cv, y_train_cv = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_cv, y_val_cv = X_train.iloc[val_idx], y_train.iloc[val_idx]

        model = LGBMRegressor(**params)
        model.fit(X_train_cv, y_train_cv, categorical_feature=["Sex"])

        y_pred_cv = model.predict(X_val_cv)
        scores.append(root_mean_squared_error(y_val_cv, y_pred_cv))
    
    return np.mean(scores)

In [8]:
study = optuna.create_study(direction="minimize", study_name="Abalone Optuna LGBM")
study.optimize(objective, n_trials=50)

for key, value in study.best_params.items():
    print(f"{key}: {value}")

[I 2024-05-28 17:32:58,519] A new study created in memory with name: Abalone Optuna LGBM
[I 2024-05-28 17:33:03,986] Trial 0 finished with value: 0.14375807045030603 and parameters: {'max_depth': 12, 'subsample': 0.7803014605564268, 'n_estimators': 566, 'min_data_in_leaf': 100, 'learning_rate': 0.008532877158587762, 'min_child_samples': 15, 'lambda_l1': 9.75460254662202e-07, 'lambda_l2': 1.0269431172985284e-06, 'colsample_bytree': 0.9207728026629094}. Best is trial 0 with value: 0.14375807045030603.
[I 2024-05-28 17:33:09,443] Trial 1 finished with value: 0.14809382882337802 and parameters: {'max_depth': 9, 'subsample': 0.5581220262864449, 'n_estimators': 585, 'min_data_in_leaf': 35, 'learning_rate': 0.005298220791781237, 'min_child_samples': 10, 'lambda_l1': 2.88943316515456e-08, 'lambda_l2': 1.0145007805795036e-06, 'colsample_bytree': 0.430267303616005}. Best is trial 0 with value: 0.14375807045030603.
[I 2024-05-28 17:33:12,823] Trial 2 finished with value: 0.14552264664278947 and p

max_depth: 15
subsample: 0.5239009969476628
n_estimators: 971
min_data_in_leaf: 58
learning_rate: 0.009722401752774184
min_child_samples: 17
lambda_l1: 0.0010976171918308747
lambda_l2: 0.0005851087234310306
colsample_bytree: 0.9629487762752005


In [9]:
plot_optimization_history(study)

In [10]:
plot_contour(study)

In [11]:
study.best_params

{'max_depth': 15,
 'subsample': 0.5239009969476628,
 'n_estimators': 971,
 'min_data_in_leaf': 58,
 'learning_rate': 0.009722401752774184,
 'min_child_samples': 17,
 'lambda_l1': 0.0010976171918308747,
 'lambda_l2': 0.0005851087234310306,
 'colsample_bytree': 0.9629487762752005}

In [12]:
lgbm_params = {
    'n_jobs':-1,
    "metric":'rmse',  
    "verbosity": -1,
    "bagging_freq": 1,
    "boosting_type": "gbdt",    
    "objective": 'regression', 
    'random_state': RANDOM_SEED,
    'max_depth': 15,
    'subsample': 0.5239009969476628,
    'n_estimators': 971,
    'min_data_in_leaf': 58,
    'learning_rate': 0.009722401752774184,
    'min_child_samples': 17,
    'lambda_l1': 0.0010976171918308747,
    'lambda_l2': 0.0005851087234310306,
    'colsample_bytree': 0.9629487762752005
}

model = LGBMRegressor(**lgbm_params)
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
root_mean_squared_error(y_test, y_pred)

0.14105726380815609