In [1]:
import numpy as np

In [6]:
import xgboost as xgb
import numpy as np
from typing import Tuple


In [8]:
# Custom gradient function for squared log error
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient for squared log error.'''
    y = dtrain.get_label()
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

# Custom hessian function for squared log error
def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return (-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2)

# Custom objective function for squared log error
def c_squared_log(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    '''Squared Log Error objective. A simplified version for RMSLE used as objective function.'''
    predt = np.copy(predt)  # Avoid modifying the original prediction in-place
    predt[predt < -1] = -1 + 1e-6  # Clip negative predictions to avoid issues with log1p

    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    
    return grad, hess



In [9]:
# Create dummy data for demonstration (use your actual dataset here)
X_train = np.random.rand(100, 10)  # 100 samples, 10 features
y_train = np.random.rand(100) * 10  # Target values between 0 and 10

# Convert data into DMatrix (XGBoost's internal data structure)
dtrain = xgb.DMatrix(X_train, label=y_train)

# Parameters for the XGBoost model
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': "c_squared_log",  # Use the custom squared log objective
}



In [2]:
! pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/4e/41/2a2f5ed6c997367ab7055185cf66d536c228b15a12b8e112a274808f48b5/optuna-4.0.0-py3-none-any.whl (362kB)
[K     |████████████████████████████████| 368kB 1.7MB/s eta 0:00:01
[?25hCollecting colorlog (from optuna)
  Downloading https://files.pythonhosted.org/packages/f3/18/3e867ab37a24fdf073c1617b9c7830e06ec270b1ea4694a624038fc40a03/colorlog-6.8.2-py3-none-any.whl
Collecting alembic>=1.5.0 (from optuna)
[?25l  Downloading https://files.pythonhosted.org/packages/34/47/95d8f99c9f4a57079dfbcff5e023c5d81bde092d1c2354156340a56b3a1a/alembic-1.12.1-py3-none-any.whl (226kB)
[K     |████████████████████████████████| 235kB 2.1MB/s eta 0:00:01
Collecting Mako (from alembic>=1.5.0->optuna)
[?25l  Downloading https://files.pythonhosted.org/packages/03/3b/68690a035ba7347860f1b8c0cde853230ba69ff41df5884ea7d89fe68cd3/Mako-1.2.4-py3-none-any.whl (78kB)
[K     |████████████████████████████████| 81kB 3.0MB/s eta 0:00:01
Ins

In [None]:
xgb.train(params, dtrain, 1, obj=c_squared_log)

In [6]:
import optuna
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Define the custom gradient and hessian for squared log error
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient for squared log error.'''
    y = dtrain.get_label()  # dtrain is a DMatrix object, get labels
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()  # dtrain is a DMatrix object, get labels
    return (-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2)

def squared_log(predt: np.ndarray, dtrain: xgb.DMatrix):
    '''Squared Log Error objective function for XGBoost.'''
    predt = np.copy(predt)  # Avoid modifying the original prediction in-place
    predt[predt < -1] = -1 + 1e-6  # Clip negative predictions to avoid log1p issues
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

# Create synthetic dataset (replace this with your actual dataset)
X = np.random.rand(400, 4)  # 400 samples, 4 features
y = np.random.rand(400) * 10  # Target values between 0 and 10

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the training and validation sets to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)



In [7]:
squared_log(y_train, dtrain)

(array([-5.48133273e-09,  8.51373898e-09, -6.04478033e-09, -4.50914799e-09,
        -2.19162715e-08, -4.82722279e-09,  5.80247690e-09, -8.58345292e-09,
         1.19275949e-08, -6.54205090e-09,  5.57265052e-10, -1.39370479e-09,
         1.48105424e-08, -1.32078136e-09, -1.03201216e-08,  6.45635101e-09,
        -4.33696680e-09, -8.48865702e-09, -7.40097025e-09,  7.82187345e-09,
        -9.22205574e-09,  3.55988228e-10, -8.58049863e-09,  3.07766280e-09,
         1.43896021e-08,  1.11258692e-08, -5.79200849e-09,  9.87625771e-09,
        -1.32469045e-08,  4.31646506e-09, -5.08854026e-09,  4.39375411e-09,
         8.26143827e-10, -1.27925430e-09,  3.32509736e-09, -2.32680091e-10,
        -9.91077440e-09,  2.88351512e-09,  1.09976692e-08,  1.23591854e-08,
         1.21807292e-09,  4.93638305e-10, -9.93804982e-09, -7.94159996e-09,
        -1.56263238e-08, -8.84101233e-09,  3.67424154e-09,  8.46316385e-10,
        -2.72349910e-09, -6.92264356e-10,  5.28808099e-09,  9.76287483e-09,
         5.3

In [None]:
# Define an objective function for Optuna to optimize
def objective(trial):
    # Define the hyperparameters to be optimized
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'objective': squared_log,  # Custom squared_log objective
        'eval_metric': 'mae',      # Mean Absolute Error for evaluation
    }

    # Train the model using DMatrix
    model = xgb.train(param, dtrain, num_boost_round=10, evals=[(dval, "validation")], 
                      obj=squared_log, early_stopping_rounds=1, verbose_eval=False)

    # Predict on the validation set
    preds = model.predict(dval)
    
    # Compute the Mean Absolute Error (MAE) on the validation set
    error = mean_absolute_error(y_val, preds)
    
    return error

# Create a study object and run the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Print the best parameters and the best score
print('Best parameters:', study.best_params_)
print('Best score:', study.best_value)


[I 2024-09-29 00:34:34,315] A new study created in memory with name: no-name-40cfd897-3350-4aa0-9811-d708630c7308
