In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb 
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Build a example dataset
data = pd.DataFrame({'a':range(1000), 'b':range(1000,2000), 'y': [i/1000 for i in range(1000)]}).sample(frac=1, random_state=17).reset_index(drop=True)
data

Unnamed: 0,a,b,y
0,786,1786,0.786
1,499,1499,0.499
2,930,1930,0.930
3,217,1217,0.217
4,803,1803,0.803
...,...,...,...
995,406,1406,0.406
996,390,1390,0.390
997,143,1143,0.143
998,241,1241,0.241


In [3]:
# Split the data into train and test
X = data.drop('y', axis=1)
y = data['y']

In [4]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

In [5]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter Optimization for XGBoost with GridSearchCV

In [6]:
from sklearn.model_selection import GridSearchCV

In [7]:
# Define the model
model = xgb.XGBRegressor()

In [8]:
# Define the grid
param_grid = {"booster": ["gbtree", "dart"],
              "n_estimators": [500, 600, 700],
              "learning_rate": [0.01, 0.0015, 0.001]}

In [9]:
# Fit the model
search = GridSearchCV(model, param_grid, n_jobs=6, cv=5, scoring="neg_mean_squared_error").fit(X_train, y_train)

In [10]:
print("The best hyperparameters are ",search.best_params_)

The best hyperparameters are  {'booster': 'gbtree', 'learning_rate': 0.01, 'n_estimators': 700}


In [11]:
# Get the best model
optimized_model = search.best_estimator_

In [12]:
# Evaluate the model
print("The mean squared error is ", mean_squared_error(y_test, optimized_model.predict(X_test)))
print("The R2 score is ", r2_score(y_test, optimized_model.predict(X_test)))

The mean squared error is  2.986394307644317e-06
The R2 score is  0.9999628649962704


# Hyperparameter Optimization for XGBoost with skopt

In [13]:
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

In [14]:
# If numpy version is more than 1.20.0, we need correct the integer type
if np.__version__ > "1.20.0":
    np.int = int

In [15]:
#Space of hyperparameters
space  =[Integer(50,200,name='n_estimators'),
        Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
        Categorical(['gbtree', 'gblinear', 'dart'], name='booster')
        ]

In [16]:
@use_named_args(space)
def objective(**params):
    """Optimization function"""
    
    # Define the model
    model = xgb.XGBRegressor()
    
    # Model with new parameters
    model.set_params(**params)
    
    # Return the mean of 5-fold cross-validation
    return -np.mean(cross_val_score(model, X_train, y_train, cv=5, n_jobs=6, scoring="neg_mean_squared_error"))

In [17]:
# Call the optimizer
res_gp = gp_minimize(objective, space, n_calls=50, random_state=17, verbose = 1)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0379
Function value obtained: 0.0000
Current minimum: 0.0000
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.6423
Function value obtained: 0.0578
Current minimum: 0.0000
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0628
Function value obtained: 0.0085
Current minimum: 0.0000
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.0309
Function value obtained: 0.0000
Current minimum: 0.0000
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.0160
Function value obtained: 0.0819
Current minimum: 0.0000
Iteration No: 6 started. Evaluating

In [18]:
print("The best score is ", res_gp.fun)

The best score is  5.142654389704623e-07


In [19]:
# Set the best parameters to the model
optimized_model = model.set_params(**dict(zip(['n_estimators', 'learning_rate', 'booster'], res_gp.x)))

In [20]:
# Fit the model
optimized_model.fit(X_train, y_train)

In [21]:
# Evaluate the model
y_pred = optimized_model.predict(X_test)

In [22]:
print("The mean squared error is ", mean_squared_error(y_test, y_pred))
print("The R2 score is ", r2_score(y_test, y_pred))

The mean squared error is  4.0507027117730474e-07
The R2 score is  0.9999949630609755
