# Tune Xgb Hyper-Params

In this notebook we show how to use the _tune_clf.py_ module to tune the hyperparameters of a xgb-classifier. 

Let's run the boilerplate code

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
from tune_clf import fine_tune_clf
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')

Load a dataset suitable for binary classification

In [2]:
raw_data = load_breast_cancer()
data = pd.DataFrame(data=raw_data.data, columns=raw_data.feature_names)
data['target'] = raw_data.target

Define the hyperparameter search space

In [3]:
param_space = {
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.9, 1.0],
    'gamma': [0, 0.1, 0.5],
    'lambda': [1, 2, 3],
    'alpha': [0, 0.5, 1],
    'seed': [1234],
    'scale_pos_weight': [None, 1, 3, 5]  # Use if dataset is imbalanced
}

Find the best model

In [4]:
best_params, best_score, best_results = fine_tune_clf(
    model_constructor = XGBClassifier, 
    hyperparams_space = param_space,
    track_metric = 'accuracy',
    metrics = ['f1', 'accuracy', 'balanced_accuracy'],
    data = data,
    y_col = 'target',
    random_search = 10 # Random search of n combs in the hyperparams space, set to None for a complete search
)

Performances improved! Iter 1/10, best accuracy=0.953
Performances improved! Iter 2/10, best accuracy=0.956
Performances improved! Iter 4/10, best accuracy=0.968
--------------------------------
FINISHED TRAINING!!!!
Best params performance
fit_time = 0.192
score_time = 0.005
test_f1 = 0.975
test_accuracy = 0.968
test_balanced_accuracy = 0.963


Test best model

In [5]:
# Create data for demo
d_train, d_test = train_test_split(data, test_size=0.2, random_state=42)
x_train, y_train = d_train.drop('target', axis=1), d_train['target']
x_test, y_test = d_test.drop('target', axis=1), d_test['target']
# Train best model
best_model = XGBClassifier(**best_params)
best_model.fit(x_train, y_train)
y_pred =best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9649122807017544
