## CatBoost Hyperparameter Tuning (F1-Score)

In [17]:
# Load Libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score

from catboost import CatBoostClassifier

import optuna

In [19]:
# Load Data

df = pd.read_csv('bank_4.csv', index_col=0)

In [21]:
# Train / Test Split

X = df.drop(columns=['churn', 'complain', 'umap_1', 'umap_2'])
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [23]:
# Hyperparameter tuning

def objective(trial):
    
    params = {
        'iterations': trial.suggest_int('iterations', 500, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 6, 16),
        'random_strength': trial.suggest_float('random_strength', 0.3, 1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2.0, 10.0)
    }

    model = CatBoostClassifier(
        **params,
        verbose=0,
        random_state=42)
    
    threshold = 0.29
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    
    for tr, te in skf.split(X_train, y_train):
        
        X_tr, X_te = X_train.iloc[tr], X_train.iloc[te]
        y_tr, y_te = y_train.iloc[tr], y_train.iloc[te]
        
        model.fit(X_tr, y_tr)
        prob = model.predict_proba(X_te)[:, 1]
        y_pred = np.where(prob < threshold, 0, 1)
        
        scores.append(f1_score(y_te, y_pred))
        
    return np.mean(scores)

In [25]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2024-07-10 14:33:41,816] A new study created in memory with name: no-name-9f3095ba-6fab-4a08-a629-ff24527c0b96


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-07-10 14:34:17,055] Trial 0 finished with value: 0.5716025929605287 and parameters: {'iterations': 925, 'learning_rate': 0.08438353547200592, 'depth': 12, 'random_strength': 0.7133152767235692, 'bagging_temperature': 0.24822840631165138, 'l2_leaf_reg': 6.729859024994687}. Best is trial 0 with value: 0.5716025929605287.
[I 2024-07-10 14:34:29,803] Trial 1 finished with value: 0.5955473522615714 and parameters: {'iterations': 534, 'learning_rate': 0.03898494295959593, 'depth': 11, 'random_strength': 0.5957926033122601, 'bagging_temperature': 0.7061012049887373, 'l2_leaf_reg': 9.060986042155577}. Best is trial 1 with value: 0.5955473522615714.
[I 2024-07-10 14:34:43,780] Trial 2 finished with value: 0.5934840428867407 and parameters: {'iterations': 575, 'learning_rate': 0.04045206136619186, 'depth': 11, 'random_strength': 0.3927267449881347, 'bagging_temperature': 0.9007350729443333, 'l2_leaf_reg': 9.669516354045438}. Best is trial 1 with value: 0.5955473522615714.
[I 2024-07-10 1

In [29]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=53, state=1, values=[0.6238698606232861], datetime_start=datetime.datetime(2024, 7, 10, 14, 52, 53, 492397), datetime_complete=datetime.datetime(2024, 7, 10, 14, 52, 58, 809779), params={'iterations': 545, 'learning_rate': 0.013560631530876574, 'depth': 6, 'random_strength': 0.3660344916841721, 'bagging_temperature': 0.2722303737601264, 'l2_leaf_reg': 2.548492629309027}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'iterations': IntDistribution(high=1000, log=False, low=500, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.01, step=None), 'depth': IntDistribution(high=16, log=False, low=6, step=1), 'random_strength': FloatDistribution(high=1.0, log=False, low=0.3, step=None), 'bagging_temperature': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'l2_leaf_reg': FloatDistribution(high=10.0, log=False, low=2.0, step=None)}, trial_id=53, value=None)
Best hyperparameters: {'iterations': 545, 'learning