## Extra Trees Hyperparameter Tuning (Recall)

In [57]:
# Load Libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import recall_score

from sklearn.ensemble import ExtraTreesClassifier

import optuna

In [59]:
# Load Data

df = pd.read_csv('bank_4.csv', index_col=0)

In [61]:
# Train / Test Split

X = df.drop(columns=['churn', 'complain', 'umap_1', 'umap_2'])
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [65]:
# Hyperparameter tuning

def objective(trial):
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'max_features': trial.suggest_int('max_features', 4, 53),
        'max_depth': trial.suggest_int('max_depth', 2, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32)
    }

    model = ExtraTreesClassifier(
        **params,
        random_state=42,
        n_jobs=-1)
        
    threshold = 0.32
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    
    for tr, te in skf.split(X_train, y_train):
        
        X_tr, X_te = X_train.iloc[tr], X_train.iloc[te]
        y_tr, y_te = y_train.iloc[tr], y_train.iloc[te]
        
        model.fit(X_tr, y_tr)
        prob = model.predict_proba(X_te)[:, 1]
        y_pred = np.where(prob < threshold, 0, 1)
        
        scores.append(recall_score(y_te, y_pred))
        
    return np.mean(scores)

In [67]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2024-07-08 18:08:23,968] A new study created in memory with name: no-name-c432d60f-4f9a-4a17-894c-9c01a9554790


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-07-08 18:08:31,942] Trial 0 finished with value: 0.6276073619631901 and parameters: {'n_estimators': 633, 'max_features': 50, 'max_depth': 42, 'min_samples_split': 32, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.6276073619631901.
[I 2024-07-08 18:08:34,751] Trial 1 finished with value: 0.6337423312883435 and parameters: {'n_estimators': 219, 'max_features': 50, 'max_depth': 41, 'min_samples_split': 31, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.6337423312883435.
[I 2024-07-08 18:08:39,550] Trial 2 finished with value: 0.5852760736196319 and parameters: {'n_estimators': 571, 'max_features': 5, 'max_depth': 21, 'min_samples_split': 24, 'min_samples_leaf': 24}. Best is trial 1 with value: 0.6337423312883435.
[I 2024-07-08 18:09:06,049] Trial 3 finished with value: 0.6294478527607362 and parameters: {'n_estimators': 1971, 'max_features': 33, 'max_depth': 17, 'min_samples_split': 11, 'min_samples_leaf': 18}. Best is trial 1 with value: 0.6337423312883435.
[I 202

In [77]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=17, state=1, values=[0.6361963190184049], datetime_start=datetime.datetime(2024, 7, 8, 18, 12, 44, 549935), datetime_complete=datetime.datetime(2024, 7, 8, 18, 13, 8, 51777), params={'n_estimators': 1567, 'max_features': 47, 'max_depth': 34, 'min_samples_split': 21, 'min_samples_leaf': 14}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=2000, log=False, low=100, step=1), 'max_features': IntDistribution(high=53, log=False, low=4, step=1), 'max_depth': IntDistribution(high=50, log=False, low=2, step=1), 'min_samples_split': IntDistribution(high=32, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=32, log=False, low=1, step=1)}, trial_id=17, value=None)
Best hyperparameters: {'n_estimators': 1567, 'max_features': 47, 'max_depth': 34, 'min_samples_split': 21, 'min_samples_leaf': 14}
