In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import catboost as cb
from catboost import CatBoostClassifier
import time

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score, average_precision_score, precision_score, recall_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

from cc2_preprocessor import Preprocessor

np.random.seed(42)

In [2]:
df = pd.read_csv('data/Training_TriGuard.csv')
df = df.dropna(subset=['subrogation'])

In [3]:
pre = Preprocessor(smoothing_factor=5, mode = 'catboost')

In [4]:
X = df.drop(columns=["subrogation"]).copy()
y = df["subrogation"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=0)

In [5]:
y_train.value_counts(normalize=True)

subrogation
0.0    0.77141
1.0    0.22859
Name: proportion, dtype: float64

In [6]:
y_test.value_counts(normalize=True)

subrogation
0.0    0.771296
1.0    0.228704
Name: proportion, dtype: float64

In [7]:
pre.fit(X_train, y_train)

X_train_proc = pre.transform(X_train)
X_test_proc = pre.transform(X_test)

X_test_proc = X_test_proc.reindex(columns=X_train_proc.columns, fill_value=0)

Fitting Preprocessor in 'catboost' mode...
CatBoost mode: Skipping target encoding learning.
Fit complete.
Transforming data in 'catboost' mode...
CatBoost mode: Skipping target encoding application.
Transform complete.
Transforming data in 'catboost' mode...
CatBoost mode: Skipping target encoding application.
Transform complete.


## Vanilla CatBoost

In [8]:
cb_clf = cb.CatBoostClassifier(
    objective='Logloss',
    random_state=42,
    thread_count=-1
)

In [9]:
cat_feature_names = pre.cat_for_encoding_
cb_clf.fit(X_train_proc, y_train, 
           cat_features=cat_feature_names,
           verbose=False)

<catboost.core.CatBoostClassifier at 0x11a0d0590>

In [10]:
test_probabilities = cb_clf.predict_proba(X_test_proc)[:, 1]

test_classes = cb_clf.predict(X_test_proc)

print(f"Accuracy: {accuracy_score(y_test, test_classes)}")
print(f"F1 Score: {f1_score(y_test, test_classes)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, test_probabilities)}") # Use probabilities
print(f"PR AUC (Average Precision): {average_precision_score(y_test, test_probabilities)}") # Use probabilities
print(f"Precision: {precision_score(y_test, test_classes)}")
print(f"Recall: {recall_score(y_test, test_classes)}")

Accuracy: 0.8112962962962963
F1 Score: 0.5108017282765243
ROC AUC Score: 0.8344394923961488
PR AUC (Average Precision): 0.6043784760678073
Precision: 0.6273584905660378
Recall: 0.4307692307692308


## Optuna

In [11]:
import optuna
from optuna.integration import CatBoostPruningCallback

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
CAT_FEATURES = pre.cat_for_encoding_
print(CAT_FEATURES)

['accident_site', 'accident_type', 'channel', 'vehicle_category', 'vehicle_color', 'living_status', 'claim_day_of_week', 'gender', 'in_network_bodyshop', 'season', 'witness_present_ind']


In [13]:
def objective(trial: optuna.trial.Trial) -> float:

    params = {
        'iterations': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 1.0, log=True), 
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0),
        
        'eval_metric': 'F1',
        'task_type': 'CPU',
        'verbose': False,
        'early_stopping_rounds': 100
    }

    params['eval_metric'] = 'Logloss'
    
    model = CatBoostClassifier(**params)
    
    model.fit(
        X_train_proc, y_train,
        eval_set=(X_test_proc, y_test),
        cat_features=CAT_FEATURES,
        verbose=False
    )

    y_preds = model.predict(X_test_proc)
    
    manual_f1_score = f1_score(y_test, y_preds, pos_label=1)
    
    return manual_f1_score

In [14]:
print("\n2. Starting Optuna study...")

study = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)
)

study.optimize(
    objective, 
    n_trials=50, # Number of trials to run
    show_progress_bar=True
)

print("\n" + "="*50)
print("Optuna study finished.")
print(f"Number of finished trials: {len(study.trials)}")

print("\nBest trial:")
best_trial = study.best_trial
    
print(f"  Value (Max F1 Score): {best_trial.value:.4f}") # <-- CHANGED comment
    
print("  Best Hyperparameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

[I 2025-11-06 16:47:52,020] A new study created in memory with name: no-name-e38be2ca-1a82-43b5-8c1b-ac2e23caf0b4



2. Starting Optuna study...


Best trial: 0. Best value: 0.576561:   2%|▏         | 1/50 [00:05<04:07,  5.06s/it]

[I 2025-11-06 16:47:57,085] Trial 0 finished with value: 0.5765606595995288 and parameters: {'learning_rate': 0.0146154791390475, 'depth': 4, 'l2_leaf_reg': 0.1444278477876349, 'subsample': 0.7, 'random_strength': 5.662041462785675e-07, 'bagging_temperature': 0.20885984194780416, 'border_count': 132, 'scale_pos_weight': 3.551127359417357}. Best is trial 0 with value: 0.5765606595995288.


Best trial: 0. Best value: 0.576561:   4%|▍         | 2/50 [00:10<04:24,  5.52s/it]

[I 2025-11-06 16:48:02,927] Trial 1 finished with value: 0.55196366550895 and parameters: {'learning_rate': 0.03454517832442975, 'depth': 10, 'l2_leaf_reg': 0.7855735725005841, 'subsample': 0.6, 'random_strength': 0.0002893535004877552, 'bagging_temperature': 0.6091324029745874, 'border_count': 102, 'scale_pos_weight': 6.48414584786913}. Best is trial 0 with value: 0.5765606595995288.


Best trial: 2. Best value: 0.580336:   6%|▌         | 3/50 [00:14<03:33,  4.55s/it]

[I 2025-11-06 16:48:06,321] Trial 2 finished with value: 0.580335731414868 and parameters: {'learning_rate': 0.037681271073073834, 'depth': 3, 'l2_leaf_reg': 2.7206545963129534, 'subsample': 0.9, 'random_strength': 0.3160442215648532, 'bagging_temperature': 0.5337825329226593, 'border_count': 166, 'scale_pos_weight': 1.5465066170534172}. Best is trial 2 with value: 0.580335731414868.


Best trial: 2. Best value: 0.580336:   8%|▊         | 4/50 [00:16<02:50,  3.70s/it]

[I 2025-11-06 16:48:08,732] Trial 3 finished with value: 0.5175936435868331 and parameters: {'learning_rate': 0.0357688377197551, 'depth': 6, 'l2_leaf_reg': 7.945331228043654, 'subsample': 0.8, 'random_strength': 0.00018079134317486905, 'bagging_temperature': 0.11530625250100168, 'border_count': 88, 'scale_pos_weight': 9.765685330726484}. Best is trial 2 with value: 0.580335731414868.


Best trial: 2. Best value: 0.580336:  10%|█         | 5/50 [00:18<02:15,  3.01s/it]

[I 2025-11-06 16:48:10,511] Trial 4 finished with value: 0.5764425936942297 and parameters: {'learning_rate': 0.03649556646043176, 'depth': 4, 'l2_leaf_reg': 0.0013148952593284077, 'subsample': 0.6, 'random_strength': 0.00024137558361779328, 'bagging_temperature': 0.4312276987237089, 'border_count': 83, 'scale_pos_weight': 3.4241494544629503}. Best is trial 2 with value: 0.580335731414868.


Best trial: 2. Best value: 0.580336:  12%|█▏        | 6/50 [00:19<01:45,  2.39s/it]

[I 2025-11-06 16:48:11,697] Trial 5 finished with value: 0.5471414242728184 and parameters: {'learning_rate': 0.127402678301513, 'depth': 5, 'l2_leaf_reg': 2.772608730586713, 'subsample': 0.9, 'random_strength': 0.008041653106814185, 'bagging_temperature': 0.4749075356386333, 'border_count': 170, 'scale_pos_weight': 5.954622088011761}. Best is trial 2 with value: 0.580335731414868.


Best trial: 2. Best value: 0.580336:  14%|█▍        | 7/50 [00:21<01:34,  2.20s/it]

[I 2025-11-06 16:48:13,507] Trial 6 finished with value: 0.5674499564838991 and parameters: {'learning_rate': 0.1775617205319118, 'depth': 8, 'l2_leaf_reg': 0.0013397839075910316, 'subsample': 0.9, 'random_strength': 3.970303868125091e-06, 'bagging_temperature': 0.6643693489330315, 'border_count': 44, 'scale_pos_weight': 3.8771867231606434}. Best is trial 2 with value: 0.580335731414868.


Best trial: 2. Best value: 0.580336:  16%|█▌        | 8/50 [00:28<02:35,  3.69s/it]

[I 2025-11-06 16:48:20,390] Trial 7 finished with value: 0.5426629750818022 and parameters: {'learning_rate': 0.017502665027523016, 'depth': 9, 'l2_leaf_reg': 0.25542528908615025, 'subsample': 0.7, 'random_strength': 0.022712840239869993, 'bagging_temperature': 0.9551237503618937, 'border_count': 238, 'scale_pos_weight': 7.835524394218863}. Best is trial 2 with value: 0.580335731414868.


Best trial: 2. Best value: 0.580336:  18%|█▊        | 9/50 [00:37<03:40,  5.37s/it]

[I 2025-11-06 16:48:29,446] Trial 8 finished with value: 0.5759071117561684 and parameters: {'learning_rate': 0.018726943813224393, 'depth': 10, 'l2_leaf_reg': 5.33082055210284, 'subsample': 0.5, 'random_strength': 0.19314415018795356, 'bagging_temperature': 0.5658242495164131, 'border_count': 85, 'scale_pos_weight': 4.303653482823392}. Best is trial 2 with value: 0.580335731414868.


Best trial: 2. Best value: 0.580336:  20%|██        | 10/50 [00:43<03:39,  5.49s/it]

[I 2025-11-06 16:48:35,225] Trial 9 finished with value: 0.5341704902197537 and parameters: {'learning_rate': 0.014009873258243096, 'depth': 8, 'l2_leaf_reg': 0.9066184038145395, 'subsample': 0.9, 'random_strength': 3.4132957950352104e-05, 'bagging_temperature': 0.23913285523148808, 'border_count': 248, 'scale_pos_weight': 7.794053545643479}. Best is trial 2 with value: 0.580335731414868.


Best trial: 10. Best value: 0.584843:  22%|██▏       | 11/50 [00:44<02:50,  4.36s/it]

[I 2025-11-06 16:48:37,022] Trial 10 finished with value: 0.5848428835489834 and parameters: {'learning_rate': 0.06423508431423151, 'depth': 3, 'l2_leaf_reg': 0.01851641373757591, 'subsample': 1.0, 'random_strength': 1.2151177494645464e-08, 'bagging_temperature': 0.8630552964494784, 'border_count': 187, 'scale_pos_weight': 1.9075965326429385}. Best is trial 10 with value: 0.5848428835489834.


Best trial: 10. Best value: 0.584843:  24%|██▍       | 12/50 [00:46<02:11,  3.45s/it]

[I 2025-11-06 16:48:38,382] Trial 11 finished with value: 0.5374941779226828 and parameters: {'learning_rate': 0.08994449514858874, 'depth': 3, 'l2_leaf_reg': 0.017798091980106984, 'subsample': 1.0, 'random_strength': 1.351131214890611e-08, 'bagging_temperature': 0.9286515226124714, 'border_count': 189, 'scale_pos_weight': 1.1003736423143566}. Best is trial 10 with value: 0.5848428835489834.


Best trial: 10. Best value: 0.584843:  26%|██▌       | 13/50 [00:48<01:48,  2.92s/it]

[I 2025-11-06 16:48:40,082] Trial 12 finished with value: 0.5473588342440802 and parameters: {'learning_rate': 0.06802868151109469, 'depth': 3, 'l2_leaf_reg': 0.015239089663484534, 'subsample': 1.0, 'random_strength': 0.9137062506279684, 'bagging_temperature': 0.7877788426874383, 'border_count': 199, 'scale_pos_weight': 1.1274666965299684}. Best is trial 10 with value: 0.5848428835489834.


Best trial: 13. Best value: 0.587579:  28%|██▊       | 14/50 [00:49<01:32,  2.58s/it]

[I 2025-11-06 16:48:41,868] Trial 13 finished with value: 0.5875785066294487 and parameters: {'learning_rate': 0.05348134529220945, 'depth': 5, 'l2_leaf_reg': 0.03282693285836524, 'subsample': 1.0, 'random_strength': 1.369876338730249e-08, 'bagging_temperature': 0.7827603156340863, 'border_count': 150, 'scale_pos_weight': 2.273753199489144}. Best is trial 13 with value: 0.5875785066294487.


Best trial: 13. Best value: 0.587579:  30%|███       | 15/50 [00:51<01:20,  2.30s/it]

[I 2025-11-06 16:48:43,520] Trial 14 finished with value: 0.5855737704918033 and parameters: {'learning_rate': 0.06458023644942615, 'depth': 6, 'l2_leaf_reg': 0.02514189016073123, 'subsample': 1.0, 'random_strength': 1.4168429872932364e-08, 'bagging_temperature': 0.8279411755578788, 'border_count': 137, 'scale_pos_weight': 2.661783509231227}. Best is trial 13 with value: 0.5875785066294487.


Best trial: 13. Best value: 0.587579:  32%|███▏      | 16/50 [00:52<01:05,  1.93s/it]

[I 2025-11-06 16:48:44,594] Trial 15 finished with value: 0.5789640927867811 and parameters: {'learning_rate': 0.2886916133590396, 'depth': 6, 'l2_leaf_reg': 0.05637279879856131, 'subsample': 0.8, 'random_strength': 2.2615546589290939e-07, 'bagging_temperature': 0.7501794540584772, 'border_count': 130, 'scale_pos_weight': 2.738370590149065}. Best is trial 13 with value: 0.5875785066294487.


Best trial: 13. Best value: 0.587579:  34%|███▍      | 17/50 [00:54<01:00,  1.82s/it]

[I 2025-11-06 16:48:46,160] Trial 16 finished with value: 0.559652928416486 and parameters: {'learning_rate': 0.10852249945179104, 'depth': 7, 'l2_leaf_reg': 0.005342742992763863, 'subsample': 1.0, 'random_strength': 1.704348651878083e-07, 'bagging_temperature': 0.7499832735801842, 'border_count': 148, 'scale_pos_weight': 4.7766646171295495}. Best is trial 13 with value: 0.5875785066294487.


Best trial: 17. Best value: 0.590863:  36%|███▌      | 18/50 [00:57<01:08,  2.15s/it]

[I 2025-11-06 16:48:49,068] Trial 17 finished with value: 0.5908629441624366 and parameters: {'learning_rate': 0.02518678651017915, 'depth': 5, 'l2_leaf_reg': 0.05045475867042612, 'subsample': 0.8, 'random_strength': 4.101334797615207e-06, 'bagging_temperature': 0.3694466001962227, 'border_count': 116, 'scale_pos_weight': 2.474241678196821}. Best is trial 17 with value: 0.5908629441624366.


Best trial: 18. Best value: 0.593226:  38%|███▊      | 19/50 [01:00<01:20,  2.61s/it]

[I 2025-11-06 16:48:52,765] Trial 18 finished with value: 0.593226137529935 and parameters: {'learning_rate': 0.025103049273737132, 'depth': 5, 'l2_leaf_reg': 0.005278117430261129, 'subsample': 0.8, 'random_strength': 1.1142369126658653e-05, 'bagging_temperature': 0.3890812048222657, 'border_count': 46, 'scale_pos_weight': 2.429975358736442}. Best is trial 18 with value: 0.593226137529935.


Best trial: 18. Best value: 0.593226:  40%|████      | 20/50 [01:03<01:17,  2.58s/it]

[I 2025-11-06 16:48:55,262] Trial 19 finished with value: 0.5574025974025975 and parameters: {'learning_rate': 0.023184409995802017, 'depth': 5, 'l2_leaf_reg': 0.004607276275819062, 'subsample': 0.8, 'random_strength': 9.616786936962553e-06, 'bagging_temperature': 0.35638868767845727, 'border_count': 37, 'scale_pos_weight': 5.153651412793124}. Best is trial 18 with value: 0.593226137529935.


Best trial: 18. Best value: 0.593226:  42%|████▏     | 21/50 [01:08<01:34,  3.26s/it]

[I 2025-11-06 16:49:00,102] Trial 20 finished with value: 0.5849652558433355 and parameters: {'learning_rate': 0.010940242346576645, 'depth': 7, 'l2_leaf_reg': 0.004794009099210432, 'subsample': 0.7, 'random_strength': 2.0822221391377467e-06, 'bagging_temperature': 0.3178464259822792, 'border_count': 60, 'scale_pos_weight': 3.017964569265845}. Best is trial 18 with value: 0.593226137529935.


Best trial: 21. Best value: 0.594576:  44%|████▍     | 22/50 [01:10<01:28,  3.15s/it]

[I 2025-11-06 16:49:03,011] Trial 21 finished with value: 0.5945757997218358 and parameters: {'learning_rate': 0.028990278077965104, 'depth': 5, 'l2_leaf_reg': 0.046395270105008594, 'subsample': 0.8, 'random_strength': 0.0024402603250790666, 'bagging_temperature': 0.046374718040806784, 'border_count': 111, 'scale_pos_weight': 2.3200395172890085}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  46%|████▌     | 23/50 [01:14<01:27,  3.25s/it]

[I 2025-11-06 16:49:06,497] Trial 22 finished with value: 0.5904486251808972 and parameters: {'learning_rate': 0.02538760555145585, 'depth': 4, 'l2_leaf_reg': 0.10665033191369785, 'subsample': 0.8, 'random_strength': 0.0016754989218567308, 'bagging_temperature': 0.0047623709677798165, 'border_count': 111, 'scale_pos_weight': 2.016123488408126}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  48%|████▊     | 24/50 [01:17<01:20,  3.08s/it]

[I 2025-11-06 16:49:09,170] Trial 23 finished with value: 0.5695919508105086 and parameters: {'learning_rate': 0.02542971599630331, 'depth': 5, 'l2_leaf_reg': 0.009356975032008236, 'subsample': 0.8, 'random_strength': 3.8437797493704766e-05, 'bagging_temperature': 0.005337730414589237, 'border_count': 69, 'scale_pos_weight': 4.2200749700243225}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  50%|█████     | 25/50 [01:24<01:48,  4.33s/it]

[I 2025-11-06 16:49:16,407] Trial 24 finished with value: 0.5874125874125874 and parameters: {'learning_rate': 0.010028001733384506, 'depth': 5, 'l2_leaf_reg': 0.31328855252051435, 'subsample': 0.7, 'random_strength': 0.0019213013243579945, 'bagging_temperature': 0.15097232548046982, 'border_count': 113, 'scale_pos_weight': 2.931370951641401}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  52%|█████▏    | 26/50 [01:27<01:31,  3.83s/it]

[I 2025-11-06 16:49:19,076] Trial 25 finished with value: 0.592222618622904 and parameters: {'learning_rate': 0.04790386317598467, 'depth': 4, 'l2_leaf_reg': 0.04352412681712474, 'subsample': 0.6, 'random_strength': 1.5084856272192618e-05, 'bagging_temperature': 0.3737893406146734, 'border_count': 56, 'scale_pos_weight': 2.094430554923304}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  54%|█████▍    | 27/50 [01:29<01:18,  3.43s/it]

[I 2025-11-06 16:49:21,585] Trial 26 finished with value: 0.5794174757281554 and parameters: {'learning_rate': 0.041051291745300394, 'depth': 4, 'l2_leaf_reg': 0.06315264319188221, 'subsample': 0.6, 'random_strength': 0.0008145070634785748, 'bagging_temperature': 0.27604759589373284, 'border_count': 57, 'scale_pos_weight': 1.6740762108640974}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  56%|█████▌    | 28/50 [01:31<01:06,  3.04s/it]

[I 2025-11-06 16:49:23,716] Trial 27 finished with value: 0.5792349726775956 and parameters: {'learning_rate': 0.049314614988427576, 'depth': 4, 'l2_leaf_reg': 0.003097382047284874, 'subsample': 0.5, 'random_strength': 2.553785635929212e-05, 'bagging_temperature': 0.09760381523602846, 'border_count': 70, 'scale_pos_weight': 3.3202030238450133}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  58%|█████▊    | 29/50 [01:35<01:08,  3.25s/it]

[I 2025-11-06 16:49:27,438] Trial 28 finished with value: 0.5181950509461426 and parameters: {'learning_rate': 0.030568452775487656, 'depth': 6, 'l2_leaf_reg': 0.14869029837810527, 'subsample': 0.6, 'random_strength': 7.985819931385623e-05, 'bagging_temperature': 0.43856274046614574, 'border_count': 33, 'scale_pos_weight': 1.0241182329849945}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  60%|██████    | 30/50 [01:40<01:13,  3.68s/it]

[I 2025-11-06 16:49:32,138] Trial 29 finished with value: 0.5656565656565656 and parameters: {'learning_rate': 0.019093352303475027, 'depth': 4, 'l2_leaf_reg': 0.010794633610860617, 'subsample': 0.7, 'random_strength': 5.585195422449031e-07, 'bagging_temperature': 0.1877748967423788, 'border_count': 47, 'scale_pos_weight': 4.596814304158309}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  62%|██████▏   | 31/50 [01:44<01:14,  3.91s/it]

[I 2025-11-06 16:49:36,563] Trial 30 finished with value: 0.5778425655976677 and parameters: {'learning_rate': 0.014949047887158088, 'depth': 6, 'l2_leaf_reg': 0.21404008251631493, 'subsample': 0.5, 'random_strength': 0.014487545124838113, 'bagging_temperature': 0.6692680552920105, 'border_count': 95, 'scale_pos_weight': 3.710823592958274}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  64%|██████▍   | 32/50 [01:47<01:05,  3.63s/it]

[I 2025-11-06 16:49:39,545] Trial 31 finished with value: 0.590645271423694 and parameters: {'learning_rate': 0.028079863455360645, 'depth': 5, 'l2_leaf_reg': 0.037213207188075526, 'subsample': 0.8, 'random_strength': 3.0885369661692908e-06, 'bagging_temperature': 0.38529789167382616, 'border_count': 119, 'scale_pos_weight': 2.3878869244438805}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  66%|██████▌   | 33/50 [01:49<00:54,  3.22s/it]

[I 2025-11-06 16:49:41,799] Trial 32 finished with value: 0.5938043856595893 and parameters: {'learning_rate': 0.04398358319355123, 'depth': 5, 'l2_leaf_reg': 0.06392485396213228, 'subsample': 0.9, 'random_strength': 1.1653704056307705e-05, 'bagging_temperature': 0.3063928458217299, 'border_count': 74, 'scale_pos_weight': 2.2820124748112605}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  68%|██████▊   | 34/50 [01:52<00:47,  2.94s/it]

[I 2025-11-06 16:49:44,110] Trial 33 finished with value: 0.587504567044209 and parameters: {'learning_rate': 0.0448926997999366, 'depth': 4, 'l2_leaf_reg': 0.4596358452840999, 'subsample': 0.9, 'random_strength': 0.0006341679103008362, 'bagging_temperature': 0.2431829137149811, 'border_count': 74, 'scale_pos_weight': 1.9564500759646823}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  70%|███████   | 35/50 [01:55<00:46,  3.07s/it]

[I 2025-11-06 16:49:47,481] Trial 34 finished with value: 0.5691666666666667 and parameters: {'learning_rate': 0.033922875318036214, 'depth': 7, 'l2_leaf_reg': 0.14183737556389214, 'subsample': 0.9, 'random_strength': 1.0577149251322084e-06, 'bagging_temperature': 0.5025458702269321, 'border_count': 54, 'scale_pos_weight': 1.5317495709228115}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  72%|███████▏  | 36/50 [01:56<00:36,  2.57s/it]

[I 2025-11-06 16:49:48,885] Trial 35 finished with value: 0.5400926151596392 and parameters: {'learning_rate': 0.07979521212205075, 'depth': 3, 'l2_leaf_reg': 0.07078481816308262, 'subsample': 0.9, 'random_strength': 9.018879936018981e-06, 'bagging_temperature': 0.3091772322512647, 'border_count': 77, 'scale_pos_weight': 6.767268839023442}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  74%|███████▍  | 37/50 [02:00<00:35,  2.75s/it]

[I 2025-11-06 16:49:52,038] Trial 36 finished with value: 0.5850965961361545 and parameters: {'learning_rate': 0.04496115191228873, 'depth': 4, 'l2_leaf_reg': 0.5428692482817882, 'subsample': 0.7, 'random_strength': 8.610593918757926e-05, 'bagging_temperature': 0.06727796134426595, 'border_count': 63, 'scale_pos_weight': 3.287957284532202}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  76%|███████▌  | 38/50 [02:02<00:30,  2.55s/it]

[I 2025-11-06 16:49:54,132] Trial 37 finished with value: 0.5727324586423275 and parameters: {'learning_rate': 0.03223832773148143, 'depth': 6, 'l2_leaf_reg': 0.002614301628932866, 'subsample': 0.6, 'random_strength': 1.243528497786136e-05, 'bagging_temperature': 0.18431511718594581, 'border_count': 102, 'scale_pos_weight': 3.8948203229462437}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  78%|███████▊  | 39/50 [02:04<00:26,  2.37s/it]

[I 2025-11-06 16:49:56,090] Trial 38 finished with value: 0.5181653042688465 and parameters: {'learning_rate': 0.03806538384359154, 'depth': 5, 'l2_leaf_reg': 0.09115652279695192, 'subsample': 0.8, 'random_strength': 0.004061769088533989, 'bagging_temperature': 0.5483285471627994, 'border_count': 92, 'scale_pos_weight': 9.62305973664018}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  80%|████████  | 40/50 [02:07<00:26,  2.65s/it]

[I 2025-11-06 16:49:59,397] Trial 39 finished with value: 0.549078980570275 and parameters: {'learning_rate': 0.020872495075055383, 'depth': 4, 'l2_leaf_reg': 0.007376795699504658, 'subsample': 0.9, 'random_strength': 0.04462899161255009, 'bagging_temperature': 0.43188218841398435, 'border_count': 48, 'scale_pos_weight': 5.8702628472080605}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  82%|████████▏ | 41/50 [02:09<00:22,  2.45s/it]

[I 2025-11-06 16:50:01,385] Trial 40 finished with value: 0.5680373039423484 and parameters: {'learning_rate': 0.05466721199238995, 'depth': 5, 'l2_leaf_reg': 0.001807570866106271, 'subsample': 0.8, 'random_strength': 0.0003250537568732419, 'bagging_temperature': 0.6030398348739806, 'border_count': 103, 'scale_pos_weight': 1.394163606178667}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  84%|████████▍ | 42/50 [02:13<00:22,  2.84s/it]

[I 2025-11-06 16:50:05,139] Trial 41 finished with value: 0.5916266300617707 and parameters: {'learning_rate': 0.02201420818588919, 'depth': 5, 'l2_leaf_reg': 0.04180310179139076, 'subsample': 0.8, 'random_strength': 6.674579504842763e-06, 'bagging_temperature': 0.38334574645714853, 'border_count': 130, 'scale_pos_weight': 2.3729788830581326}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  86%|████████▌ | 43/50 [02:19<00:27,  3.89s/it]

[I 2025-11-06 16:50:11,479] Trial 42 finished with value: 0.5911401597676107 and parameters: {'learning_rate': 0.01530429223907212, 'depth': 5, 'l2_leaf_reg': 0.032104708351957924, 'subsample': 0.7, 'random_strength': 0.00014190331888513192, 'bagging_temperature': 0.46399368479935876, 'border_count': 124, 'scale_pos_weight': 2.0294666415543188}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  88%|████████▊ | 44/50 [02:21<00:20,  3.37s/it]

[I 2025-11-06 16:50:13,641] Trial 43 finished with value: 0.5774058577405857 and parameters: {'learning_rate': 0.02965747627533654, 'depth': 6, 'l2_leaf_reg': 0.012026868188147513, 'subsample': 0.9, 'random_strength': 1.629913647829436e-05, 'bagging_temperature': 0.38705700619012684, 'border_count': 80, 'scale_pos_weight': 3.398976956291902}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  90%|█████████ | 45/50 [02:26<00:18,  3.70s/it]

[I 2025-11-06 16:50:18,092] Trial 44 finished with value: 0.5868603916614024 and parameters: {'learning_rate': 0.021492452041734042, 'depth': 4, 'l2_leaf_reg': 1.6248198034342183, 'subsample': 0.8, 'random_strength': 6.138103776981157e-05, 'bagging_temperature': 0.31663867948254304, 'border_count': 43, 'scale_pos_weight': 2.9345139335459187}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  92%|█████████▏| 46/50 [02:28<00:13,  3.35s/it]

[I 2025-11-06 16:50:20,648] Trial 45 finished with value: 0.577238951896754 and parameters: {'learning_rate': 0.03899969267348936, 'depth': 3, 'l2_leaf_reg': 0.01874216489149506, 'subsample': 0.9, 'random_strength': 1.179167058068543e-06, 'bagging_temperature': 0.2529032558555214, 'border_count': 91, 'scale_pos_weight': 1.6489101944870606}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  94%|█████████▍| 47/50 [02:34<00:12,  4.16s/it]

[I 2025-11-06 16:50:26,696] Trial 46 finished with value: 0.5938472174213619 and parameters: {'learning_rate': 0.01255851774455187, 'depth': 6, 'l2_leaf_reg': 0.044561938085464554, 'subsample': 0.8, 'random_strength': 4.922786965410017e-06, 'bagging_temperature': 0.49983655547932637, 'border_count': 156, 'scale_pos_weight': 2.3682530869225715}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  96%|█████████▌| 48/50 [02:39<00:08,  4.41s/it]

[I 2025-11-06 16:50:31,692] Trial 47 finished with value: 0.5780380673499268 and parameters: {'learning_rate': 0.011595950942832214, 'depth': 8, 'l2_leaf_reg': 0.027325055485873037, 'subsample': 0.7, 'random_strength': 3.874100931892773e-07, 'bagging_temperature': 0.5047741010335989, 'border_count': 167, 'scale_pos_weight': 4.017586044443871}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576:  98%|█████████▊| 49/50 [02:45<00:04,  4.72s/it]

[I 2025-11-06 16:50:37,137] Trial 48 finished with value: 0.586093920640699 and parameters: {'learning_rate': 0.0135174129572402, 'depth': 7, 'l2_leaf_reg': 0.001031104676216177, 'subsample': 0.9, 'random_strength': 0.0002823271490907119, 'bagging_temperature': 0.1456290316067929, 'border_count': 224, 'scale_pos_weight': 2.1222076960335334}. Best is trial 21 with value: 0.5945757997218358.


Best trial: 21. Best value: 0.594576: 100%|██████████| 50/50 [02:49<00:00,  3.40s/it]

[I 2025-11-06 16:50:41,783] Trial 49 finished with value: 0.5865771812080537 and parameters: {'learning_rate': 0.017268833788154022, 'depth': 6, 'l2_leaf_reg': 0.16951070830968767, 'subsample': 0.8, 'random_strength': 0.08480734016786749, 'bagging_temperature': 0.6453535890663101, 'border_count': 180, 'scale_pos_weight': 2.5728435639534597}. Best is trial 21 with value: 0.5945757997218358.

Optuna study finished.
Number of finished trials: 50

Best trial:
  Value (Max F1 Score): 0.5946
  Best Hyperparameters:
    learning_rate: 0.028990278077965104
    depth: 5
    l2_leaf_reg: 0.046395270105008594
    subsample: 0.8
    random_strength: 0.0024402603250790666
    bagging_temperature: 0.046374718040806784
    border_count: 111
    scale_pos_weight: 2.3200395172890085





In [15]:
best_params = study.best_trial.params
print(best_params)

final_params = best_params.copy()
final_params.update({
    'iterations': 2000, # Use more iterations for the final model
    'eval_metric': 'Logloss', # Use Logloss for training/stopping
    'task_type': 'CPU',
    'early_stopping_rounds': 50 # Keep early stopping
})

best_model = CatBoostClassifier(**final_params)

best_model.fit(
    X_train_proc, y_train,
    eval_set=(X_test_proc, y_test),
    cat_features=CAT_FEATURES,
    verbose=False
)

print(f"\nFinal Model Score (from best Logloss iteration):")
y_preds_final = best_model.predict(X_test_proc)
final_f1 = f1_score(y_test, y_preds_final, pos_label=1)
print(f"  Manual F1:class=1 Score: {final_f1:.4f}")
        
print("\n  Full Classification Report:")
print(classification_report(y_test, y_preds_final, target_names=['Class 0.0', 'Class 1.0']))

{'learning_rate': 0.028990278077965104, 'depth': 5, 'l2_leaf_reg': 0.046395270105008594, 'subsample': 0.8, 'random_strength': 0.0024402603250790666, 'bagging_temperature': 0.046374718040806784, 'border_count': 111, 'scale_pos_weight': 2.3200395172890085}

Final Model Score (from best Logloss iteration):
  Manual F1:class=1 Score: 0.5946

  Full Classification Report:
              precision    recall  f1-score   support

   Class 0.0       0.90      0.81      0.85      4165
   Class 1.0       0.52      0.69      0.59      1235

    accuracy                           0.78      5400
   macro avg       0.71      0.75      0.72      5400
weighted avg       0.81      0.78      0.79      5400

