In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import catboost as cb
from catboost import CatBoostClassifier
import time

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score, average_precision_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

from preprocessor import Preprocessor

np.random.seed(42)

In [2]:
df = pd.read_csv('data/Training_TriGuard.csv')
df = df.dropna(subset=['subrogation'])

In [3]:
pre = Preprocessor()

In [4]:
X = df.drop(columns=["subrogation"]).copy()
y = df["subrogation"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=0)

In [5]:
y_train.value_counts(normalize=True)

subrogation
0.0    0.77141
1.0    0.22859
Name: proportion, dtype: float64

In [6]:
y_test.value_counts(normalize=True)

subrogation
0.0    0.771296
1.0    0.228704
Name: proportion, dtype: float64

In [7]:
X_train_proc = pre.fit_transform(X_train)
X_test_proc = pre.transform(X_test)

X_test_proc = X_test_proc.reindex(columns=X_train_proc.columns, fill_value=0)

## Vanilla CatBoost Model (Default Parameters)

In [8]:
cb_clf = cb.CatBoostClassifier(
    objective='Logloss',
    random_state=42,
    thread_count=-1
)

In [9]:
cb_clf.fit(X_train_proc, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x116692660>

In [10]:
test_probabilities = cb_clf.predict_proba(X_test_proc)[:, 1]

test_classes = cb_clf.predict(X_test_proc)

print(f"Accuracy: {accuracy_score(y_test, test_classes)}")
print(f"F1 Score: {f1_score(y_test, test_classes)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, test_probabilities)}") # Use probabilities
print(f"PR AUC (Average Precision): {average_precision_score(y_test, test_probabilities)}") # Use probabilities
print(f"Precision: {precision_score(y_test, test_classes)}")
print(f"Recall: {recall_score(y_test, test_classes)}")

Accuracy: 0.8133333333333334
F1 Score: 0.5135135135135135
ROC AUC Score: 0.8343265403327322
PR AUC (Average Precision): 0.5999310575144581
Precision: 0.6356033452807647
Recall: 0.4307692307692308


## CatBoost with Randomized Search for Hyperparameter Tuning

In [17]:
print("Starting randomized search tuning...")
start_time = time.time()

# Define a larger parameter space
param_dist = {
    'depth': [2, 3, 4], 
    'learning_rate': [0.005, 0.01, 0.015], 
    'n_estimators': [2000, 2250, 2500],
    'subsample': [0.4, 0.5, 0.6],
    'rsm': [0.8, 0.9, 1.0],
    'l2_leaf_reg': [9, 11, 13]
}

# Use Randomized Search
random_search = RandomizedSearchCV(
    estimator=cb_clf,
    param_distributions=param_dist,
    n_iter=150,
    cv=3,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train_proc, y_train, verbose=False)

random_search_time = time.time() - start_time
print(f"Randomized search complete. Time taken: {random_search_time:.2f} seconds")

# Output the best parameters
print("\nBest parameters:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest cross-validation accuracy: {random_search.best_score_:.4f}")

Starting randomized search tuning...
Fitting 3 folds for each of 150 candidates, totalling 450 fits
Randomized search complete. Time taken: 418.14 seconds

Best parameters:
  subsample: 0.5
  rsm: 1.0
  n_estimators: 2000
  learning_rate: 0.01
  l2_leaf_reg: 13
  depth: 2

Best cross-validation accuracy: 0.8410


In [18]:
best_model = random_search.best_estimator_
test_predictions_gs = best_model.predict(X_test_proc)
round_test_predictions_gs = [round(p) for p in test_predictions_gs]
print(f"Accuracy: {accuracy_score(y_test, round_test_predictions_gs)}")
print(f"F1 Score: {f1_score(y_test, round_test_predictions_gs)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, test_predictions_gs)}") # Use probabilities
print(f"PR AUC (Average Precision): {average_precision_score(y_test, test_predictions_gs)}") # Use probabilities
print(f"Precision: {precision_score(y_test, round_test_predictions_gs)}")
print(f"Recall: {recall_score(y_test, round_test_predictions_gs)}")

Accuracy: 0.8194444444444444
F1 Score: 0.5222929936305732
ROC AUC Score: 0.6830163644405131
PR AUC (Average Precision): 0.41539898132427844
Precision: 0.6612903225806451
Recall: 0.43157894736842106
