In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import xgboost as xgb
import time

from sklearn.metrics import accuracy_score,confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

from preprocessor import Preprocessor

In [2]:
df = pd.read_csv('data/Training_TriGuard.csv')
df = df.dropna(subset=['subrogation'])

In [3]:
pre = Preprocessor()

In [4]:
X = df.drop(columns=["subrogation"]).copy()
y = df["subrogation"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=0)

In [5]:
y_train.value_counts(normalize=True)

subrogation
0.0    0.77141
1.0    0.22859
Name: proportion, dtype: float64

In [6]:
y_test.value_counts(normalize=True)

subrogation
0.0    0.771296
1.0    0.228704
Name: proportion, dtype: float64

In [7]:
X_train_proc = pre.fit_transform(X_train)
X_test_proc = pre.transform(X_test)

X_test_proc = X_test_proc.reindex(columns=X_train_proc.columns, fill_value=0)

In [8]:
dmatrix_train = xgb.DMatrix(data=X_train_proc, enable_categorical=True, label=y_train)
dmatrix_test = xgb.DMatrix(data=X_test_proc, enable_categorical=True, label=y_test)

In [9]:
learning_objective = {'objective':'binary:logistic' }
model = xgb.train(params = learning_objective, dtrain= dmatrix_train)

In [10]:
test_predictions = model.predict(dmatrix_test)
round_test_predictions = [round(p) for p in test_predictions]
accuracy_score(y_test,round_test_predictions)

0.8096296296296296

In [11]:
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    n_jobs=-1
)

In [12]:
print("Starting randomized search tuning...")
start_time = time.time()

# Define a larger parameter space
param_dist = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.005, 0.01, 0.02, 0.03, 0.05],
    'n_estimators': [700, 900, 1100, 1300, 1500],
    'subsample': [0.7, 0.75, 0.8, 0.85, 0.9],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8],
    'gamma': [0, 0.05, 0.1],
    'reg_alpha': [0, 0.05, 0.1, 0.2],
    'reg_lambda': [0.25, 0.5, 0.75, 1.0, 1.25]
}

# Use Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=150,               # Randomly sample 50 parameter sets
    cv=3,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train_proc, y_train)

random_search_time = time.time() - start_time
print(f"Randomized search complete. Time taken: {random_search_time:.2f} seconds")

# Output the best parameters
print("\nBest parameters:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest cross-validation accuracy: {random_search.best_score_:.4f}")

Starting randomized search tuning...
Fitting 3 folds for each of 150 candidates, totalling 450 fits
Randomized search complete. Time taken: 79.75 seconds

Best parameters:
  subsample: 0.75
  reg_lambda: 0.5
  reg_alpha: 0.05
  n_estimators: 900
  max_depth: 3
  learning_rate: 0.02
  gamma: 0.05
  colsample_bytree: 0.6

Best cross-validation accuracy: 0.8174


In [13]:
best_model = random_search.best_estimator_
test_predictions_gs = best_model.predict(X_test_proc)
round_test_predictions_gs = [round(p) for p in test_predictions_gs]
accuracy_score(y_test,round_test_predictions_gs)

0.8190740740740741

In [16]:
model_prior = xgb.XGBClassifier(
    n_estimators=800,
    learning_rate=0.06,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="auc"
)
model_prior.fit(X_train_proc, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [17]:
test_predictions_prior = model_prior.predict(X_test_proc)
round_test_predictions_prior = [round(p) for p in test_predictions_prior]
accuracy_score(y_test,round_test_predictions_prior)

0.8111111111111111