In [1]:
import numpy as np
from pathlib import Path
from catboost import CatBoostClassifier, Pool

splits_dir = Path("../data/splits")

train = np.load(splits_dir / "train.npz", allow_pickle=True)
val   = np.load(splits_dir / "val.npz",   allow_pickle=True)

# Cast X to float32, y to int
X_train = train["X"].astype(np.float32)
y_train = train["y"].astype(int)

X_val   = val["X"].astype(np.float32)
y_val   = val["y"].astype(int)

feature_columns = np.load(
    splits_dir / "feature_columns.npy",
    allow_pickle=True
).tolist()

# Combine train + val for tuning (optional but common)
X_tune = np.concatenate([X_train, X_val], axis=0)
y_tune = np.concatenate([y_train, y_val], axis=0)

print("Tuning set:", X_tune.shape, y_tune.shape)


Tuning set: (412577, 773) (412577,)


In [2]:
rng = np.random.default_rng(42)
n_samples = min(20_000, X_tune.shape[0])  # cap to 100k for speed

idx_sub = rng.choice(X_tune.shape[0], size=n_samples, replace=False)

X_sub = X_tune[idx_sub]
y_sub = y_tune[idx_sub]

train_pool_sub = Pool(X_sub, y_sub, feature_names=feature_columns)
print("Subsampled tuning set:", X_sub.shape, y_sub.shape)


Subsampled tuning set: (20000, 773) (20000,)


In [3]:
param_dist = {
    "depth":            [4, 6, 8, 10],
    "learning_rate":    [0.03, 0.05, 0.08],
    "l2_leaf_reg":      [1, 3, 5, 7, 10],
    # "bagging_temperature": [0, 0.25, 0.5, 1.0],
    "random_strength":  [0.5, 1.0, 2.0],
    "border_count":     [64, 128],
    "iterations":       [500]
}


In [4]:
model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    task_type="CPU",
    bagging_temperature=1.0,
    verbose=False,
)

search_result = model.randomized_search(
    param_distributions=param_dist,
    X=train_pool_sub,
    cv=3,                         # 3-fold CV on the subsample
    n_iter=25,                    # number of random combinations to try
    partition_random_seed=42,
    search_by_train_test_split=False,  # use CV, not a single split
    calc_cv_statistics=True,
    refit=True,                   # refit model on all X_sub, y_sub with best params
    shuffle=True,
)


Training on fold [0/3]

bestTest = 0.9595733975
bestIteration = 499

Training on fold [1/3]

bestTest = 0.9633131583
bestIteration = 495

Training on fold [2/3]

bestTest = 0.9662699985
bestIteration = 485

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.9630219	best: 0.9630219 (0)	total: 2m 12s	remaining: 52m 59s
Training on fold [0/3]

bestTest = 0.964651778
bestIteration = 499

Training on fold [1/3]

bestTest = 0.9658940429
bestIteration = 474

Training on fold [2/3]

bestTest = 0.9677203239
bestIteration = 486

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.9660043	best: 0.9660043 (1)	total: 4m 19s	remaining: 49m 40s
Training on fold [0/3]

bestTest = 0.9599302676
bestIteration = 499

Training on fold [1/3]

bestTest = 0.9623548431
bestIteration = 488

Training on fold [2/3]

bestTest = 0.

In [6]:
print("Best params from search:")
print(search_result["params"])
print("Best CV AUC:", max(search_result["cv_results"]["test-AUC-mean"]))


Best params from search:
{'border_count': 64, 'random_strength': 2, 'depth': 8, 'learning_rate': 0.08, 'l2_leaf_reg': 5, 'iterations': 500}
Best CV AUC: 0.9682862288999282
