In [1]:
import numpy as np
from pathlib import Path

splits_dir = Path("../data/splits")

train = np.load(splits_dir / "train.npz", allow_pickle=True)
val   = np.load(splits_dir / "val.npz",   allow_pickle=True)

# Cast X to float32, y to int
X_train = train["X"].astype(np.float32)
y_train = train["y"].astype(int)

X_val   = val["X"].astype(np.float32)
y_val   = val["y"].astype(int)

print("X_train dtype:", X_train.dtype, "shape:", X_train.shape)
print("X_val   dtype:", X_val.dtype,   "shape:", X_val.shape)


# Feature names need allow_pickle=True
feature_columns = np.load(
    splits_dir / "feature_columns.npy",
    allow_pickle=True
).tolist()

print("Num features:", len(feature_columns))


X_train dtype: float32 shape: (350690, 773)
X_val   dtype: float32 shape: (61887, 773)
Num features: 773


In [2]:
# cols_to_remove = ["isTimelineWork", "isPublicDomain", "objectEndDate", "objectBeginDate", "accessionYear"]
cols_to_remove = ["isTimelineWork", "isPublicDomain", "accessionYear"]

# Get indices of columns we want to drop
drop_indices = [feature_columns.index(col) for col in cols_to_remove]

print("Dropping indices:", drop_indices)

# Sort descending so deleting works without shifting indices
drop_indices_sorted = sorted(drop_indices, reverse=True)

for idx in drop_indices_sorted:
    X_train = np.delete(X_train, idx, axis=1)
    X_val   = np.delete(X_val,   idx, axis=1)

    # Remove from feature column names
    feature_columns.pop(idx)


Dropping indices: [3, 4, 2]


In [3]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(
    X_train, y_train,
    feature_names=feature_columns
)

val_pool = Pool(
    X_val, y_val,
    feature_names=feature_columns
)


In [4]:
model = CatBoostClassifier(
    iterations=800,          # tuned
    depth=8,                 # tuned
    learning_rate=0.08,      # tuned
    l2_leaf_reg=5,           # tuned
    border_count=64,         # tuned
    random_strength=2,       # tuned
    
    loss_function="Logloss",
    auto_class_weights="Balanced",
    eval_metric="PRAUC",
    task_type="GPU",
    verbose=100
)

model.fit(train_pool, eval_set=val_pool)


Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.8977075	test: 0.8980285	best: 0.8980285 (0)	total: 230ms	remaining: 3m 3s
100:	learn: 0.9725375	test: 0.9688477	best: 0.9688477 (100)	total: 13.7s	remaining: 1m 34s
200:	learn: 0.9808924	test: 0.9749948	best: 0.9749948 (200)	total: 26.7s	remaining: 1m 19s
300:	learn: 0.9852614	test: 0.9778355	best: 0.9778355 (300)	total: 39.1s	remaining: 1m 4s
400:	learn: 0.9881296	test: 0.9795880	best: 0.9795880 (400)	total: 51.3s	remaining: 51s
500:	learn: 0.9901007	test: 0.9807863	best: 0.9807863 (500)	total: 1m 3s	remaining: 37.7s
600:	learn: 0.9916965	test: 0.9817450	best: 0.9817450 (600)	total: 1m 15s	remaining: 24.9s
700:	learn: 0.9929101	test: 0.9825049	best: 0.9825049 (700)	total: 1m 26s	remaining: 12.3s
799:	learn: 0.9938765	test: 0.9831095	best: 0.9831095 (799)	total: 1m 38s	remaining: 0us
bestTest = 0.9831094698
bestIteration = 799


<catboost.core.CatBoostClassifier at 0x14f8e121390>

In [5]:
import pandas as pd

importances = model.get_feature_importance()
df_imp = pd.DataFrame({
    "feature": feature_columns,
    "importance": importances
}).sort_values("importance", ascending=False)

df_imp.head(20)

Unnamed: 0,feature,importance
1,objectEndDate,10.951909
0,objectBeginDate,2.71629
232,emb_230,2.66024
496,emb_494,1.371585
78,emb_76,1.271672
131,emb_129,1.103216
96,emb_94,1.00308
268,emb_266,0.935166
742,emb_740,0.734212
461,emb_459,0.715425


In [6]:
from pathlib import Path

models_dir = Path("models")
models_dir.mkdir(exist_ok=True)

model_path = models_dir / "catboost_model_optimized_parameters_PRAUC_1000iter.cbm"
model.save_model(str(model_path))

print("Saved model to:", model_path)


Saved model to: models\catboost_model_optimized_parameters_PRAUC_1000iter.cbm


In [7]:
y_val_proba = model.predict_proba(X_val)[:, 1]


In [8]:
from sklearn.metrics import f1_score

thresholds = np.linspace(0, 1, 501)

best_t = 0
best_f1 = 0

for t in thresholds:
    preds = (y_val_proba >= t).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print("Best threshold:", best_t)
print("Best F1:", best_f1)


Best threshold: 0.7000000000000001
Best F1: 0.8326105810928014


In [9]:
print(thresholds)

[0.    0.002 0.004 0.006 0.008 0.01  0.012 0.014 0.016 0.018 0.02  0.022
 0.024 0.026 0.028 0.03  0.032 0.034 0.036 0.038 0.04  0.042 0.044 0.046
 0.048 0.05  0.052 0.054 0.056 0.058 0.06  0.062 0.064 0.066 0.068 0.07
 0.072 0.074 0.076 0.078 0.08  0.082 0.084 0.086 0.088 0.09  0.092 0.094
 0.096 0.098 0.1   0.102 0.104 0.106 0.108 0.11  0.112 0.114 0.116 0.118
 0.12  0.122 0.124 0.126 0.128 0.13  0.132 0.134 0.136 0.138 0.14  0.142
 0.144 0.146 0.148 0.15  0.152 0.154 0.156 0.158 0.16  0.162 0.164 0.166
 0.168 0.17  0.172 0.174 0.176 0.178 0.18  0.182 0.184 0.186 0.188 0.19
 0.192 0.194 0.196 0.198 0.2   0.202 0.204 0.206 0.208 0.21  0.212 0.214
 0.216 0.218 0.22  0.222 0.224 0.226 0.228 0.23  0.232 0.234 0.236 0.238
 0.24  0.242 0.244 0.246 0.248 0.25  0.252 0.254 0.256 0.258 0.26  0.262
 0.264 0.266 0.268 0.27  0.272 0.274 0.276 0.278 0.28  0.282 0.284 0.286
 0.288 0.29  0.292 0.294 0.296 0.298 0.3   0.302 0.304 0.306 0.308 0.31
 0.312 0.314 0.316 0.318 0.32  0.322 0.324 0.326 0.328