In [1]:
import numpy as np
from pathlib import Path

splits_dir = Path("../data/splits")

train = np.load(splits_dir / "train.npz", allow_pickle=True)
val   = np.load(splits_dir / "val.npz",   allow_pickle=True)
test  = np.load(splits_dir / "test.npz",  allow_pickle=True)

# Cast X to float32, y to int
X_train = train["X"].astype(np.float32)
y_train = train["y"].astype(int)

X_val   = val["X"].astype(np.float32)
y_val   = val["y"].astype(int)

X_test  = test["X"].astype(np.float32)
y_test  = test["y"].astype(int)

print("X_train dtype:", X_train.dtype, "shape:", X_train.shape)
print("X_val   dtype:", X_val.dtype,   "shape:", X_val.shape)
print("X_test  dtype:", X_test.dtype,  "shape:", X_test.shape)


# Feature names need allow_pickle=True
feature_columns = np.load(
    splits_dir / "feature_columns.npy",
    allow_pickle=True
).tolist()

print("Num features:", len(feature_columns))


X_train dtype: float32 shape: (350690, 773)
X_val   dtype: float32 shape: (61887, 773)
X_test  dtype: float32 shape: (72808, 773)
Num features: 773


In [2]:
cols_to_remove = ["isTimelineWork", "isPublicDomain", "objectEndDate", "objectBeginDate", "accessionYear"]

# Get indices of columns we want to drop
drop_indices = [feature_columns.index(col) for col in cols_to_remove]

print("Dropping indices:", drop_indices)

# Sort descending so deleting works without shifting indices
drop_indices_sorted = sorted(drop_indices, reverse=True)

for idx in drop_indices_sorted:
    X_train = np.delete(X_train, idx, axis=1)
    X_val   = np.delete(X_val,   idx, axis=1)
    X_test  = np.delete(X_test,  idx, axis=1)

    # Remove from feature column names
    feature_columns.pop(idx)


Dropping indices: [3, 4, 1, 0, 2]


In [3]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(
    X_train, y_train,
    feature_names=feature_columns
)

val_pool = Pool(
    X_val, y_val,
    feature_names=feature_columns
)


In [4]:
model = CatBoostClassifier(
    iterations=600,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="AUC",
    task_type="GPU",
    verbose=100
)

model.fit(train_pool, eval_set=val_pool)


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8242583	best: 0.8242583 (0)	total: 200ms	remaining: 1m 59s
100:	test: 0.9487967	best: 0.9487967 (100)	total: 10.4s	remaining: 51.6s
200:	test: 0.9597311	best: 0.9597311 (200)	total: 20.2s	remaining: 40.1s
300:	test: 0.9651571	best: 0.9651571 (300)	total: 29.7s	remaining: 29.5s
400:	test: 0.9683352	best: 0.9683352 (400)	total: 39.2s	remaining: 19.4s
500:	test: 0.9705635	best: 0.9705635 (500)	total: 48.2s	remaining: 9.53s
599:	test: 0.9722587	best: 0.9722587 (599)	total: 57.1s	remaining: 0us
bestTest = 0.9722587168
bestIteration = 599


<catboost.core.CatBoostClassifier at 0x1869b450ad0>

In [5]:
import pandas as pd

importances = model.get_feature_importance()
df_imp = pd.DataFrame({
    "feature": feature_columns,
    "importance": importances
}).sort_values("importance", ascending=False)

df_imp.head(20)

Unnamed: 0,feature,importance
230,emb_230,6.502275
494,emb_494,3.284797
176,emb_176,2.550755
76,emb_76,2.025252
95,emb_95,1.95462
129,emb_129,1.952849
318,emb_318,1.831387
744,emb_744,1.670633
740,emb_740,1.545812
362,emb_362,1.43647
