In [1]:
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
import multiprocessing
from tqdm.auto import tqdm
import scipy.sparse as sp
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train = sp.load_npz('/mnt/fastssd/belka_data/ready_data/train__m.npz')
Y_train = np.load('/mnt/fastssd/belka_data/ready_data/train__t.npy')


In [3]:
X_test_random = sp.load_npz('/mnt/fastssd/belka_data/ready_data/random_test__m.npz')
Y_test_random = np.load('/mnt/fastssd/belka_data/ready_data/random_test__t.npy')

In [4]:
X_test_unique = sp.load_npz('/mnt/fastssd/belka_data/ready_data/unique_test__m.npz')
Y_test_unique = np.load('/mnt/fastssd/belka_data/ready_data/unique_test__t.npy')

In [5]:
def map_micro(preds, dtrain):
    labels = dtrain.get_label()
    # Calculate the MAP score
    score = average_precision_score(labels, preds, average='micro')
    return 'map_micro', score



In [6]:
dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest_random = xgb.DMatrix(X_test_random, label=Y_test_random)
dtest_unique = xgb.DMatrix(X_test_unique, label=Y_test_unique)

In [None]:
# Parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'device': 'cuda',
    'tree_method': 'hist',  # Utilize GPU for histogram construction
    'learning_rate': 0.003,
    'max_depth': 6,
    'n_jobs': multiprocessing.cpu_count(),
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'verbosity': 2,
}
evals_result = {}
model = xgb.train(params, dtrain, evals=[(dtrain, 'train'), (dtest_random, 'test_random'), (dtest_unique, 'test_unique')], 
                  num_boost_round=1000000, early_stopping_rounds=10,
                  evals_result=evals_result, custom_metric=map_micro,
                  maximize=True)  # Since MAP is higher the better
model.save_model('unique_best.xgb')

[15:39:27] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/simple_dmatrix.cc:137: Generating new Ellpack page.
[0]	train-auc:0.75424	train-map_micro:0.52841	test_random-auc:0.76225	test_random-map_micro:0.53929	test_unique-auc:0.47696	test_unique-map_micro:0.02250
[1]	train-auc:0.76971	train-map_micro:0.55050	test_random-auc:0.77642	test_random-map_micro:0.55987	test_unique-auc:0.74844	test_unique-map_micro:0.13059
[2]	train-auc:0.76987	train-map_micro:0.55210	test_random-auc:0.77664	test_random-map_micro:0.56142	test_unique-auc:0.74891	test_unique-map_micro:0.13060
[3]	train-auc:0.77714	train-map_micro:0.56252	test_random-auc:0.78381	test_random-map_micro:0.57148	test_unique-auc:0.74891	test_unique-map_micro:0.13060
[4]	train-auc:0.78478	train-map_micro:0.57070	test_random-auc:0.79069	test_random-map_micro:0.57884	test_unique-auc:0.74891	test_unique-map_micro:0.13060
[5]	train-auc:0.78372	train-map_micro:0.57111	test_random-auc:0.78969	test_r

KeyboardInterrupt: 

In [None]:
# Parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'device': 'cuda',
    'tree_method': 'hist',  # Utilize GPU for histogram construction
    'learning_rate': 0.01,
    'max_depth': 6,
    'n_jobs': multiprocessing.cpu_count(),
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'verbosity': 2,
}
evals_result = {}
model = xgb.train(params, dtrain, evals=[(dtrain, 'train'), (dtest_unique, 'test_unique'),  (dtest_random, 'test_random'),], 
                  num_boost_round=1000000, early_stopping_rounds=10,
                  evals_result=evals_result, custom_metric=map_micro,
                  maximize=True)  # Since MAP is higher the better
model.save_model('unique_random.xgb')