In [1]:
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
import multiprocessing
from tqdm.auto import tqdm
import scipy.sparse as sp
import numpy as np
from wandb.integration.xgboost import WandbCallback
import os 
import re
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
run = wandb.init(project='belka-xgb', tags=['xgboost'],config={"neg_sampling_ratio":0.15})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msavsunenko-sasha[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
def extract_uuid(filename):
    # UUID pattern: 8-4-4-4-12 hexadecimal characters
    uuid_pattern = r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}'
    
    # Search for the UUID pattern in the filename
    match = re.search(uuid_pattern, filename)
    if match:
        return match.group()  # Return the found UUID
    else:
        return None  # No UUID found
    
class Iterator(xgb.DataIter):
  def __init__(self, prefix, directory):
    self.prefix = prefix
    self.directory = directory
    self._it = 0
    files = [f for f in os.listdir(directory) if f.endswith('.npz') and f.startswith(prefix)]
    self.uuids = [extract_uuid(x) for x in files]
    # XGBoost will generate some cache files under current directory with the prefix
    # "cache"
    super().__init__(cache_prefix=os.path.join(".", "cache"))

  def next(self, input_data):
    """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
    called by XGBoost during the construction of ``DMatrix``

    """
    if self._it == len(self.uuids):
      # return 0 to let XGBoost know this is the end of iteration
      return 0

    # input_data is a function passed in by XGBoost who has the exact same signature of
    # ``DMatrix``
    file_path = os.path.join(self.directory, self.prefix+self.uuids[self._it]+'.npz')
    matrix = sp.load_npz(file_path)
    X = matrix
    file_path = os.path.join(self.directory, self.prefix+self.uuids[self._it]+'.npl.npy')
    y = np.load(file_path)
    input_data(data=X, label=y)
    self._it += 1
    # Return 1 to let XGBoost know we haven't seen all the files yet.
    return 1

  def reset(self):
    """Reset the iterator to its beginning"""
    self._it = 0

In [4]:
def map_micro(preds, dtrain):
    labels = dtrain.get_label()
    # Calculate the MAP score
    score = average_precision_score(labels, preds, average='micro')
    return 'map_micro', score


In [7]:
dtrain = xgb.DMatrix(Iterator(prefix = "train_full_", directory='/mnt/fastssd/belka_data/train_full/'))

In [8]:
params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        # 'device': 'cuda',
        # 'subsample': wandb.config.subsample,
        # 'sampling_method': 'gradient_based',
        'tree_method': 'hist',  # Utilize GPU for histogram construction
        'learning_rate': 0.25,
        'max_depth': 9,
        'n_jobs': multiprocessing.cpu_count(),
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'verbosity': 2,
    }
evals_result_unique = {}
model = xgb.train(params, dtrain, evals=[(dtrain, 'train')], 
                    num_boost_round=85, 
                    evals_result=evals_result_unique, custom_metric=map_micro,
                    maximize=True, callbacks=[WandbCallback(log_model=True)])  # Since MAP is higher the better
model.save_model('random_best.ubj')

[23:00:02] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/./sparse_page_dmatrix.h:147: Make cache:./cache-0x55ccc4d8fe40.gradient_index.page

[23:00:02] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/./sparse_page_dmatrix.h:147: Make cache:./cache-0x55ccc4d8fe40.gradient_index.page

[23:00:02] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/sparse_page_dmatrix.cc:176: Generating new Gradient Index.
[23:01:02] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/sparse_page_source.h:240: 42.303 MB written in 0.124 seconds.
[23:01:02] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/sparse_page_source.h:240: 41.9109 MB written in 0.113088 seconds.
[23:01:02] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/sparse_page_source.h:240: 44.1969