In [2]:
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
import multiprocessing
from tqdm.auto import tqdm
import scipy.sparse as sp
import numpy as np
from wandb.integration.xgboost import WandbCallback
import os 
import re
import wandb

In [3]:
run = wandb.init(project='belka-sweep-unique', tags=['xgboost'],config={"neg_sampling_ratio":0.05})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msavsunenko-sasha[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
sweep_configs = {
    "method": "bayes",
    "metric": {"name": "map", "goal": "maximize"},
    "parameters": {
        "max_depth": {"values": [5, 6, 7, 8, 9]},
        "learning_rate": {"distribution": "uniform", "min": 0.03, "max": 0.3},
        "colsample_bytree": {"distribution": "uniform", "min": 0.4, "max": 0.9},
    },
}

In [5]:
def extract_uuid(filename):
    # UUID pattern: 8-4-4-4-12 hexadecimal characters
    uuid_pattern = r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}'
    
    # Search for the UUID pattern in the filename
    match = re.search(uuid_pattern, filename)
    if match:
        return match.group()  # Return the found UUID
    else:
        return None  # No UUID found
    
class Iterator(xgb.DataIter):
  def __init__(self, prefix, directory):
    self.prefix = prefix
    self.directory = directory
    self._it = 0
    files = [f for f in os.listdir(directory) if f.endswith('.npz') and f.startswith(prefix)]
    self.uuids = [extract_uuid(x) for x in files]
    # XGBoost will generate some cache files under current directory with the prefix
    # "cache"
    super().__init__(cache_prefix=os.path.join(".", "cache"))

  def next(self, input_data):
    """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
    called by XGBoost during the construction of ``DMatrix``

    """
    if self._it == len(self.uuids):
      # return 0 to let XGBoost know this is the end of iteration
      return 0

    # input_data is a function passed in by XGBoost who has the exact same signature of
    # ``DMatrix``
    file_path = os.path.join(self.directory, self.prefix+self.uuids[self._it]+'.npz')
    matrix = sp.load_npz(file_path)
    X = matrix
    file_path = os.path.join(self.directory, self.prefix+self.uuids[self._it]+'.npl.npy')
    y = np.load(file_path)
    input_data(data=X, label=y)
    self._it += 1
    # Return 1 to let XGBoost know we haven't seen all the files yet.
    return 1

  def reset(self):
    """Reset the iterator to its beginning"""
    self._it = 0

In [6]:
def map_micro(preds, dtrain):
    labels = dtrain.get_label()
    # Calculate the MAP score
    score = average_precision_score(labels, preds, average='micro')
    return 'map_micro', score


In [7]:
dtrain = xgb.DMatrix(Iterator(prefix = "train_", directory='/mnt/fastssd/belka_data/train_split_testr3/'))
dtest_random = xgb.DMatrix(Iterator(prefix = "random_test_", directory='/mnt/fastssd/belka_data/train_split_testr3/'))
dtest_unique = xgb.DMatrix(Iterator(prefix = "unique_test_", directory='/mnt/fastssd/belka_data/train_split_testr3/'))

In [8]:
def train_model():
    wandb.init()
    # Parameters for XGBoost
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        # 'device': 'cuda',
        # 'subsample': wandb.config.subsample,
        # 'sampling_method': 'gradient_based',
        'tree_method': 'hist',  # Utilize GPU for histogram construction
        'learning_rate': wandb.config.learning_rate,
        'max_depth': wandb.config.max_depth,
        'n_jobs': multiprocessing.cpu_count(),
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': wandb.config.colsample_bytree,
        'verbosity': 2,
    }
    evals_result_unique = {}
    model = xgb.train(params, dtrain, evals=[(dtrain, 'train'), (dtest_unique, 'test_unique'),(dtest_random, 'test_random')], 
                    num_boost_round=100000, early_stopping_rounds=30,
                    evals_result=evals_result_unique, custom_metric=map_micro,
                    maximize=True, callbacks=[WandbCallback(log_model=True)])  # Since MAP is higher the better
    wandb.log({"map": model.best_score})
    model.save_model('unique_best.ubj')

    # Log model performance metrics to W&B
    

In [9]:
sweep_id = wandb.sweep(sweep_configs, project="belka-sweep-unique")
wandb.agent(sweep_id=sweep_id, function=train_model)



Create sweep with ID: gcpy7242
Sweep URL: https://wandb.ai/savsunenko-sasha/belka-sweep-unique/sweeps/gcpy7242
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[34m[1mwandb[0m: Agent Starting Run: 69g3ydg1 with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.4422603497085114
[34m[1mwandb[0m: 	learning_rate: 0.047639091582221245
[34m[1mwandb[0m: 	max_depth: 5


[16:36:22] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/./sparse_page_dmatrix.h:147: Make cache:./cache-0x616e98c30eb0.gradient_index.page

[16:36:22] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/./sparse_page_dmatrix.h:147: Make cache:./cache-0x616e98c30eb0.gradient_index.page

[16:36:22] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/sparse_page_dmatrix.cc:176: Generating new Gradient Index.
[16:37:31] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/sparse_page_source.h:240: 53.7892 MB written in 0.0929625 seconds.
[16:37:31] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/sparse_page_source.h:240: 59.4769 MB written in 0.10234 seconds.
[16:37:31] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/data/sparse_page_source.h:240: 53.

[0]	train-auc:0.74860	train-map_micro:0.40451	test_unique-auc:0.50595	test_unique-map_micro:0.02242	test_random-auc:0.74804	test_random-map_micro:0.40384
[1]	train-auc:0.77541	train-map_micro:0.45091	test_unique-auc:0.49691	test_unique-map_micro:0.02209	test_random-auc:0.77485	test_random-map_micro:0.45038
[2]	train-auc:0.78044	train-map_micro:0.45455	test_unique-auc:0.51643	test_unique-map_micro:0.02306	test_random-auc:0.78046	test_random-map_micro:0.45523
[3]	train-auc:0.78163	train-map_micro:0.45802	test_unique-auc:0.51643	test_unique-map_micro:0.02306	test_random-auc:0.78154	test_random-map_micro:0.45852
[4]	train-auc:0.78283	train-map_micro:0.46233	test_unique-auc:0.51302	test_unique-map_micro:0.02267	test_random-auc:0.78274	test_random-map_micro:0.46299
[5]	train-auc:0.79134	train-map_micro:0.47382	test_unique-auc:0.51180	test_unique-map_micro:0.02260	test_random-auc:0.79109	test_random-map_micro:0.47438
[6]	train-auc:0.79099	train-map_micro:0.47102	test_unique-auc:0.51182	test_u