In [1]:
from ray import train, tune
from ray.tune import JupyterNotebookReporter
from ray.tune.integration.xgboost import TuneReportCheckpointCallback
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from ray.tune.search import ConcurrencyLimiter
import xgboost as xgb
import numpy as np
import pandas as pd
import os
import pickle
from functools import partial

In [7]:
import sklearn.datasets
from sklearn.metrics import roc_auc_score
import os
from ray.tune.schedulers import ASHAScheduler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight

from ray import train, tune
from ray.tune.integration.xgboost import TuneReportCheckpointCallback


def train_commit(config: dict):

    with open('C:/Users/masak/workspace/lab/thesis_data2/learning_process/resource/openstack_train.pkl', 'rb') as f_train:
        tr_dataset = pickle.load(f_train)
    
    va_period_list = [3]
    for va_period in va_period_list:
        tr_index = [index for index, value in enumerate(tr_dataset[1]) if value != va_period]
        va_index = [index for index, value in enumerate(tr_dataset[1]) if value == va_period]
        tr_x = [tr_dataset[4][i] for i in tr_index]
        tr_y = [tr_dataset[5][i] for i in tr_index]
        va_x = [tr_dataset[4][i] for i in va_index]
        va_y = [tr_dataset[5][i] for i in va_index]
    
    # Build input matrices for XGBoost
    train_set = xgb.DMatrix(tr_x, label=tr_y)
    valid_set = xgb.DMatrix(va_x, label=va_y)
    # Train the classifier, using the Tune callback
    xgb.train(
        config,
        train_set,
        evals=[(valid_set, "eval")],
        verbose_eval=False,
        callbacks=[TuneReportCheckpointCallback(filename="model.xgb")],
    )


def get_best_model_checkpoint(results):
    best_bst = xgb.Booster()
    best_result = results.get_best_result()

    with best_result.checkpoint.as_directory() as best_checkpoint_dir:
        best_bst.load_model(os.path.join(best_checkpoint_dir, "model.xgb"))
    print(best_result.metrics.keys())
    auc = best_result.metrics["eval-auc"]
    print(f"Best model parameters: {best_result.config}")
    print(f'Best model logloss: {best_result.metrics["eval-logloss"]:.4f}')
    print(f"Best model total auc: {auc:.4f}")
    return best_bst


def tune_xgboost(test=False):
    # search_space = {
    #     # You can mix constants with search space objects.
    #     "objective": "binary:logistic",
    #     "eval_metric": ["logloss", "auc"],
    #     "max_depth": tune.randint(1, 9),
    #     "min_child_weight": tune.choice([1, 2, 3]),
    #     "subsample": tune.uniform(0.5, 1.0),
    #     "eta": tune.loguniform(1e-4, 1e-1),
    # }
    # search_space = {
    #     "objective": "binary:logistic",
    #     "eval_metric": ["logloss", "auc"],
    #     "max_depth": tune.randint(3,9),
    #     "min_child_weight" : tune.loguniform(0.1, 10),
    #     "subsample" : tune.quniform(0.6, 0.95, 0.05),
    #     "eta" : 0.1,
    #     "colsample_bytree": tune.quniform(0.6, 0.95, 0.05),
    #     "gamma" : tune.loguniform(1e-8, 1.0),
    #     "alpha":0.0,
    #     "lambda":1.0
    # }
    search_space = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "auc"],
        "max_depth": 8,
        "min_child_weight" : 5,
        "subsample" : 0.85,
        "eta":tune.choice([0.1, 0.01, 0.001]),
        "colsample_bytree": 0.7,
        "gamma" : 0.06,
        "alpha":tune.choice([1e-5, 1e-2, 0.1, 1, 100]),
        "lambda":tune.choice([1e-5, 1e-2, 0.1, 1, 100])
    }
    # This will enable aggressive early stopping of bad trials.
    scheduler = ASHAScheduler(
        max_t=10, grace_period=1, reduction_factor=2  # 10 training iterations
    )

    tuner = tune.Tuner(
        train_commit,
        tune_config=tune.TuneConfig(
            metric="eval-logloss",
            mode="min",
            scheduler=scheduler,
            num_samples=1 if test else 10,
        ),
        param_space=search_space,
    )
    results = tuner.fit()

    return results

results = tune_xgboost()

# Load the best model checkpoint.
best_bst = get_best_model_checkpoint(results)

with open('../resource/openstack_test.pkl', 'rb') as f_test:
    te_dataset = pickle.load(f_test)


test = xgb.DMatrix(np.array(te_dataset[4]))

# You could now do further predictions with
pred = best_bst.predict(test)

print(f'Test data auc {roc_auc_score(np.array(te_dataset[5]), pred):.4f}')

0,1
Current time:,2023-11-19 14:37:41
Running for:,00:00:17.40
Memory:,11.7/15.7 GiB

Trial name,status,loc,alpha,eta,lambda,iter,total time (s),eval-logloss,eval-auc
train_commit_b7444_00000,TERMINATED,127.0.0.1:41516,1e-05,0.01,1e-05,1,12.793,0.498275,0.684298
train_commit_b7444_00001,TERMINATED,127.0.0.1:24916,0.1,0.001,100.0,1,12.6355,0.499096,0.654574
train_commit_b7444_00002,TERMINATED,127.0.0.1:42780,1e-05,0.1,0.1,10,12.7282,0.46791,0.711884
train_commit_b7444_00003,TERMINATED,127.0.0.1:30852,1.0,0.01,1.0,1,12.8298,0.49836,0.679953
train_commit_b7444_00004,TERMINATED,127.0.0.1:22808,100.0,0.1,100.0,1,12.6301,0.497219,0.644176
train_commit_b7444_00005,TERMINATED,127.0.0.1:29072,0.1,0.1,1.0,2,12.7392,0.485241,0.706261
train_commit_b7444_00006,TERMINATED,127.0.0.1:34128,100.0,0.1,0.01,1,12.7415,0.496166,0.654848
train_commit_b7444_00007,TERMINATED,127.0.0.1:40092,1.0,0.001,1.0,1,12.8867,0.499063,0.679953
train_commit_b7444_00008,TERMINATED,127.0.0.1:40496,1e-05,0.1,0.1,10,11.1195,0.46791,0.711884
train_commit_b7444_00009,TERMINATED,127.0.0.1:42196,100.0,0.001,1.0,1,12.1122,0.499109,0.654848


[36m(train_commit pid=40496)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/masak/ray_results/train_commit_2023-11-19_14-37-23/train_commit_b7444_00008_8_alpha=0.0000,eta=0.1000,lambda=0.1000_2023-11-19_14-37-23/checkpoint_000000)
2023-11-19 14:37:41,210	INFO tune.py:1047 -- Total run time: 17.45 seconds (17.27 seconds for the tuning loop).


odict_keys(['eval-logloss', 'eval-auc', 'timestamp', 'checkpoint_dir_name', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'time_this_iter_s', 'time_total_s', 'pid', 'hostname', 'node_ip', 'config', 'time_since_restore', 'iterations_since_restore', 'experiment_tag'])
Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'auc'], 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.85, 'eta': 0.1, 'colsample_bytree': 0.7, 'gamma': 0.06, 'alpha': 1e-05, 'lambda': 0.1}
Best model logloss: 0.4679
Best model total auc: 0.7119
Test data auc 0.7580
