<a href="https://colab.research.google.com/github/Base-R-Best-R/Auction/blob/main/Code/Models/Colab/XGboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# call R in Python
%reload_ext rpy2.ipython

In [None]:
# mount gdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# R
%%R

# install and load packages
install.packages(c("xgboost"))
library(xgboost)

In [None]:
# R
%%R

# load required data 
dat_aucs_eng <- readRDS("drive/MyDrive/Colab Transfer/Aucs_df_feateng_split.RDS")

# rm vars
lapply(dat_aucs_eng, \(x){

  # remove
  x$Contract_ID <- NULL
  x$MLOT <- NULL
  x$EW_Diff <- NULL
  Label <- x$Winning_Bid / 1e3
  x$Winning_Bid <- NULL
  x$Eng_Est <- x$Eng_Est /1e3
  
  # feature model matrix
  mod_mat <-  model.matrix(~. + 0, data = x)
  
  # Label and hot encoded features
  list("Label" = Label,
       "Features" = mod_mat,
       "XGB_Matrix" = xgb.DMatrix(data = mod_mat, label = Label))
  
}) -> dat_aucs_mod

# tuning grid
XGgrid <- expand.grid("objective" = "reg:squarederror",
                      "booster" = "gbtree",
                      "eta" = seq(0.05, 0.25, 0.1), 
                      "gamma" = seq(0.1, 9, 3),
                      "max_depth" = seq(5, 100, 25),
                      "min_child_weight" = seq(3, 9, 3),
                      "subsample" = seq(0.6, 1, 0.3),
                      "colsample_bytree" = seq(0.6, 1, 0.3),
                      "lambda" = seq(0.1, 9, 3),
                      "alpha" = seq(0.1, 9, 3))

In [None]:
# R
%%R

# parameter optimization CV
xgb.cv_opt <- \(nrounds = 1e3, print_every_n = 2e3, nfold = 5, 
                early_stopping_rounds = 10, maximize = FALSE, metrics = c("mae", "rmse"), 
                verbosity = 0, seed = 33, data, tuning_grid){
  
  # nmodels
  nmod <- nrow(tuning_grid)
  
  # metric subset vector
  sub_pst <- paste0("test_", metrics, "_mean")
  
  # print 
  cat(paste0(Sys.time(), ", starting CV: ", nmod, " models to fit!\n\n"))
  
  # counter
  count <- 1
  
  # over tuning grid rows
  apply(tuning_grid, 1, \(row){
    
    # parameters to list 
    param_lst <- as.list(row)
    
    # seed (s.t. we may compare the different models across the same folds)
    set.seed(seed)
    
    # cv 
    cv_tmp <- xgb.cv(params = param_lst, nrounds = nrounds, 
                     nfold = nfold, print_every_n = print_every_n,
                     early_stopping_rounds = early_stopping_rounds,
                     maximize = maximize, metrics = metrics, verbosity = verbosity,
                     data = data)
    
    # print current State
    cat(paste0(count, "/", nmod, " fit!\n"))
    
    # count
    count <<- count + 1

    # extract best iteration + input parameters
    cbind(cv_tmp[["evaluation_log"]][cv_tmp[["best_iteration"]], c("iter", sub_pst), with = FALSE] |> 
          data.matrix(),
          t(data.matrix(row)))
    
  }) -> tmp
  
  # rownames
  rownames(tmp) <- c("iter", sub_pst, colnames(tuning_grid))
  
  # return
  return(tmp)
}

In [None]:
# R
%%R

# run 
# res <- xgb.cv_opt(data = dat_aucs_mod[["Train"]][["XGB_Matrix"]], tuning_grid = XGgrid)

# save 
# saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/XGBoost_feateng.RDS")

In [None]:
# R
%%R

# updated tuning grid
XGgrid_r2 <- expand.grid("objective" = "reg:squarederror",
                      "booster" = "gbtree",
                      "eta" = seq(0.01, 0.05, 0.02), 
                      "gamma" = c(0.05, 0.1, 0.5, 1),
                      "max_depth" = c(3, 5, 10),
                      "min_child_weight" = seq(1, 5, 2),
                      "subsample" = seq(0.9, 1, 0.1),
                      "colsample_bytree" = seq(0.9, 1, 0.1),
                      "lambda" = seq(5, 10, 2),
                      "alpha" = seq(2, 4, 2))

# rerun CV
# res <- xgb.cv_opt(data = dat_aucs_mod[["Train"]][["XGB_Matrix"]], tuning_grid = XGgrid_r2)

# save 
# saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/CV_XGBoost_feateng_r2.RDS")

In [None]:
# R
%%R

# 3rd run tuning grid
XGgrid_r3 <- expand.grid("objective" = "reg:squarederror",
                         "booster" = "gbtree",
                         "eta" = seq(0.04, 0.06, 0.01), 
                         "gamma" = c(0.03, 0.05, 0.1),
                         "max_depth" = c(4, 5, 6),
                         "min_child_weight" = 1,
                         "subsample" = seq(0.85, 0.95, 0.05),
                         "colsample_bytree" = seq(0.85, 0.95, 0.05),
                         "lambda" = seq(6, 8, 1),
                         "alpha" = seq(3, 7, 2))

# rerun CV
# res <- xgb.cv_opt(data = dat_aucs_mod[["Train"]][["XGB_Matrix"]], tuning_grid = XGgrid_r3)

# save 
# saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/CV_XGBoost_feateng_r3.RDS")

In [None]:
# R
%%R

# 4th run
XGgrid_r4 <- expand.grid("objective" = "reg:squarederror",
                         "booster" = "gbtree",
                         "eta" = seq(0.02, 0.04, 0.01), 
                         "gamma" = c(0.03, 0.04, 0.01),
                         "max_depth" = c(6, 7),
                         "min_child_weight" = 1,
                         "subsample" = 0.9,
                         "colsample_bytree" = 0.9,
                         "lambda" = seq(8, 11, 1),
                         "alpha" = seq(2, 4, 1))

# rerun CV
# res <- xgb.cv_opt(data = dat_aucs_mod[["Train"]][["XGB_Matrix"]], tuning_grid = XGgrid_r4)

# save 
# saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/CV_XGBoost_feateng_r4.RDS")


In [None]:
# R
%%R

# final run
XGgrid_r5 <- expand.grid("objective" = "reg:squarederror",
                         "booster" = "gbtree",
                         "eta" = 0.04, 
                         "gamma" = c(0.01, 0.02),
                         "max_depth" = seq(7, 10, 1),
                         "min_child_weight" = 1,
                         "subsample" = 0.9,
                         "colsample_bytree" = 0.9,
                         "lambda" = seq(8, 11, 1),
                         "alpha" = seq(3, 4, 1))

# rerun CV
res <- xgb.cv_opt(data = dat_aucs_mod[["Train"]][["XGB_Matrix"]], tuning_grid = XGgrid_r5)

# save 
saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/CV_XGBoost_feateng_r5.RDS")

2022-07-07 11:37:45, starting CV: 64 models to fit!

[1]	train-mae:3441.544337+120.460723	train-rmse:6178.137871+669.856601	test-mae:3441.557388+502.598463	test-rmse:5825.646478+2172.454246 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.

Stopping. Best iteration:
[338]	train-mae:67.832371+14.271790	train-rmse:476.367481+201.246882	test-mae:573.186898+369.805914	test-rmse:1568.485650+1893.405144

1/64 fit!
[1]	train-mae:3441.544337+120.460723	train-rmse:6178.137871+669.856601	test-mae:3441.557388+502.598463	test-rmse:5825.646478+2172.454246 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.

Stopping. Best iteration:
[338]	train-mae:67.832371+14.271790	train-rmse:476.367481+201.246882	test-mae:573.186898+369.805914	test-rmse:1568.485650+1893.405144

2/64 fit!
[1]	train-mae:3441.544337+120.460723	train-rmse:6178.137871+669.8

In [None]:
# unmount gdrive
drive.flush_and_unmount()