<a href="https://colab.research.google.com/github/Base-R-Best-R/Auction/blob/main/Code/Models/XGboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# call R in Python
%reload_ext rpy2.ipython

In [3]:
# mount gdrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# R
%%R

# install and load packages
install.packages(c("xgboost"))
library(xgboost)

In [5]:
# R
%%R

# load required data 
dat_aucs_eng <- readRDS("drive/MyDrive/Colab Transfer/Aucs_df_feateng_split.RDS")

# rm vars
lapply(dat_aucs_eng, \(x){

  # remove
  x$Contract_ID <- NULL
  x$MLOT <- NULL
  x$EW_Diff <- NULL
  Label <- x$Winning_Bid / 1e3
  x$Winning_Bid <- NULL
  x$Eng_Est <- x$Eng_Est /1e3
  
  # feature model matrix
  mod_mat <-  model.matrix(~. + 0, data = x)
  
  # Label and hot encoded features
  list("Label" = Label,
       "Features" = mod_mat,
       "XGB_Matrix" = xgb.DMatrix(data = mod_mat, label = Label))
  
}) -> dat_aucs_mod

# tuning grid
XGgrid <- expand.grid("objective" = "reg:squarederror",
                      "booster" = "gbtree",
                      "eta" = seq(0.05, 0.25, 0.1), 
                      "gamma" = seq(0.1, 9, 3),
                      "max_depth" = seq(5, 100, 25),
                      "min_child_weight" = seq(3, 9, 3),
                      "subsample" = seq(0.6, 1, 0.3),
                      "colsample_bytree" = seq(0.6, 1, 0.3),
                      "lambda" = seq(0.1, 9, 3),
                      "alpha" = seq(0.1, 9, 3))

In [6]:
# R
%%R

# parameter optimization CV
xgb.cv_opt <- \(nrounds = 1e3, print_every_n = 2e3, nfold = 5, 
                early_stopping_rounds = 10, maximize = FALSE, metrics = c("mae", "rmse"), 
                verbosity = 0, seed = 33, data, tuning_grid){
  
  # nmodels
  nmod <- nrow(tuning_grid)
  
  # metric subset vector
  sub_pst <- paste0("test_", metrics, "_mean")
  
  # print 
  cat(paste0(Sys.time(), ", starting CV: ", nmod, " models to fit!\n\n"))
  
  # counter
  count <- 1
  
  # over tuning grid rows
  apply(tuning_grid, 1, \(row){
    
    # parameters to list 
    param_lst <- as.list(row)
    
    # seed (s.t. we may compare the different models across the same folds)
    set.seed(seed)
    
    # cv 
    cv_tmp <- xgb.cv(params = param_lst, nrounds = nrounds, 
                     nfold = nfold, print_every_n = print_every_n,
                     early_stopping_rounds = early_stopping_rounds,
                     maximize = maximize, metrics = metrics, verbosity = verbosity,
                     data = data)
    
    # print current State
    cat(paste0(count, "/", nmod, " fit!\n"))
    
    # count
    count <<- count + 1

    # extract best iteration + input parameters
    cbind(cv_tmp[["evaluation_log"]][cv_tmp[["best_iteration"]], c("iter", sub_pst), with = FALSE] |> 
          data.matrix(),
          t(data.matrix(row)))
    
  }) -> tmp
  
  # rownames
  rownames(tmp) <- c("iter", sub_pst, colnames(tuning_grid))
  
  # return
  return(tmp)
}

In [None]:
# R
%%R

# run 
res <- xgb.cv_opt(data = dat_aucs_mod[["Train"]][["XGB_Matrix"]], tuning_grid = XGgrid)

# save 
saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/XGBoost_feateng.RDS")