<a href="https://colab.research.google.com/github/Base-R-Best-R/Auction/blob/main/Code/Models/Colab/Parallel_NestedCV_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# call R in Python
%reload_ext rpy2.ipython

In [3]:
# mount gdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# R
%%R

# install and load packages
install.packages(c("ranger", "logisticPCA", "Metrics"))
library(ranger)
library(logisticPCA)
library(Metrics)

In [4]:
# R
%%R

# load required data 
dat_aucs <- readRDS("drive/MyDrive/Colab Transfer/Aucs_df_feateng_split.RDS")

# training 
dat_aucs_train <- dat_aucs[["Train"]]

In [5]:
# R
%%R

# write CV function
log_PCA_rf_CV_par <- \(data, nfolds = 5, m_nest_CV = seq(1, 13, 2), 
                   k_desc_CV = seq(2, 20, 2), k_ven_CV = seq(2, 20, 2), 
                   k_vendInt_CV = seq(2, 20, 2), splitrule_CV = "variance",
                   min_node_size_CV = 1:4, max_depth_CV = seq(5, 70, 5), mtry_incr = 5,
                   num_trees_CV = 1500, mtry_incr_start = 5, eval_fun = Metrics::rmse,
                   ncore = NULL){
      
  # rm unwanted cols
  within(data,{
    Contract_ID <- NULL
    MLOT <- NULL
    EW_Diff <- NULL
    Winning_Bid <- Winning_Bid / 1e3
    Eng_Est <- Eng_Est / 1e3
  }) -> data

  
  # bools
  vend_cols_log <- names(data) |> stringr::str_detect("Vend_") 
  interact_cols <- names(data) |> stringr::str_detect("_x_")
  descr_words <- which(!vend_cols_log)[-c(1:7)]
  Vend_not_int <- vend_cols_log & (!interact_cols)
  
  # generate CV folds
  folds <- sample(nrow(data), nrow(data), replace = FALSE) |> 
            split(as.factor(1:nfolds)) |> setNames(1:nfolds) |> suppressWarnings()

  # tuning grid init
  tgrid_PC <- expand.grid("nPCA_Desc" = k_desc_CV,
                          "nPCA_Vend" = k_ven_CV,
                          "nPCA_VendInt" = k_vendInt_CV)
  
  # set up parallel compute cluster
  if(is.null(ncore)){
    
    # set amount of cores to the max available and leave one out
    ncore <- parallel::detectCores() - 1
    
    # we parallelize over folds - the maximum number of occupied cores should thus be
    ncore <- min(ncore, nfolds)
    
  } else {
    
    # find min of ncore and folds
    ncore <- min(ncore, nfolds)
    
  }

  # set up cluster
  clust <- parallel::makeCluster(ncore, outfile = "")

  # print cores that will be occupied
  cat(paste0(length(clust), " cores will be occupied by this process!"))
  
  # loop over folds each is used as test set once
  parallel::parLapply(clust, names(folds), \(f_ind){
    
      # test - bool 
      test_bool <- names(folds) %in% f_ind

      # train and test
      train <- data[do.call(c, folds[!test_bool]), ]
      test <- data[folds[test_bool] |> unlist(), ]
      
      # separate into desc / Vend / Vend_int
      grps <- list("Train" = list("Description" = train[, descr_words], 
                                  "Vendor" = train[, Vend_not_int],
                                  "Vendor_Interaction" = train[, interact_cols]),
                   "Test" = list("Description" = test[, descr_words], 
                                  "Vendor" = test[, Vend_not_int],
                                  "Vendor_Interaction" = test[, interact_cols]))
      
      # nested CV
      apply(tgrid_PC, 1, \(x){

        # Over all 3 binary subsets
        Map(\(dat_train, dat_test, kk){

          # CV for m of logistic PCA
          cv_PCA <- logisticPCA::cv.lpca(dat_train, ms = m_nest_CV, ks = kk)
        
          # fit
          fit_PCA <- logisticPCA::logisticPCA(dat_train, k = kk, 
                                 m = colnames(cv_PCA)[which.min(cv_PCA)] |> as.numeric())
            
          # predict 
          pred_PCA <- predict(fit_PCA, dat_test, type = "PCs") # name this (variable importance)
          
          # return
          return(list("Fit_PCA" = fit_PCA,
                      "Pred_PCA" = pred_PCA))
          
        }, grps[["Train"]], grps[["Test"]], x) -> fitted_PCs

        # assemble PCA dataset
        PC_dfs <- Map(\(tt, PCAtt, bool){
    
          # bool for supset
          if(bool){
            
            # subset from fit
            PCs <- lapply(fitted_PCs, "[[", PCAtt) |> lapply(\(z) as.data.frame(z[["PCs"]])) 
              
          } else {
            
            PCs <- lapply(fitted_PCs, \(t) as.data.frame(t[[PCAtt]]))
            
          }
    
          # assemble new test and train set
          cbind(tt[1:7], do.call(cbind, PCs))
          
        }, list(train, test), 
        c("Fit_PCA", "Pred_PCA"), c(TRUE, FALSE)) |> setNames(c("Train", "Test"))

        # mtry value depending on number of PCs and thus added now
        mtry_CV <- seq(mtry_incr_start, ncol(PC_dfs[["Train"]]), mtry_incr)
        
        # RF tuning grid 
        tgrid_RF <- expand.grid("mtry" = mtry_CV,
                                "splitrule" = splitrule_CV,
                                "min_node_size" = min_node_size_CV,
                                "max_depth" = max_depth_CV, 
                                "num_trees" = num_trees_CV)
        
        ## Random forest ##
        apply(tgrid_RF, 1, \(cv_inp){
          
          # fit
          ranger::ranger(Winning_Bid ~., mtry = as.numeric(cv_inp[1]),
                         splitrule = cv_inp[2],
                         min.node.size = as.numeric(cv_inp[3]),
                         max.depth = as.numeric(cv_inp[4]), 
                         num.trees = as.numeric(cv_inp[5]),
                         data = PC_dfs[["Train"]]) -> fit_rf
          
          # predict 
          pred <- predict(fit_rf, PC_dfs[["Test"]])
          
          # calc RMSE
          eval_res <- eval_fun(actual = PC_dfs[["Test"]][, "Winning_Bid"], predicted = pred[["predictions"]])
          
          # return result and inputs
          return(c(cv_inp[1], # mtry
                   cv_inp[2], # splitrule
                   cv_inp[3], # min.node.size
                   cv_inp[4], # max.depth
                   cv_inp[5], # ntrees
                   x[1], # nPCA Desc
                   x[2], # nPCA Vend
                   x[3], # nPCA VendInt
                   "performance" = eval_res)) 
          
      }) |> as.data.frame() |> setNames(paste0("RF_", 1:nrow(tgrid_RF)))
    }) |> setNames(paste0("PCA_", 1:nrow(tgrid_PC)))
  }) |> setNames(1:nfolds) -> tmp
  
  # release cores 
  on.exit(parallel::stopCluster(clust), add = TRUE)
  
  # return
  return(tmp)
  
}


In [None]:
# R
%%R

# Run CV 
log_PCA_rf_CV_par(dat_aucs_train, nfolds = 5, m_nest_CV = c(3, 8, 12), 
              k_desc_CV = c(20, 30), k_ven_CV = c(20, 30), 
              k_vendInt_CV = c(20, 30), splitrule_CV = "variance",
              min_node_size_CV = c(1, 3, 5), max_depth_CV = c(30, 50, 70, 110), 
              mtry_incr = 5, ncore = 2) -> res

# save 
# saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/NestedCV_logPCA_rf_v2.RDS")

In [None]:
# R
%%R

# 2nd run 
log_PCA_rf_CV_par(dat_aucs_train, nfolds = 5, m_nest_CV = c(5, 8, 12), 
              k_desc_CV = c(15, 20, 25), k_ven_CV = c(5, 10, 15), 
              k_vendInt_CV = c(15, 20, 25), splitrule_CV = "variance",
              min_node_size_CV = c(1, 3), max_depth_CV = c(30, 50, 70), 
              mtry_incr = 5, ncore = 2) -> res

# save
saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/NestedCV_logPCA_rf_r3.RDS")

2 cores will be occupied by this process!

In [None]:
# unmount drive 
drive.flush_and_unmount()