<a href="https://colab.research.google.com/github/Base-R-Best-R/Auction/blob/main/Code/Models/Colab/CV_elastic_net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# call R in Python
%reload_ext rpy2.ipython

In [None]:
# mount gdrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# R
%%R

# packages 
install.packages(c("glmnet", "glmnetUtils", "Metrics"))
library(glmnet)
library(glmnetUtils)
library(Metrics)

# load required data 
dat_aucs_eng <- readRDS("drive/MyDrive/Colab Transfer/Aucs_df_feateng_split.RDS")

# data transformations
lapply(dat_aucs_eng, \(df){
  
  within(df, {
    
    Contract_ID <- NULL
    MLOT <- NULL
    EW_Diff <- NULL
    Winning_Bid <- Winning_Bid / 1e3
    Eng_Est <- Eng_Est / 1e3
    
  })
  
}) |> setNames(c("Train", "Test")) -> dat_aucs_mod

In [4]:
#R
%%R

# CV glmnet paralell CV over link functions
CV_disglmnet <- \(formula, data, families, type_measure = "mse", nfolds = 5, 
                  alpha = c(0.01, seq(0, 1, 0.25),0.99), nlambda = 250, 
                  ncore = NULL, seed = 33){
  
  # set up parallel compute cluster
  if(is.null(ncore)){
    
    # set amount of cores to the max available and leave one out
    ncore <- parallel::detectCores() - 1
    
    # we parallelize over folds - the maximum number of occupied cores should thus be
    ncore <- min(ncore, length(families))
    
  } else {
    
    # find min of ncore and folds
    ncore <- min(ncore, length(families))
    
  }

  # set up cluster
  clust <- parallel::makeCluster(ncore, outfile = "")

  # print cores that will be occupied
  cat(paste0(length(clust), " cores will be occupied by this process!"))
  
  # set cluster enviroment to function enviroment
  parallel::clusterExport(cl = clust,
                          varlist = c("data"),
                          envir = environment())
  
  
  # loop over families
  parallel::parLapply(clust, families, \(fam){
    
    # seed
    set.seed(seed)
    
    # cross validation
    glmnetUtils::cva.glmnet(formula, data = data,
                            family = fam,
                            type.measure = type_measure, nfolds = nfolds,
                            alpha = alpha,
                            nlambda = nlambda)
      
  }) |> setNames(names(families)) -> tmp
  
  # release cores 
  on.exit(parallel::stopCluster(clust), add = TRUE)
  
  # return
  return(tmp)
}

In [None]:
# R
%%R

# families
fams <- list("Gaus_ident" = gaussian(link = "identity"),
             "Gaus_log" = gaussian(link = "log"),
             "Gaus_inv" = gaussian(link = "inverse"),
             "Gam_ident" = Gamma(link = "identity"),
             "Gam_log" = Gamma(link = "log"),
             "Gam_ident" = Gamma(link = "identity"))

# Run CV
CV_disglmnet(Winning_Bid ~., data = dat_aucs_mod[["Train"]], families = fams,
             nlambda = 500, alpha = c(0.01, seq(0, 1, 0.05),0.99)) -> res

# write into gDrive
saveRDS(res, "drive/MyDrive/Master_Thesis/Models_MT/CV_elastic_net.RDS")

In [None]:
# unmount gdrive
drive.flush_and_unmount()