# Simulation
With R kernel

#### Further info
Models usable with train() from caret: <br>
https://topepo.github.io/caret/train-models-by-tag.html#Model_Tree <br>

## Packages

In [None]:
# Set the library path
#.libPaths("/user/emma.foessing01/u11969/new_R_libs")
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

print(R.version.string)

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "doParallel", "parallel"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

## Data

In [3]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("income", "race", "marital", "educ")), "income", "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]

Ein Datensatz zu genieren (m = 1) ist ausreichend, da ich keine Varianzanalyse machen werde. Damit die Ergebnisse nicht von einem zufälligen Prozess abhängen ist es sinnvoll über ein paar runs Mittelwerte zu bilden (50–100)

## Parallelisation

In [None]:
# Detect the number of available cores
#num_cores <- detectCores() - 1  # Leave one core free for system operations

# Register the parallel backend
#cl <- makeCluster(num_cores)
#registerDoParallel(cl)

## Functions

### Evaluation Functions

In [None]:
## Calculate evaluation metrics for continuous targets
evaluation_metrics_cont <- function(predictions, test_set){
    # Residuals
    residuals <- predictions - test_set$income
    
    # Mean Absolute Error (MAE)
    MAE <- mean(abs(residuals))
    
    # Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
    MSE <- mean(residuals^2)
    RMSE <- sqrt(MSE)
    
    # R-squared: Guarding against zero variance in the target
    SS_res <- sum(residuals^2)
    SS_tot <- sum((test_set$income - mean(test_set$income))^2)
    R_squared <- ifelse(SS_tot == 0, NA, 1 - (SS_res / SS_tot))
    
    # Mean Absolute Percentage Error (MAPE): Handling division by zero
    MAPE <- ifelse(any(test_set$income == 0), NA, mean(abs(residuals / test_set$income)) * 100)
    
    metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE, 
        R_squared = R_squared, 
        MAPE = MAPE
    )
    
    return(metrics_df)
}

In [None]:
## Calculate evaluation metrics for factored targets
evaluation_metrics_factor <- function(predictions, test_set) {
    # Ensure test_set is a data frame
    test_set <- as.data.frame(test_set)
    
    # Ensure both predictions and test_set$income are factors with the same levels
    predictions <- as.factor(predictions)
    reference <- as.factor(test_set$income)
    
    # Ensure levels match between predictions and reference
    levels(predictions) <- levels(reference)
    
    # Confusion matrix for the prediction on original data
    cm <- caret::confusionMatrix(predictions, reference, mode = "everything")

    # Saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    
    if (length(levels(reference)) == 2) {
        # Binary classification
        f1 <- cm$byClass['F1']
        sens <- cm$byClass['Sensitivity']
        spec <- cm$byClass['Specificity']
    } else {
        # Multi-class classification: calculate metrics for each class and take the mean
        f1 <- mean(cm$byClass[,'F1'], na.rm = TRUE)
        sens <- mean(cm$byClass[,'Sensitivity'], na.rm = TRUE)
        spec <- mean(cm$byClass[,'Specificity'], na.rm = TRUE)
    }

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

### Prediction Functions

##### CART

In [None]:
cart_pred <- function(data, outer_folds, cp_steps, inner_folds) {
    # Adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary

    # Set control args for inner and outer loops
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
    
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the grid for hyperparameter tuning
    complexity <- 10^seq(log10(0.0001), log10(0.01), length.out = cp_steps)
    tunegrid <- expand.grid(cp = complexity)

    # Create outer CV folds
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Setup parallel backend for outer folds
    num_cores <- detectCores() - 1  # Use all but one core
    cl <- makeCluster(num_cores)
    registerDoParallel(cl)

    # Use foreach to parallelize the outer loop
    outer_results <- foreach(i = seq_along(outer_cv_folds), .packages = c("caret", "dplyr", "rpart"),
                             .export = c("evaluation_metrics_cont", "evaluation_metrics_factor")) %dopar% {
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Hyperparameter tuning using inner CV
        model <- caret::train(income ~ ., 
                              data = outer_trainData, 
                              method = "rpart", 
                              tuneGrid = tunegrid, 
                              trControl = inner_control,
                              control = rpart.control(maxsurrogate = 0, maxcompete = 1))

        # Get the best hyperparameters
        best_hyperparameters <- model$bestTune

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                                    data = outer_trainData, 
                                    method = "rpart", 
                                    trControl = outer_control, 
                                    tuneGrid = best_hyperparameters)

        # Testing the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)

        # Evaluate the predictions
        if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        return(eval)
    }

    stopCluster(cl)  # Stop the cluster when done

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- do.call(rbind, outer_results) %>% as.data.frame() %>%
                        dplyr::summarise(across(everything(), mean, na.rm = TRUE))



    return(eval_avg_outer_folds)
}

##### RF

In [None]:
rf_pred <- function(data, outer_folds, mtry_steps, ntree_steps, inner_folds) {
    # Adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary

    # Set control args for inner and outer loops
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
    
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the parameter grid for tuning
    splitrule_value <- if (is.numeric(data$income)) "variance" else "gini"
    
    tunegrid <- expand.grid(mtry = seq(1, ncol(data) - 1, length.out = mtry_steps),
                            splitrule = splitrule_value,
                            min.node.size = if (is.numeric(data$income)) seq(5, 15, length.out = 10) else seq(1, 10, length.out = 10))

    # Create outer CV folds
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Setup parallel backend for outer folds
    num_cores <- detectCores() - 1  # Use all but one core for parallel processing
    cl <- makeCluster(num_cores)
    registerDoParallel(cl)

    # Parallelized outer loop: Cross-validation for model evaluation
    outer_results <- foreach(i = seq_along(outer_cv_folds), .combine = rbind, .packages = c("caret", "dplyr"),
                             .export = c("evaluation_metrics_cont", "evaluation_metrics_factor")) %dopar% {
        
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Hyperparameter tuning using inner CV
        model <- caret::train(income ~ ., 
                              data = outer_trainData, 
                              method = "ranger",  
                              tuneGrid = tunegrid, 
                              trControl = inner_control)

        # Store the best hyperparameters
        best_hyperparameters <- model$bestTune

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                                    data = outer_trainData, 
                                    method = "ranger", 
                                    trControl = outer_control, 
                                    tuneGrid = best_hyperparameters)

        # Test the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)

        # Evaluate the predictions
        if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        # Return the evaluation metrics for this outer fold
        return(eval)
    }

    stopCluster(cl)  # Stop the cluster when done
    registerDoSEQ()

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- outer_results %>%
                            dplyr::summarise(across(everything(), mean, na.rm = TRUE))

    # Return the average evaluation metrics
    return(eval_avg_outer_folds)
}

##### XGB

problematic datatype in xboost handling that was not compatible with caret

In [None]:
# Parallelized outer loop: Cross-validation for model evaluation
    

In [None]:
xgb_pred <- function(data, outer_folds, inner_folds) {
    # Save the original factor levels of the target variable (income) before conversion
    original_levels <- levels(data$income)

    # Initialize parallel backend
    num_cores <- parallel::detectCores() - 1  # Use all cores except one
    cl <- makeCluster(num_cores)
    registerDoParallel(cl)

    # Create outer folds once before the loop
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Define the grid of hyperparameters to tune
    tunegrid <- expand.grid(
        max_depth = c(3, 5, 7),
        eta = c(0.05, 0.1),
        gamma = c(0, 0.1),
        subsample = 0.8,
        colsample_bytree = 0.8
    )

    # Outer cross-validation loop using foreach for parallel processing
    outer_results <- foreach(i = seq_along(outer_cv_folds), .packages = c('xgboost', 'caret', 'dplyr'), .export = c("evaluation_metrics_cont", "evaluation_metrics_factor")) %dopar% {
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Prepare the training and validation matrices
        train_X <- data.matrix(outer_trainData[, !colnames(outer_trainData) %in% 'income'])
        train_y <- outer_trainData$income
        val_X <- data.matrix(outer_testData[, !colnames(outer_testData) %in% 'income'])
        val_y <- outer_testData$income

        if (is.factor(train_y)) {
            # Ensure consistent factor levels
            levels(train_y) <- levels(data$income)
            levels(val_y) <- levels(data$income)

            # Convert factor levels to numeric starting from 0
            train_y <- as.numeric(train_y) - 1
            val_y <- as.numeric(val_y) - 1
        }

        # Handle missing values
        train_X[is.na(train_X)] <- 0
        val_X[is.na(val_X)] <- 0

        # Create the DMatrix required for xgboost
        dtrain <- xgb.DMatrix(data = train_X, label = train_y)
        dtest <- xgb.DMatrix(data = val_X, label = val_y)

        best_params <- list()  # Initialize as an empty list
        best_eval_metric <- Inf
        best_nrounds <- 200  # Increased default value for nrounds

        # Iterate over all combinations of hyperparameters in tunegrid
        for (j in 1:nrow(tunegrid)) {
            params <- list(
                max_depth = tunegrid$max_depth[j],
                eta = tunegrid$eta[j],
                gamma = tunegrid$gamma[j],
                subsample = tunegrid$subsample[j],
                colsample_bytree = tunegrid$colsample_bytree[j],
                objective = if (is.numeric(data$income)) 'reg:squarederror' else if (length(unique(data$income)) == 2) 'binary:logistic' else 'multi:softprob',
                eval_metric = if (is.numeric(data$income)) 'rmse' else if (length(unique(data$income)) == 2) 'logloss' else 'mlogloss'
            )

            # Include num_class only for multi-class classification
            if (is.factor(data$income) && length(unique(data$income)) > 2) {
                params$num_class <- length(unique(train_y))
            }

            # Perform inner cross-validation using xgb.cv
            cv_model <- xgb.cv(
                params = params,
                data = dtrain,
                nrounds = 200,  # Increased number of boosting rounds
                nfold = inner_folds,
                verbose = FALSE
            )

            # Retrieve the best metric from cross-validation
            current_eval_metric <- min(cv_model$evaluation_log$test_rmse_mean)

            # If this model is better, update the best parameters
            if (current_eval_metric < best_eval_metric) {
                best_eval_metric <- current_eval_metric
                best_params <- params
                best_nrounds <- which.min(cv_model$evaluation_log$test_rmse_mean)
            }
        }

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- xgboost(
            params = best_params,
            data = dtrain,
            nrounds = best_nrounds,
            verbose = FALSE
        )

        # Predict on the outer test set
        predictions <- predict(final_model, dtest)

        if (is.factor(data$income) && length(unique(data$income)) > 2) {
            # Multi-class classification
            predicted_class_indices <- max.col(matrix(predictions, ncol = length(unique(train_y)), byrow = TRUE)) - 1
            predictions <- factor(predicted_class_indices, levels = 0:(length(original_levels) - 1), labels = original_levels)
        } else if (is.factor(data$income) && length(unique(data$income)) == 2) {
            # Binary classification
            predicted_classes <- ifelse(predictions >= 0.5, 1, 0)
            predictions <- factor(predicted_classes, levels = 0:1, labels = original_levels)
        }

        # Evaluate the model
        if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        }

        return(eval)  # Return evaluation results for this fold
    }

    # Filter out NULL values from outer_results
    valid_results <- Filter(Negate(is.null), outer_results)

    # Summarize the evaluation metrics across folds
    eval_avg_outer_folds <- do.call(rbind, valid_results) %>%
                            dplyr::summarise(across(everything(), mean, na.rm = TRUE))

    # Stop the parallel backend
    stopCluster(cl)

    return(eval_avg_outer_folds)
}

##### BN

this does not have an implemented function in the caret::train() function

In [None]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

In [None]:
inner_cv <- function(data, target_var, folds, algorithms) {
  # Create inner folds
  inner_folds <- createFolds(data[[target_var]], k = folds)
  
  best_model <- NULL
  best_performance <- -Inf
  best_algorithm <- NULL
  
  for (algorithm in algorithms) {
    cat("Trying algorithm:", algorithm, "\n")
    fold_results <- c()
    
    for (i in seq_along(inner_folds)) {
      inner_test_index <- inner_folds[[i]]
      inner_trainData <- data[-inner_test_index, ]
      inner_testData <- data[inner_test_index, ]
      
      # Fit Bayesian Network model using bnlearn algorithm
      bn_model <- do.call(get(algorithm, envir = asNamespace("bnlearn")), list(inner_trainData))
      
      # Fit the model to the training data
      fitted_bn_model <- bnlearn::bn.fit(bn_model, inner_trainData)
      
      # Use Bayesian Likelihood Weighting for prediction
      predictions <- predict(fitted_bn_model, node = target_var, data = inner_testData, method = "bayes-lw")
      
      # Handle missing levels in prediction
      predictions <- factor(predictions, levels = levels(inner_trainData[[target_var]]))
      
      # Calculate the accuracy
      accuracy <- mean(predictions == inner_testData[[target_var]], na.rm = TRUE)
      fold_results[i] <- accuracy
    }
    
    # Average performance for this algorithm
    avg_performance <- mean(fold_results, na.rm = TRUE)
    
    if (!is.na(avg_performance) && avg_performance > best_performance) {
      best_performance <- avg_performance
      best_model <- fitted_bn_model
      best_algorithm <- algorithm
    }
  }
  
  cat("Best algorithm selected:", best_algorithm, "with accuracy:", best_performance, "\n")
  return(best_model)
}

In [None]:
bn_pred <- function(data, outer_folds, inner_folds) {
  # Discretize the data
  data <- discretize_df(data)
  
  algorithms <- c("tabu") # without hc
  data$income <- factor(data$income, levels = unique(data$income))
  
  outer_cv_folds <- createFolds(data$income, k = outer_folds)
  
  # Setup parallel backend
  num_cores <- detectCores() - 1  # You can adjust this depending on available cores
  cl <- makeCluster(num_cores)
  registerDoParallel(cl)
  
  # Use foreach for parallelization of outer folds
  outer_results <- foreach(i = seq_along(outer_cv_folds), 
                           .packages = c("caret", "dplyr", "bnlearn", "doParallel", "foreach", "parallel"), 
                           .export = c("discretize_df", "inner_cv", "evaluation_metrics_factor")) %dopar% {
    outer_test_index <- outer_cv_folds[[i]]
    outer_testData <- data[outer_test_index, ]
    outer_trainData <- data[-outer_test_index, ]
    
    # Get the best fitted model from inner CV
    best_model <- inner_cv(outer_trainData, "income", inner_folds, algorithms)
    
    # Perform prediction using 'bayes-lw' method
    predictions <- predict(best_model, node = "income", data = outer_testData, method = "bayes-lw")
    
    # Ensure both predictions and test data are factors
    predictions <- factor(predictions, levels = levels(outer_testData$income))
    
    cat("Class of outer_testData: ", class(outer_testData), "\n")

    # Evaluate predictions (ensure test data is also a factor)
    eval <- evaluation_metrics_factor(predictions, outer_testData)
    
    return(eval)
  }
  
  stopCluster(cl)  # Stop the cluster when done
  registerDoSEQ()

  # Average the evaluation metrics over the outer folds
  eval_avg_outer_folds <- do.call(rbind, outer_results) %>%
                          dplyr::summarise(across(everything(), mean, na.rm = TRUE))
  
  return(eval_avg_outer_folds)
}


##### SVM

In [None]:
svm_pred <- function(data, outer_folds, cost_steps, inner_folds) {
    # Adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary

    # Set control args
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
    
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the grid for hyperparameter tuning
    cost_values <- 10^seq(log10(0.001), log10(100), length.out = cost_steps)  # Adjust the range as needed
    tunegrid <- expand.grid(C = cost_values, sigma = 0.1)  # sigma can also be tuned separately

    # Create outer CV folds
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Setup parallel backend for outer folds
    num_cores <- detectCores() - 1  # Use all but one core for parallel processing
    cl <- makeCluster(num_cores)
    registerDoParallel(cl)

    # Parallelized outer loop: Cross-validation for model evaluation
    outer_results <- foreach(i = seq_along(outer_cv_folds), .combine = rbind, .packages = c("caret", "dplyr"),
                             .export = c("evaluation_metrics_cont", "evaluation_metrics_factor")) %dopar% {

        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Hyperparameter tuning using inner CV
        model <- caret::train(income ~ ., 
                              data = outer_trainData, 
                              method = "svmRadial",  
                              tuneGrid = tunegrid, 
                              trControl = inner_control)

        # Store the best hyperparameters
        best_hyperparameters <- model$bestTune

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                                    data = outer_trainData, 
                                    method = "svmRadial", 
                                    trControl = outer_control, 
                                    tuneGrid = best_hyperparameters)

        # Testing the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)

        if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        # Return the evaluation metrics for this outer fold
        return(eval)
    }

    stopCluster(cl)  # Stop the cluster when done
    registerDoSEQ()

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- outer_results %>%
                            summarise(across(everything(), mean, na.rm = TRUE))

    # Return the average evaluation metrics
    return(eval_avg_outer_folds)
}

## For CART synthezised data

In [None]:
simulation <- function(data, outer_folds = 5, inner_folds = 3,
                       cp_steps = 10,  # CART params
                       mtry_steps = 10, ntree_steps = 10,  # RF params
                       nrounds_steps = 10, max_depth_steps = 10, eta_steps = 10, 
                       gamma_steps = 10, colsample_bytree_steps = 10, 
                       min_child_weight_steps = 10, subsample_steps = 10,  # XGB params
                       cost_steps = 2  # SVM params
                       ) {
  
  # Check if the target variable is numeric (regression) or factor (classification)
  target_is_numeric <- is.numeric(data$income)
  
  # Initialize empty lists to store evaluation dataframes
  eval_list <- list()  # For regression or classification metrics
  eval_bn_list <- list()  # For BN metrics when the target is numeric (regression)
  
  # Set initial seed
  s <- 1244
  set.seed(s)

  # create synthetic data
  syndata <- synthpop::syn(data, visit.sequence = c("sex", setdiff(colnames(data), "sex")), seed = s)  #remember to set the seed
  syndata <- syndata$syn
  
  # Prediction models with nested CV and grid search
  cat("XGB eval")
  XGB_eval <- xgb_pred(syndata, outer_folds, inner_folds)
  cat("CART eval")
  CART_eval <- cart_pred(syndata, outer_folds, cp_steps, inner_folds)
  cat("RF eval")
  RF_eval <- rf_pred(syndata, outer_folds, mtry_steps, ntree_steps, inner_folds)
  cat("SVM eval")
  SVM_eval <- svm_pred(syndata, outer_folds, cost_steps, inner_folds)
  cat("BN eval")
  BN_eval <- bn_pred(syndata, outer_folds, inner_folds)

  cat("CART Evaluation:\n")
  cat(capture.output(CART_eval), sep = "\n")

  cat("RF Evaluation:\n")
  cat(capture.output(RF_eval), sep = "\n")

  cat("BN Evaluation:\n")
  cat(capture.output(BN_eval), sep = "\n")

  cat("XGB Evaluation:\n")
  cat(capture.output(XGB_eval), sep = "\n")

  cat("SVM Evaluation:\n")
  cat(capture.output(SVM_eval), sep = "\n")

  rm(syndata)
  gc()
  
  if (target_is_numeric) {
    # Combine only regression metrics (CART, RF, XGB, SVM)
    eval <- rbind(CART = CART_eval, RF = RF_eval, XGB = XGB_eval, SVM = SVM_eval)
    #eval_list[[i]] <- eval
    
    # Save BN metrics separately for regression task
    eval_bn <- BN_eval
    #eval_bn_list[[i]] <- BN_eval

    results <- list(eval, eval_bn)
  } else {
    # Combine all classification metrics, including BN
    eval <- rbind(CART = CART_eval, RF = RF_eval, XGB = XGB_eval, SVM = SVM_eval, BN = BN_eval)
    #eval_list[[i]] <- eval

    results <- list(eval)
  }
  
  return(results)
}
  



rownames <- row.names(eval_bn_avg[[1]])
        row.names(eval_bn_avg) <- rownames

## For synthetic data

In [None]:
adult_res <- simulation(data = adult, outer_folds = 5, inner_folds = 3)

In [None]:
cps_res <- simulation(data = cpspop, outer_folds = 5, inner_folds = 3)

In [None]:
adult_res

In [None]:
cps_res

In [None]:
# Stop the parallel cluster
#stopCluster(cl)
#registerDoSEQ()  # Return to sequential computation

In [None]:
# Save the data
save(cps_res, file = paste0(here(), "/results/cps_cart_res_", as.character(s) ,".RData"))
save(adult_res, file = paste0(here(), "/results/adult_cart_res_", as.character(s) ,".RData"))