## Libraries

In [None]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "e1071", "ExtDist",
  "doParallel"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)


Attache Paket: 'ExtDist'


Das folgende Objekt ist maskiert 'package:stats4':

    BIC


Das folgende Objekt ist maskiert 'package:stats':

    BIC




## Data

In [None]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("race", "marital", "educ")), "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]


## Evaluation Functions

In [None]:
## Calculate evaluation metrics for continuous targets
evaluation_metrics_cont <- function(predictions, test_set){
    # Residuals
    residuals <- predictions - test_set$income
    
    # Mean Absolute Error (MAE)
    MAE <- mean(abs(residuals))
    
    # Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
    MSE <- mean(residuals^2)
    RMSE <- sqrt(MSE)
    
    # R-squared: Guarding against zero variance in the target
    SS_res <- sum(residuals^2)
    SS_tot <- sum((test_set$income - mean(test_set$income))^2)
    R_squared <- ifelse(SS_tot == 0, NA, 1 - (SS_res / SS_tot))
    
    # Mean Absolute Percentage Error (MAPE): Handling division by zero
    MAPE <- ifelse(any(test_set$income == 0), NA, mean(abs(residuals / test_set$income)) * 100)
    
    metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE, 
        R_squared = R_squared, 
        MAPE = MAPE
    )
    
    return(metrics_df)
}

In [None]:
## Calculate evaluation metrics for factored targets
evaluation_metrics_factor <- function(predictions, test_set) {
    # Ensure test_set is a data frame
    test_set <- as.data.frame(test_set)
    
    # Ensure both predictions and test_set$income are factors with the same levels
    predictions <- as.factor(predictions)
    reference <- as.factor(test_set$income)
    
    # Ensure levels match between predictions and reference
    levels(predictions) <- levels(reference)
    
    # Confusion matrix for the prediction on original data
    cm <- caret::confusionMatrix(predictions, reference, mode = "everything")

    # Saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    
    if (length(levels(reference)) == 2) {
        # Binary classification
        f1 <- cm$byClass['F1']
        sens <- cm$byClass['Sensitivity']
        spec <- cm$byClass['Specificity']
    } else {
        # Multi-class classification: calculate metrics for each class and take the mean
        f1 <- mean(cm$byClass[,'F1'], na.rm = TRUE)
        sens <- mean(cm$byClass[,'Sensitivity'], na.rm = TRUE)
        spec <- mean(cm$byClass[,'Specificity'], na.rm = TRUE)
    }

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

## XGB pred function

In [None]:
xgb_pred <- function(data, outer_folds, inner_folds) {
    # Save the original factor levels of the target variable (income) before conversion
    original_levels <- levels(data$income)

    # Initialize parallel backend
    num_cores <- parallel::detectCores() - 1  # Use all cores except one
    cl <- makeCluster(num_cores)
    registerDoParallel(cl)

    # Create outer folds once before the loop
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Define the grid of hyperparameters to tune
    tunegrid <- expand.grid(
        max_depth = c(3, 5, 7),
        eta = c(0.05, 0.1),
        gamma = c(0, 0.1),
        subsample = 0.8,
        colsample_bytree = 0.8
    )

    # Outer cross-validation loop using foreach for parallel processing
    outer_results <- foreach(i = seq_along(outer_cv_folds), .packages = c('xgboost', 'caret', 'dplyr'), .export = c("evaluation_metrics_cont", "evaluation_metrics_factor")) %dopar% {
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Prepare the training and validation matrices
        train_X <- data.matrix(outer_trainData[, !colnames(outer_trainData) %in% 'income'])
        train_y <- outer_trainData$income
        val_X <- data.matrix(outer_testData[, !colnames(outer_testData) %in% 'income'])
        val_y <- outer_testData$income
        
        message("train_X dimensions: ", dim(train_X))
        message("train_y length: ", length(train_y))
        message("val_X dimensions: ", dim(val_X))
        message("val_y length: ", length(val_y))

        if (is.factor(train_y)) {
            # Ensure consistent factor levels
            levels(train_y) <- levels(data$income)
            levels(val_y) <- levels(data$income)

            # Convert factor levels to numeric starting from 0
            train_y <- as.numeric(train_y) - 1
            val_y <- as.numeric(val_y) - 1
        }

        # Handle missing values
        train_X[is.na(train_X)] <- 0
        val_X[is.na(val_X)] <- 0

        # Create the DMatrix required for xgboost
        dtrain <- xgb.DMatrix(data = train_X, label = train_y)
        dtest <- xgb.DMatrix(data = val_X, label = val_y)

        best_params <- list()  # Initialize as an empty list
        best_eval_metric <- Inf
        best_nrounds <- 200  # Increased default value for nrounds

        # Iterate over all combinations of hyperparameters in tunegrid
        for (j in 1:nrow(tunegrid)) {
            params <- list(
                max_depth = tunegrid$max_depth[j],
                eta = tunegrid$eta[j],
                gamma = tunegrid$gamma[j],
                subsample = tunegrid$subsample[j],
                colsample_bytree = tunegrid$colsample_bytree[j],
                objective = if (is.numeric(data$income)) 'reg:squarederror' else if (length(unique(data$income)) == 2) 'binary:logistic' else 'multi:softprob',
                eval_metric = if (is.numeric(data$income)) 'rmse' else if (length(unique(data$income)) == 2) 'logloss' else 'mlogloss'
            )

            # Include num_class only for multi-class classification
            if (is.factor(data$income) && length(unique(data$income)) > 2) {
                params$num_class <- length(unique(train_y))
            }

            # Perform inner cross-validation using xgb.cv
            cv_model <- xgb.cv(
                params = params,
                data = dtrain,
                nrounds = 200,  # Increased number of boosting rounds
                nfold = inner_folds,
                verbose = FALSE
            )

            # Retrieve the best metric from cross-validation
            current_eval_metric <- min(cv_model$evaluation_log$test_rmse_mean)

            # If this model is better, update the best parameters
            if (current_eval_metric < best_eval_metric) {
                best_eval_metric <- current_eval_metric
                best_params <- params
                best_nrounds <- which.min(cv_model$evaluation_log$test_rmse_mean)
            }
        }

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- xgboost(
            params = best_params,
            data = dtrain,
            nrounds = best_nrounds,
            verbose = FALSE
        )

        # Predict on the outer test set
        predictions <- predict(final_model, dtest)

        if (is.factor(data$income) && length(unique(data$income)) > 2) {
            # Multi-class classification
            predicted_class_indices <- max.col(matrix(predictions, ncol = length(unique(train_y)), byrow = TRUE)) - 1
            predictions <- factor(predicted_class_indices, levels = 0:(length(original_levels) - 1), labels = original_levels)
        } else if (is.factor(data$income) && length(unique(data$income)) == 2) {
            # Binary classification
            predicted_classes <- ifelse(predictions >= 0.5, 1, 0)
            predictions <- factor(predicted_classes, levels = 0:1, labels = original_levels)
        }

        # Evaluate the model
        if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        }

        return(eval)  # Return evaluation results for this fold
    }

    # Filter out NULL values from outer_results
    valid_results <- Filter(Negate(is.null), outer_results)

    # Summarize the evaluation metrics across folds
    eval_avg_outer_folds <- do.call(rbind, valid_results) %>%
                            dplyr::summarise(across(everything(), mean, na.rm = TRUE))

    # Stop the parallel backend
    stopCluster(cl)

    return(eval_avg_outer_folds)
}

In [None]:
# old function
xgb_pred <- function(data, outer_folds, inner_folds) {

    # Save the original factor levels of the target variable (income) before conversion
    original_levels <- levels(data$income)

    # Initialize variables to store results
    outer_results <- list()
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Define the grid of hyperparameters to tune
    tunegrid <- expand.grid(
        max_depth = c(3, 5, 7),
        eta = c(0.05, 0.1),
        gamma = c(0, 0.1),
        subsample = 0.8,
        colsample_bytree = 0.8
    )

    for (i in seq_along(outer_cv_folds)) {
        
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Prepare the training and validation matrices
        train_X <- data.matrix(outer_trainData[, !colnames(outer_trainData) %in% 'income'])
        train_y <- outer_trainData$income
        val_X <- data.matrix(outer_testData[, !colnames(outer_testData) %in% 'income'])
        val_y <- outer_testData$income
        
        if (is.factor(train_y)) {
            # Ensure consistent factor levels
            levels(train_y) <- levels(data$income)
            levels(val_y) <- levels(data$income)
            
            # Convert factor levels to numeric starting from 0
            train_y <- as.numeric(train_y) - 1
            val_y <- as.numeric(val_y) - 1
        }

        # Handle missing values
        train_X[is.na(train_X)] <- 0
        val_X[is.na(val_X)] <- 0

        # Create the DMatrix required for xgboost
        dtrain <- xgb.DMatrix(data = train_X, label = train_y)
        dtest <- xgb.DMatrix(data = val_X, label = val_y)

        best_params <- list()  # Initialize as an empty list
        best_eval_metric <- Inf
        best_nrounds <- 200  # Increased default value for nrounds

        # Iterate over all combinations of hyperparameters in tunegrid
        for (j in 1:nrow(tunegrid)) {
            params <- list(
                max_depth = tunegrid$max_depth[j],
                eta = tunegrid$eta[j],
                gamma = tunegrid$gamma[j],
                subsample = tunegrid$subsample[j],
                colsample_bytree = tunegrid$colsample_bytree[j],
                objective = if (is.numeric(data$income)) 'reg:squarederror' else if (length(unique(data$income)) == 2) 'binary:logistic' else 'multi:softprob',
                eval_metric = if (is.numeric(data$income)) 'rmse' else if (length(unique(data$income)) == 2) 'logloss' else 'mlogloss'
            )

            # Include num_class only for multi-class classification
            if (is.factor(data$income) && length(unique(data$income)) > 2) {
                params$num_class <- length(unique(train_y))
            }

            # Perform inner cross-validation using xgb.cv
            message("Trying params: ", params)  # Log current params
            cv_model <- xgb.cv(
                params = params,
                data = dtrain,
                nrounds = 200,  # Increased number of boosting rounds
                nfold = inner_folds,
                verbose = FALSE
            )

            # Check if the cross-validation completed successfully
            if (is.null(cv_model)) {
                message("Inner CV failed for params: ", params)
                next
            }

            # Retrieve the best metric from cross-validation
            current_eval_metric <- min(cv_model$evaluation_log$test_rmse_mean)

            # If this model is better, update the best parameters
            if (current_eval_metric < best_eval_metric) {
                best_eval_metric <- current_eval_metric
                best_params <- params
                best_nrounds <- which.min(cv_model$evaluation_log$test_rmse_mean)
            }
        }

        # Ensure best_params is not empty
        if (length(best_params) == 0) {
            message("No valid parameters found, using default parameters.")
            best_params <- list(
                max_depth = 3,
                eta = 0.1,
                gamma = 0,
                subsample = 0.8,
                colsample_bytree = 0.8,
                objective = 'binary:logistic',
                eval_metric = 'logloss'
            )
            best_nrounds <- 200  # Default value
        }

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- xgboost(
            params = best_params,
            data = dtrain,
            nrounds = best_nrounds,
            verbose = FALSE
        )

        # Predict on the outer test set
        predictions <- predict(final_model, dtest)

        if (is.factor(data$income) && length(unique(data$income)) > 2) {
            # Multi-class classification
            predicted_class_indices <- max.col(matrix(predictions, ncol = length(unique(train_y)), byrow = TRUE)) - 1
            predictions <- factor(predicted_class_indices, levels = 0:(length(original_levels) - 1), labels = original_levels)
        } else if (is.factor(data$income) && length(unique(data$income)) == 2) {
            # Binary classification
            predicted_classes <- ifelse(predictions >= 0.5, 1, 0)
            predictions <- factor(predicted_classes, levels = 0:1, labels = original_levels)
        } else if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        # Ensure predictions and true labels have the same length
        if (length(predictions) != nrow(outer_testData)) {
            stop("Length of predictions does not match number of test samples.")
        }

        # Evaluate the model
        if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        }

        outer_results[[i]] <- eval
    }

    # Filter out NULL values from outer_results
    valid_results <- Filter(Negate(is.null), outer_results)

    # Check if there are valid results to summarize
    if (length(valid_results) == 0) {
        stop("No valid model results were obtained.")
    }

    # Summarize the evaluation metrics across folds
    eval_avg_outer_folds <- do.call(rbind, valid_results) %>%
                            dplyr::summarise(across(everything(), mean, na.rm = TRUE))

    return(eval_avg_outer_folds)

}

In [None]:
# no parallelisation, but otherwise same
xgb_pred <- function(data, outer_folds, inner_folds) {
    # Save the original factor levels of the target variable (income) before conversion
    original_levels <- levels(data$income)

    # Create outer folds once before the loop
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Define the grid of hyperparameters to tune
    tunegrid <- expand.grid(
        max_depth = c(3, 5, 7),
        eta = c(0.05, 0.1),
        gamma = c(0, 0.1),
        subsample = 0.8,
        colsample_bytree = 0.8
    )

    outer_results <- list()

    # Outer cross-validation loop using a regular for loop
    for (i in seq_along(outer_cv_folds)) {
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Prepare the training and validation matrices
        train_X <- data.matrix(outer_trainData[, !colnames(outer_trainData) %in% 'income'])
        train_y <- outer_trainData$income
        val_X <- data.matrix(outer_testData[, !colnames(outer_testData) %in% 'income'])
        val_y <- outer_testData$income

        # Print debugging information
        message("train_X dimensions: ", paste(dim(train_X), collapse = " x "))
        message("train_y length: ", length(train_y))
        message("val_X dimensions: ", paste(dim(val_X), collapse = " x "))
        message("val_y length: ", length(val_y))

        if (is.factor(train_y)) {
            # Ensure consistent factor levels
            levels(train_y) <- levels(data$income)
            levels(val_y) <- levels(data$income)

            # Convert factor levels to numeric starting from 0
            train_y <- as.numeric(train_y) - 1
            val_y <- as.numeric(val_y) - 1
        }

        # Handle missing values
        train_X[is.na(train_X)] <- 0
        val_X[is.na(val_X)] <- 0

        # Create the DMatrix required for xgboost
        dtrain <- xgb.DMatrix(data = train_X, label = train_y)
        dtest <- xgb.DMatrix(data = val_X, label = val_y)

        best_params <- list()  # Initialize as an empty list
        best_eval_metric <- Inf
        best_nrounds <- 200  # Increased default value for nrounds

        # Iterate over all combinations of hyperparameters in tunegrid
        for (j in 1:nrow(tunegrid)) {
            params <- list(
                max_depth = tunegrid$max_depth[j],
                eta = tunegrid$eta[j],
                gamma = tunegrid$gamma[j],
                subsample = tunegrid$subsample[j],
                colsample_bytree = tunegrid$colsample_bytree[j],
                objective = if (is.numeric(data$income)) 'reg:squarederror' else if (length(unique(data$income)) == 2) 'binary:logistic' else 'multi:softprob',
                eval_metric = if (is.numeric(data$income)) 'rmse' else if (length(unique(data$income)) == 2) 'logloss' else 'mlogloss'
            )

            # Include num_class only for multi-class classification
            if (is.factor(data$income) && length(unique(data$income)) > 2) {
                params$num_class <- length(unique(train_y))
            }

            # Perform inner cross-validation using xgb.cv
            cv_model <- xgb.cv(
                params = params,
                data = dtrain,
                nrounds = 200,  # Increased number of boosting rounds
                nfold = inner_folds,
                verbose = FALSE
            )

            # Retrieve the best metric from cross-validation
            current_eval_metric <- min(cv_model$evaluation_log$test_rmse_mean)

            # If this model is better, update the best parameters
            if (current_eval_metric < best_eval_metric) {
                best_eval_metric <- current_eval_metric
                best_params <- params
                best_nrounds <- which.min(cv_model$evaluation_log$test_rmse_mean)
            }
        }

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- xgboost(
            params = best_params,
            data = dtrain,
            nrounds = best_nrounds,
            verbose = FALSE
        )

        # Predict on the outer test set
        predictions <- predict(final_model, dtest)

        if (is.factor(data$income) && length(unique(data$income)) > 2) {
            # Multi-class classification
            predicted_class_indices <- max.col(matrix(predictions, ncol = length(unique(train_y)), byrow = TRUE)) - 1
            predictions <- factor(predicted_class_indices, levels = 0:(length(original_levels) - 1), labels = original_levels)
        } else if (is.factor(data$income) && length(unique(data$income)) == 2) {
            # Binary classification
            predicted_classes <- ifelse(predictions >= 0.5, 1, 0)
            predictions <- factor(predicted_classes, levels = 0:1, labels = original_levels)
        }

        # Evaluate the model
        if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        }

        # Store the evaluation result for this fold
        outer_results[[i]] <- eval
    }

    # Summarize the evaluation metrics across folds
    eval_avg_outer_folds <- do.call(rbind, outer_results) %>%
                            dplyr::summarise(across(everything(), mean, na.rm = TRUE))

    return(eval_avg_outer_folds)
}

In [None]:
# new with fixes
xgb_pred <- function(data, outer_folds, inner_folds) {
    original_levels <- levels(data$income)
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    tunegrid <- expand.grid(
        max_depth = c(3, 5, 7),
        eta = c(0.05, 0.1),
        gamma = c(0, 0.1),
        subsample = 0.8,
        colsample_bytree = 0.8
    )

    outer_results <- list()

    for (i in seq_along(outer_cv_folds)) {
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        train_X <- data.matrix(outer_trainData[, !colnames(outer_trainData) %in% 'income'])
        train_y <- outer_trainData$income
        val_X <- data.matrix(outer_testData[, !colnames(outer_testData) %in% 'income'])
        val_y <- outer_testData$income

        if (is.factor(train_y)) {
            levels(train_y) <- levels(data$income)
            levels(val_y) <- levels(data$income)
            train_y <- as.numeric(train_y) - 1
            val_y <- as.numeric(val_y) - 1
        }

        train_X[is.na(train_X)] <- 0
        val_X[is.na(val_X)] <- 0

        dtrain <- xgb.DMatrix(data = train_X, label = train_y)
        dtest <- xgb.DMatrix(data = val_X, label = val_y)

        best_params <- list()
        best_eval_metric <- Inf
        best_nrounds <- 200

        for (j in 1:nrow(tunegrid)) {
            params <- list(
                max_depth = tunegrid$max_depth[j],
                eta = tunegrid$eta[j],
                gamma = tunegrid$gamma[j],
                subsample = tunegrid$subsample[j],
                colsample_bytree = tunegrid$colsample_bytree[j],
                objective = if (is.numeric(data$income)) 'reg:squarederror' else if (length(unique(data$income)) == 2) 'binary:logistic' else 'multi:softprob',
                eval_metric = if (is.numeric(data$income)) 'rmse' else if (length(unique(data$income)) == 2) 'logloss' else 'mlogloss'
            )

            if (is.factor(data$income) && length(unique(data$income)) > 2) {
                params$num_class <- length(unique(train_y))
            }

            cv_model <- xgb.cv(
                params = params,
                data = dtrain,
                nrounds = 200,
                nfold = inner_folds,
                verbose = FALSE
            )

            if (nrow(cv_model$evaluation_log) > 0) {
                current_eval_metric <- min(cv_model$evaluation_log$test_rmse_mean, na.rm = TRUE)
            } else {
                current_eval_metric <- Inf
            }

            if (current_eval_metric < best_eval_metric) {
                best_eval_metric <- current_eval_metric
                best_params <- params
                best_nrounds <- which.min(cv_model$evaluation_log$test_rmse_mean)
            }
        }

        final_model <- xgboost(
            params = best_params,
            data = dtrain,
            nrounds = best_nrounds,
            verbose = FALSE
        )

        predictions <- predict(final_model, dtest)

        if (is.factor(data$income) && length(unique(data$income)) > 2) {
            num_classes <- length(unique(train_y))
            if (length(predictions) %% num_classes == 0) {
                predicted_class_indices <- max.col(matrix(predictions, ncol = num_classes, byrow = TRUE)) - 1
                predictions <- factor(predicted_class_indices, levels = 0:(length(original_levels) - 1), labels = original_levels)
            } else {
                stop("Prediction length does not match the expected number of classes.")
            }
        } else if (is.factor(data$income) && length(unique(data$income)) == 2) {
            predicted_classes <- ifelse(predictions >= 0.5, 1, 0)
            predictions <- factor(predicted_classes, levels = 0:1, labels = original_levels)
        }

        if (length(predictions) != nrow(outer_testData)) {
            stop("Mismatch between predictions and test data lengths.")
        }

        if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        }

        outer_results[[i]] <- eval
    }

    eval_avg_outer_folds <- do.call(rbind, outer_results) %>%
                            dplyr::summarise(across(everything(), mean, na.rm = TRUE))

    return(eval_avg_outer_folds)
}


## Discretize df

In [None]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

## Apply

In [None]:
dis_data <- discretize_df(cpspop)

In [None]:
res <- xgb_pred(dis_data, outer_folds = 5, inner_folds = 3)