# RF prediction function

### Packages

In [1]:
# Set the library path
#.libPaths("/user/emma.foessing01/u11969/new_R_libs")
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "tensorflow","keras", "doParallel"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift




The following object is masked from ‘package:jsonlite’:

    flatten





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




Package 'network' is not installed.



Package 'igraph' is not installed.




Attaching package: ‘data.table’




The following object is masked from ‘package:purrr’:

    transpose




The following objects are masked from ‘package:dplyr’:

    between, first, last




The following objects are masked from ‘package:zoo’:

    yearmon, yearqtr




Loading required package: Rcpp




Attaching package: ‘RSNNS’




The following objects are masked from ‘package:caret’:

    confusionMatrix, train




Package 'tensorflow' is not installed.



Package 'keras' is not installed.



Loading required package: foreach




Attaching package: ‘foreach’




The following objects are masked from ‘package:purrr’:

    accumulate, when




Loading required package: iterators



Loading required package: parallel



## Data

In [2]:
# set path
############## adjust to correct directory!
directory <- "/user/emma.foessing01/u11969/Master-Thesis"

In [3]:
load(file = (paste0(directory, "/cpspop.RData")))
adult <- read.csv(file = paste0(directory,"/adult_preprocessed.csv"))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

### Helper functions

In [4]:
## eval cont targets
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [5]:
## eval factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- caret::confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [6]:
rf_pred <- function(data, outer_folds, mtry_steps, ntree_steps, inner_folds) {
    # Adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary
    # Metric: train() uses per default RSME and Accuracy for numeric and factored targets

    # Set control args
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
        
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the parameter grid for tuning
    splitrule_value <- if (is.numeric(data$income)) "variance" else "gini"
    
    tunegrid <- expand.grid(mtry = seq(2, ncol(data) - 1, length.out = mtry_steps),
                            splitrule = splitrule_value,
                            min.node.size = 5)
    # You can incorporate ntree_steps into the grid if needed

    # Initialize variables to store results
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Register parallel backend
    cl <- makeCluster(detectCores() - 1)  # Use one less than the total number of cores
    registerDoParallel(cl)
    
    # Outer loop: Cross-validation for model evaluation (Parallelized with foreach)
    outer_results <- foreach(i = seq_along(outer_cv_folds), .combine = 'rbind', 
                             .packages = c('caret', 'ranger'), 
                             .export = c('outer_cv_folds', 'data', 'tunegrid', 'inner_control', 'outer_control', 'summaryFunctionType', 'evaluation_metrics_factor', 'evaluation_metrics_cont')) %dopar% {
        
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]
        
        # Hyperparameter tuning using inner CV
        model <- caret::train(income ~ ., 
                              data = outer_trainData, 
                              method = "ranger",  
                              tuneGrid = tunegrid, 
                              trControl = inner_control)

        # Store the best hyperparameters
        best_hyperparameters <- model$bestTune

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                                    data = outer_trainData, 
                                    method = "ranger", 
                                    trControl = outer_control, 
                                    tuneGrid = best_hyperparameters)

        # Testing the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)
        
        if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        # Return the evaluation metrics for this outer fold
        return(eval)
    }

    # Stop the parallel backend
    stopCluster(cl)
    registerDoSEQ()  # Reset to sequential computation

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- outer_results %>%
                            summarise(across(everything(), mean, na.rm = TRUE))

    # Return the average evaluation metrics
    return(eval_avg_outer_folds)
}

In [7]:
adult_res <- rf_pred(adult, 5, 5, 5, 5)

“already exporting variable(s): outer_cv_folds, data, tunegrid, inner_control, outer_control”


[1m[22m[36mℹ[39m In argument: `across(everything(), mean, na.rm = TRUE)`.
[1m[22m[33m![39m The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))”


In [8]:
cps_res <- rf_pred(cpspop, 5, 5, 5, 5)

“already exporting variable(s): outer_cv_folds, data, tunegrid, inner_control, outer_control”


### Save results

In [9]:
# Bind results
rf_pred_results <- list(cps_res = cps_res, adult_res = adult_res)
# Save the results to an RData file 
save(rf_pred_results, file = paste0(directory, "/results/rf_pred_results.RData"))