# RF prediction function

### Packages

In [1]:
# Set the library path
#.libPaths("/user/emma.foessing01/u11969/new_R_libs")
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "tensorflow","keras", "doParallel"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



“running command 'timedatectl' had status 1”


here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift




The following object is masked from ‘package:jsonlite’:

    flatten





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




Package 'network' is not installed.



Package 'igraph' is not installed.




Attaching package: ‘data.table’




The following object is masked from ‘package:purrr’:

    transpose




The following objects are masked from ‘package:dplyr’:

    between, first, last




The following objects are masked from ‘package:zoo’:

    yearmon, yearqtr




Loading required package: Rcpp




Attaching package: ‘RSNNS’




The following objects are masked from ‘package:caret’:

    confusionMatrix, train




Package 'tensorflow' is not installed.



Package 'keras' is not installed.



Loading required package: foreach




Attaching package: ‘foreach’




The following objects are masked from ‘package:purrr’:

    accumulate, when




Loading required package: iterators



Loading required package: parallel



## Data

In [2]:
# set path
############## adjust to correct directory!
directory <- "/user/emma.foessing01/u11969/Master-Thesis"

In [3]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = paste0(here(),"/adult_preprocessed.csv"))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

### Helper functions

In [4]:
## eval cont targets
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [5]:
## eval factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- caret::confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [6]:
rf_pred <- function(data, outer_folds, mtry_steps, ntree_steps, inner_folds) {
    # Adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary
    # Metric: train() uses per default RSME and Accuracy for numeric and factored targets

    # Set control args
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
        
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the parameter grid for tuning
    splitrule_value <- if (is.numeric(data$income)) "variance" else "gini"
    
    tunegrid <- expand.grid(mtry = seq(2, ncol(data) - 1, length.out = mtry_steps),
                            splitrule = splitrule_value,
                            min.node.size = 5)
    # You can incorporate ntree_steps into the grid if needed

    # Initialize variables to store results
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Initialize variables to store results
    outer_results <- list()

    outer_cv_folds <- createFolds(data$income, k = outer_folds)
    
    # Outer loop: Cross-validation for model evaluation
    for (i in seq_along(outer_cv_folds)) {
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]
        
        # Hyperparameter tuning using inner CV
        model <- caret::train(income ~ ., 
                              data = outer_trainData, 
                              method = "ranger",  
                              tuneGrid = tunegrid, 
                              trControl = inner_control)

        # Store the best hyperparameters
        best_hyperparameters <- model$bestTune

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                                    data = outer_trainData, 
                                    method = "ranger", 
                                    trControl = outer_control, 
                                    tuneGrid = best_hyperparameters)

        # Testing the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)
        
        if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        # Store the evaluation metrics for this outer fold
        outer_results[[i]] <- eval
    }

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- do.call(rbind, outer_results) %>%
                            summarise(across(everything(), mean, na.rm = TRUE))

    # Return the average evaluation metrics
    return(eval_avg_outer_folds)
}

In [7]:
adult_res <- rf_pred(adult, 5, 5, 5, 5)

Growing trees.. Progress: 58%. Estimated remaining time: 22 seconds.
Growing trees.. Progress: 44%. Estimated remaining time: 38 seconds.
Growing trees.. Progress: 97%. Estimated remaining time: 2 seconds.
Growing trees.. Progress: 78%. Estimated remaining time: 8 seconds.
Growing trees.. Progress: 45%. Estimated remaining time: 37 seconds.
Growing trees.. Progress: 92%. Estimated remaining time: 5 seconds.
Growing trees.. Progress: 83%. Estimated remaining time: 6 seconds.
Growing trees.. Progress: 40%. Estimated remaining time: 46 seconds.
Growing trees.. Progress: 84%. Estimated remaining time: 11 seconds.
Growing trees.. Progress: 35%. Estimated remaining time: 57 seconds.
Growing trees.. Progress: 73%. Estimated remaining time: 23 seconds.
Growing trees.. Progress: 44%. Estimated remaining time: 39 seconds.
Growing trees.. Progress: 29%. Estimated remaining time: 1 minute, 16 seconds.
Growing trees.. Progress: 68%. Estimated remaining time: 29 seconds.
Growing trees.. Progress: 35

Growing trees.. Progress: 46%. Estimated remaining time: 36 seconds.
Growing trees.. Progress: 85%. Estimated remaining time: 5 seconds.
Growing trees.. Progress: 56%. Estimated remaining time: 24 seconds.
Growing trees.. Progress: 94%. Estimated remaining time: 1 seconds.
Growing trees.. Progress: 51%. Estimated remaining time: 30 seconds.
Growing trees.. Progress: 31%. Estimated remaining time: 1 minute, 10 seconds.
Growing trees.. Progress: 96%. Estimated remaining time: 2 seconds.


Growing trees.. Progress: 80%. Estimated remaining time: 7 seconds.
Growing trees.. Progress: 48%. Estimated remaining time: 33 seconds.
Growing trees.. Progress: 47%. Estimated remaining time: 35 seconds.
Growing trees.. Progress: 99%. Estimated remaining time: 0 seconds.
Growing trees.. Progress: 84%. Estimated remaining time: 5 seconds.
Growing trees.. Progress: 71%. Estimated remaining time: 12 seconds.
Growing trees.. Progress: 55%. Estimated remaining time: 24 seconds.
Growing trees.. Progress: 82%. Estimated remaining time: 6 seconds.
Growing trees.. Progress: 77%. Estimated remaining time: 9 seconds.
Growing trees.. Progress: 89%. Estimated remaining time: 3 seconds.
Growing trees.. Progress: 81%. Estimated remaining time: 7 seconds.
Growing trees.. Progress: 56%. Estimated remaining time: 24 seconds.
Growing trees.. Progress: 94%. Estimated remaining time: 4 seconds.
Growing trees.. Progress: 68%. Estimated remaining time: 14 seconds.
Growing trees.. Progress: 34%. Estimated r

Growing trees.. Progress: 25%. Estimated remaining time: 1 minute, 33 seconds.
Growing trees.. Progress: 69%. Estimated remaining time: 27 seconds.
Growing trees.. Progress: 19%. Estimated remaining time: 2 minutes, 13 seconds.
Growing trees.. Progress: 45%. Estimated remaining time: 1 minute, 15 seconds.
Growing trees.. Progress: 86%. Estimated remaining time: 15 seconds.
Growing trees.. Progress: 42%. Estimated remaining time: 42 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 7 seconds.
Growing trees.. Progress: 43%. Estimated remaining time: 40 seconds.
Growing trees.. Progress: 86%. Estimated remaining time: 10 seconds.
Growing trees.. Progress: 37%. Estimated remaining time: 53 seconds.
Growing trees.. Progress: 76%. Estimated remaining time: 20 seconds.
Growing trees.. Progress: 24%. Estimated remaining time: 1 minute, 40 seconds.
Growing trees.. Progress: 51%. Estimated remaining time: 59 seconds.
Growing trees.. Progress: 83%. Estimated remaining time: 19 sec

Growing trees.. Progress: 67%. Estimated remaining time: 15 seconds.
Growing trees.. Progress: 60%. Estimated remaining time: 20 seconds.
Growing trees.. Progress: 47%. Estimated remaining time: 35 seconds.
Growing trees.. Progress: 96%. Estimated remaining time: 1 seconds.
Growing trees.. Progress: 40%. Estimated remaining time: 47 seconds.
Growing trees.. Progress: 35%. Estimated remaining time: 56 seconds.
Growing trees.. Progress: 57%. Estimated remaining time: 47 seconds.
Growing trees.. Progress: 79%. Estimated remaining time: 8 seconds.
Growing trees.. Progress: 48%. Estimated remaining time: 34 seconds.
Growing trees.. Progress: 39%. Estimated remaining time: 48 seconds.
Growing trees.. Progress: 63%. Estimated remaining time: 37 seconds.
Growing trees.. Progress: 73%. Estimated remaining time: 11 seconds.
Growing trees.. Progress: 30%. Estimated remaining time: 1 minute, 13 seconds.
Growing trees.. Progress: 21%. Estimated remaining time: 1 minute, 59 seconds.
Growing trees.. 

Growing trees.. Progress: 56%. Estimated remaining time: 23 seconds.
Growing trees.. Progress: 52%. Estimated remaining time: 29 seconds.
Growing trees.. Progress: 38%. Estimated remaining time: 50 seconds.
Growing trees.. Progress: 79%. Estimated remaining time: 16 seconds.
Growing trees.. Progress: 50%. Estimated remaining time: 31 seconds.
Growing trees.. Progress: 38%. Estimated remaining time: 49 seconds.
Growing trees.. Progress: 74%. Estimated remaining time: 22 seconds.
Growing trees.. Progress: 21%. Estimated remaining time: 1 minute, 55 seconds.
Growing trees.. Progress: 47%. Estimated remaining time: 1 minute, 11 seconds.
Growing trees.. Progress: 72%. Estimated remaining time: 35 seconds.


Growing trees.. Progress: 66%. Estimated remaining time: 15 seconds.
Growing trees.. Progress: 46%. Estimated remaining time: 36 seconds.
Growing trees.. Progress: 96%. Estimated remaining time: 2 seconds.
Growing trees.. Progress: 31%. Estimated remaining time: 1 minute, 8 seconds.
Growing trees.. Progress: 64%. Estimated remaining time: 34 seconds.
Growing trees.. Progress: 75%. Estimated remaining time: 10 seconds.
Growing trees.. Progress: 85%. Estimated remaining time: 5 seconds.
Growing trees.. Progress: 44%. Estimated remaining time: 38 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 7 seconds.
Growing trees.. Progress: 34%. Estimated remaining time: 1 minute, 0 seconds.
Growing trees.. Progress: 77%. Estimated remaining time: 18 seconds.
Growing trees.. Progress: 50%. Estimated remaining time: 31 seconds.
Growing trees.. Progress: 30%. Estimated remaining time: 1 minute, 12 seconds.
Growing trees.. Progress: 81%. Estimated remaining time: 14 seconds.
Growing t

Growing trees.. Progress: 90%. Estimated remaining time: 3 seconds.
Growing trees.. Progress: 65%. Estimated remaining time: 16 seconds.
Growing trees.. Progress: 80%. Estimated remaining time: 7 seconds.
Growing trees.. Progress: 70%. Estimated remaining time: 13 seconds.


Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.


[1m[22m[36mℹ[39m In argument: `across(everything(), mean, na.rm = TRUE)`.
[1m[22m[33m![39m The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))”


In [8]:
cps_res <- rf_pred(cpspop, 5, 5, 5, 5)

Growing trees.. Progress: 94%. Estimated remaining time: 1 seconds.


Growing trees.. Progress: 44%. Estimated remaining time: 39 seconds.
Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.
Growing trees.. Progress: 29%. Estimated remaining time: 1 minute, 17 seconds.
Growing trees.. Progress: 52%. Estimated remaining time: 56 seconds.
Growing trees.. Progress: 23%. Estimated remaining time: 1 minute, 46 seconds.
Growing trees.. Progress: 56%. Estimated remaining time: 48 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 10 seconds.
Growing trees.. Progress: 76%. Estimated remaining time: 10 seconds.
Growing trees.. Progress: 43%. Estimated remaining time: 40 seconds.
Growing trees.. Progress: 97%. Estimated remaining time: 1 seconds.
Growing trees.. Progress: 38%. Estimated remaining time: 50 seconds.
Growing trees.. Progress: 86%. Estimated remaining time: 9 seconds.
Growing trees.. Progress: 25%. Estimated remaining time: 1 minute, 31 seconds.
Growing trees.. Progress: 42%. Estimated remaining time: 1 minute, 26 secon

Growing trees.. Progress: 51%. Estimated remaining time: 30 seconds.
Growing trees.. Progress: 44%. Estimated remaining time: 39 seconds.
Growing trees.. Progress: 96%. Estimated remaining time: 2 seconds.
Growing trees.. Progress: 52%. Estimated remaining time: 28 seconds.
Growing trees.. Progress: 87%. Estimated remaining time: 8 seconds.
Growing trees.. Progress: 18%. Estimated remaining time: 2 minutes, 25 seconds.
Growing trees.. Progress: 46%. Estimated remaining time: 1 minute, 11 seconds.
Growing trees.. Progress: 73%. Estimated remaining time: 35 seconds.
Growing trees.. Progress: 34%. Estimated remaining time: 59 seconds.
Growing trees.. Progress: 60%. Estimated remaining time: 40 seconds.
Growing trees.. Progress: 83%. Estimated remaining time: 19 seconds.
Growing trees.. Progress: 9%. Estimated remaining time: 5 minutes, 21 seconds.
Growing trees.. Progress: 24%. Estimated remaining time: 3 minutes, 21 seconds.
Growing trees.. Progress: 43%. Estimated remaining time: 2 minu

Growing trees.. Progress: 66%. Estimated remaining time: 16 seconds.
Growing trees.. Progress: 37%. Estimated remaining time: 51 seconds.
Growing trees.. Progress: 93%. Estimated remaining time: 4 seconds.
Growing trees.. Progress: 65%. Estimated remaining time: 16 seconds.
Growing trees.. Progress: 25%. Estimated remaining time: 1 minute, 32 seconds.
Growing trees.. Progress: 58%. Estimated remaining time: 44 seconds.
Growing trees.. Progress: 84%. Estimated remaining time: 17 seconds.
Growing trees.. Progress: 55%. Estimated remaining time: 25 seconds.
Growing trees.. Progress: 28%. Estimated remaining time: 1 minute, 20 seconds.
Growing trees.. Progress: 82%. Estimated remaining time: 13 seconds.
Growing trees.. Progress: 20%. Estimated remaining time: 2 minutes, 0 seconds.
Growing trees.. Progress: 55%. Estimated remaining time: 50 seconds.
Growing trees.. Progress: 81%. Estimated remaining time: 21 seconds.
Growing trees.. Progress: 15%. Estimated remaining time: 2 minutes, 52 sec

Growing trees.. Progress: 69%. Estimated remaining time: 14 seconds.
Growing trees.. Progress: 50%. Estimated remaining time: 31 seconds.
Growing trees.. Progress: 84%. Estimated remaining time: 6 seconds.
Growing trees.. Progress: 83%. Estimated remaining time: 6 seconds.
Growing trees.. Progress: 49%. Estimated remaining time: 32 seconds.
Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.


Growing trees.. Progress: 92%. Estimated remaining time: 2 seconds.
Growing trees.. Progress: 60%. Estimated remaining time: 21 seconds.
Growing trees.. Progress: 25%. Estimated remaining time: 1 minute, 32 seconds.
Growing trees.. Progress: 91%. Estimated remaining time: 6 seconds.
Growing trees.. Progress: 35%. Estimated remaining time: 57 seconds.
Growing trees.. Progress: 64%. Estimated remaining time: 34 seconds.
Growing trees.. Progress: 96%. Estimated remaining time: 3 seconds.
Growing trees.. Progress: 14%. Estimated remaining time: 3 minutes, 10 seconds.
Growing trees.. Progress: 32%. Estimated remaining time: 2 minutes, 11 seconds.
Growing trees.. Progress: 75%. Estimated remaining time: 30 seconds.
Growing trees.. Progress: 49%. Estimated remaining time: 32 seconds.
Growing trees.. Progress: 46%. Estimated remaining time: 36 seconds.
Growing trees.. Progress: 75%. Estimated remaining time: 20 seconds.
Growing trees.. Progress: 22%. Estimated remaining time: 1 minute, 48 seco

Growing trees.. Progress: 35%. Estimated remaining time: 58 seconds.
Growing trees.. Progress: 65%. Estimated remaining time: 33 seconds.
Growing trees.. Progress: 44%. Estimated remaining time: 40 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 6 seconds.
Growing trees.. Progress: 26%. Estimated remaining time: 1 minute, 30 seconds.
Growing trees.. Progress: 57%. Estimated remaining time: 47 seconds.
Growing trees.. Progress: 39%. Estimated remaining time: 48 seconds.
Growing trees.. Progress: 76%. Estimated remaining time: 20 seconds.
Growing trees.. Progress: 37%. Estimated remaining time: 52 seconds.
Growing trees.. Progress: 70%. Estimated remaining time: 26 seconds.
Growing trees.. Progress: 23%. Estimated remaining time: 1 minute, 42 seconds.
Growing trees.. Progress: 40%. Estimated remaining time: 1 minute, 33 seconds.
Growing trees.. Progress: 67%. Estimated remaining time: 45 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 14 seconds.


### Save results

In [9]:
# Bind results
rf_pred_results <- list(cps_res = cps_res, adult_res = adult_res)
# Save the results to an RData file 
save(rf_pred_results, file = paste0(directory, "/results/rf_pred_results.RData"))