## Libraries

In [11]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "e1071", "ExtDist", "doParallel"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Lade n"otiges Paket: foreach




Attache Paket: 'foreach'


Die folgenden Objekte sind maskiert von 'package:purrr':

    accumulate, when


Lade n"otiges Paket: iterators

Lade n"otiges Paket: parallel



## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("race", "marital", "educ")), "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]

## BN synthetic data

In [3]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

In [4]:
synthesize_data_bn <- function(data, seed = seed) {
  # Discretize only factors, leave numeric variables as they are
  data <- discretize_df(data)

  set.seed(seed)
  
  # learn structure
  bn_structure <- tabu(data)  
  
  # fir with parametres
  bn_fitted <- bn.fit(bn_structure, data, method = "mle")
  
  # rbn() function generates synthetic data based on the fitted Bayesian network
  syn_data <- rbn(bn_fitted, n = nrow(data))  # gen same number of obs as original dataframe
  
  # ensure factor levels match the original dataset
  for (var in colnames(data)) {
    if (is.factor(data[[var]])) {
      syn_data[[var]] <- factor(syn_data[[var]], levels = levels(data[[var]]))  # Match factor levels
    }
  }
  
  return(syn_data)
}

In [7]:
syn_adult <- synthesize_data_bn(adult, seed = 1243)

In [8]:
syn_cps <- synthesize_data_bn(cpspop, seed = 1243)

## Eval Functions

In [14]:
### Evaluation Functions
## Calculate evaluation metrics for continuous targets
evaluation_metrics_cont <- function(predictions, test_set){
    # Residuals
    residuals <- predictions - test_set$income
    
    # Mean Absolute Error (MAE)
    MAE <- mean(abs(residuals))
    
    # Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
    MSE <- mean(residuals^2)
    RMSE <- sqrt(MSE)
    
    # R-squared: Guarding against zero variance in the target
    SS_res <- sum(residuals^2)
    SS_tot <- sum((test_set$income - mean(test_set$income))^2)
    R_squared <- ifelse(SS_tot == 0, NA, 1 - (SS_res / SS_tot))
    
    # Mean Absolute Percentage Error (MAPE): Handling division by zero
    MAPE <- ifelse(any(test_set$income == 0), NA, mean(abs(residuals / test_set$income)) * 100)
    
    metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE, 
        R_squared = R_squared, 
        MAPE = MAPE
    )
    
    return(metrics_df)
}
## Calculate evaluation metrics for factored targets
evaluation_metrics_factor <- function(predictions, test_set) {
    # Ensure test_set is a data frame
    test_set <- as.data.frame(test_set)
    
    # Ensure both predictions and test_set$income are factors with the same levels
    predictions <- as.factor(predictions)
    reference <- as.factor(test_set$income)
    
    # Ensure levels match between predictions and reference
    levels(predictions) <- levels(reference)
    
    # Confusion matrix for the prediction on original data
    cm <- caret::confusionMatrix(predictions, reference, mode = "everything")

    # Saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    
    if (length(levels(reference)) == 2) {
        # Binary classification
        f1 <- cm$byClass['F1']
        sens <- cm$byClass['Sensitivity']
        spec <- cm$byClass['Specificity']
    } else {
        # Multi-class classification: calculate metrics for each class and take the mean
        f1 <- mean(cm$byClass[,'F1'], na.rm = TRUE)
        sens <- mean(cm$byClass[,'Sensitivity'], na.rm = TRUE)
        spec <- mean(cm$byClass[,'Specificity'], na.rm = TRUE)
    }

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

## CART prediction

In [15]:
cart_pred <- function(data, outer_folds, cp_steps, inner_folds) {
    # Adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary

    # Set control args for inner and outer loops
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
    
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the grid for hyperparameter tuning
    complexity <- 10^seq(log10(0.0001), log10(0.01), length.out = cp_steps)
    tunegrid <- expand.grid(cp = complexity)

    # Create outer CV folds
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Setup parallel backend for outer folds
    num_cores <- detectCores() - 1  # Use all but one core
    cl <- makeCluster(num_cores)
    registerDoParallel(cl)

    # Use foreach to parallelize the outer loop
    outer_results <- foreach(i = seq_along(outer_cv_folds), .packages = c("caret", "dplyr", "rpart"),
                             .export = c("evaluation_metrics_cont", "evaluation_metrics_factor")) %dopar% {
        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Hyperparameter tuning using inner CV
        model <- caret::train(income ~ ., 
                              data = outer_trainData, 
                              method = "rpart", 
                              tuneGrid = tunegrid, 
                              trControl = inner_control,
                              control = rpart.control(maxsurrogate = 0, maxcompete = 1))

        # Get the best hyperparameters
        best_hyperparameters <- model$bestTune

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                                    data = outer_trainData, 
                                    method = "rpart", 
                                    trControl = outer_control, 
                                    tuneGrid = best_hyperparameters)

        # Testing the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)

        # Evaluate the predictions
        if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        return(eval)
    }

    stopCluster(cl)  # Stop the cluster when done

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- do.call(rbind, outer_results) %>% as.data.frame() %>%
                        dplyr::summarise(across(everything(), mean, na.rm = TRUE))



    return(eval_avg_outer_folds)
}

In [None]:
s <- 1243
set.seed(s)

In [17]:
cart_eval_adult <- cart_pred(adult, outer_folds = 5, cp_steps = 10, inner_folds = 3)

[1m[22m[36mi[39m In argument: `across(everything(), mean, na.rm = TRUE)`.
[1m[22m[33m![39m The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))"


In [18]:
cart_eval_adult

Accuracy,F1,Sensitivity,Specificity
<dbl>,<dbl>,<dbl>,<dbl>
0.8548174,0.9072263,0.9451749,0.5821775


In [None]:
save(cart_eval_adult, file = paste0(here(), "/results/adult_cart_on_bn_res_", as.character(s) ,".RData"))