# Random Forests

Link: https://medium.com/@gideonadele/using-random-forests-to-generate-partially-synthetic-categorical-data-4d2b6a664988
Source: J. Reiter and G. Caiola (2010) Random Forests for Generating Partially Synthetic, Categorical Data. Transactions on data privacy. <br>
randomForest documentation: https://cran.r-project.org/web/packages/randomForest/randomForest.pdf


In [8]:
list_of_packages <- c ("synthpop", "insight", "party", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "arf", "randomForest", "caret")

lapply(list_of_packages, FUN= function(X){
  do.call("require", list(X))
})

## Data

In [1]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)

ERROR: Error in here(): konnte Funktion "here" nicht finden


## Synthetic data

In [4]:
generate_synthetic_rf <- function(data, n_synthetic) {
  if (!is.data.frame(data)) {
    stop("Data must be a dataframe.")
  }
  if (n_synthetic <= 0 || !is.numeric(n_synthetic)) {
    stop("Number of synthetic samples must be a positive integer.")
  }

  # Initialize an empty dataframe with appropriate dimensions
  synthetic_data <- data.frame(matrix(ncol = ncol(data), nrow = n_synthetic))
  names(synthetic_data) <- names(data)

  for (i in seq_along(data)) {
    # Set the current column as the target variable
    target_var <- data[, i]
    predictor_vars <- data[, -i, drop = FALSE]
    
    # Check for sufficient data
    if (ncol(predictor_vars) == 0) {
      stop("Not enough predictor variables.")
    }

    # Train a Random Forest model
    rf_model <- randomForest(predictor_vars, target_var, ntree=100)

    # Generate synthetic data
    if (is.numeric(target_var)) {
      synthetic_data[, i] <- predict(rf_model, newdata = synthetic_data[, -i, drop = FALSE])
    } else if (is.factor(target_var)) {
      predicted_probs <- predict(rf_model, newdata = synthetic_data[, -i, drop = FALSE], type = "prob")
      levels <- levels(target_var)
      synthetic_scaled_probs <- sweep(predicted_probs, 1, rowSums(predicted_probs), FUN="/")
      synthetic_data[, i] <- apply(synthetic_scaled_probs, 1, function(p) sample(levels, 1, prob = p))
    }
  }
  
  return(synthetic_data)
}

## Reasons for long time computing syn data
Complexity of the Random Forest Model: Random Forest is an ensemble method that builds multiple decision trees (specified by the ntree parameter) for each variable in the dataset. Each tree is built on a bootstrap sample of the data, and at each node in these trees, a subset of the predictors is randomly chosen to split on. This process is computationally expensive, especially with larger datasets and a higher number of trees.

Multiple Models: For each variable in the dataset, a separate Random Forest model is trained using all other variables as predictors. This means if you have a dataset with many variables, you will need to train a significant number of Random Forest models. If your dataset has, say, 30 variables, and you decide to use 100 trees per model, you are essentially building 3,000 trees.

Data Size and Dimensionality: The size (number of rows) and the number of variables (columns) in your dataset greatly impact the computation time. Larger datasets or those with many features require more time to build each tree and, consequently, each model.

Handling Factor Variables: If the target variable is a factor (categorical), the model not only has to calculate the probabilities for each category for each case but also needs to handle these during the prediction phase, which can add overhead, especially when sampling from these predicted probabilities to generate synthetic data.

## Adversarial random forest

In [5]:
data <- cpspop

arf_syn <- function(data) {
  # Train the adversarial random forest
  arf_model <- adversarial_rf(
    x = data,
    num_trees = 100L,
    min_node_size = 2L,
    delta = 0,
    max_iters = 10L,
    early_stop = TRUE,
    prune = TRUE,
    verbose = TRUE,
    parallel = TRUE
  )

  # Estimate leaf and distribution parameters
  psi <- forde(
    arf = arf_model, 
    x = data, 
    oob = FALSE,
    family = "truncnorm",
    finite_bounds = FALSE,
    alpha = 0,
    epsilon = 0,
    parallel = TRUE
  )

  synthetic_data <- forge(psi, nrow(data))
  return(synthetic_data)
}

In [6]:
syn_cps <- arf_syn(cpspop)

"Recoding integers with more than 5 unique values as numeric. To override this behavior, explicitly code these variables as factors."


Iteration: 0, Accuracy: 76.47%
Iteration: 1, Accuracy: 35.96%


"executing %dopar% sequentially: no parallel backend registered"


In [None]:
syn_adult <- arf_syn(adult)

"Recoding integers with more than 5 unique values as numeric. To override this behavior, explicitly code these variables as factors."


Iteration: 0, Accuracy: 86.25%
Iteration: 1, Accuracy: 41.12%


In [7]:
rf_model <- randomForest(income ~ ., data = cpspop)

## Simulation

### Helper functions

In [None]:
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [None]:
## Calculate evaluation metrics for factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [12]:
rf_simulation <- function(data, nrun = 10, kfold = 10, steps_mtry = 5, steps_ntree = 5) {

    # create empty list to store evaluation dataframes
    eval_list <- list()

    # set inital seed
    s <- 1234
    for (i in 1:nrun){
        # vary seed with each run
        s <- s + 1
        set.seed(s)

        # generate synthetic data
        gen_data <- arf_syn(data)
        
        # Split the data into training and testing sets
        trainIndex <- createDataPartition(data$income, p = .8, 
                                            list = FALSE, 
                                            times = 1)
        trainData <- data[ trainIndex,]
        testData  <- data[-trainIndex,]
        
        # define control for CV
        control <- trainControl(method = "cv", number = kfold)

        # Define the parameter grid for tuning
        tunegrid <- expand.grid(mtry = seq(2, ncol(trainData) - 1, length.out = steps_mtry))
        ntree_values <- seq(100, 1000, length.out = steps_ntree)

        # Fit the random forest model
        rf_model <- randomForest(income ~ ., data = trainData)

        results <- list()
        for (ntree in ntree_values) {
            set.seed(123)
            rf_model <- train(income ~ ., data = trainData, 
                                method = "rf", 
                                trControl = control, 
                                tuneGrid = tunegrid, 
                                ntree = ntree)
            results[[paste0("ntree_", ntree)]] <- rf_model
            }
        best_model <- results[[which.max(sapply(results, function(x) max(x$results$Accuracy)))]]

        # Make predictions on the test set
        predictions <- predict(best_model, newdata = testData)

        # Evaluate the model performance
        # evaluation metrics
        if (is.numeric(data$income)) {
            eval <- as.data.frame(evaluation_metrics_cont(predictions, testData))
            }
        else if (is.factor(data$income)) {
            eval <- as.data.frame(evaluation_metrics_factor(predictions, testData))
            }
        else {
            break("The predicted target has to be numeric or factor.")
            }

        eval_list[[i]] <- eval
        print(c("run", i, "completed"))
        }

    # outside the nruns loop
    # average over all runs
    sum_df <- Reduce(function(x, y) Map(`+`, x, y), eval_list)
    eval_avg <- lapply(sum_df, function(col) col / length(eval_list))

    # Convert the list back to a dataframe
    # Store row names
    rownames <- row.names(eval_list[[1]])

    # Convert the list back to a dataframe
    eval_avg <- as.data.frame(eval_avg)

    # Set back the row names
    rownames(eval_avg) <- row.names(eval_list[[1]])

    # Return the average evaluation metrics
    return(eval_avg)
    }


In [13]:
cps_res <- rf_simulation(cpspop)

"Recoding integers with more than 5 unique values as numeric. To override this behavior, explicitly code these variables as factors."


Iteration: 0, Accuracy: 76.46%
Iteration: 1, Accuracy: 35.93%
