# Synthetic Data Generator with a Random Forest Model

## Libraries

In [1]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Package 'synthpop' is not installed.



Package 'insight' is not installed.



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich



Package 'haven' is not installed.




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




Package 'rpart.plot' is not installed.



randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



Package 'pracma' is not installed.



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis



Package 'Hmisc' is not installed.




Attaching package: ‘purrr’




The following object is masked from ‘package:caret’:

    lift




The following object is masked from ‘package:jsonlite’:

    flatten





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance




Package 'bnlearn' is not installed.



Package 'arulesCBA' is not installed.



Package 'network' is not installed.



Package 'igraph' is not installed.



Package 'xgboost' is not installed.




Attaching package: ‘data.table’




The following object is masked from ‘package:purrr’:

    transpose




The following objects are masked from ‘package:dplyr’:

    between, first, last




The following objects are masked from ‘package:zoo’:

    yearmon, yearqtr




Package 'RSNNS' is not installed.



Package 'xgboost' is not installed.



## Data

In [2]:
# set path
############## adjust to correct directory!
directory <- "/user/emma.foessing01/u11969/Master-Thesis"

load(file = (paste0(directory, "/cpspop.RData")))
adult <- read.csv(file = paste0(directory,"/adult_preprocessed.csv"))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## Synthetic Data

In [3]:
# Function to convert dataframe columns to appropriate types (factors or numeric)
prepare_data <- function(df) {
  df[] <- lapply(df, function(col) {
    if (is.logical(col)) {
      return(as.factor(col))  # Convert logicals to factors
    } else if (is.character(col)) {
      return(as.factor(col))  # Convert characters to factors
    } else {
      return(col)  # Keep numeric and factor columns as they are
    }
  })
  return(df)
}

# Function to sequentially synthesize data using Random Forests with ranger
synthesize_data_rf <- function(data, n_trees = 500) {
  # Prepare data by converting columns to the correct types
  data <- prepare_data(data)
  
  # Initialize synthetic data frame
  syn_data <- data.frame(matrix(ncol = ncol(data), nrow = nrow(data)))
  colnames(syn_data) <- colnames(data)
  
  # Synthesize each variable sequentially
  for (var in colnames(data)) {
    # Identify predictors (all columns except the one being synthesized and those with any NAs)
    predictors <- colnames(syn_data)[!sapply(syn_data, anyNA) & colnames(syn_data) != var]
    
    # If there are no predictors (first variable), randomly sample from the original data
    if (length(predictors) == 0) {
      syn_data[[var]] <- sample(data[[var]], nrow(data), replace = TRUE)
    } else {
      # Prepare data for ranger
      train_data <- syn_data[, predictors, drop = FALSE]
      train_data[[var]] <- data[[var]]
      
      # Fit Random Forest using ranger
      model <- ranger(
        formula = as.formula(paste(var, "~ .")),
        data = train_data,
        num.trees = n_trees,
        probability = is.factor(data[[var]])  # Use probability estimation for factors
      )
      
      # Predict the values for the synthetic data
      if (is.factor(data[[var]])) {
        predictions <- predict(model, data = syn_data[, predictors, drop = FALSE])$predictions
        syn_data[[var]] <- factor(apply(predictions, 1, which.max), levels = 1:nlevels(data[[var]]), labels = levels(data[[var]]))
      } else {
        predictions <- predict(model, data = syn_data[, predictors, drop = FALSE])$predictions
        syn_data[[var]] <- predictions
      }
    }
  }
  
  return(syn_data)
}

## Apply

### CPS

In [4]:
cps_syndata <- synthesize_data_rf(cpspop)

In [5]:
head(cps_syndata)

Unnamed: 0_level_0,tax,income,csp,age,educ,marital,race,sex,ss
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<fct>,<dbl>
1,342,40713.7,0.0,48.95159,39,1,1,1,2792.424
2,0,54119.69,126.7822,48.14375,39,1,1,1,2107.766
3,1600,52370.29,189.0203,48.43671,39,1,1,1,2249.23
4,2600,59245.23,228.9901,48.39597,39,1,1,1,2287.761
5,0,54119.69,126.7822,48.14375,39,1,1,1,2107.766
6,0,54119.69,126.7822,48.14375,39,1,1,1,2107.766


### Adult

In [6]:
adult_syndata <- synthesize_data_rf(adult)