# Synthetic Data Generator with a Random Forest Model

## Libraries

In [None]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "foreach", "doParallel"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("race", "marital", "educ")), "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]


## Synthetic Data

In [3]:
synthesize_data_rf <- function(data, first_var, seed) {
  # set seed
  #set.seed(seed)

  # Specify the synthesis method for each variable as 'ranger'
  method_list <- rep("rf", ncol(data))  # Set 'ranger' method for all variables
  method_list[which(colnames(data) == first_var)] <- "sample"  # Use random sampling for the first variable
  
  # Define the visit sequence (order of synthesis)
  visit_sequence <- c(which(colnames(data) == first_var), setdiff(1:ncol(data), which(colnames(data) == first_var)))
  
  # Perform the sequential synthesis with random forest (ranger)
  syn_data <- syn(data, method = method_list, visit.sequence = visit_sequence, seed = seed)#, ranger.save.memory = TRUE, ranger.num.trees = 100, ranger.max.depth = 10)#, seed = seed)
  #control = list(num.trees = 100, max.depth = 10, save.memory = TRUE)
  
  # Return the synthetic dataset
  return(syn_data$syn)
}


## Apply

In [4]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0(dataset_name, "_rf_", as.character(seed), ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

In [None]:
s <- 1236
cps_syn <- synthesize_data_rf(cpspop, first_var = "sex", seed = s)
adult_syn <- synthesize_data_rf(adult, first_var = "sex", seed = s)

In [None]:
save_synthesized_data(cps_syn, "cps", s)
save_synthesized_data(adult_syn, "adult", s)

In [None]:
head(cps_syn)

In [None]:
head(adult_syn)