# Synthetic Data Generator with a Random Forest Model

## Libraries

In [1]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "foreach", "doParallel"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Find out more at https://www.synthpop.org.uk/

Lade n"otiges Paket: grid

Lade n"otiges Paket: mvtnorm

Lade n"otiges Paket: modeltools

Lade n"otiges Paket: stats4

Lade n"otiges Paket: strucchange

Lade n"otiges Paket: zoo


Attache Paket: 'zoo'


Die folgenden Objekte sind maskiert von 'package:base':

    as.Date, as.Date.numeric


Lade n"otiges Paket: sandwich


Attache Paket: 'dplyr'


Das folgende Objekt ist maskiert 'package:party':

    where


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attache Paket: 'randomForest'


Das folgende Objekt ist maskiert 'package:dplyr':

    combine


Type 'citation("pROC")' for a citation.


Attache Paket: 'pROC'


Die folgenden Objekte sind maskiert von 'package:stats':

    cov, smooth, var


Lade n"otiges Paket: ggplot2


Attache Paket: 

## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("race", "marital", "educ")), "race", "marital", "educ")]

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]


## Synthetic Data

In [7]:
synthesize_data_rf <- function(data, first_var) { #, seed
  # set seed
  #set.seed(seed)

  # Specify the synthesis method for each variable as 'ranger'
  method_list <- rep("ranger", ncol(data))  # Set 'ranger' method for all variables
  method_list[which(colnames(data) == first_var)] <- "sample"  # Use random sampling for the first variable
  
  # Define the visit sequence (order of synthesis)
  visit_sequence <- c(which(colnames(data) == first_var), setdiff(1:ncol(data), which(colnames(data) == first_var)))
  
  # Perform the sequential synthesis with random forest (ranger)
  syn_data <- syn(data, method = method_list, visit.sequence = visit_sequence)#, seed = seed)
  
  # Return the synthetic dataset
  return(syn_data$syn)
}


## Apply

In [None]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0("data_rf_", dataset_name, "_", seed, ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

s <- 1235
cps_syn <- synthesize_data_rf(cpspop, first_var = "sex", seed = s)
adult_syn <- synthesize_data_rf(adult, first_var = "sex", seed = s)

save_synthesized_data(cps_syn, "rf", cps, s)
save_synthesized_data(adult_syn, "rf", cps, s)

# Datasets
datasets <- list(cps = cpspop, adult = adult)

# Parameters
n <- 10  # Number of runs
initial_seed <- 1234

# Main loop for generating and saving synthetic data
for (i in 1:n) {
  seed <- initial_seed + i
  for (dataset_name in names(datasets)) {
    dataset <- datasets[[dataset_name]]
    synthetic_data <- synthesize_data_rf(dataset, first_var = "sex", seed = seed)
    save_synthesized_data(synthetic_data, "rf", dataset_name, seed)
  }
}

### CPS

In [None]:
set.seed(1235)

In [None]:
cps_syndata <- synthesize_data_rf(cpspop, first_var= "sex")#, seed = seed)

### Adult

In [None]:
adult_syndata <- synthesize_data_rf(adult, first_var= "sex")#, seed = seed)

## Save results

write.csv(cps_syndata, file = paste0(directory, "/results/RF_cps_syndata.csv"), row.names = FALSE)
write.csv(adult_syndata, file = paste0(directory, "/results/RF_adult_syndata.csv"), row.names = FALSE)