# Synthetic Data Generator with a Random Forest Model

## Libraries

In [1]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "foreach", "doParallel"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift




The following object is masked from ‘package:jsonlite’:

    flatten





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




Package 'network' is not installed.




Attaching package: ‘igraph’




The following object is masked from ‘package:arules’:

    union




The following objects are masked from ‘package:bnlearn’:

    as.igraph, compare, degree, subgraph




The following objects are masked from ‘package:purrr’:

    compose, simplify




The following objects are masked from ‘package:dplyr’:

    as_data_frame, groups, union




The following object is masked from ‘package:modeltools’:

    clusters




The following object is masked from ‘package:synthpop’:

    compare




The following objects are masked from ‘package:stats’:

    decompose, spectrum




The following object is masked from ‘package:base’:

    union





Attaching package: ‘data.table’




The following object is masked from ‘package:purrr’:

    transpose




The following objects are masked from ‘package:dplyr’:

    between, first, last




The following objects are masked from ‘package:zoo’:

    yearmon, yearqtr




Loading required package: Rcpp




Attaching package: ‘RSNNS’




The following objects are masked from ‘package:caret’:

    confusionMatrix, train





Attaching package: ‘foreach’




The following objects are masked from ‘package:purrr’:

    accumulate, when




Loading required package: iterators



Loading required package: parallel



## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("race", "marital", "educ")), "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]


## Synthetic Data

In [3]:
synthesize_data_rf <- function(data, first_var, seed) {
  # set seed
  #set.seed(seed)

  # Specify the synthesis method for each variable as 'ranger'
  method_list <- rep("rf", ncol(data))  # Set 'ranger' method for all variables
  method_list[which(colnames(data) == first_var)] <- "sample"  # Use random sampling for the first variable
  
  # Define the visit sequence (order of synthesis)
  visit_sequence <- c(which(colnames(data) == first_var), setdiff(1:ncol(data), which(colnames(data) == first_var)))
  
  # Perform the sequential synthesis with random forest (ranger)
  syn_data <- syn(data, method = method_list, visit.sequence = visit_sequence, seed = seed)#, ranger.save.memory = TRUE, ranger.num.trees = 100, ranger.max.depth = 10)#, seed = seed)
  #control = list(num.trees = 100, max.depth = 10, save.memory = TRUE)
  
  # Return the synthetic dataset
  return(syn_data$syn)
}


## Apply

In [4]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0(dataset_name, "_rf_", as.character(seed), ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

In [5]:
s <- 1243
cps_syn <- synthesize_data_rf(cpspop, first_var = "sex", seed = s)
adult_syn <- synthesize_data_rf(adult, first_var = "sex", seed = s)


Synthesis
-----------
 sex tax income csp age ss race marital educ



Synthesis
-----------
 sex age fnlwgt capital_gain capital_loss hours_per_week income race relationship marital_status
 workclass occupation education native_country


In [6]:
save_synthesized_data(cps_syn, "cps", s)
save_synthesized_data(adult_syn, "adult", s)

In [7]:
head(cps_syn)

Unnamed: 0_level_0,tax,income,csp,age,sex,ss,race,marital,educ
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<fct>,<int>,<fct>,<fct>,<fct>
1,0,15992,0,35,2,0,1,4,35
2,4400,47894,0,35,1,0,1,7,44
3,1040,81000,0,39,1,0,1,1,43
4,5460,47060,0,46,2,0,1,1,39
5,5000,26178,0,39,1,0,1,7,34
6,0,53110,0,74,1,8898,1,3,40


In [8]:
head(adult_syn)

Unnamed: 0_level_0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,income,sex,race,relationship,marital_status,workclass,occupation,education,native_country
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,22,75821,2597,1762,37,<=50K,0,4,3,4,2,9,9,38
2,45,221947,0,0,40,<=50K,1,4,0,2,2,11,4,25
3,53,121912,0,0,50,>50K,1,4,0,2,1,9,12,38
4,25,209214,0,0,55,<=50K,0,4,1,4,2,11,1,38
5,59,59496,0,0,35,>50K,1,4,0,2,4,11,11,38
6,57,148492,0,1590,40,<=50K,1,4,0,2,2,13,2,38
