# Synthetic Data Generator with a Random Forest Model

## Libraries

In [1]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Find out more at https://www.synthpop.org.uk/

Lade n"otiges Paket: grid

Lade n"otiges Paket: mvtnorm

Lade n"otiges Paket: modeltools

Lade n"otiges Paket: stats4

Lade n"otiges Paket: strucchange

Lade n"otiges Paket: zoo


Attache Paket: 'zoo'


Die folgenden Objekte sind maskiert von 'package:base':

    as.Date, as.Date.numeric


Lade n"otiges Paket: sandwich


Attache Paket: 'dplyr'


Das folgende Objekt ist maskiert 'package:party':

    where


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attache Paket: 'randomForest'


Das folgende Objekt ist maskiert 'package:dplyr':

    combine


Type 'citation("pROC")' for a citation.


Attache Paket: 'pROC'


Die folgenden Objekte sind maskiert von 'package:stats':

    cov, smooth, var


Lade n"otiges Paket: ggplot2


Attache Paket: 

## Data

In [2]:
# set path
############## adjust to correct directory!
directory <- "/user/emma.foessing01/u11969/Master-Thesis"

load(file = (paste0(directory, "/cpspop.RData")))
adult <- read.csv(file = paste0(directory,"/adult_preprocessed.csv"))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## Synthetic Data

In [11]:
# Function to convert dataframe columns to appropriate types (factors or numeric)
prepare_data <- function(df) {
  df[] <- lapply(df, function(col) {
    if (is.logical(col)) {
      return(as.factor(col))  # Convert logicals to factors
    } else if (is.character(col)) {
      return(as.factor(col))  # Convert characters to factors
    } else {
      return(col)  # Keep numeric and factor columns as they are
    }
  })
  return(df)
}

# Function to sequentially synthesize data using Random Forests with ranger
synthesize_data_rf <- function(data, n_trees = 500) {
  # Prepare data by converting columns to the correct types
  data <- prepare_data(data)
  
  # Initialize synthetic data frame
  syn_data <- data.frame(matrix(ncol = ncol(data), nrow = nrow(data)))
  colnames(syn_data) <- colnames(data)
  
  # Synthesize each variable sequentially
  for (var in colnames(data)) {
    # Identify predictors (all columns except the one being synthesized and those with any NAs)
    predictors <- colnames(syn_data)[!sapply(syn_data, anyNA) & colnames(syn_data) != var]
    
    # If there are no predictors (first variable), randomly sample from the original data
    if (length(predictors) == 0) {
      syn_data[[var]] <- sample(data[[var]], nrow(data), replace = TRUE)
    } else {
      # Prepare data for ranger
      train_data <- syn_data[, predictors, drop = FALSE]
      train_data[[var]] <- data[[var]]
      
      # Fit Random Forest using ranger
      model <- ranger(
        formula = as.formula(paste(var, "~ .")),
        data = train_data,
        num.trees = n_trees,
        probability = is.factor(data[[var]])  # Use probability estimation for factors
      )
      
      # Predict the values for the synthetic data
      if (is.factor(data[[var]])) {
        predictions <- predict(model, data = syn_data[, predictors, drop = FALSE])$predictions
        syn_data[[var]] <- factor(apply(predictions, 1, which.max), levels = 1:nlevels(data[[var]]), labels = levels(data[[var]]))
      } else {
        predictions <- predict(model, data = syn_data[, predictors, drop = FALSE])$predictions
        syn_data[[var]] <- predictions
      }
    }
  }
  
  return(syn_data)
}

## Apply

### CPS

In [12]:
cps_syndata <- synthesize_data_rf(cpspop)

ERROR: Error in ranger(formula = as.formula(paste(var, "~ .")), data = train_data, : User interrupt or internal error.


In [None]:
head(cps_syndata)

Unnamed: 0_level_0,tax,income,csp,age,educ,marital,race,sex,ss
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<int>
1,342,46000,0,51,39,1,1,1,0
2,0,145200,0,55,40,1,1,1,8400
3,1600,35265,0,39,34,1,1,1,10146
4,2600,17384,0,40,43,1,1,2,0
5,0,50062,0,36,46,5,1,1,0
6,0,105596,0,81,39,7,1,1,0


### Adult

In [None]:
adult_syndata <- synthesize_data_rf(adult)

## Save results

In [None]:
write.csv(cps_syndata, file = paste0(directory, "/results/RF_cps_syndata.csv"), row.names = FALSE)
write.csv(adult_syndata, file = paste0(directory, "/results/RF_adult_syndata.csv"), row.names = FALSE)