# Synthetic Data Generator with a Extreme Boosting Model

## Libraries

In [1]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich



Package 'haven' is not installed.




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift




The following object is masked from ‘package:jsonlite’:

    flatten





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance




Package 'bnlearn' is not installed.



Package 'arulesCBA' is not installed.



Package 'network' is not installed.



Package 'igraph' is not installed.




Attaching package: ‘data.table’




The following object is masked from ‘package:purrr’:

    transpose




The following objects are masked from ‘package:dplyr’:

    between, first, last




The following objects are masked from ‘package:zoo’:

    yearmon, yearqtr




Loading required package: Rcpp




Attaching package: ‘RSNNS’




The following objects are masked from ‘package:caret’:

    confusionMatrix, train





Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




## Data

In [2]:
# set path
############## adjust to correct directory!
directory <- "/user/emma.foessing01/u11969/Master-Thesis"

load(file = (paste0(directory, "/cpspop.RData")))
adult <- read.csv(file = paste0(directory,"/adult_preprocessed.csv"))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## Synthetic Data

In [3]:
# Function to convert dataframe to numeric matrix
convert_to_numeric_matrix <- function(df) {
  df[] <- lapply(df, function(col) {
    if (is.factor(col)) {
      return(as.numeric(col))
    } else {
      return(col)
    }
  })
  
  matrix_data <- as.matrix(df)
  return(matrix_data)
}

# Function to restore factors from numeric values
restore_factors <- function(original_df, synthesized_df) {
  synthesized_df[] <- lapply(seq_along(original_df), function(i) {
    if (is.factor(original_df[[i]])) {
      levels <- levels(original_df[[i]])
      return(factor(synthesized_df[[i]], levels = seq_along(levels), labels = levels))
    } else {
      return(as.numeric(synthesized_df[[i]]))
    }
  })
  
  return(synthesized_df)
}

# Function to synthesize all variables in the dataframe sequentially using XGBoost
synthesize_data_xgb <- function(data, nrounds = 100) {
  data_synth <- data
  
  for (j in seq_along(colnames(data))) {
    var_j <- colnames(data)[j]
    var_indices <- match(colnames(data)[1:(j-1)], colnames(data_synth), nomatch = 0)
    predictors <- c(colnames(data)[-j], colnames(data_synth)[var_indices])
    
    # Prepare data for XGBoost by converting to numeric matrix
    X <- convert_to_numeric_matrix(data_synth[, predictors, drop = FALSE])
    y <- data_synth[[var_j]]
    
    # Choose the objective based on the type of variable
    if (is.factor(y)) {
      y_numeric <- as.numeric(y) - 1  # Convert factor to numeric starting at 0
      num_classes <- length(levels(y))
      objective <- "multi:softmax"
      params <- list(
        objective = objective,
        num_class = num_classes,
        max_depth = 3,
        eta = 0.1
      )
    } else {
      y_numeric <- as.numeric(y)  # Ensure y is numeric for regression
      objective <- "reg:squarederror"
      params <- list(
        objective = objective,
        max_depth = 3,
        eta = 0.1
      )
    }
    
    # Train the XGBoost model on Y(j) given all other variables synthesized so far
    dtrain <- xgb.DMatrix(data = X, label = y_numeric)
    model <- xgb.train(params = params, data = dtrain, nrounds = nrounds)
    
    # Predict and synthesize Y(j)
    data_synth[[var_j]] <- predict(model, newdata = xgb.DMatrix(data = X))
    
    # If the original column was a factor, convert predictions back to factor levels
    if (is.factor(y)) {
      data_synth[[var_j]] <- factor(data_synth[[var_j]], levels = 0:(num_classes-1), labels = levels(y))
    }
  }
  
  # Restore factors and numeric types in synthesized data
  data_synth <- restore_factors(data, data_synth)
  
  return(data_synth)
}

In [4]:
# Function to check and prepare the input data
prepare_data <- function(df) {
  # Ensure the data is a data frame
  if (!is.data.frame(df)) {
    stop("Input data must be a data frame.")
  }
  
  # Convert columns to appropriate types (factors or numeric)
  df[] <- lapply(df, function(col) {
    if (is.logical(col)) {
      return(as.factor(col))  # Convert logicals to factors
    } else if (is.character(col)) {
      return(as.factor(col))  # Convert characters to factors
    } else {
      return(col)  # Keep numeric and factor columns as they are
    }
  })
  return(df)
}

# Function to sequentially synthesize data using XGBoost
synthesize_data_xgb <- function(data, first_var, nrounds = 100, eta = 0.1, max_depth = 3, noise_factor = 0.01) {
  # Ensure that the first_var exists in the data
  if (!first_var %in% colnames(data)) {
    stop(paste("The column", first_var, "does not exist in the dataframe."))
  }
  
  # Prepare data by checking and converting columns to the correct types
  data <- prepare_data(data)
  
  # Initialize synthetic data frame with the same structure as the original data
  syn_data <- data.frame(matrix(NA, ncol = ncol(data), nrow = nrow(data)))
  colnames(syn_data) <- colnames(data)
  
  # Step 1: Directly sample the first variable (e.g., 'sex') from the original data
  syn_data[[first_var]] <- sample(data[[first_var]], nrow(data), replace = TRUE)
  
  # Step 2: Synthesize the remaining variables sequentially
  remaining_vars <- setdiff(colnames(data), first_var)
  
  for (var in remaining_vars) {
    print(paste("Synthesizing variable:", var))
    
    # Identify predictors (all synthesized columns up to this point)
    predictors <- colnames(syn_data)[!is.na(syn_data[1, ]) & colnames(syn_data) != var]
    
    # Convert predictors to matrix form
    train_data <- syn_data[, predictors, drop = FALSE]
    train_matrix <- model.matrix(~ . - 1, data = train_data)
    
    # Prepare the target variable
    target <- data[[var]]
    
    # Set up XGBoost parameters
    params <- list(
      objective = if (is.factor(target)) "multi:softmax" else "reg:squarederror",
      eta = eta,
      max_depth = max_depth
    )
    
    # Only set num_class if the target is a factor
    if (is.factor(target)) {
      params$num_class <- length(levels(target))
      target <- as.numeric(target) - 1  # XGBoost requires factor levels to be 0-based
    }
    
    # Train the XGBoost model
    model <- xgboost::xgboost(
      data = train_matrix,
      label = target,
      params = params,
      nrounds = nrounds,
      verbose = 0
    )
    
    # Predict the values for the synthetic data
    pred_matrix <- model.matrix(~ . - 1, data = syn_data[, predictors, drop = FALSE])
    predictions <- predict(model, newdata = pred_matrix)
    
    # Assign predictions back to synthetic data
    if (is.factor(data[[var]])) {
      syn_data[[var]] <- factor(predictions + 1, levels = seq_along(levels(data[[var]])), labels = levels(data[[var]]))
    } else {
      # Add random noise to numeric predictions
      syn_data[[var]] <- predictions + rnorm(length(predictions), mean = 0, sd = noise_factor * sd(predictions))
    }
  }
  
  return(syn_data)
}

## Apply

### CPS

In [5]:
# Generate synthetic data for all target variables
cps_syndata <- synthesize_data_xgb(cpspop, first_var = "sex")

[1] "Synthesizing variable: tax"
[1] "Synthesizing variable: income"
[1] "Synthesizing variable: csp"
[1] "Synthesizing variable: age"
[1] "Synthesizing variable: educ"
[1] "Synthesizing variable: marital"
[1] "Synthesizing variable: race"
[1] "Synthesizing variable: ss"


### Adult

In [6]:
# Generate synthetic data for all target variables
adult_syndata <- synthesize_data_xgb(adult, first_var = "sex")

[1] "Synthesizing variable: age"
[1] "Synthesizing variable: workclass"
[1] "Synthesizing variable: fnlwgt"
[1] "Synthesizing variable: education"
[1] "Synthesizing variable: marital_status"
[1] "Synthesizing variable: occupation"
[1] "Synthesizing variable: relationship"
[1] "Synthesizing variable: race"
[1] "Synthesizing variable: capital_gain"
[1] "Synthesizing variable: capital_loss"
[1] "Synthesizing variable: hours_per_week"
[1] "Synthesizing variable: native_country"
[1] "Synthesizing variable: income"


## Save results

In [7]:
write.csv(cps_syndata, file = paste0(directory, "/results/XGB_cps_syndata.csv"), row.names = FALSE)
write.csv(adult_syndata, file = paste0(directory, "/results/XGB_adult_syndata.csv"), row.names = FALSE)