# Synthetic Data Generator with a Extreme Boosting Model

## Libraries

In [1]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "ExtDist"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift




The following object is masked from ‘package:jsonlite’:

    flatten





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




Package 'network' is not installed.




Attaching package: ‘igraph’




The following object is masked from ‘package:arules’:

    union




The following objects are masked from ‘package:bnlearn’:

    as.igraph, compare, degree, subgraph




The following objects are masked from ‘package:purrr’:

    compose, simplify




The following objects are masked from ‘package:dplyr’:

    as_data_frame, groups, union




The following object is masked from ‘package:modeltools’:

    clusters




The following object is masked from ‘package:synthpop’:

    compare




The following objects are masked from ‘package:stats’:

    decompose, spectrum




The following object is masked from ‘package:base’:

    union





Attaching package: ‘data.table’




The following object is masked from ‘package:purrr’:

    transpose




The following objects are masked from ‘package:dplyr’:

    between, first, last




The following objects are masked from ‘package:zoo’:

    yearmon, yearqtr




Loading required package: Rcpp




Attaching package: ‘RSNNS’




The following objects are masked from ‘package:caret’:

    confusionMatrix, train





Attaching package: ‘ExtDist’




The following object is masked from ‘package:stats4’:

    BIC




The following object is masked from ‘package:stats’:

    BIC




## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = paste0(here(),"/adult_preprocessed.csv"))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## Synthetic Data

In [3]:
# Helper function to prepare data for XGBoost by converting factors/characters to numeric
prepare_data_for_xgb <- function(df) {
  df[] <- lapply(df, function(col) {
    if (is.factor(col) || is.character(col)) {
      return(as.numeric(as.factor(col)))  # Convert factors/characters to numeric
    } else {
      return(as.numeric(col))  # Ensure numeric columns remain numeric
    }
  })
  return(as.matrix(df))  # Return a numeric matrix
}

# Helper function to bootstrap residuals and add to predictions
bootstrap_residuals <- function(predictions, residuals) {
  # Bootstrap residuals by sampling with replacement
  bootstrapped_residuals <- sample(residuals, length(predictions), replace = TRUE)
  
  # Add bootstrapped residuals to the predictions
  noisy_predictions <- predictions + bootstrapped_residuals
  
  # Clip negative values to 0 (or a small threshold)
  noisy_predictions[noisy_predictions < 0] <- 0
  
  return(noisy_predictions)
}

# Function to apply 3rd root transformation to continuous variables
transform_continuous <- function(col) {
  return(sign(col) * abs(col)^(1/3))  # Handle negative values properly
}

# Function to retransform continuous variables back to original scale
retransform_continuous <- function(col) {
  return(col^3)  # Cube the transformed values
}

synthesize_data_xgb <- function(data, first_var, seed, nrounds = 100, eta = 0.1, max_depth = 3) {
  
  set.seed(seed)

  # Ensure that the first_var exists in the data
  if (!first_var %in% colnames(data)) {
    stop(paste("The column", first_var, "does not exist in the dataframe."))
  }
  
  # Initialize synthetic data frame with the same structure as the original data
  syn_data <- data.frame(matrix(NA, ncol = ncol(data), nrow = nrow(data)))
  colnames(syn_data) <- colnames(data)
  
  continuous_vars <- c()  # Track continuous variables
  
  # Step 1: Sample the first variable (Y_1) from its marginal distribution
  syn_data[[first_var]] <- sample(data[[first_var]], nrow(data), replace = TRUE)
  
  # Step 2: Apply 3rd root transformation to continuous variables
  transformed_data <- data
  for (var_j in colnames(data)) {
    if (is.numeric(data[[var_j]])) {
      transformed_data[[var_j]] <- transform_continuous(data[[var_j]])
      continuous_vars <- c(continuous_vars, var_j)  # Track continuous variables
    }
  }
  
  # Step 3: Sequentially synthesize each subsequent variable
  remaining_vars <- setdiff(colnames(data), first_var)
  
  for (var_j in remaining_vars) {
    print(paste("Synthesizing variable:", var_j))
    
    # Identify predictors (all synthesized columns up to this point)
    predictors <- colnames(syn_data)[!is.na(syn_data[1, ]) & colnames(syn_data) != var_j]
    
    # Prepare data: Convert categorical predictors to numeric matrix form
    train_data <- syn_data[, predictors, drop = FALSE]
    train_matrix <- prepare_data_for_xgb(train_data)  # Convert predictors to numeric matrix
    
    # Prepare the target variable
    if (is.factor(data[[var_j]])) {
      target <- as.numeric(data[[var_j]]) - 1  # XGBoost requires factor levels to be 0-based
    } else {
      target <- as.numeric(transformed_data[[var_j]])  # Use transformed data for continuous variables
    }
    
    # Set up XGBoost parameters
    params <- list(
      objective = if (is.factor(data[[var_j]])) "multi:softprob" else "reg:squarederror",  # Use softprob for probabilities
      eta = eta,
      max_depth = max_depth
    )
    
    # Only set num_class if the target is a factor
    if (is.factor(data[[var_j]])) {
      params$num_class <- length(levels(data[[var_j]]))
    }
    
    # Create DMatrix
    dtrain <- xgb.DMatrix(data = train_matrix, label = target)  # Now data is numeric
    
    # Train the XGBoost model
    model <- xgboost(
      data = dtrain,
      params = params,
      nrounds = nrounds,
      verbose = 0
    )
    
    # Predict the values for the synthetic data
    pred_matrix <- prepare_data_for_xgb(syn_data[, predictors, drop = FALSE])
    
    # For classification, get probabilistic outputs
    if (is.factor(data[[var_j]])) {
      # Get probability predictions
      prob_predictions <- predict(model, newdata = pred_matrix, outputmargin = FALSE)
      prob_predictions <- matrix(prob_predictions, nrow = nrow(syn_data), byrow = TRUE)
      
      # Sample from the predicted probabilities to introduce variability
      syn_data[[var_j]] <- apply(prob_predictions, 1, function(prob_row) {
        sample(levels(data[[var_j]]), size = 1, prob = prob_row)
      })
      
      # Ensure the synthesized column is a factor with the same levels as the original data
      syn_data[[var_j]] <- factor(syn_data[[var_j]], levels = levels(data[[var_j]]))
      
    } else {
      # For regression, use bootstrap sampling from residuals
      predictions <- predict(model, newdata = pred_matrix)
      
      # Use residuals to estimate the error distribution
      residuals <- target - predict(model, newdata = train_matrix)
      
      # Bootstrap residuals and add to predictions
      syn_data[[var_j]] <- bootstrap_residuals(predictions, residuals)
      
      # Retransform the continuous variables back to their original scale
      syn_data[[var_j]] <- retransform_continuous(syn_data[[var_j]])
      
      # Force the type of the synthesized column to match the original data
      if (is.integer(data[[var_j]])) {
        syn_data[[var_j]] <- as.integer(round(syn_data[[var_j]]))  # Convert to integer
      } else {
        syn_data[[var_j]] <- as.numeric(syn_data[[var_j]])  # Ensure numeric type
      }
    }
  }
  
  return(syn_data)
}


## Save results

In [4]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0(dataset_name, "_xgb_", as.character(seed), ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

In [5]:
s <- 1238
cps_syn <- synthesize_data_xgb(cpspop, first_var = "sex", seed = s)
adult_syn <- synthesize_data_xgb(adult, first_var = "sex", seed = s)

[1] "Synthesizing variable: tax"
[1] "Synthesizing variable: income"
[1] "Synthesizing variable: csp"
[1] "Synthesizing variable: age"
[1] "Synthesizing variable: educ"
[1] "Synthesizing variable: marital"
[1] "Synthesizing variable: race"
[1] "Synthesizing variable: ss"


[1] "Synthesizing variable: age"
[1] "Synthesizing variable: workclass"
[1] "Synthesizing variable: fnlwgt"
[1] "Synthesizing variable: education"
[1] "Synthesizing variable: marital_status"
[1] "Synthesizing variable: occupation"
[1] "Synthesizing variable: relationship"
[1] "Synthesizing variable: race"
[1] "Synthesizing variable: capital_gain"
[1] "Synthesizing variable: capital_loss"
[1] "Synthesizing variable: hours_per_week"
[1] "Synthesizing variable: native_country"
[1] "Synthesizing variable: income"


In [6]:
save_synthesized_data(cps_syn, "cps", s)
save_synthesized_data(adult_syn, "adult", s)

## Test data

In [7]:
test_synthetic_data <- function(original_data, synthetic_data) {
  # Check if the number of observations is the same
  if (nrow(original_data) != nrow(synthetic_data)) {
    stop("Mismatch in the number of observations between original and synthetic data.")
  }
  
  # Check if variable types are preserved
  for (var_name in colnames(original_data)) {
    original_type <- class(original_data[[var_name]])
    synthetic_type <- class(synthetic_data[[var_name]])
    
    if (original_type != synthetic_type) {
      stop(paste("Mismatch in variable type for", var_name, ":", original_type, "vs", synthetic_type))
    }
  }
  
  # Check for NAs in the synthetic data
  if (any(is.na(synthetic_data))) {
    stop("There are missing values (NAs) in the synthetic dataset.")
  }
  
  # Check if all levels of factor variables are present in the synthetic data
  for (var_name in colnames(original_data)) {
    if (is.factor(original_data[[var_name]])) {
      original_levels <- levels(original_data[[var_name]])
      synthetic_levels <- levels(factor(synthetic_data[[var_name]]))
      
      missing_levels <- setdiff(original_levels, synthetic_levels)
      
      if (length(missing_levels) > 0) {
        message(paste("Missing levels in variable", var_name, ":", paste(missing_levels, collapse = ", ")))
      }
    }
  }
  
  # If all tests pass
  message("All checks passed. Synthetic data is consistent with the original.")
}

In [8]:
test_synthetic_data(original_data = cpspop, synthetic_data = cps_syn)

All checks passed. Synthetic data is consistent with the original.



In [9]:
test_synthetic_data(original_data = adult, synthetic_data = adult_syn)

All checks passed. Synthetic data is consistent with the original.

