# Synthetic Data Generator with a Extreme Boosting Model

## Libraries

In [1]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger", "xgboost", "data.table")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/

Lade n"otiges Paket: grid

Lade n"otiges Paket: mvtnorm

Lade n"otiges Paket: modeltools

Lade n"otiges Paket: stats4

Lade n"otiges Paket: strucchange

Lade n"otiges Paket: zoo


Attache Paket: 'zoo'


Die folgenden Objekte sind maskiert von 'package:base':

    as.Date, as.Date.numeric


Lade n"otiges Paket: sandwich


Attache Paket: 'dplyr'


Das folgende Objekt ist maskiert 'package:party':

    where


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attache Paket: 'randomForest'


Das folgende Objekt ist maskiert 'package:dplyr':

    combine


Type 'citation("pROC")' for a citation.


Attache Paket: 'pROC'


Die folgenden Objekte sind maskiert von 'package:stats':

    cov, smooth, var


Lade n"otiges Paket: ggplot2


Attache Paket: 

## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)

## Synthetic Data

In [10]:
# Function to convert dataframe to numeric matrix
convert_to_numeric_matrix <- function(df) {
  df[] <- lapply(df, function(col) {
    if (is.factor(col)) {
      return(as.numeric(col))
    } else {
      return(col)
    }
  })
  
  matrix_data <- as.matrix(df)
  return(matrix_data)
}

# Function to restore factors from numeric values
restore_factors <- function(original_df, synthesized_df) {
  synthesized_df[] <- lapply(seq_along(original_df), function(i) {
    if (is.factor(original_df[[i]])) {
      levels <- levels(original_df[[i]])
      return(factor(synthesized_df[[i]], levels = seq_along(levels), labels = levels))
    } else {
      return(as.numeric(synthesized_df[[i]]))
    }
  })
  
  return(synthesized_df)
}

# Function to synthesize all variables in the dataframe sequentially using XGBoost
synthesize_data_xgb <- function(data, nrounds = 100) {
  data_synth <- data
  
  for (j in seq_along(colnames(data))) {
    var_j <- colnames(data)[j]
    var_indices <- match(colnames(data)[1:(j-1)], colnames(data_synth), nomatch = 0)
    predictors <- c(colnames(data)[-j], colnames(data_synth)[var_indices])
    
    # Prepare data for XGBoost by converting to numeric matrix
    X <- convert_to_numeric_matrix(data_synth[, predictors, drop = FALSE])
    y <- data_synth[[var_j]]
    
    # Choose the objective based on the type of variable
    if (is.factor(y)) {
      y_numeric <- as.numeric(y) - 1  # Convert factor to numeric starting at 0
      num_classes <- length(levels(y))
      objective <- "multi:softmax"
      params <- list(
        objective = objective,
        num_class = num_classes,
        max_depth = 3,
        eta = 0.1
      )
    } else {
      y_numeric <- as.numeric(y)  # Ensure y is numeric for regression
      objective <- "reg:squarederror"
      params <- list(
        objective = objective,
        max_depth = 3,
        eta = 0.1
      )
    }
    
    # Train the XGBoost model on Y(j) given all other variables synthesized so far
    dtrain <- xgb.DMatrix(data = X, label = y_numeric)
    model <- xgb.train(params = params, data = dtrain, nrounds = nrounds)
    
    # Predict and synthesize Y(j)
    data_synth[[var_j]] <- predict(model, newdata = xgb.DMatrix(data = X))
    
    # If the original column was a factor, convert predictions back to factor levels
    if (is.factor(y)) {
      data_synth[[var_j]] <- factor(data_synth[[var_j]], levels = 0:(num_classes-1), labels = levels(y))
    }
  }
  
  # Restore factors and numeric types in synthesized data
  data_synth <- restore_factors(data, data_synth)
  
  return(data_synth)
}

## Apply

### CPS

In [11]:
# Generate synthetic data for all target variables
synthetic_cpspop <- synthesize_data_xgb(cpspop)

# View the synthetic dataset
head(synthetic_cpspop)

### Adult

In [None]:
# Define target variables
target_vars <- colnames(adult)

# Generate synthetic data for all target variables
synthetic_adult <- generate_synthetic_data_all(adult, target_vars, num_trees = 500)

# View the synthetic dataset
head(synthetic_adult)