# Synthetic Data Generator with a Extreme Boosting Model

## Libraries

In [7]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger", "xgboost", "data.table", "Matrix")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)


Attache Paket: 'Matrix'


Die folgenden Objekte sind maskiert von 'package:pracma':

    expm, lu, tril, triu




## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## Synthetic Data

In [3]:
# Function to convert dataframe to numeric matrix
convert_to_numeric_matrix <- function(df) {
  df[] <- lapply(df, function(col) {
    if (is.factor(col)) {
      return(as.numeric(col))
    } else {
      return(col)
    }
  })
  
  matrix_data <- as.matrix(df)
  return(matrix_data)
}

# Function to restore factors from numeric values
restore_factors <- function(original_df, synthesized_df) {
  synthesized_df[] <- lapply(seq_along(original_df), function(i) {
    if (is.factor(original_df[[i]])) {
      levels <- levels(original_df[[i]])
      return(factor(synthesized_df[[i]], levels = seq_along(levels), labels = levels))
    } else {
      return(as.numeric(synthesized_df[[i]]))
    }
  })
  
  return(synthesized_df)
}

# Function to synthesize all variables in the dataframe sequentially using XGBoost
synthesize_data_xgb <- function(data, nrounds = 100) {
  data_synth <- data
  
  for (j in seq_along(colnames(data))) {
    var_j <- colnames(data)[j]
    var_indices <- match(colnames(data)[1:(j-1)], colnames(data_synth), nomatch = 0)
    predictors <- c(colnames(data)[-j], colnames(data_synth)[var_indices])
    
    # Prepare data for XGBoost by converting to numeric matrix
    X <- convert_to_numeric_matrix(data_synth[, predictors, drop = FALSE])
    y <- data_synth[[var_j]]
    
    # Choose the objective based on the type of variable
    if (is.factor(y)) {
      y_numeric <- as.numeric(y) - 1  # Convert factor to numeric starting at 0
      num_classes <- length(levels(y))
      objective <- "multi:softmax"
      params <- list(
        objective = objective,
        num_class = num_classes,
        max_depth = 3,
        eta = 0.1
      )
    } else {
      y_numeric <- as.numeric(y)  # Ensure y is numeric for regression
      objective <- "reg:squarederror"
      params <- list(
        objective = objective,
        max_depth = 3,
        eta = 0.1
      )
    }
    
    # Train the XGBoost model on Y(j) given all other variables synthesized so far
    dtrain <- xgb.DMatrix(data = X, label = y_numeric)
    model <- xgb.train(params = params, data = dtrain, nrounds = nrounds)
    
    # Predict and synthesize Y(j)
    data_synth[[var_j]] <- predict(model, newdata = xgb.DMatrix(data = X))
    
    # If the original column was a factor, convert predictions back to factor levels
    if (is.factor(y)) {
      data_synth[[var_j]] <- factor(data_synth[[var_j]], levels = 0:(num_classes-1), labels = levels(y))
    }
  }
  
  # Restore factors and numeric types in synthesized data
  data_synth <- restore_factors(data, data_synth)
  
  return(data_synth)
}

In [8]:
# Function to check and prepare the input data
prepare_data <- function(df) {
  # Ensure the data is a data frame
  if (!is.data.frame(df)) {
    stop("Input data must be a data frame.")
  }
  
  # Convert columns to appropriate types (factors or numeric)
  df[] <- lapply(df, function(col) {
    if (is.logical(col)) {
      return(as.factor(col))  # Convert logicals to factors
    } else if (is.character(col)) {
      return(as.factor(col))  # Convert characters to factors
    } else {
      return(col)  # Keep numeric and factor columns as they are
    }
  })
  return(df)
}

# Function to sequentially synthesize data using XGBoost
synthesize_data_xgb <- function(data, first_var, nrounds = 100, eta = 0.1, max_depth = 3, noise_factor = 0.01) {
  # Ensure that the first_var exists in the data
  if (!first_var %in% colnames(data)) {
    stop(paste("The column", first_var, "does not exist in the dataframe."))
  }
  
  # Prepare data by checking and converting columns to the correct types
  data <- prepare_data(data)
  
  # Initialize synthetic data frame with the same structure as the original data
  syn_data <- data.frame(matrix(NA, ncol = ncol(data), nrow = nrow(data)))
  colnames(syn_data) <- colnames(data)
  
  # Step 1: Directly sample the first variable (e.g., 'sex') from the original data
  syn_data[[first_var]] <- sample(data[[first_var]], nrow(data), replace = TRUE)
  
  # Step 2: Synthesize the remaining variables sequentially
  remaining_vars <- setdiff(colnames(data), first_var)
  
  for (var in remaining_vars) {
    print(paste("Synthesizing variable:", var))
    
    # Identify predictors (all synthesized columns up to this point)
    predictors <- colnames(syn_data)[!is.na(syn_data[1, ]) & colnames(syn_data) != var]
    
    # Convert predictors to matrix form
    train_data <- syn_data[, predictors, drop = FALSE]
    train_matrix <- model.matrix(~ . - 1, data = train_data)
    
    # Prepare the target variable
    target <- data[[var]]
    
    # Set up XGBoost parameters
    params <- list(
      objective = if (is.factor(target)) "multi:softmax" else "reg:squarederror",
      eta = eta,
      max_depth = max_depth
    )
    
    # Only set num_class if the target is a factor
    if (is.factor(target)) {
      params$num_class <- length(levels(target))
      target <- as.numeric(target) - 1  # XGBoost requires factor levels to be 0-based
    }
    
    # Train the XGBoost model
    model <- xgboost(
      data = train_matrix,
      label = target,
      params = params,
      nrounds = nrounds,
      verbose = 0
    )
    
    # Predict the values for the synthetic data
    pred_matrix <- model.matrix(~ . - 1, data = syn_data[, predictors, drop = FALSE])
    predictions <- predict(model, newdata = pred_matrix)
    
    # Assign predictions back to synthetic data
    if (is.factor(data[[var]])) {
      syn_data[[var]] <- factor(predictions + 1, levels = seq_along(levels(data[[var]])), labels = levels(data[[var]]))
    } else {
      # Add random noise to numeric predictions
      syn_data[[var]] <- predictions + rnorm(length(predictions), mean = 0, sd = noise_factor * sd(predictions))
    }
  }
  
  return(syn_data)
}

## Apply

### CPS

In [9]:
# Generate synthetic data for all target variables
synthetic_cpspop <- synthesize_data_xgb(cpspop, first_var = "sex")

# View the synthetic dataset
head(synthetic_cpspop)

### Adult

In [None]:
# Generate synthetic data for all target variables
synthetic_cpspop <- synthesize_data_xgb(cpspop, first_var = "sex")

# View the synthetic dataset
head(synthetic_cpspop)

# Define target variables
target_vars <- colnames(adult)

# Generate synthetic data for all target variables
synthetic_adult <- generate_synthetic_data_all(adult, target_vars, num_trees = 500)

# View the synthetic dataset
head(synthetic_adult)