## Libraries

In [13]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "e1071", "ExtDist"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)


Attache Paket: 'ExtDist'


Das folgende Objekt ist maskiert 'package:stats4':

    BIC


Das folgende Objekt ist maskiert 'package:stats':

    BIC




## Data

In [14]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("race", "marital", "educ")), "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]
adult[] <- lapply(adult, function(col) {
  if (is.integer(col)) {
    as.numeric(col)
  } else {
    col
  }
})

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## SDG

Key Features of the Implementation:
1. Sequential Synthesis:

The synthesis process starts with sampling the sex variable (or another specified first variable) and then synthesizes each subsequent variable based on the previously synthesized variables.

2. SVM Model:

The SVM model (svm()) is trained for each variable, using the synthesized variables up to that point as predictors.
For factor variables, SVM classification (C-classification) is used.
For numeric variables, SVM regression (eps-regression) is used.

3. Restoring the Original Structure:

After synthesis, the function restores the original structure of the dataframe, including factor levels and numeric types.

### Parameters:
kernel: The kernel type for the SVM model ("radial", "linear", etc.). <br>
cost: The regularization parameter, which controls the trade-off between achieving a low error on the training data and minimizing the model complexity. <br>
epsilon: The tolerance for regression (only relevant for regression tasks).

In [None]:
# Function to compute pairwise probabilities using Platt scaling or other methods
compute_pairwise_probabilities <- function(model, syn_data, class_levels) {
  n <- nrow(syn_data)  # Number of observations
  k <- length(class_levels)  # Number of classes
  
  # Initialize matrix to store probabilities for each observation (n x k)
  pairwise_probs <- matrix(0, nrow = n, ncol = k)
  
  # Predict with probabilities
  predictions <- predict(model, newdata = syn_data, probability = TRUE)
  
  # Extract probabilities from the attribute
  prob_ij <- attr(predictions, "probabilities")
  
  # Check if prob_ij is null (if probabilities are not available)
  if (is.null(prob_ij)) {
    stop("Probabilities not available from the model.")
  }
  
  # Fill the pairwise_probs matrix with class probabilities for each observation
  for (i in 1:k) {
    pairwise_probs[, i] <- prob_ij[, class_levels[i]]
  }
  
  return(pairwise_probs)
}

# Solve the linear system to obtain class probabilities
solve_class_probabilities <- function(pairwise_probs) {
  # Normalize row-wise to ensure probabilities sum to 1 for each observation
  class_probs <- pairwise_probs / rowSums(pairwise_probs)
  return(class_probs)
}


In [None]:
# Helper function to bootstrap residuals and add to predictions
bootstrap_residuals <- function(predictions, residuals) {
  # Bootstrap residuals by sampling with replacement
  bootstrapped_residuals <- sample(residuals, length(predictions), replace = TRUE)
  
  # Add bootstrapped residuals to the predictions
  noisy_predictions <- predictions + bootstrapped_residuals
  
  # Clip negative values to 0 (or a small threshold, if needed)
  noisy_predictions[noisy_predictions < 0] <- 0
  
  return(noisy_predictions)
}

In [None]:
# Function to apply 3rd root transformation to continuous variables
transform_continuous <- function(col) {
  return(sign(col) * abs(col)^(1/3))  # Handle negative values properly
}

# Function to retransform continuous variables back to original scale
retransform_continuous <- function(col) {
  return(col^3)  # Cube the transformed values
}

In [15]:
synthesize_data_svm_svr <- function(data, first_var, seed, C = 1, epsilon = 0.1) {
  
  set.seed(seed)

  # Extract dataset name as a string
  dataset_name <- deparse(substitute(data))

  # Ensure transformation is applied (adult continuous variables are integers)
  data[] <- lapply(data, function(col) {
    if (is.integer(col)) {
      as.numeric(col)
    } else {
      col
    }
  })
  
  # Ensure that the first_var exists in the data
  if (!first_var %in% colnames(data)) {
    stop(paste("The column", first_var, "does not exist in the dataframe."))
  }
  
  # Apply 3rd root transformation to continuous variables
  transformed_data <- data
  transformed_data[] <- lapply(transformed_data, function(col) {
    if (is.numeric(col)) {
      transform_continuous(col)
    } else {
      col
    }
  })

  # Initialize synthetic data frame with the same structure as the transformed data
  syn_data <- data.frame(matrix(NA, ncol = ncol(transformed_data), nrow = nrow(transformed_data)))
  colnames(syn_data) <- colnames(transformed_data)
  
  # List to store residuals for plotting later (if needed)
  residuals_list <- list()
  continuous_vars <- c()  # Keep track of continuous variables
  factor_levels <- list() # Store the factor levels for reapplying later
  
  # Step 1: Sample the first variable (Y_1) from its marginal distribution
  syn_data[[first_var]] <- sample(transformed_data[[first_var]], nrow(transformed_data), replace = TRUE)

  # Get the list of remaining variables
  remaining_vars <- setdiff(colnames(transformed_data), first_var)
  
  # Step 2: Sequentially synthesize each subsequent variable using transformed_data
  for (var_j in remaining_vars) {
    print(paste("Synthesizing variable:", var_j))
    
    # Identify predictors (all synthesized columns up to this point)
    predictors <- colnames(syn_data)[!is.na(syn_data[1, ]) & colnames(syn_data) != var_j]
    
    # Prepare the training data
    train_data <- transformed_data[, predictors, drop = FALSE]
    train_data[[var_j]] <- transformed_data[[var_j]]
    
    # Check if the target variable is continuous or categorical
    if (is.factor(data[[var_j]])) {
      # Categorical variable: Use SVM for classification with probabilistic outputs and RBF kernel
      model <- svm(as.formula(paste(var_j, "~ .")), data = train_data, 
                   probability = TRUE, kernel = "radial", cost = C)
      
      # Compute pairwise probabilities (assuming you have compute_pairwise_probabilities)
      syn_predictors <- syn_data[, predictors, drop = FALSE]
      pairwise_probs <- compute_pairwise_probabilities(model, syn_predictors, levels(data[[var_j]]))
      
      # Solve for class probabilities (assuming you have solve_class_probabilities)
      class_probs <- solve_class_probabilities(pairwise_probs)
      
      # Sample from the computed class probabilities
      syn_data[[var_j]] <- apply(class_probs, 1, function(prob_row) {
        sample(levels(data[[var_j]]), size = 1, prob = prob_row)
      })
      
    } else {
      # Continuous variable: Use SVR for regression with RBF kernel
      model <- svm(as.formula(paste(var_j, "~ .")), data = train_data, 
                   type = "eps-regression", kernel = "radial", cost = C, epsilon = epsilon)
      
      # Predict the values for the synthetic data
      syn_predictors <- syn_data[, predictors, drop = FALSE]
      predictions <- predict(model, newdata = syn_predictors)
      
      # Compute residuals from the original data
      residuals <- model$residuals
      residuals_list[[var_j]] <- residuals  # Store residuals (if needed for future use)
      continuous_vars <- c(continuous_vars, var_j)  # Track continuous variables
      
      # Use bootstrap sampling of residuals and add to predictions
      syn_data[[var_j]] <- bootstrap_residuals(predictions, residuals)
      
      # Retransform the continuous variables back to their original scale
      syn_data[[var_j]] <- retransform_continuous(syn_data[[var_j]])
      
      # Ensure numeric type for continuous variables
      syn_data[[var_j]] <- as.numeric(syn_data[[var_j]])  # Convert to numeric
    }
  }
  
  # Step 4: Reapply factor levels to synthesized factor variables
  for (var_j in names(factor_levels)) {
    syn_data[[var_j]] <- factor(syn_data[[var_j]], levels = factor_levels[[var_j]])
  }
  
  return(syn_data)
}


## Save data

In [None]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0(dataset_name, "_svm_", as.character(seed), ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

In [None]:
s <- 1241

In [None]:
#cps_syn <- synthesize_data_svm_svr(cpspop, first_var = "sex", seed = s)
#save_synthesized_data(cps_syn, "cps", s)

In [None]:
adult_syn <- synthesize_data_svm_svr(adult, first_var = "sex", seed = s)
save_synthesized_data(adult_syn, "adult", s)

In [None]:
#head(cps_syn)

In [None]:
head(adult_syn)