## Libraries

In [1]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "e1071", "ExtDist"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/emma.foessing01/u12711/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift




The following object is masked from ‘package:jsonlite’:

    flatten





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




Package 'network' is not installed.




Attaching package: ‘igraph’




The following object is masked from ‘package:arules’:

    union




The following objects are masked from ‘package:bnlearn’:

    as.igraph, compare, degree, subgraph




The following objects are masked from ‘package:purrr’:

    compose, simplify




The following objects are masked from ‘package:dplyr’:

    as_data_frame, groups, union




The following object is masked from ‘package:modeltools’:

    clusters




The following object is masked from ‘package:synthpop’:

    compare




The following objects are masked from ‘package:stats’:

    decompose, spectrum




The following object is masked from ‘package:base’:

    union





Attaching package: ‘data.table’




The following object is masked from ‘package:purrr’:

    transpose




The following objects are masked from ‘package:dplyr’:

    between, first, last




The following objects are masked from ‘package:zoo’:

    yearmon, yearqtr




Loading required package: Rcpp




Attaching package: ‘RSNNS’




The following objects are masked from ‘package:caret’:

    confusionMatrix, train





Attaching package: ‘e1071’




The following object is masked from ‘package:bnlearn’:

    impute




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:pracma’:

    sigmoid





Attaching package: ‘ExtDist’




The following object is masked from ‘package:stats4’:

    BIC




The following object is masked from ‘package:stats’:

    BIC




## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("race", "marital", "educ")), "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]
adult[] <- lapply(adult, function(col) {
  if (is.integer(col)) {
    as.numeric(col)
  } else {
    col
  }
})

## SDG

Key Features of the Implementation:
1. Sequential Synthesis:

The synthesis process starts with sampling the sex variable (or another specified first variable) and then synthesizes each subsequent variable based on the previously synthesized variables.

2. SVM Model:

The SVM model (svm()) is trained for each variable, using the synthesized variables up to that point as predictors.
For factor variables, SVM classification (C-classification) is used.
For numeric variables, SVM regression (eps-regression) is used.

3. Restoring the Original Structure:

After synthesis, the function restores the original structure of the dataframe, including factor levels and numeric types.

### Parameters:
kernel: The kernel type for the SVM model ("radial", "linear", etc.). <br>
cost: The regularization parameter, which controls the trade-off between achieving a low error on the training data and minimizing the model complexity. <br>
epsilon: The tolerance for regression (only relevant for regression tasks).

In [3]:
# Function to compute pairwise probabilities using Platt scaling or other methods
compute_pairwise_probabilities <- function(model, syn_data, class_levels) {
  n <- nrow(syn_data)  # Number of observations
  k <- length(class_levels)  # Number of classes
  
  # Initialize matrix to store probabilities for each observation (n x k)
  pairwise_probs <- matrix(0, nrow = n, ncol = k)
  
  # Predict with probabilities
  predictions <- predict(model, newdata = syn_data, probability = TRUE)
  
  # Extract probabilities from the attribute
  prob_ij <- attr(predictions, "probabilities")
  
  # Check if prob_ij is null (if probabilities are not available)
  if (is.null(prob_ij)) {
    stop("Probabilities not available from the model.")
  }
  
  # Fill the pairwise_probs matrix with class probabilities for each observation
  for (i in 1:k) {
    pairwise_probs[, i] <- prob_ij[, class_levels[i]]
  }
  
  return(pairwise_probs)
}

# Solve the linear system to obtain class probabilities
solve_class_probabilities <- function(pairwise_probs) {
  # Normalize row-wise to ensure probabilities sum to 1 for each observation
  class_probs <- pairwise_probs / rowSums(pairwise_probs)
  return(class_probs)
}


In [4]:
# Helper function to bootstrap residuals and add to predictions
bootstrap_residuals <- function(predictions, residuals) {
  # Bootstrap residuals by sampling with replacement
  bootstrapped_residuals <- sample(residuals, length(predictions), replace = TRUE)
  
  # Add bootstrapped residuals to the predictions
  noisy_predictions <- predictions + bootstrapped_residuals
  
  # Clip negative values to 0 (or a small threshold, if needed)
  noisy_predictions[noisy_predictions < 0] <- 0
  
  return(noisy_predictions)
}

In [5]:
# Function to apply 3rd root transformation to continuous variables
transform_continuous <- function(col) {
  return(sign(col) * abs(col)^(1/3))  # Handle negative values properly
}

# Function to retransform continuous variables back to original scale
retransform_continuous <- function(col) {
  return(col^3)  # Cube the transformed values
}

In [6]:
synthesize_data_svm_svr <- function(data, first_var, seed, C = 1, epsilon = 0.1) {
  
  set.seed(seed)

  # Extract dataset name as a string
  dataset_name <- deparse(substitute(data))

  # Ensure transformation is applied (adult continuous variables are integers)
  data[] <- lapply(data, function(col) {
    if (is.integer(col)) {
      as.numeric(col)
    } else {
      col
    }
  })
  
  # Ensure that the first_var exists in the data
  if (!first_var %in% colnames(data)) {
    stop(paste("The column", first_var, "does not exist in the dataframe."))
  }
  
  # Apply 3rd root transformation to continuous variables
  transformed_data <- data
  transformed_data[] <- lapply(transformed_data, function(col) {
    if (is.numeric(col)) {
      transform_continuous(col)
    } else {
      col
    }
  })

  # Initialize synthetic data frame with the same structure as the transformed data
  syn_data <- data.frame(matrix(NA, ncol = ncol(transformed_data), nrow = nrow(transformed_data)))
  colnames(syn_data) <- colnames(transformed_data)
  
  # List to store residuals for plotting later (if needed)
  residuals_list <- list()
  continuous_vars <- c()  # Keep track of continuous variables
  factor_levels <- list() # Store the factor levels for reapplying later
  
  # Step 1: Sample the first variable (Y_1) from its marginal distribution
  syn_data[[first_var]] <- sample(transformed_data[[first_var]], nrow(transformed_data), replace = TRUE)

  # Get the list of remaining variables
  remaining_vars <- setdiff(colnames(transformed_data), first_var)
  
  # Step 2: Sequentially synthesize each subsequent variable using transformed_data
  for (var_j in remaining_vars) {
    print(paste("Synthesizing variable:", var_j))
    
    # Identify predictors (all synthesized columns up to this point)
    predictors <- colnames(syn_data)[!is.na(syn_data[1, ]) & colnames(syn_data) != var_j]
    
    # Prepare the training data
    train_data <- transformed_data[, predictors, drop = FALSE]
    train_data[[var_j]] <- transformed_data[[var_j]]
    
    # Check if the target variable is continuous or categorical
    if (is.factor(data[[var_j]])) {
      # Categorical variable: Use SVM for classification with probabilistic outputs and RBF kernel
      model <- svm(as.formula(paste(var_j, "~ .")), data = train_data, 
                   probability = TRUE, kernel = "radial", cost = C)
      
      # Compute pairwise probabilities (assuming you have compute_pairwise_probabilities)
      syn_predictors <- syn_data[, predictors, drop = FALSE]
      pairwise_probs <- compute_pairwise_probabilities(model, syn_predictors, levels(data[[var_j]]))
      
      # Solve for class probabilities (assuming you have solve_class_probabilities)
      class_probs <- solve_class_probabilities(pairwise_probs)
      
      # Sample from the computed class probabilities
      syn_data[[var_j]] <- apply(class_probs, 1, function(prob_row) {
        sample(levels(data[[var_j]]), size = 1, prob = prob_row)
      })
      
    } else {
      # Continuous variable: Use SVR for regression with RBF kernel
      model <- svm(as.formula(paste(var_j, "~ .")), data = train_data, 
                   type = "eps-regression", kernel = "radial", cost = C, epsilon = epsilon)
      
      # Predict the values for the synthetic data
      syn_predictors <- syn_data[, predictors, drop = FALSE]
      predictions <- predict(model, newdata = syn_predictors)
      
      # Compute residuals from the original data
      residuals <- model$residuals
      residuals_list[[var_j]] <- residuals  # Store residuals (if needed for future use)
      continuous_vars <- c(continuous_vars, var_j)  # Track continuous variables
      
      # Use bootstrap sampling of residuals and add to predictions
      syn_data[[var_j]] <- bootstrap_residuals(predictions, residuals)
      
      # Retransform the continuous variables back to their original scale
      syn_data[[var_j]] <- retransform_continuous(syn_data[[var_j]])
      
      # Ensure numeric type for continuous variables
      syn_data[[var_j]] <- as.numeric(syn_data[[var_j]])  # Convert to numeric
    }
  }
  
  # Step 4: Reapply factor levels to synthesized factor variables
  for (var_j in names(factor_levels)) {
    syn_data[[var_j]] <- factor(syn_data[[var_j]], levels = factor_levels[[var_j]])
  }
  
  return(syn_data)
}


## Save data

In [7]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0(dataset_name, "_svm_", as.character(seed), ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

In [8]:
s <- 1243

In [9]:
cps_syn <- synthesize_data_svm_svr(cpspop, first_var = "sex", seed = s)
save_synthesized_data(cps_syn, "cps", s)

[1] "Synthesizing variable: tax"
[1] "Synthesizing variable: income"
[1] "Synthesizing variable: csp"
[1] "Synthesizing variable: age"
[1] "Synthesizing variable: ss"
[1] "Synthesizing variable: race"
[1] "Synthesizing variable: marital"
[1] "Synthesizing variable: educ"


In [10]:
adult_syn <- synthesize_data_svm_svr(adult, first_var = "sex", seed = s)
save_synthesized_data(adult_syn, "adult", s)

[1] "Synthesizing variable: age"
[1] "Synthesizing variable: fnlwgt"
[1] "Synthesizing variable: capital_gain"
[1] "Synthesizing variable: capital_loss"
[1] "Synthesizing variable: hours_per_week"
[1] "Synthesizing variable: income"
[1] "Synthesizing variable: race"
[1] "Synthesizing variable: relationship"
[1] "Synthesizing variable: marital_status"
[1] "Synthesizing variable: workclass"
[1] "Synthesizing variable: occupation"
[1] "Synthesizing variable: education"
[1] "Synthesizing variable: native_country"


In [11]:
head(cps_syn)

Unnamed: 0_level_0,tax,income,csp,age,sex,ss,race,marital,educ
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<chr>,<chr>,<chr>
1,127.5021,90088.77,0,75.18234,2,28.21937,1,1,43
2,860.0,61901.88,0,58.21759,1,19.17247,1,1,39
3,470.0,69000.12,0,67.02431,1,0.0,1,7,39
4,3208.0,95573.23,0,36.42339,2,360.16602,2,1,37
5,0.0,104048.3,0,43.10315,1,32.12661,1,1,45
6,1785.0,83115.62,0,28.11344,1,24.92115,1,1,39


In [12]:
head(adult_syn)

Unnamed: 0_level_0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,income,sex,race,relationship,marital_status,workclass,occupation,education,native_country
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,28.23656,47577.39,0,0.0,33.63799,<=50K,0,4,2,4,2,0,5,38
2,18.98172,176385.97,0,0.0,29.57438,<=50K,1,4,0,4,2,9,2,38
3,53.0,155554.1,0,0.0,46.37449,<=50K,1,4,2,4,2,5,8,38
4,21.67645,201612.56,0,0.0,46.48305,>50K,0,4,2,3,3,6,13,38
5,36.06124,152395.0,0,1899.983,29.58403,<=50K,1,2,0,4,2,4,7,38
6,44.0,110329.04,0,0.0,18.50369,<=50K,1,1,2,4,3,4,12,38
