## Libraries

In [3]:
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "RSNNS", "xgboost", "data.table", "Matrix", "e1071"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

## Data

In [4]:
# set path
############## adjust to correct directory!
directory <- "/Users/emmafoessing/Documents/Master/MA/Code/Master-Thesis"

load(file = (paste0(directory, "/cpspop.RData")))
adult <- read.csv(file = paste0(directory,"/adult_preprocessed.csv"))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## SDG

Key Features of the Implementation:
1. Sequential Synthesis:

The synthesis process starts with sampling the sex variable (or another specified first variable) and then synthesizes each subsequent variable based on the previously synthesized variables.

2. SVM Model:

The SVM model (svm()) is trained for each variable, using the synthesized variables up to that point as predictors.
For factor variables, SVM classification (C-classification) is used.
For numeric variables, SVM regression (eps-regression) is used.

3. Restoring the Original Structure:

After synthesis, the function restores the original structure of the dataframe, including factor levels and numeric types.

### Parameters:
kernel: The kernel type for the SVM model ("radial", "linear", etc.). <br>
cost: The regularization parameter, which controls the trade-off between achieving a low error on the training data and minimizing the model complexity. <br>
epsilon: The tolerance for regression (only relevant for regression tasks).

In [5]:
# Function to convert dataframe to a numeric matrix with one-hot encoding for factors
convert_to_numeric_matrix <- function(df) {
  df_numeric <- model.matrix(~ . - 1, data = df)  # One-hot encode factors, remove intercept
  return(as.matrix(df_numeric))
}

# Function to restore factors and numeric values to match the original dataframe
restore_factors_and_numeric <- function(original_df, synthesized_df) {
  synthesized_df[] <- lapply(seq_along(original_df), function(i) {
    original_col <- original_df[[i]]
    
    if (is.factor(original_col)) {
      # Restore factors with original levels
      levels <- levels(original_col)
      return(factor(synthesized_df[[i]], levels = seq_along(levels), labels = levels))
    } else if (is.numeric(original_col)) {
      # Ensure numeric columns are numeric
      return(as.numeric(synthesized_df[[i]]))
    } else {
      # Preserve any other types (shouldn't occur in this case)
      return(synthesized_df[[i]])
    }
  })
  
  colnames(synthesized_df) <- colnames(original_df)  # Restore original column names
  return(as.data.frame(synthesized_df))  # Ensure it is returned as a dataframe
}

# Function to build and train an SVM using e1071
build_svm <- function(X, y, type = "C-classification", kernel = "radial", cost = 1, epsilon = 0.1) {
  model <- svm(
    x = X,
    y = y,
    type = type,               # SVM type: classification or regression
    kernel = kernel,           # Kernel type: radial, linear, etc.
    cost = cost,               # Regularization parameter
    epsilon = epsilon          # Tolerance for regression (only used if type is regression)
  )
  
  return(model)
}

# Function to synthesize data sequentially starting from a specified first variable (e.g., 'sex')
synthesize_data_svm <- function(data, first_var, kernel = "radial", cost = 1, epsilon = 0.1) {
  # Ensure the first variable exists in the data
  if (!first_var %in% colnames(data)) {
    stop(paste("The column", first_var, "does not exist in the dataframe."))
  }
  
  # Initialize synthetic data with the same structure as the original data
  data_synth <- data.frame(matrix(NA, ncol = ncol(data), nrow = nrow(data)))
  colnames(data_synth) <- colnames(data)
  
  # Step 1: Directly sample the first variable (e.g., 'sex') from the original data
  data_synth[[first_var]] <- sample(data[[first_var]], nrow(data), replace = TRUE)
  
  # Step 2: Synthesize the remaining variables sequentially based on the already synthesized data
  remaining_vars <- setdiff(colnames(data), first_var)
  
  for (var in remaining_vars) {
    print(paste("Synthesizing variable:", var))
    
    # Identify predictors (all synthesized columns up to this point)
    predictors <- colnames(data_synth)[!is.na(data_synth[1, ]) & colnames(data_synth) != var]
    
    # Prepare data for SVM by converting all predictors to numeric with one-hot encoding
    X <- convert_to_numeric_matrix(data_synth[, predictors, drop = FALSE])
    y <- data[[var]]
    
    # Determine SVM type (classification or regression)
    if (is.factor(y)) {
      svm_type <- "C-classification"
      y_numeric <- as.numeric(y)
    } else {
      svm_type <- "eps-regression"
      y_numeric <- y
    }
    
    # Train the SVM model using e1071
    model <- build_svm(X, y_numeric, type = svm_type, kernel = kernel, cost = cost, epsilon = epsilon)
    
    # Predict and synthesize Y(j)
    predictions <- predict(model, X)
    
    if (is.factor(y)) {
      # For factors, assign the predicted class without rounding
      predicted_classes <- round(predictions)
      data_synth[[var]] <- factor(predicted_classes, levels = seq_along(levels(y)), labels = levels(y))
    } else {
      data_synth[[var]] <- predictions
    }
  }
  
  # Restore original factor levels and numeric types
  data_synth <- restore_factors_and_numeric(data, data_synth)
  
  return(data_synth)
}

### Apply

In [6]:
cps_syndata <- synthesize_data_svm(cpspop, "sex")

In [None]:
adult_syndata <- synthesize_data_svm(adult, "sex")

### Save results

In [None]:
write.csv(cps_syndata, file = paste0(directory, "/results/SVM_cps_syndata.csv"), row.names = FALSE)
write.csv(adult_syndata, file = paste0(directory, "/results/SVM_adult_syndata.csv"), row.names = FALSE)