# Synthetic Data Generator with a Bayesian Network Model

## Libraries

In [1]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger",  "pracma", "bnlearn", "arulesCBA")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/

Lade n"otiges Paket: grid

Lade n"otiges Paket: mvtnorm

Lade n"otiges Paket: modeltools

Lade n"otiges Paket: stats4

Lade n"otiges Paket: strucchange

Lade n"otiges Paket: zoo


Attache Paket: 'zoo'


Die folgenden Objekte sind maskiert von 'package:base':

    as.Date, as.Date.numeric


Lade n"otiges Paket: sandwich


Attache Paket: 'dplyr'


Das folgende Objekt ist maskiert 'package:party':

    where


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attache Paket: 'randomForest'


Das folgende Objekt ist maskiert 'package:dplyr':

    combine


Type 'citation("pROC")' for a citation.


Attache Paket: 'pROC'


Die folgenden Objekte sind maskiert von 'package:stats':

    cov, smooth, var


Lade n"otiges Paket: ggplot2


Attache Paket: 

## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("income", "race", "marital", "educ")), "income", "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]


## Synthetic Data

In [3]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

sequential synthesis does not make a difference since the learned DAG already has all conditional probabilaties modeled

In [None]:
synthesize_data_bn <- function(data, seed = seed) {
  # Discretize only factors, leave numeric variables as they are
  data <- discretize_df(data)

  set.seed(seed)
  
  # learn structure
  bn_structure <- tabu(data)  
  
  # fir with parametres
  bn_fitted <- bn.fit(bn_structure, data, method = "mle")
  
  # rbn() function generates synthetic data based on the fitted Bayesian network
  syn_data <- rbn(bn_fitted, n = nrow(data))  # gen same number of obs as original dataframe
  
  # ensure factor levels match the original dataset
  for (var in colnames(data)) {
    if (is.factor(data[[var]])) {
      syn_data[[var]] <- factor(syn_data[[var]], levels = levels(data[[var]]))  # Match factor levels
    }
  }
  
  return(syn_data)
}

## Apply

In [None]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0(dataset_name, "_bn_", as.character(seed), ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

In [None]:
s <- 1236

cps_syn <- synthesize_data_bn(cpspop, seed = s)
adult_syn <- synthesize_data_bn(adult, seed = s)

In [None]:
save_synthesized_data(cps_syn, "cps", s)
save_synthesized_data(adult_syn, "adult", s)

## Check data

In [None]:
# Function to tabulate levels with their relative frequencies
tabulate_levels <- function(x) {
  freq_table <- table(x)           # Absolute frequencies
  rel_freq <- prop.table(freq_table)  # Relative frequencies
  cbind(Frequency = as.vector(freq_table), Relative = as.vector(rel_freq))
}

In [None]:
test_synthetic_data <- function(original_data, synthetic_data) {
  # Check if the number of observations is the same
  if (nrow(original_data) != nrow(synthetic_data)) {
    stop("Mismatch in the number of observations between original and synthetic data.")
  }
  
  # Check if variable types are preserved
  for (var_name in colnames(original_data)) {
    original_type <- class(original_data[[var_name]])
    synthetic_type <- class(synthetic_data[[var_name]])
    
    if (original_type != synthetic_type) {
      stop(paste("Mismatch in variable type for", var_name, ":", original_type, "vs", synthetic_type))
    }
  }
  
  # Check for NAs in the synthetic data
  if (any(is.na(synthetic_data))) {
    stop("There are missing values (NAs) in the synthetic dataset.")
  }
  
  # Check if all levels of factor variables are present in the synthetic data
  for (var_name in colnames(original_data)) {
    if (is.factor(original_data[[var_name]])) {
      original_levels <- levels(original_data[[var_name]])
      synthetic_levels <- levels(factor(synthetic_data[[var_name]]))
      
      missing_levels <- setdiff(original_levels, synthetic_levels)
      
      if (length(missing_levels) > 0) {
        message(paste("Missing levels in variable", var_name, ":", paste(missing_levels, collapse = ", ")))
      }
    }
  }
  
  # If all tests pass
  message("All checks passed. Synthetic data is consistent with the original.")
}

In [None]:
sapply(discretize_df(cpspop), tabulate_levels, simplify = FALSE)

In [None]:
sapply(cps_syn, tabulate_levels, simplify = FALSE)

In [None]:
test_synthetic_data(original_data = discretize_df(cpspop), synthetic_data = cps_syn)

In [None]:
sapply(discretize_df(adult), tabulate_levels, simplify = FALSE)

In [None]:
sapply(adult_syn, tabulate_levels, simplify = FALSE)

In [None]:
test_synthetic_data(original_data = discretize_df(adult), synthetic_data = adult_syn)