# Synthetic Data Generator with a Bayesian Network Model

## Libraries

In [1]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger",  "pracma", "bnlearn", "arulesCBA")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## Synthetic Data

In [3]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

sequential synthesis does not make a difference since the learned DAG already has all conditional probabilaties modeled

In [4]:
# Function for sequential synthesis with a Bayesian Network
synthesize_data_bn <- function(data, first_var) {

  # Discretize only factors, leave numeric variables as they are
  data <- discretize_df(data)
  
  # Debugging: Print column types after preparation
  print("After preparation:")
  print(sapply(data, class))
  
  # Step 1: Learn the structure of the Bayesian Network
  bn_structure <- hc(data)
  
  # Step 2: Fit the Bayesian Network with parameters
  bn_fitted <- bn.fit(bn_structure, data)
  
  # Step 3: Initialize the synthetic dataset
  syn_data <- data.frame(matrix(NA, ncol = ncol(data), nrow = nrow(data)))
  colnames(syn_data) <- colnames(data)
  
  # Ensure each column in syn_data has the correct type
  for (var in colnames(data)) {
    if (is.factor(data[[var]])) {
      syn_data[[var]] <- factor(NA, levels = levels(data[[var]]))  # Initialize as factor with same levels
    } else if (is.numeric(data[[var]])) {
      syn_data[[var]] <- as.numeric(NA)  # Initialize as numeric
    }
  }
  
  # Step 4: Synthesize the first variable from its marginal distribution
  syn_data[[first_var]] <- sample(data[[first_var]], nrow(data), replace = TRUE)
  
  # Step 5: Sequentially synthesize the remaining variables
  remaining_vars <- setdiff(colnames(data), first_var)

  for (var_j in remaining_vars) {
    print(paste("Synthesizing variable:", var_j))
    
    # Get parents of the current variable
    parents <- parents(bn_structure, var_j)
    
    if (length(parents) == 0) {
      # If no parents, sample from marginal distribution
      syn_data[[var_j]] <- sample(data[[var_j]], nrow(data), replace = TRUE)
    } else {
      # Sample from conditional distribution based on the parents
      parent_data <- syn_data[, parents, drop = FALSE]
      
      # Check for NA values in parent columns and impute if necessary
      for (p in parents) {
        na_rows <- is.na(syn_data[[p]])
        if (any(na_rows)) {
          print(paste("Imputing missing values for parent:", p))
          # Impute missing values by sampling from the marginal distribution of the parent
          syn_data[[p]][na_rows] <- sample(data[[p]], sum(na_rows), replace = TRUE)
        }
      }
      
      # After imputing missing values, check if parent_data is fully populated
      if (any(is.na(parent_data))) {
        print(paste("Some parent values for", var_j, "are still missing. Falling back to marginal sampling."))
        # If there are still NA values in parent_data, sample from marginal distribution
        syn_data[[var_j]] <- sample(data[[var_j]], nrow(data), replace = TRUE)
      } else {
        # If no missing values, predict based on the parents
        syn_data[[var_j]] <- predict(bn_fitted, node = var_j, data = parent_data, method = "bayes-lw")
      }
    }
  }
  
  return(syn_data)
}

## Apply

### CPS

In [5]:
# Generate synthetic data for all target variables
synthetic_cpspop <- synthesize_data_bn(cpspop, first_var= "sex")

# View the synthetic dataset
head(synthetic_cpspop)

[1] "After preparation:"
     tax   income      csp      age     educ  marital     race      sex 
"factor" "factor" "factor" "factor" "factor" "factor" "factor" "factor" 
      ss 
"factor" 
[1] "Synthesizing variable: tax"
[1] "Imputing missing values for parent: age"
[1] "Imputing missing values for parent: marital"
[1] "Some parent values for tax are still missing. Falling back to marginal sampling."
[1] "Synthesizing variable: income"
[1] "Imputing missing values for parent: educ"
[1] "Some parent values for income are still missing. Falling back to marginal sampling."
[1] "Synthesizing variable: csp"


“dropping 3500 observations because generated samples are NAs.”


[1] "Synthesizing variable: age"
[1] "Synthesizing variable: educ"
[1] "Synthesizing variable: marital"
[1] "Synthesizing variable: race"
[1] "Synthesizing variable: ss"


Unnamed: 0_level_0,tax,income,csp,age,educ,marital,race,sex,ss
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,0,(153749.2-307497.4],0,(60-75],39,1,1,1,(7-16671.3333333333]
2,(1-33333],(153749.2-307497.4],0,(15-30],39,7,1,1,0
3,(1-33333],(1-153749.2],0,(45-60],39,1,1,1,0
4,(1-33333],(1-153749.2],0,(45-60],39,1,1,1,0
5,0,(1-153749.2],0,(45-60],39,1,1,1,0
6,0,(1-153749.2],0,(45-60],39,1,1,1,0


### Adult

In [6]:
# Generate synthetic data for all target variables
synthetic_adult <- synthesize_data_bn(adult, first_var= "sex")

# View the synthetic dataset
head(synthetic_adult)

[1] "After preparation:"
           age      workclass         fnlwgt      education marital_status 
      "factor"       "factor"       "factor"       "factor"       "factor" 
    occupation   relationship           race            sex   capital_gain 
      "factor"       "factor"       "factor"       "factor"       "factor" 
  capital_loss hours_per_week native_country         income 
      "factor"       "factor"       "factor"       "factor" 
[1] "Synthesizing variable: age"
[1] "Synthesizing variable: workclass"
[1] "Imputing missing values for parent: occupation"
[1] "Some parent values for workclass are still missing. Falling back to marginal sampling."
[1] "Synthesizing variable: fnlwgt"
[1] "Imputing missing values for parent: race"
[1] "Some parent values for fnlwgt are still missing. Falling back to marginal sampling."
[1] "Synthesizing variable: education"
[1] "Imputing missing values for parent: income"
[1] "Some parent values for education are still missing. Falling back 

Unnamed: 0_level_0,age,workclass,fnlwgt,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,(17-31.6],2,(13769-307956.2],11,4,7,3,4,0,0,0,(20.6-40.2],38,<=50K
2,(60.8-75.4],5,(13769-307956.2],15,2,2,0,4,1,0,0,(20.6-40.2],38,<=50K
3,(17-31.6],2,(13769-307956.2],9,4,3,3,4,1,0,0,(20.6-40.2],38,<=50K
4,(31.6-46.2],2,(13769-307956.2],9,2,9,0,4,0,0,0,(20.6-40.2],38,<=50K
5,(17-31.6],2,(13769-307956.2],11,4,2,3,4,1,0,0,(20.6-40.2],38,<=50K
6,(46.2-60.8],2,(13769-307956.2],12,2,9,0,4,0,0,0,(20.6-40.2],38,<=50K


## Save results

In [7]:
write.csv(synthetic_cpspop, file = paste0(here(), "/results/BN_cps_syndata.csv"), row.names = FALSE)
write.csv(synthetic_adult, file = paste0(here(), "/results/BN_adult_syndata.csv"), row.names = FALSE)