# Synthetic Data Generator with a Bayesian Network Model

## Libraries

In [1]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger",  "pracma", "bnlearn", "arulesCBA")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("income", "race", "marital", "educ")), "income", "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]


## Synthetic Data

In [3]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

sequential synthesis does not make a difference since the learned DAG already has all conditional probabilaties modeled

In [4]:
synthesize_data_bn <- function(data, first_var) {
  # Discretize only factors, leave numeric variables as they are
  data <- discretize_df(data)
  
  # Debugging: Print column types after preparation
  print("After preparation:")
  print(sapply(data, class))
  
  # Step 1: Learn the structure of the Bayesian Network using Tabu Search
  bn_structure <- tabu(data)  # Applying the Tabu search algorithm instead of hc
  
  # Step 2: Fit the Bayesian Network with parameters using AIC/BIC for parameter learning
  bn_fitted <- bn.fit(bn_structure, data, method = "mle", score = "aic")  # Use "aic" or "bic" as needed
  
  # Step 3: Initialize the synthetic dataset
  syn_data <- data.frame(matrix(NA, ncol = ncol(data), nrow = nrow(data)))
  colnames(syn_data) <- colnames(data)
  
  # Ensure each column in syn_data has the correct type
  for (var in colnames(data)) {
    if (is.factor(data[[var]])) {
      syn_data[[var]] <- factor(NA, levels = levels(data[[var]]))  # Initialize as factor with same levels
    } else if (is.numeric(data[[var]])) {
      syn_data[[var]] <- as.numeric(NA)  # Initialize as numeric
    }
  }
  
  # Step 4: Synthesize the first variable from its marginal distribution
  syn_data[[first_var]] <- sample(data[[first_var]], nrow(data), replace = TRUE)
  
  # Step 5: Sequentially synthesize the remaining variables
  remaining_vars <- setdiff(colnames(data), first_var)

  for (var_j in remaining_vars) {
    print(paste("Synthesizing variable:", var_j))
    
    # Get parents of the current variable
    parents <- parents(bn_structure, var_j)
    
    if (length(parents) == 0) {
      # If no parents, sample from marginal distribution
      syn_data[[var_j]] <- sample(data[[var_j]], nrow(data), replace = TRUE)
    } else {
      # Sample from conditional distribution based on the parents
      parent_data <- syn_data[, parents, drop = FALSE]
      
      # Check for NA values in parent columns and impute if necessary
      for (p in parents) {
        na_rows <- is.na(syn_data[[p]])
        if (any(na_rows)) {
          print(paste("Imputing missing values for parent:", p))
          # Impute missing values by sampling from the marginal distribution of the parent
          syn_data[[p]][na_rows] <- sample(data[[p]], sum(na_rows), replace = TRUE)
        }
      }
      
      # After imputing missing values, check if parent_data is fully populated
      parent_data <- syn_data[, parents, drop = FALSE]  # Refresh parent_data after imputation
      
      if (any(is.na(parent_data))) {
        print(paste("Some parent values for", var_j, "are still missing. Falling back to marginal sampling."))
        # If there are still NA values in parent_data, sample from marginal distribution
        syn_data[[var_j]] <- sample(data[[var_j]], nrow(data), replace = TRUE)
      } else {
        # If no missing values, predict based on the parents
        syn_data[[var_j]] <- predict(bn_fitted, node = var_j, data = parent_data, method = "bayes-lw")
        
        # Check if any NAs are generated during prediction and fall back to marginal sampling
        na_rows <- is.na(syn_data[[var_j]])
        if (any(na_rows)) {
          print(paste("NA values generated during prediction for:", var_j, ". Falling back to marginal sampling."))
          syn_data[[var_j]][na_rows] <- sample(data[[var_j]], sum(na_rows), replace = TRUE)
        }
      }
    }
  }
  
  return(syn_data)
}

## Apply

### CPS

In [5]:
# Generate synthetic data for all target variables
synthetic_cpspop <- synthesize_data_bn(cpspop, first_var= "sex")

# View the synthetic dataset
head(synthetic_cpspop)

[1] "After preparation:"
     tax      csp      age      sex       ss   income     race  marital 
"factor" "factor" "factor" "factor" "factor" "factor" "factor" "factor" 
    educ 
"factor" 


“unused argument(s): 'score'.”


[1] "Synthesizing variable: tax"
[1] "Imputing missing values for parent: age"
[1] "Imputing missing values for parent: marital"


“dropping 3500 observations because generated samples are NAs.”


[1] "NA values generated during prediction for: tax . Falling back to marginal sampling."
[1] "Synthesizing variable: csp"


“dropping 3500 observations because generated samples are NAs.”


[1] "NA values generated during prediction for: csp . Falling back to marginal sampling."
[1] "Synthesizing variable: age"
[1] "Synthesizing variable: ss"
[1] "Synthesizing variable: income"
[1] "Imputing missing values for parent: educ"
[1] "Synthesizing variable: race"


“dropping 5184 observations because generated samples are NAs.”


[1] "Synthesizing variable: marital"
[1] "Synthesizing variable: educ"


Unnamed: 0_level_0,tax,csp,age,sex,ss,income,race,marital,educ
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,(1-33333],0,(30-45],1,0,(1-153749.2],1,1,39
2,0,0,(30-45],1,0,(1-153749.2],1,1,39
3,0,0,(30-45],1,0,(1-153749.2],1,1,39
4,(1-33333],0,(45-60],1,0,(1-153749.2],1,1,39
5,(1-33333],0,(30-45],1,0,(1-153749.2],1,1,39
6,0,0,(45-60],1,0,(1-153749.2],1,1,39


### Adult

In [6]:
# Generate synthetic data for all target variables
synthetic_adult <- synthesize_data_bn(adult, first_var= "sex")

# View the synthetic dataset
head(synthetic_adult)

[1] "After preparation:"
           age         fnlwgt   capital_gain   capital_loss hours_per_week 
      "factor"       "factor"       "factor"       "factor"       "factor" 
        income            sex           race   relationship marital_status 
      "factor"       "factor"       "factor"       "factor"       "factor" 
     workclass     occupation      education native_country 
      "factor"       "factor"       "factor"       "factor" 


“unused argument(s): 'score'.”


[1] "Synthesizing variable: age"
[1] "Imputing missing values for parent: income"
[1] "Imputing missing values for parent: marital_status"
[1] "Synthesizing variable: fnlwgt"
[1] "Imputing missing values for parent: race"
[1] "Synthesizing variable: capital_gain"
[1] "Synthesizing variable: capital_loss"
[1] "Synthesizing variable: hours_per_week"
[1] "Imputing missing values for parent: relationship"
[1] "Synthesizing variable: income"
[1] "Imputing missing values for parent: education"
[1] "Synthesizing variable: race"
[1] "Synthesizing variable: relationship"
[1] "Imputing missing values for parent: occupation"


“dropping 3000 observations because generated samples are NAs.”


[1] "NA values generated during prediction for: relationship . Falling back to marginal sampling."
[1] "Synthesizing variable: marital_status"


“dropping 1430 observations because generated samples are NAs.”


[1] "Synthesizing variable: workclass"
[1] "Synthesizing variable: occupation"
[1] "Synthesizing variable: education"
[1] "Synthesizing variable: native_country"


Unnamed: 0_level_0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,income,sex,race,relationship,marital_status,workclass,occupation,education,native_country
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,(46.2-60.8],(13769-307956.2],0,0,(20.6-40.2],<=50K,0,4,4,0,2,11,15,38
2,(31.6-46.2],(13769-307956.2],0,0,(20.6-40.2],<=50K,0,4,1,4,4,2,11,38
3,(17-31.6],(13769-307956.2],0,0,(20.6-40.2],>50K,1,4,0,2,2,2,11,38
4,(31.6-46.2],(13769-307956.2],0,0,(20.6-40.2],<=50K,1,4,0,2,2,2,11,38
5,(31.6-46.2],(13769-307956.2],0,0,(20.6-40.2],<=50K,1,4,0,2,2,2,11,38
6,(31.6-46.2],(13769-307956.2],0,0,(40.2-59.8],<=50K,0,4,4,0,2,2,11,38


## Save results

In [7]:
write.csv(synthetic_cpspop, file = paste0(here(), "/results/BN_cps_syndata.csv"), row.names = FALSE)
write.csv(synthetic_adult, file = paste0(here(), "/results/BN_adult_syndata.csv"), row.names = FALSE)

## Check data

In [8]:
# Function to tabulate levels with their relative frequencies
tabulate_levels <- function(x) {
  freq_table <- table(x)           # Absolute frequencies
  rel_freq <- prop.table(freq_table)  # Relative frequencies
  cbind(Frequency = as.vector(freq_table), Relative = as.vector(rel_freq))
}

In [9]:
test_synthetic_data <- function(original_data, synthetic_data) {
  # Check if the number of observations is the same
  if (nrow(original_data) != nrow(synthetic_data)) {
    stop("Mismatch in the number of observations between original and synthetic data.")
  }
  
  # Check if variable types are preserved
  for (var_name in colnames(original_data)) {
    original_type <- class(original_data[[var_name]])
    synthetic_type <- class(synthetic_data[[var_name]])
    
    if (original_type != synthetic_type) {
      stop(paste("Mismatch in variable type for", var_name, ":", original_type, "vs", synthetic_type))
    }
  }
  
  # Check for NAs in the synthetic data
  if (any(is.na(synthetic_data))) {
    stop("There are missing values (NAs) in the synthetic dataset.")
  }
  
  # Check if all levels of factor variables are present in the synthetic data
  for (var_name in colnames(original_data)) {
    if (is.factor(original_data[[var_name]])) {
      original_levels <- levels(original_data[[var_name]])
      synthetic_levels <- levels(factor(synthetic_data[[var_name]]))
      
      missing_levels <- setdiff(original_levels, synthetic_levels)
      
      if (length(missing_levels) > 0) {
        message(paste("Missing levels in variable", var_name, ":", paste(missing_levels, collapse = ", ")))
      }
    }
  }
  
  # If all tests pass
  message("All checks passed. Synthetic data is consistent with the original.")
}

In [10]:
sapply(discretize_df(cpspop), tabulate_levels, simplify = FALSE)

Frequency,Relative
32315,0.6536734364
8,0.0001618254
22,0.0004450198
17091,0.3457197184

Frequency,Relative
1672,0.03382151
47764,0.96617849

Frequency,Relative
7693,0.15561534
16665,0.33710252
13186,0.2667287
7946,0.16073307
3946,0.07982037

Frequency,Relative
28360,0.573671
21076,0.426329

Frequency,Relative
393,0.0079496723
26,0.0005259325
10369,0.2097459341
38648,0.781778461

Frequency,Relative
47478,0.9603932
1758,0.03556113
183,0.003701756
14,0.0002831944
3,6.068452e-05

Frequency,Relative
42504,0.8597783
4929,0.09970467
561,0.01134801
1442,0.02916903

Frequency,Relative
27065,0.547475524
117,0.002366696
711,0.014382232
4733,0.095739947
6850,0.138562991
1403,0.028380128
8557,0.173092483

Frequency,Relative
170,0.00343879
535,0.01082207
1000,0.02022817
1752,0.03543976
1123,0.02271624
1436,0.02904766
1539,0.03113116
577,0.01167166
15638,0.31632818
9230,0.18670604


In [11]:
sapply(synthetic_cpspop, tabulate_levels, simplify = FALSE)

Frequency,Relative
38741,0.7836597
0,0.0
0,0.0
10695,0.2163403

Frequency,Relative
0,0
49436,1

Frequency,Relative
7774,0.15725382
16733,0.33847803
13087,0.26472611
7911,0.16002508
3931,0.07951695

Frequency,Relative
28397,0.5744195
21039,0.4255805

Frequency,Relative
0,0.0
0,0.0
11842,0.239542
37594,0.760458

Frequency,Relative
49436,1
0,0
0,0
0,0
0,0

Frequency,Relative
49436,1
0,0
0,0
0,0

Frequency,Relative
37784,0.76430132
0,0.0
0,0.0
3931,0.07951695
0,0.0
0,0.0
7721,0.15618173

Frequency,Relative
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
49304,0.9973299
131,0.002649891


In [12]:
test_synthetic_data(original_data = discretize_df(cpspop), synthetic_data = synthetic_cpspop)

Missing levels in variable tax : (33333-66665], (66665-99997]



Missing levels in variable csp : (1-23917]



Missing levels in variable ss : (16671.3333333333-33335.6666666667], (33335.6666666667-50000]



Missing levels in variable income : (153749.2-307497.4], (307497.4-461245.6], (461245.6-614993.8], (614993.8-768742]



Missing levels in variable race : 2, 3, 4



Missing levels in variable marital : 2, 3, 5, 6



Missing levels in variable educ : 31, 32, 33, 34, 35, 36, 37, 38, 41, 42, 44, 45, 46



All checks passed. Synthetic data is consistent with the original.



In [13]:
sapply(discretize_df(adult), tabulate_levels, simplify = FALSE)

Frequency,Relative
10448,0.346396128
11686,0.387441151
6222,0.206286055
1637,0.054273589
169,0.005603077

Frequency,Relative
5,0.0001657715
26380,0.874610437
3652,0.121079504
111,0.0036801273
14,0.0004641602

Frequency,Relative
2538,0.08414561
27624,0.91585439

Frequency,Relative
1427,0.04731119
28735,0.95268881

Frequency,Relative
2388,0.07917247
18577,0.61590743
6740,0.22345998
2142,0.07101651
315,0.0104436

Frequency,Relative
22654,0.7510775
7508,0.2489225

Frequency,Relative
9782,0.3243154
20380,0.6756846

Frequency,Relative
286,0.00948213
895,0.029673099
2817,0.093395663
231,0.007658643
25933,0.859790465

Frequency,Relative
12463,0.41320204
7726,0.25615012
889,0.02947417
4466,0.1480671
3212,0.10649161
1406,0.04661495

Frequency,Relative
4214,0.1397122207
21,0.0006962403
14065,0.4663152311
370,0.012267091
9726,0.3224587229
939,0.0311318878
827,0.0274186062

Frequency,Relative
943,0.031264505
2067,0.0685299383
22286,0.7388767323
1074,0.0356077183
2499,0.082852596
1279,0.0424043498
14,0.0004641602

Frequency,Relative
3721,0.1233671507
9,0.0002983887
4030,0.1336118295
3992,0.132351966
989,0.0327896028
1350,0.0447583052
1966,0.065181354
3212,0.106491612
143,0.0047410649
4038,0.1338770639

Frequency,Relative
820,0.027186526
1048,0.034745707
377,0.012499171
151,0.005006299
288,0.009548438
557,0.018466945
455,0.015085207
1008,0.033419535
1307,0.04333267
5044,0.16723029

Frequency,Relative
18,0.0005967774
107,0.0035475101
68,0.0022544924
56,0.0018566408
92,0.0030501956
67,0.0022213381
27,0.0008951661
100,0.00331543
86,0.0028512698
27,0.0008951661


In [14]:
sapply(synthetic_adult, tabulate_levels, simplify = FALSE)

Frequency,Relative
7344,0.24348518
21761,0.72147072
699,0.02317486
358,0.01186924
0,0.0

Frequency,Relative
0,0
30162,1
0,0
0,0
0,0

Frequency,Relative
0,0
30162,1

Frequency,Relative
0,0
30162,1

Frequency,Relative
0,0.0
28400,0.94158212
1762,0.05841788
0,0.0
0,0.0

Frequency,Relative
26188,0.8682448
3974,0.1317552

Frequency,Relative
9763,0.3236854
20399,0.6763146

Frequency,Relative
0,0
0,0
0,0
0,0
30162,1

Frequency,Relative
19979,0.662389762
7576,0.2511769777
1,3.31543e-05
1305,0.0432663616
1267,0.0420064982
34,0.0011272462

Frequency,Relative
1267,0.0420065
0,0.0
20013,0.663517
0,0.0
8882,0.2944765
0,0.0
0,0.0

Frequency,Relative
930,0.0308334991
2081,0.0689940985
22339,0.7406339102
1069,0.0354419468
2482,0.0822889729
1250,0.0414428751
11,0.0003646973

Frequency,Relative
4186,0.1387839003
0,0.0
13773,0.4566341755
2551,0.0845766196
180,0.005967774
0,0.0
3,9.94629e-05
1572,0.0521185598
0,0.0
3451,0.1144154897

Frequency,Relative
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
6021,0.19962204

Frequency,Relative
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0


In [15]:
test_synthetic_data(original_data = discretize_df(adult), synthetic_data = synthetic_adult)

Missing levels in variable age : (75.4-90]



Missing levels in variable fnlwgt : (1190517.8-1484705], (307956.2-602143.4], (602143.4-896330.6], (896330.6-1190517.8]



Missing levels in variable capital_gain : (114-99999]



Missing levels in variable capital_loss : (155-4356]



Missing levels in variable hours_per_week : (1-20.6], (59.8-79.4], (79.4-99]



Missing levels in variable race : 0, 1, 2, 3



Missing levels in variable marital_status : 1, 3, 5, 6



Missing levels in variable occupation : 1, 5, 8, 10, 12, 13



Missing levels in variable education : 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 14



Missing levels in variable native_country : 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40



All checks passed. Synthetic data is consistent with the original.

