# Synthetic Data Generator with a Bayesian Network Model

## Libraries

In [1]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger",  "pracma", "bnlearn", "arulesCBA")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("income", "race", "marital", "educ")), "income", "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]


## Synthetic Data

In [3]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

sequential synthesis does not make a difference since the learned DAG already has all conditional probabilaties modeled

In [4]:
synthesize_data_bn <- function(data, seed = seed) {
  # Discretize only factors, leave numeric variables as they are
  data <- discretize_df(data)

  set.seed(seed)
  
  # learn structure
  bn_structure <- tabu(data)  
  
  # fir with parametres
  bn_fitted <- bn.fit(bn_structure, data, method = "mle")
  
  # rbn() function generates synthetic data based on the fitted Bayesian network
  syn_data <- rbn(bn_fitted, n = nrow(data))  # gen same number of obs as original dataframe
  
  # ensure factor levels match the original dataset
  for (var in colnames(data)) {
    if (is.factor(data[[var]])) {
      syn_data[[var]] <- factor(syn_data[[var]], levels = levels(data[[var]]))  # Match factor levels
    }
  }
  
  return(syn_data)
}

## Apply

In [5]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0(dataset_name, "_bn_", as.character(seed), ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

In [6]:
s <- 1241

cps_syn <- synthesize_data_bn(cpspop, seed = s)
adult_syn <- synthesize_data_bn(adult, seed = s)

In [7]:
save_synthesized_data(cps_syn, "cps", s)
save_synthesized_data(adult_syn, "adult", s)

## Check data

In [8]:
# Function to tabulate levels with their relative frequencies
tabulate_levels <- function(x) {
  freq_table <- table(x)           # Absolute frequencies
  rel_freq <- prop.table(freq_table)  # Relative frequencies
  cbind(Frequency = as.vector(freq_table), Relative = as.vector(rel_freq))
}

In [9]:
test_synthetic_data <- function(original_data, synthetic_data) {
  # Check if the number of observations is the same
  if (nrow(original_data) != nrow(synthetic_data)) {
    stop("Mismatch in the number of observations between original and synthetic data.")
  }
  
  # Check if variable types are preserved
  for (var_name in colnames(original_data)) {
    original_type <- class(original_data[[var_name]])
    synthetic_type <- class(synthetic_data[[var_name]])
    
    if (original_type != synthetic_type) {
      stop(paste("Mismatch in variable type for", var_name, ":", original_type, "vs", synthetic_type))
    }
  }
  
  # Check for NAs in the synthetic data
  if (any(is.na(synthetic_data))) {
    stop("There are missing values (NAs) in the synthetic dataset.")
  }
  
  # Check if all levels of factor variables are present in the synthetic data
  for (var_name in colnames(original_data)) {
    if (is.factor(original_data[[var_name]])) {
      original_levels <- levels(original_data[[var_name]])
      synthetic_levels <- levels(factor(synthetic_data[[var_name]]))
      
      missing_levels <- setdiff(original_levels, synthetic_levels)
      
      if (length(missing_levels) > 0) {
        message(paste("Missing levels in variable", var_name, ":", paste(missing_levels, collapse = ", ")))
      }
    }
  }
  
  # If all tests pass
  message("All checks passed. Synthetic data is consistent with the original.")
}

In [10]:
sapply(discretize_df(cpspop), tabulate_levels, simplify = FALSE)

Frequency,Relative
32315,0.6536734364
8,0.0001618254
22,0.0004450198
17091,0.3457197184

Frequency,Relative
1672,0.03382151
47764,0.96617849

Frequency,Relative
7693,0.15561534
16665,0.33710252
13186,0.2667287
7946,0.16073307
3946,0.07982037

Frequency,Relative
28360,0.573671
21076,0.426329

Frequency,Relative
393,0.0079496723
26,0.0005259325
10369,0.2097459341
38648,0.781778461

Frequency,Relative
47478,0.9603932
1758,0.03556113
183,0.003701756
14,0.0002831944
3,6.068452e-05

Frequency,Relative
42504,0.8597783
4929,0.09970467
561,0.01134801
1442,0.02916903

Frequency,Relative
27065,0.547475524
117,0.002366696
711,0.014382232
4733,0.095739947
6850,0.138562991
1403,0.028380128
8557,0.173092483

Frequency,Relative
170,0.00343879
535,0.01082207
1000,0.02022817
1752,0.03543976
1123,0.02271624
1436,0.02904766
1539,0.03113116
577,0.01167166
15638,0.31632818
9230,0.18670604


In [11]:
sapply(cps_syn, tabulate_levels, simplify = FALSE)

Frequency,Relative
32345,0.6542802816
12,0.0002427381
22,0.0004450198
17057,0.3450319605

Frequency,Relative
1729,0.03497451
47707,0.96502549

Frequency,Relative
7622,0.15417914
16631,0.33641476
13197,0.26695121
7984,0.16150174
4002,0.08095315

Frequency,Relative
28560,0.5777166
20876,0.4222834

Frequency,Relative
377,0.007626022
29,0.000586617
10547,0.213346549
38483,0.778440812

Frequency,Relative
47376,0.958329962
1836,0.0371389271
207,0.004187232
13,0.0002629663
4,8.09127e-05

Frequency,Relative
42505,0.85979853
4949,0.10010923
544,0.01100413
1438,0.02908811

Frequency,Relative
27102,0.548223966
117,0.002366696
697,0.014099037
4784,0.096771583
6855,0.138664131
1353,0.027368719
8528,0.172505866

Frequency,Relative
164,0.003317421
540,0.010923214
997,0.020167489
1779,0.035985921
1199,0.02425358
1389,0.028096933
1536,0.031070475
594,0.012015535
15456,0.312646654
9210,0.186301481


In [12]:
test_synthetic_data(original_data = discretize_df(cpspop), synthetic_data = cps_syn)

All checks passed. Synthetic data is consistent with the original.



In [13]:
sapply(discretize_df(adult), tabulate_levels, simplify = FALSE)

Frequency,Relative
10448,0.346396128
11686,0.387441151
6222,0.206286055
1637,0.054273589
169,0.005603077

Frequency,Relative
5,0.0001657715
26380,0.874610437
3652,0.121079504
111,0.0036801273
14,0.0004641602

Frequency,Relative
2538,0.08414561
27624,0.91585439

Frequency,Relative
1427,0.04731119
28735,0.95268881

Frequency,Relative
2388,0.07917247
18577,0.61590743
6740,0.22345998
2142,0.07101651
315,0.0104436

Frequency,Relative
22654,0.7510775
7508,0.2489225

Frequency,Relative
9782,0.3243154
20380,0.6756846

Frequency,Relative
286,0.00948213
895,0.029673099
2817,0.093395663
231,0.007658643
25933,0.859790465

Frequency,Relative
12463,0.41320204
7726,0.25615012
889,0.02947417
4466,0.1480671
3212,0.10649161
1406,0.04661495

Frequency,Relative
4214,0.1397122207
21,0.0006962403
14065,0.4663152311
370,0.012267091
9726,0.3224587229
939,0.0311318878
827,0.0274186062

Frequency,Relative
943,0.031264505
2067,0.0685299383
22286,0.7388767323
1074,0.0356077183
2499,0.082852596
1279,0.0424043498
14,0.0004641602

Frequency,Relative
3721,0.1233671507
9,0.0002983887
4030,0.1336118295
3992,0.132351966
989,0.0327896028
1350,0.0447583052
1966,0.065181354
3212,0.106491612
143,0.0047410649
4038,0.1338770639

Frequency,Relative
820,0.027186526
1048,0.034745707
377,0.012499171
151,0.005006299
288,0.009548438
557,0.018466945
455,0.015085207
1008,0.033419535
1307,0.04333267
5044,0.16723029

Frequency,Relative
18,0.0005967774
107,0.0035475101
68,0.0022544924
56,0.0018566408
92,0.0030501956
67,0.0022213381
27,0.0008951661
100,0.00331543
86,0.0028512698
27,0.0008951661


In [14]:
sapply(adult_syn, tabulate_levels, simplify = FALSE)

Frequency,Relative
10379,0.344108481
11800,0.391220741
6142,0.203633711
1658,0.05496983
183,0.006067237

Frequency,Relative
5,0.0001657715
26378,0.8745441284
3649,0.1209800411
110,0.003646973
20,0.000663086

Frequency,Relative
2497,0.08278629
27665,0.91721371

Frequency,Relative
1410,0.04674756
28752,0.95325244

Frequency,Relative
2390,0.07923878
18498,0.61328824
6723,0.22289636
2241,0.07429879
310,0.01027783

Frequency,Relative
22750,0.7542603
7412,0.2457397

Frequency,Relative
9744,0.3230555
20418,0.6769445

Frequency,Relative
327,0.010841456
870,0.028844241
2788,0.092434189
240,0.007957032
25937,0.859923082

Frequency,Relative
12458,0.41303627
7711,0.25565281
848,0.02811485
4550,0.15085207
3193,0.10586168
1402,0.04648233

Frequency,Relative
4165,0.13808766
19,0.0006299317
14074,0.4666136198
380,0.012598634
9846,0.3264372389
894,0.0296399443
784,0.0259929713

Frequency,Relative
913,0.030269876
2062,0.0683641668
22321,0.7400371328
1109,0.0367681188
2489,0.082521053
1259,0.0417412638
9,0.0002983887

Frequency,Relative
3636,0.120549035
10,0.000331543
4064,0.134739076
3929,0.130263245
977,0.032391751
1317,0.043664213
1983,0.065744977
3198,0.106027452
130,0.004310059
4189,0.138883363

Frequency,Relative
783,0.025959817
1062,0.035209867
351,0.011637159
138,0.004575293
292,0.009681056
533,0.017671242
472,0.01564883
960,0.031828128
1406,0.046614946
5136,0.170280485

Frequency,Relative
17,0.0005636231
106,0.0035143558
70,0.002320801
49,0.0016245607
73,0.0024202639
73,0.0024202639
27,0.0008951661
123,0.0040779789
94,0.0031165042
36,0.0011935548


In [15]:
test_synthetic_data(original_data = discretize_df(adult), synthetic_data = adult_syn)

All checks passed. Synthetic data is consistent with the original.

