# Synthetic Data Generator with a Bayesian Network Model

## Libraries

In [1]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger",  "pracma", "bnlearn", "arulesCBA")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/



Loading required package: grid



Loading required package: mvtnorm



Loading required package: modeltools



Loading required package: stats4



Loading required package: strucchange



Loading required package: zoo




Attaching package: ‘zoo’




The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




Loading required package: sandwich




Attaching package: ‘dplyr’




The following object is masked from ‘package:party’:

    where




The following objects are masked from ‘package:stats’:

    filter, lag




The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




randomForest 4.7-1.1



Type rfNews() to see new features/changes/bug fixes.




Attaching package: ‘randomForest’




The following object is masked from ‘package:dplyr’:

    combine




Type 'citation("pROC")' for a citation.




Attaching package: ‘pROC’




The following objects are masked from ‘package:stats’:

    cov, smooth, var




Loading required package: ggplot2




Attaching package: ‘ggplot2’




The following object is masked from ‘package:randomForest’:

    margin




Loading required package: lattice



here() starts at /home/uni08/hpc/emma.foessing01/u11969/Master-Thesis




Attaching package: ‘Hmisc’




The following object is masked from ‘package:pracma’:

    ceil




The following objects are masked from ‘package:dplyr’:

    src, summarize




The following objects are masked from ‘package:base’:

    format.pval, units





Attaching package: ‘purrr’




The following object is masked from ‘package:pracma’:

    cross




The following object is masked from ‘package:caret’:

    lift





Attaching package: ‘ranger’




The following object is masked from ‘package:randomForest’:

    importance





Attaching package: ‘bnlearn’




The following object is masked from ‘package:Hmisc’:

    impute




The following object is masked from ‘package:synthpop’:

    compare




Loading required package: Matrix




Attaching package: ‘Matrix’




The following objects are masked from ‘package:pracma’:

    expm, lu, tril, triu




Loading required package: arules




Attaching package: ‘arules’




The following object is masked from ‘package:bnlearn’:

    discretize




The following object is masked from ‘package:pracma’:

    size




The following object is masked from ‘package:dplyr’:

    recode




The following object is masked from ‘package:modeltools’:

    info




The following objects are masked from ‘package:base’:

    abbreviate, write





Attaching package: ‘arulesCBA’




The following object is masked from ‘package:party’:

    response




## Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("income", "race", "marital", "educ")), "income", "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]


## Synthetic Data

In [3]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

sequential synthesis does not make a difference since the learned DAG already has all conditional probabilaties modeled

In [4]:
synthesize_data_bn <- function(data, seed = seed) {
  # Discretize only factors, leave numeric variables as they are
  data <- discretize_df(data)

  set.seed(seed)
  
  # learn structure
  bn_structure <- tabu(data)  
  
  # fir with parametres
  bn_fitted <- bn.fit(bn_structure, data, method = "mle")
  
  # rbn() function generates synthetic data based on the fitted Bayesian network
  syn_data <- rbn(bn_fitted, n = nrow(data))  # gen same number of obs as original dataframe
  
  # ensure factor levels match the original dataset
  for (var in colnames(data)) {
    if (is.factor(data[[var]])) {
      syn_data[[var]] <- factor(syn_data[[var]], levels = levels(data[[var]]))  # Match factor levels
    }
  }
  
  return(syn_data)
}

## Apply

In [5]:
save_synthesized_data <- function(data, dataset_name, seed) {
  file_name <- paste0(dataset_name, "_bn_", as.character(seed), ".rds")
  saveRDS(data, paste0(here(), "/results/", file_name))
}

In [6]:
s <- 1238

cps_syn <- synthesize_data_bn(cpspop, seed = s)
adult_syn <- synthesize_data_bn(adult, seed = s)

In [7]:
save_synthesized_data(cps_syn, "cps", s)
save_synthesized_data(adult_syn, "adult", s)

## Check data

In [8]:
# Function to tabulate levels with their relative frequencies
tabulate_levels <- function(x) {
  freq_table <- table(x)           # Absolute frequencies
  rel_freq <- prop.table(freq_table)  # Relative frequencies
  cbind(Frequency = as.vector(freq_table), Relative = as.vector(rel_freq))
}

In [9]:
test_synthetic_data <- function(original_data, synthetic_data) {
  # Check if the number of observations is the same
  if (nrow(original_data) != nrow(synthetic_data)) {
    stop("Mismatch in the number of observations between original and synthetic data.")
  }
  
  # Check if variable types are preserved
  for (var_name in colnames(original_data)) {
    original_type <- class(original_data[[var_name]])
    synthetic_type <- class(synthetic_data[[var_name]])
    
    if (original_type != synthetic_type) {
      stop(paste("Mismatch in variable type for", var_name, ":", original_type, "vs", synthetic_type))
    }
  }
  
  # Check for NAs in the synthetic data
  if (any(is.na(synthetic_data))) {
    stop("There are missing values (NAs) in the synthetic dataset.")
  }
  
  # Check if all levels of factor variables are present in the synthetic data
  for (var_name in colnames(original_data)) {
    if (is.factor(original_data[[var_name]])) {
      original_levels <- levels(original_data[[var_name]])
      synthetic_levels <- levels(factor(synthetic_data[[var_name]]))
      
      missing_levels <- setdiff(original_levels, synthetic_levels)
      
      if (length(missing_levels) > 0) {
        message(paste("Missing levels in variable", var_name, ":", paste(missing_levels, collapse = ", ")))
      }
    }
  }
  
  # If all tests pass
  message("All checks passed. Synthetic data is consistent with the original.")
}

In [10]:
sapply(discretize_df(cpspop), tabulate_levels, simplify = FALSE)

Frequency,Relative
32315,0.6536734364
8,0.0001618254
22,0.0004450198
17091,0.3457197184

Frequency,Relative
1672,0.03382151
47764,0.96617849

Frequency,Relative
7693,0.15561534
16665,0.33710252
13186,0.2667287
7946,0.16073307
3946,0.07982037

Frequency,Relative
28360,0.573671
21076,0.426329

Frequency,Relative
393,0.0079496723
26,0.0005259325
10369,0.2097459341
38648,0.781778461

Frequency,Relative
47478,0.9603932
1758,0.03556113
183,0.003701756
14,0.0002831944
3,6.068452e-05

Frequency,Relative
42504,0.8597783
4929,0.09970467
561,0.01134801
1442,0.02916903

Frequency,Relative
27065,0.547475524
117,0.002366696
711,0.014382232
4733,0.095739947
6850,0.138562991
1403,0.028380128
8557,0.173092483

Frequency,Relative
170,0.00343879
535,0.01082207
1000,0.02022817
1752,0.03543976
1123,0.02271624
1436,0.02904766
1539,0.03113116
577,0.01167166
15638,0.31632818
9230,0.18670604


In [11]:
sapply(cps_syn, tabulate_levels, simplify = FALSE)

Frequency,Relative
32510,0.6576179303
7,0.0001415972
25,0.0005057043
16894,0.3417347682

Frequency,Relative
1616,0.03268873
47820,0.96731127

Frequency,Relative
7657,0.15488713
16701,0.33783073
13053,0.26403835
8038,0.16259406
3987,0.08064973

Frequency,Relative
28382,0.574116
21054,0.425884

Frequency,Relative
431,0.008718343
23,0.000465248
10424,0.210858484
38558,0.779957925

Frequency,Relative
47474,0.960312323
1765,0.0357027268
182,0.0036815276
10,0.0002022817
5,0.0001011409

Frequency,Relative
42551,0.86072902
4945,0.10002832
566,0.01144915
1374,0.02779351

Frequency,Relative
27101,0.548203738
122,0.002467837
712,0.01440246
4740,0.095881544
6784,0.137227931
1368,0.027672142
8609,0.174144348

Frequency,Relative
179,0.003620843
575,0.0116312
991,0.02004612
1811,0.036633223
1102,0.022291448
1415,0.028622866
1602,0.032405534
572,0.011570515
15581,0.315175176
9260,0.187312889


In [12]:
test_synthetic_data(original_data = discretize_df(cpspop), synthetic_data = cps_syn)

All checks passed. Synthetic data is consistent with the original.



In [13]:
sapply(discretize_df(adult), tabulate_levels, simplify = FALSE)

Frequency,Relative
10448,0.346396128
11686,0.387441151
6222,0.206286055
1637,0.054273589
169,0.005603077

Frequency,Relative
5,0.0001657715
26380,0.874610437
3652,0.121079504
111,0.0036801273
14,0.0004641602

Frequency,Relative
2538,0.08414561
27624,0.91585439

Frequency,Relative
1427,0.04731119
28735,0.95268881

Frequency,Relative
2388,0.07917247
18577,0.61590743
6740,0.22345998
2142,0.07101651
315,0.0104436

Frequency,Relative
22654,0.7510775
7508,0.2489225

Frequency,Relative
9782,0.3243154
20380,0.6756846

Frequency,Relative
286,0.00948213
895,0.029673099
2817,0.093395663
231,0.007658643
25933,0.859790465

Frequency,Relative
12463,0.41320204
7726,0.25615012
889,0.02947417
4466,0.1480671
3212,0.10649161
1406,0.04661495

Frequency,Relative
4214,0.1397122207
21,0.0006962403
14065,0.4663152311
370,0.012267091
9726,0.3224587229
939,0.0311318878
827,0.0274186062

Frequency,Relative
943,0.031264505
2067,0.0685299383
22286,0.7388767323
1074,0.0356077183
2499,0.082852596
1279,0.0424043498
14,0.0004641602

Frequency,Relative
3721,0.1233671507
9,0.0002983887
4030,0.1336118295
3992,0.132351966
989,0.0327896028
1350,0.0447583052
1966,0.065181354
3212,0.106491612
143,0.0047410649
4038,0.1338770639

Frequency,Relative
820,0.027186526
1048,0.034745707
377,0.012499171
151,0.005006299
288,0.009548438
557,0.018466945
455,0.015085207
1008,0.033419535
1307,0.04333267
5044,0.16723029

Frequency,Relative
18,0.0005967774
107,0.0035475101
68,0.0022544924
56,0.0018566408
92,0.0030501956
67,0.0022213381
27,0.0008951661
100,0.00331543
86,0.0028512698
27,0.0008951661


In [14]:
sapply(adult_syn, tabulate_levels, simplify = FALSE)

Frequency,Relative
10397,0.344705258
11793,0.390988661
6222,0.206286055
1589,0.052682183
161,0.005337842

Frequency,Relative
0,0.0
26412,0.8756713746
3618,0.1199522578
113,0.0037464359
19,0.0006299317

Frequency,Relative
2431,0.0805981
27731,0.9194019

Frequency,Relative
1379,0.04571978
28783,0.95428022

Frequency,Relative
2397,0.07947086
18687,0.61955441
6721,0.22283005
2023,0.06707115
334,0.01107354

Frequency,Relative
22833,0.7570121
7329,0.2429879

Frequency,Relative
9819,0.3255421
20343,0.6744579

Frequency,Relative
280,0.009283204
884,0.029308401
2840,0.094158212
235,0.007791261
25923,0.859458922

Frequency,Relative
12443,0.41253896
7702,0.25535442
934,0.03096612
4386,0.14541476
3253,0.10785094
1444,0.04787481

Frequency,Relative
4208,0.1395132949
19,0.0006299317
14070,0.4664810026
346,0.0114713878
9774,0.3240501293
924,0.0306345733
821,0.0272196804

Frequency,Relative
1013,0.033585306
2010,0.0666401432
22294,0.7391419667
1074,0.0356077183
2504,0.0830183675
1254,0.0415754923
13,0.0004310059

Frequency,Relative
3701,0.122704065
10,0.000331543
3961,0.131324183
4024,0.133412904
1007,0.03338638
1301,0.043133744
1960,0.064982428
3229,0.107055235
140,0.004641602
4119,0.136562562

Frequency,Relative
837,0.027750149
1032,0.034215238
365,0.01210132
135,0.004475831
293,0.00971421
580,0.019229494
433,0.014355812
953,0.031596048
1280,0.042437504
4965,0.1646111

Frequency,Relative
15,0.0004973145
99,0.0032822757
70,0.002320801
65,0.0021550295
83,0.0027518069
73,0.0024202639
28,0.0009283204
97,0.0032159671
81,0.0026854983
26,0.0008620118


In [15]:
test_synthetic_data(original_data = discretize_df(adult), synthetic_data = adult_syn)

Missing levels in variable fnlwgt : (1190517.8-1484705]



All checks passed. Synthetic data is consistent with the original.

