# Generate synthetic data
Generating synthetic data using public libraries for the 'adult' dataset.

## Load libraries and define settings

In [1]:
# general dependencies
import numpy as np
import pandas as pd
import sys
import os
from pathlib import Path

# synthesis methods
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.benchmark import Benchmarks

from synthesis.synthesizers.privbayes import PrivBayes
from synthesis.evaluation.metrics import MarginalComparison, AssociationsComparison

# repo code
from crnsynth import config
from crnsynthsynth.generators.privbayes_dk import PrivBayesDK
from crnsynth import util
from crnsynth import visual


  from .autonotebook import tqdm as notebook_tqdm
<stdin>:1:10: fatal error: 'omp.h' file not found
#include <omp.h>
         ^~~~~~~
1 error generated.




[2023-11-20T10:08:22.152950+0100][22224][CRITICAL] module disabled: /Users/dknoors/miniconda3/envs/synth/lib/python3.9/site-packages/synthcity/plugins/generic/plugin_goggle.py


In [2]:
# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# privacy budget for differentially private algorithms
EPSILON = 1

# dataset
DATASET_NAME = 'adult'

## Load and process dataset

In [4]:
df_adult = pd.read_csv(config.PATH_DATA[DATASET_NAME])
df_adult.head()



Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Generate
Here we show how step by step how to generate synthetic data.

We define our synthesis pipeline, this will help us perform all the steps the right order. You can inherit common functionality across synthesis pipelines, while also adding your own data-specific methods to it. 

This ensures that you can benefit from a defined structure without needing to re-implement the generic elements, while still having the flexbility to customize it to your needs.

In [5]:
from synthesis_scripts.adult.adult_synthesis import AdultSynthPipe

# choose generator
generator = Plugins().get('privbayes-dk', epsilon=EPSILON)

# initialize custom pipeline for dataset
synth_pipe = AdultSynthPipe(generator=generator)

[2023-11-20T10:08:26.071305+0100][22224][CRITICAL] module disabled: /Users/dknoors/miniconda3/envs/synth/lib/python3.9/site-packages/synthcity/plugins/generic/plugin_goggle.py


Now we will prepare the data and split into training and testing. It's good to leave some holdout data that that can be used to compare the perfomance of models trained on real data and synthetic data.

In [6]:
# create copy to retain original data format
df = df_adult.copy()

# prepare data
df_train, df_test = synth_pipe.process_data(df)
df_train.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
29101,19,?,Some-college,Never-married,?,Other-relative,White,Female,30,United-States,<=50K
10606,50,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,<=50K
9414,62,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,<=50K
1747,20,Private,HS-grad,Never-married,Adm-clerical,Not-in-family,White,Female,40,United-States,<=50K
10677,25,Private,Bachelors,Never-married,Adm-clerical,Own-child,White,Female,40,United-States,<=50K


Train the generator on the processed training data. It might give some warning on categoreis that occur quite infrequently and might be a privacy risk to leave in your data. Decide how you would like to deal with them or proceed.

In [7]:
# train generator
synth_pipe.fit(df_train)

1/11 - Root of network: race

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'income' - with parents: ('race',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 9
Selected attribute: 'marital-status' - with parents: ('race', 'income')

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 16
Selected attribute: 'relationship' - with parents: ('marital-status', 'income')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 24
Selected attribute: 'age' - with parents: ('marital-status', 'income')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 27
Selected attribute: 'sex' - with parents: ('race', 'relationship', 'income')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 34
Selected attribute: 'occupation' - 

After training the synthesizer, we can now sample records from it to generate a synthetic dataset.

In [8]:
# generate synthetic data
df_synth = synth_pipe.generate(n_records=None)
df_synth.head()

Number of records generated: 32561 / 32561
Synthetic Data Generated

Number of records generated: 32561 / 32561
Synthetic Data Generated



Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,45,Private,HS-grad,Married-civ-spouse,Sales,Husband,White,Male,47,United-States,>50K
1,22,Private,Bachelors,Never-married,Transport-moving,Own-child,White,Male,58,United-States,<=50K
2,25,Without-pay,Some-college,Never-married,Tech-support,Not-in-family,Black,Male,44,United-States,<=50K
3,33,?,Some-college,Divorced,Other-service,Unmarried,White,Male,47,United-States,<=50K
4,33,Private,Bachelors,Married-civ-spouse,Sales,Husband,White,Male,26,United-States,>50K


We still might have to do some post-processing to fix inconsistencies, reorder columns or add columns.

In [9]:
# post-process data
df_synth = synth_pipe.postprocess_synthetic_data(df_synth)

df_synth.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,45,Private,HS-grad,Married-civ-spouse,Sales,Husband,White,Male,47,United-States,>50K
1,22,Private,Bachelors,Never-married,Transport-moving,Own-child,White,Male,58,United-States,<=50K
2,25,Without-pay,Some-college,Never-married,Tech-support,Not-in-family,Black,Male,44,United-States,<=50K
3,33,?,Some-college,Divorced,Other-service,Unmarried,White,Male,47,United-States,<=50K
4,33,Private,Bachelors,Married-civ-spouse,Sales,Husband,White,Male,26,United-States,>50K


In [12]:
df_train.shape

(26048, 11)

SynthPipe allows you to perform a full synthesis pipeline step by step and allow you to inspect what happens. However, you can also use run() to perform all steps in order at once.

In [19]:
# choose generator
generator = Plugins().get('privbayes-dk', epsilon=EPSILON)

# initialize custom pipeline for dataset
synth_pipe = AdultSynthPipe(generator=generator, output_train_format=True)

synth_pipe.run(df_adult)

[2023-11-20T12:05:05.018649+0100][22224][CRITICAL] module disabled: /Users/dknoors/miniconda3/envs/synth/lib/python3.9/site-packages/synthcity/plugins/generic/plugin_goggle.py


Using data loader for generic
1/11 - Root of network: race

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'relationship' - with parents: ('race',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 13
Selected attribute: 'marital-status' - with parents: ('relationship',)

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 21
Selected attribute: 'sex' - with parents: ('relationship', 'marital-status')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 24
Selected attribute: 'age' - with parents: ('sex', 'marital-status')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 27
Selected attribute: 'occupation' - with parents: ('sex', 'age')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 29
Selected attribute

Note: PrivBayes implementation of synthetic-data-generation library (DK) is added to plugins. This version of privbayes differs with the one included synthcity:
- R score function instead of Mutual Information - which has a lower sensitivity and thus requires less noise to compute.
- Candidate attribute-parent pairs (AP-pairs) are determined based on the theta-usefulness criterion instead of setting a fixed max degree K.