# Generate synthetic data
Generating synthetic data using public libraries for the 'adult' dataset.

## Load libraries and define settings

In [1]:
# general dependencies
import numpy as np
import pandas as pd
import sys
import os
from pathlib import Path

# synthesis methods
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.benchmark import Benchmarks

from synthesis.synthesizers.privbayes import PrivBayes
from synthesis.evaluation.metrics import MarginalComparison, AssociationsComparison

# repo code
from crnsynth.configs import config
from crnsynth.synth.custom_generators.privbayes_dk import PrivBayesDK
from crnsynth.synth.custom_generators.marginal_dk import MarginalDK
from crnsynth.synth.custom_generators.uniform_dk import UniformDK

from crnsynth.process import util
from crnsynth.evaluation import visual
from crnsynth.synth.synthpipe import BaseSynthPipe

# gives extra pandas methods for dataframes: bin_numeric_column, sample_from_binned_column
from synthesis.transformers import deidentification, generalization


  from .autonotebook import tqdm as notebook_tqdm
<stdin>:1:10: fatal error: 'omp.h' file not found
#include <omp.h>
         ^~~~~~~
1 error generated.




[2023-12-19T15:33:37.375822+0100][23092][CRITICAL] module disabled: /Users/dknoors/miniconda3/envs/synth/lib/python3.9/site-packages/synthcity/plugins/generic/plugin_goggle.py
[2023-12-19T15:33:38.464302+0100][23092][CRITICAL] module disabled: /Users/dknoors/miniconda3/envs/synth/lib/python3.9/site-packages/synthcity/plugins/generic/plugin_goggle.py
[2023-12-19T15:33:38.465193+0100][23092][CRITICAL] module disabled: /Users/dknoors/miniconda3/envs/synth/lib/python3.9/site-packages/synthcity/plugins/generic/plugin_goggle.py


In [2]:
# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# privacy budget for differentially private algorithms
EPSILON = 1

# dataset
DATASET_NAME = 'adult'

## Load and process dataset

In [5]:
df_adult = pd.read_csv(config.PATH_DATA[DATASET_NAME])
df_adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [12]:
from diffprivlib.tools.utils import mean as dp_mean

dp_mean(df_adult['age'], epsilon=0.01, bounds=(0, 100))



38.475845282109844

In [10]:
df_adult['age'].describe()

count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64

## Generate
Here we show how step by step how to generate synthetic data.

We define our synthesis pipeline, this will help us perform all the steps the right order. You can inherit common functionality across synthesis pipelines, while also adding your own data-specific methods to it. 

This ensures that you can benefit from a defined structure without needing to re-implement the generic elements, while still having the flexbility to customize it to your needs.

In [22]:
class AdultSynthPipe(BaseSynthPipe):
    def __init__(
        self,
        generator,
        data_name="adult",
        target_column="income",
        test_size=0.2,
        output_train_format=False,
        generalize=True,
        data_loader_name="generic",
        random_state=None,
        warn=True,
        verbose=2,
    ) -> None:
        super().__init__(
            generator=generator,
            data_name=data_name,
            target_column=target_column,
            test_size=test_size,
            output_train_format=output_train_format,
            generalize=generalize,
            data_loader_name=data_loader_name,
            random_state=random_state,
            warn=warn,
            verbose=verbose,
        )

    def process_data(self, data_real):
        # reduce columns
        columns_subset = [
            "age",
            "workclass",
            "education",
            "marital-status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "hours-per-week",
            "native-country",
            "income",
        ]
        data_real = data_real[columns_subset]

        return super().process_data(data_real)

    def _generalize_data(self, data_real):
        data_real = data_real.bin_numeric_column(
            column_name="age", n_bins=5, col_min=17, col_max=90, strategy="quantile"
        ).bin_numeric_column(
            column_name="hours-per-week",
            n_bins=5,
            col_min=1,
            col_max=99,
            strategy="quantile",
        )
        return super()._generalize_data(data_real)

    def _reverse_generalization(self, data_synth):
        data_synth = data_synth.sample_from_binned_column(
            column_name="age",
            numeric_type="int",
            mean=38,
            std=13,
            random_state=self.random_state,
        ).sample_from_binned_column(
            column_name="hours-per-week",
            numeric_type="int",
            mean=40,
            std=10,
            random_state=self.random_state,
        )
        return super()._reverse_generalization(data_synth)


# choose generator
# generator = Plugins().get('privbayes-dk', epsilon=EPSILON)
generator = Plugins().get('marginal-dk', epsilon=EPSILON)

# initialize custom pipeline for dataset
synth_pipe = AdultSynthPipe(generator=generator)

[2023-11-29T21:24:14.952294+0100][4618][CRITICAL] module disabled: /Users/dknoors/miniconda3/envs/synth/lib/python3.9/site-packages/synthcity/plugins/generic/plugin_goggle.py


Now we will prepare the data and split into training and testing. It's good to leave some holdout data that that can be used to compare the perfomance of models trained on real data and synthetic data.

In [23]:
# create copy to retain original data format
df = df_adult.copy()

# prepare data
df_train, df_test = synth_pipe.process_data(df)
df_train.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
29101,19,?,Some-college,Never-married,?,Other-relative,White,Female,30,United-States,<=50K
10606,50,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,<=50K
9414,62,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,<=50K
1747,20,Private,HS-grad,Never-married,Adm-clerical,Not-in-family,White,Female,40,United-States,<=50K
10677,25,Private,Bachelors,Never-married,Adm-clerical,Own-child,White,Female,40,United-States,<=50K


Train the generator on the processed training data. It might give some warning on categoreis that occur quite infrequently and might be a privacy risk to leave in your data. Decide how you would like to deal with them or proceed.

In [24]:
# train generator
synth_pipe.fit(df_train)

Bins whose width are too small (i.e., <= 1e-8). Consider decreasing the number of bins.


Using data loader for generic
Marginal fitted: age
Marginal fitted: workclass
Marginal fitted: education
Marginal fitted: marital-status
Marginal fitted: occupation
Marginal fitted: relationship
Marginal fitted: race
Marginal fitted: sex
Marginal fitted: hours-per-week
Marginal fitted: native-country
Marginal fitted: income


After training the synthesizer, we can now sample records from it to generate a synthetic dataset.

In [25]:
# generate synthetic data
df_synth = synth_pipe.generate(n_records=None)
df_synth.head()

Column sampled: age
Column sampled: workclass
Column sampled: education
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: hours-per-week
Column sampled: native-country
Column sampled: income
Column sampled: age
Column sampled: workclass
Column sampled: education
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: hours-per-week
Column sampled: native-country
Column sampled: income


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,20,Private,HS-grad,Married-civ-spouse,Adm-clerical,Own-child,White,Male,49,United-States,>50K
1,47,Private,11th,Married-civ-spouse,Craft-repair,Own-child,White,Male,40,United-States,<=50K
2,22,Private,HS-grad,Never-married,Transport-moving,Own-child,White,Male,54,United-States,<=50K
3,18,Private,11th,Divorced,Craft-repair,Wife,White,Male,43,United-States,<=50K
4,36,Private,HS-grad,Never-married,Priv-house-serv,Husband,White,Female,31,United-States,<=50K


We still might have to do some post-processing to fix inconsistencies, reorder columns or add columns.

In [26]:
# post-process data
df_synth = synth_pipe.postprocess_synthetic_data(df_synth)

df_synth.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,20,Private,HS-grad,Married-civ-spouse,Adm-clerical,Own-child,White,Male,49,United-States,>50K
1,47,Private,11th,Married-civ-spouse,Craft-repair,Own-child,White,Male,40,United-States,<=50K
2,22,Private,HS-grad,Never-married,Transport-moving,Own-child,White,Male,54,United-States,<=50K
3,18,Private,11th,Divorced,Craft-repair,Wife,White,Male,43,United-States,<=50K
4,36,Private,HS-grad,Never-married,Priv-house-serv,Husband,White,Female,31,United-States,<=50K


In [27]:
df_train.shape

(26048, 11)

SynthPipe allows you to perform a full synthesis pipeline step by step and allow you to inspect what happens. However, you can also use run() to perform all steps in order at once.

In [15]:
# choose generator
generator = Plugins().get('privbayes-dk', epsilon=EPSILON)

# initialize custom pipeline for dataset
synth_pipe = AdultSynthPipe(generator=generator, output_train_format=True)

synth_pipe.run(df_adult)

[2023-11-29T21:23:29.177221+0100][4618][CRITICAL] module disabled: /Users/dknoors/miniconda3/envs/synth/lib/python3.9/site-packages/synthcity/plugins/generic/plugin_goggle.py
Bins whose width are too small (i.e., <= 1e-8). Consider decreasing the number of bins.


Using data loader for generic
1/11 - Root of network: education

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'income' - with parents: ('education',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 11
Selected attribute: 'marital-status' - with parents: ('income',)

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 15
Selected attribute: 'relationship' - with parents: ('income', 'marital-status')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 19
Selected attribute: 'sex' - with parents: ('income', 'relationship', 'marital-status')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 24
Selected attribute: 'age' - with parents: ('sex', 'income', 'marital-status')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair

invalid value encountered in divide
invalid value encountered in divide
invalid value encountered in divide


Number of records generated: 26048 / 26048
Synthetic Data Generated

Number of records generated: 26048 / 26048
Synthetic Data Generated



Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,45,United-States,<=50K
1,46,State-gov,Some-college,Never-married,Tech-support,Own-child,White,Female,48,United-States,<=50K
2,42,Without-pay,10th,Married-civ-spouse,Tech-support,Husband,White,Male,45,United-States,<=50K
3,57,?,11th,Divorced,Other-service,Unmarried,White,Female,38,Nicaragua,<=50K
4,50,Private,Some-college,Married-civ-spouse,Sales,Husband,White,Male,44,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...
26043,40,Private,Assoc-voc,Never-married,Machine-op-inspct,Own-child,Black,Male,47,United-States,<=50K
26044,29,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,34,Taiwan,<=50K
26045,50,Private,HS-grad,Widowed,Adm-clerical,Not-in-family,Amer-Indian-Eskimo,Female,43,United-States,<=50K
26046,40,Self-emp-not-inc,Some-college,Separated,?,Other-relative,Other,Male,33,United-States,<=50K


Note: PrivBayes implementation of synthetic-data-generation library (DK) is added to plugins. This version of privbayes differs with the one included synthcity:
- R score function instead of Mutual Information - which has a lower sensitivity and thus requires less noise to compute.
- Candidate attribute-parent pairs (AP-pairs) are determined based on the theta-usefulness criterion instead of setting a fixed max degree K.