# Generate synthetic data
Generating synthetic data using public libraries for the 'adult' dataset.

## Load libraries and define settings

In [1]:
# general dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path

In [16]:
# package
from crnsynth.generators.marginal_generator import MarginalGenerator
from crnsynth.generators.privbayes import PrivBayes

from crnsynth.processing import preprocessing, postprocessing
from crnsynth.synthesization import synthesization, pipeline
from crnsynth.processing.generalization import NumericGeneralizationMech

In [3]:
from examples.synthesization import adult_synthesis

In [4]:
# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

In [5]:
# privacy budget for differentially private algorithms
EPSILON = 1

# other constants
RANDOM_STATE = 42
VERBOSE = 1

## Load and process dataset

In [6]:
df_adult = pd.read_csv(adult_synthesis.PATH_ADULT)
df_adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Generate
Here we show how step by step how to generate synthetic data.

In [7]:
# define your generator - in this case the simple marginal generator
generator = MarginalGenerator(epsilon=EPSILON)

# train the generator on the input data
generator.fit(df_adult)

# generate synthetic data with a desired number of rows
df_synth_marginal = generator.generate(n_records=1000)
df_synth_marginal.head()

Marginal fitted: age
Marginal fitted: workclass
Marginal fitted: fnlwgt
Marginal fitted: education
Marginal fitted: education-num
Marginal fitted: marital-status
Marginal fitted: occupation
Marginal fitted: relationship
Marginal fitted: race
Marginal fitted: sex
Marginal fitted: capital-gain
Marginal fitted: capital-loss
Marginal fitted: hours-per-week
Marginal fitted: native-country
Marginal fitted: income
Column sampled: age
Column sampled: workclass
Column sampled: fnlwgt
Column sampled: education
Column sampled: education-num
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: capital-gain
Column sampled: capital-loss
Column sampled: hours-per-week
Column sampled: native-country
Column sampled: income


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,27,Local-gov,709798,Prof-school,10,Married-civ-spouse,Sales,Husband,White,Female,0,0,40,?,<=50K
1,33,Private,483450,Bachelors,10,Divorced,Craft-repair,Not-in-family,Black,Male,0,3900,40,United-States,<=50K
2,38,Private,188260,Bachelors,5,Never-married,Transport-moving,Husband,White,Female,0,0,25,United-States,<=50K
3,36,Private,204085,HS-grad,10,Separated,Protective-serv,Own-child,White,Male,0,0,50,United-States,>50K
4,60,Private,335716,11th,5,Never-married,Prof-specialty,Not-in-family,Black,Female,0,2174,12,United-States,<=50K


## Improving the quality of the synthetic data
Additional processing steps can be added to the generation process to improve the quality of the synthetic data.

In [8]:
def preprocess_real_data(data_real: pd.DataFrame) -> pd.DataFrame:
    """Process data before fitting the generator."""
    # reduce columns
    columns_subset = [
        "age",
        "workclass",
        "education",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "hours-per-week",
        "native-country",
        "income",
    ]
    data_real = data_real[columns_subset]
    return data_real

# create copy to retain original data format
df = df_adult.copy()

# split in training and holdout set - train is used for fitting the generator, holdout for evaluation
df_train, df_holdout = preprocessing.split_train_holdout(df, target_column='income', holdout_size=0.2, random_state=RANDOM_STATE)

# preprocess the data
df_train = preprocess_real_data(df_train)
df_train.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
15738,32,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K
27985,43,Private,Masters,Never-married,Exec-managerial,Not-in-family,White,Female,45,United-States,<=50K
30673,20,?,HS-grad,Never-married,?,Not-in-family,White,Female,28,United-States,<=50K
9505,40,Local-gov,Some-college,Divorced,Transport-moving,Unmarried,White,Male,40,United-States,<=50K
26417,24,Private,Bachelors,Never-married,Prof-specialty,Own-child,White,Male,40,United-States,<=50K


Reducing the dimensionality of the data can help the generator to learn the underlying distribution better. We can generalize the data by binning the numerical columns.

In [9]:
# specify generalization mechanisms
generalizers = [
    NumericGeneralizationMech(column="age", epsilon=0.05, bins=5, bounds=adult_synthesis.AGE_BOUNDS),
    NumericGeneralizationMech(column="hours-per-week", epsilon=0.05, bins=5, bounds=adult_synthesis.HOURS_PER_WEEK_BOUNDS),
]

# apply generalization mechanisms
df_train_input, generalizers = preprocessing.generalize_data(df_train, generalizers=generalizers)
df_train_input.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
15738,1,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,2,United-States,>50K
27985,1,Private,Masters,Never-married,Exec-managerial,Not-in-family,White,Female,2,United-States,<=50K
30673,0,?,HS-grad,Never-married,?,Not-in-family,White,Female,1,United-States,<=50K
9505,1,Local-gov,Some-college,Divorced,Transport-moving,Unmarried,White,Male,1,United-States,<=50K
26417,0,Private,Bachelors,Never-married,Prof-specialty,Own-child,White,Male,1,United-States,<=50K


Fit the generator on the preprocessed data and generate synthetic data.

In [10]:
generator=PrivBayes(epsilon=EPSILON, verbose=VERBOSE)
generator.fit(df_train_input)

1/11 - Root of network: education

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'income' - with parents: ('education',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 11
Selected attribute: 'relationship' - with parents: ('income', 'education')

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 15
Selected attribute: 'marital-status' - with parents: ('relationship', 'income')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 18
Selected attribute: 'sex' - with parents: ('relationship', 'education')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 23
Selected attribute: 'age' - with parents: ('marital-status', 'sex', 'income')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 27
Selected attribute:

  self.values = self.values / row_sum


In [11]:
df_synth = generator.generate(n_records=df.shape[0])
df_synth.head()

Number of records generated: 2988 / 32561

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Number of records generated: 11715 / 32561

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Number of records generated: 20627 / 32561

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Number of records generated: 29633 / 32561

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



The synthetic data still has generalized values. We can reverse the generalization by sampling within the bounds of the generalized value to retrieve the original data structure.

In [12]:
df_synth_inv, generalizers = postprocessing.reverse_generalization(df_synth, generalizers)
df_synth_inv.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,28,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,17,United-States,<=50K
1,50,Self-emp-not-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,39,United-States,>50K
2,44,Private,Assoc-voc,Married-civ-spouse,Exec-managerial,Own-child,White,Male,34,United-States,>50K
3,44,Private,Some-college,Married-spouse-absent,Exec-managerial,Husband,Amer-Indian-Eskimo,Male,53,United-States,>50K
4,26,Private,10th,Never-married,Other-service,Not-in-family,White,Female,30,United-States,<=50K


Addtional postprocessing steps can be added to the pipeline to improve the quality of the synthetic data.

In [13]:
def postprocess_synth_data(data_synth: pd.DataFrame) -> pd.DataFrame:
    """Post-process the synthetic data."""
    # add identifier to notify user that this is synthetic data (fake)
    id_string = 'FAKE_PERSON_'
    data_synth['id'] = [id_string + str(i) for i in np.arange(1, data_synth.shape[0]+1)]
    
    # make id column first
    cols = data_synth.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    data_synth = data_synth[cols]
    return data_synth

df_synth_final = postprocess_synth_data(df_synth_inv)
df_synth_final.head()

Unnamed: 0,id,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,FAKE_PERSON_1,28,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,17,United-States,<=50K
1,FAKE_PERSON_2,50,Self-emp-not-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,39,United-States,>50K
2,FAKE_PERSON_3,44,Private,Assoc-voc,Married-civ-spouse,Exec-managerial,Own-child,White,Male,34,United-States,>50K
3,FAKE_PERSON_4,44,Private,Some-college,Married-spouse-absent,Exec-managerial,Husband,Amer-Indian-Eskimo,Male,53,United-States,>50K
4,FAKE_PERSON_5,26,Private,10th,Never-married,Other-service,Not-in-family,White,Female,30,United-States,<=50K


## Synth Pipeline
All these steps can be combined in a single function to create a pipeline for generating synthetic data. Note this function provides a consistent structure for creating synthetic data, but can also serve as an example for creating your own pipeline in case you need a bit more flexibility.

In [17]:
# create a synthetic pipeline
df = df_adult.copy()
generator=PrivBayes(epsilon=EPSILON, verbose=VERBOSE)
generalizers = [
    NumericGeneralizationMech(column="age", epsilon=0.05, bins=5, bounds=adult_synthesis.AGE_BOUNDS, inverse='truncated_normal', random_state=RANDOM_STATE),
    NumericGeneralizationMech(column="hours-per-week", epsilon=0.05, bins=5, bounds=adult_synthesis.HOURS_PER_WEEK_BOUNDS, inverse='truncated_normal', random_state=RANDOM_STATE)
]
output = pipeline.run_synth_pipeline(
    data_real=df,
    generator=generator,
    preprocess_func=adult_synthesis.preprocess_real_data,
    generalizers=generalizers,
    postprocess_func=adult_synthesis.postprocess_synthetic_data,
    holdout_size=0.2,
    target_column='income',
    random_state=RANDOM_STATE,
    n_records=1000,
    output_keys= ["train", "holdout", "synth", "generators"],
    verbose=VERBOSE
)
output['synth'].head()

Fitting generator PrivBayes({'epsilon': 1, 'verbose': 1, 'theta_usefulness': 4, 'epsilon_split': 0.3, 'score_function': 'R', 'network_init': None, 'n_cpus': None}) on input data
1/11 - Root of network: native-country

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'sex' - with parents: ('native-country',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 9
Selected attribute: 'relationship' - with parents: ('sex',)

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 9
Selected attribute: 'marital-status' - with parents: ('relationship', 'sex')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 14
Selected attribute: 'age' - with parents: ('marital-status', 'sex')

6/11 - Evaluating next attribute to add to network


  return Factor(marginal, states=states)
  return Factor(marginal, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  return Factor(marginal, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, st

Number of AttributeParentPair candidates: 23
Selected attribute: 'occupation' - with parents: ('sex', 'age')

7/11 - Evaluating next attribute to add to network


  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes

Number of AttributeParentPair candidates: 26
Selected attribute: 'workclass' - with parents: ('occupation',)

8/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 28


  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.s

Selected attribute: 'income' - with parents: ('relationship', 'occupation')

9/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 23
Selected attribute: 'hours-per-week' - with parents: ('sex', 'occupation')

10/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 21


  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value

Selected attribute: 'education' - with parents: ('income', 'age')

11/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 14
Selected attribute: 'race' - with parents: ('relationship', 'hours-per-week')

Learned Network Structure

Learning conditional probabilities: native-country - with parents None ~ estimated size: 42
Learning conditional probabilities: sex - with parents ('native-country',) ~ estimated size: 84
Learning conditional probabilities: relationship - with parents ('sex',) ~ estimated size: 12
Learning conditional probabilities: marital-status - with parents ('relationship', 'sex') ~ estimated size: 84
Learning conditional probabilities: age - with parents ('marital-status', 'sex') ~ estimated size: 70
Learning conditional probabilities: occupation - with parents ('sex', 'age') ~ estimated size: 150
Learning conditional probabilities: workclass - with parents ('occupation',) ~ estimated size: 135
Learning conditional probabilities: i

  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  return Factor(marginal, states=states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value, states)
  codes = cartesian_product(codes)
  return [
  return Factor(values, states=states)
  return Factor(self.values, self.states)
  codes = cartesian_product(codes)
  return [
  f2 = Factor(complete_value

Number of records generated: 1000 / 1000
Synthetic Data Generated


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,37,Private,HS-grad,Divorced,Machine-op-inspct,Not-in-family,White,Male,46,United-States,<=50K
1,45,Private,HS-grad,Divorced,Craft-repair,Not-in-family,Black,Male,39,United-States,<=50K
2,29,Federal-gov,12th,Never-married,Adm-clerical,Not-in-family,White,Female,36,United-States,<=50K
3,40,State-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,65,United-States,>50K
4,34,Private,HS-grad,Divorced,Tech-support,Own-child,Asian-Pac-Islander,Female,26,United-States,<=50K
