# Generate synthetic data
Generating synthetic data using public libraries for the 'adult' dataset.

## Load libraries and define settings

In [74]:
# general dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path

In [75]:
# package
from crnsynth2.generators.marginal_generator import MarginalGenerator
from crnsynth2.generators.privbayes import PrivBayes

from crnsynth2.process import preprocessing, generalization, synthesization, postprocessing

In [72]:
from examples.adult import adult_synthesis

In [33]:
# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
# privacy budget for differentially private algorithms
EPSILON = 1

# dataset
DATASET_NAME = 'adult'

# other
RANDOM_STATE = 42
VERBOSE = 1

## Load and process dataset

In [73]:
df_adult = pd.read_csv(adult_synthesis.PATH_ADULT)
df_adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Generate
Here we show how step by step how to generate synthetic data.

In [39]:
# define your generator - in this case the simple marginal generator
generator = MarginalGenerator(epsilon=EPSILON)

# train the generator on the input data
generator.fit(df_adult)

# generate synthetic data with a desired number of rows
df_synth_marginal = generator.generate(n_records=1000)
df_synth_marginal.head()

Marginal fitted: age
Marginal fitted: workclass
Marginal fitted: fnlwgt
Marginal fitted: education
Marginal fitted: education-num
Marginal fitted: marital-status
Marginal fitted: occupation
Marginal fitted: relationship
Marginal fitted: race
Marginal fitted: sex
Marginal fitted: capital-gain
Marginal fitted: capital-loss
Marginal fitted: hours-per-week
Marginal fitted: native-country
Marginal fitted: income
Column sampled: age
Column sampled: workclass
Column sampled: fnlwgt
Column sampled: education
Column sampled: education-num
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: capital-gain
Column sampled: capital-loss
Column sampled: hours-per-week
Column sampled: native-country
Column sampled: income


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,24,Federal-gov,253354,HS-grad,13,Married-civ-spouse,Sales,Wife,White,Female,0,0,70,United-States,<=50K
1,25,Private,100270,Some-college,9,Never-married,Craft-repair,Husband,White,Male,0,2057,45,United-States,<=50K
2,68,Private,203505,HS-grad,13,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,18,United-States,<=50K
3,24,Federal-gov,212091,Some-college,13,Married-civ-spouse,Adm-clerical,Own-child,Asian-Pac-Islander,Male,10520,0,35,United-States,>50K
4,31,Private,186191,11th,10,Married-civ-spouse,Machine-op-inspct,Own-child,White,Female,0,0,40,United-States,>50K


## Improving the quality of the synthetic data
Additional processing steps can be added to the generation process to improve the quality of the synthetic data.

In [47]:
def preprocess_real_data(data_real: pd.DataFrame) -> pd.DataFrame:
    """Process data before fitting the generator."""
    # reduce columns
    columns_subset = [
        "age",
        "workclass",
        "education",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "hours-per-week",
        "native-country",
        "income",
    ]
    data_real = data_real[columns_subset]
    return data_real

# create copy to retain original data format
df = df_adult.copy()

# split in training and holdout set - train is used for fitting the generator, holdout for evaluation
df_train, df_holdout = preprocessing.split_train_holdout(df, target_column='income', holdout_size=0.2, random_state=RANDOM_STATE)

# preprocess the data
df_train = preprocess_real_data(df_train)
df_train.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
15738,32,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K
27985,43,Private,Masters,Never-married,Exec-managerial,Not-in-family,White,Female,45,United-States,<=50K
30673,20,?,HS-grad,Never-married,?,Not-in-family,White,Female,28,United-States,<=50K
9505,40,Local-gov,Some-college,Divorced,Transport-moving,Unmarried,White,Male,40,United-States,<=50K
26417,24,Private,Bachelors,Never-married,Prof-specialty,Own-child,White,Male,40,United-States,<=50K


Reducing the dimensionality of the data can help the generator to learn the underlying distribution better. We can generalize the data by binning the numerical columns.

In [49]:
# specify generalization mechanisms
generalizers = [
    NumericGeneralizationMech(column="age", epsilon=0.05, bins=5, bounds=adult_config.AGE_BOUNDS),
    NumericGeneralizationMech(column="hours-per-week", epsilon=0.05, bins=5, bounds=adult_config.HOURS_PER_WEEK_BOUNDS),
]

# apply generalization mechanisms
df_train_input, generalizers = preprocessing.generalize_data(df_train, generalizers=generalizers)
df_train_input.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
15738,1,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,2,United-States,>50K
27985,1,Private,Masters,Never-married,Exec-managerial,Not-in-family,White,Female,2,United-States,<=50K
30673,0,?,HS-grad,Never-married,?,Not-in-family,White,Female,1,United-States,<=50K
9505,1,Local-gov,Some-college,Divorced,Transport-moving,Unmarried,White,Male,1,United-States,<=50K
26417,0,Private,Bachelors,Never-married,Prof-specialty,Own-child,White,Male,1,United-States,<=50K


Fit the generator on the preprocessed data and generate synthetic data.

In [52]:
generator=PrivBayes(epsilon=EPSILON, verbose=VERBOSE)
generator.fit(df_train_input)

1/11 - Root of network: age

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'marital-status' - with parents: ('age',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 13
Selected attribute: 'relationship' - with parents: ('marital-status',)

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 20
Selected attribute: 'sex' - with parents: ('relationship', 'age')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 23
Selected attribute: 'income' - with parents: ('sex', 'relationship', 'age')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 28
Selected attribute: 'occupation' - with parents: ('sex', 'income')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 27
Selected attribute: 'workclass' - with parents: 

  self.values = self.values / row_sum
  self.values = self.values / row_sum


In [53]:
df_synth = generator.generate(n_records=df.shape[0])
df_synth.head()

Number of records generated: 2939 / 32561

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Number of records generated: 11972 / 32561

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Number of records generated: 21156 / 32561

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Number of records generated: 30359 / 32561

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



The synthetic data still has generalized values. We can reverse the generalization by sampling within the bounds of the generalized value to retrieve the original data structure.

In [54]:
df_synth_inv, generalizers = postprocessing.reverse_generalization(df_synth, generalizers)
df_synth_inv.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,27,Private,Masters,Never-married,Exec-managerial,Not-in-family,White,Female,33,Iran,<=50K
1,24,Local-gov,Bachelors,Never-married,Prof-specialty,Not-in-family,White,Male,30,United-States,<=50K
2,23,?,10th,Never-married,?,Own-child,White,Female,11,United-States,<=50K
3,50,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,51,United-States,>50K
4,57,Private,Bachelors,Divorced,Other-service,Not-in-family,White,Male,32,Mexico,<=50K


Addtional postprocessing steps can be added to the pipeline to improve the quality of the synthetic data.

In [59]:
def postprocess_synth_data(data_synth: pd.DataFrame) -> pd.DataFrame:
    """Post-process the synthetic data."""
    # add identifier to notify user that this is synthetic data (fake)
    id_string = 'FAKE_PERSON_'
    data_synth['id'] = [id_string + str(i) for i in np.arange(1, data_synth.shape[0]+1)]
    
    # make id column first
    cols = data_synth.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    data_synth = data_synth[cols]
    return data_synth

df_synth_final = postprocess_synth_data(df_synth_inv)
df_synth_final.head()

Unnamed: 0,id,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,FAKE_PERSON_1,27,Private,Masters,Never-married,Exec-managerial,Not-in-family,White,Female,33,Iran,<=50K
1,FAKE_PERSON_2,24,Local-gov,Bachelors,Never-married,Prof-specialty,Not-in-family,White,Male,30,United-States,<=50K
2,FAKE_PERSON_3,23,?,10th,Never-married,?,Own-child,White,Female,11,United-States,<=50K
3,FAKE_PERSON_4,50,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,51,United-States,>50K
4,FAKE_PERSON_5,57,Private,Bachelors,Divorced,Other-service,Not-in-family,White,Male,32,Mexico,<=50K


## Synth Pipeline
All these steps can be combined in a single function to create a pipeline for generating synthetic data. Note this function provides a consistent structure for creating synthetic data, but can be customized to your needs.

In [71]:
# create a synthetic pipeline
df = df_adult.copy()
generator=PrivBayes(epsilon=EPSILON, verbose=VERBOSE)
generalizers = [
    NumericGeneralizationMech(column="age", epsilon=0.05, bins=5, bounds=adult_synthesis.AGE_BOUNDS, inverse='truncated_normal', random_state=RANDOM_STATE),
    NumericGeneralizationMech(column="hours-per-week", epsilon=0.05, bins=5, bounds=adult_synthesis.HOURS_PER_WEEK_BOUNDS, inverse='truncated_normal', random_state=RANDOM_STATE)
]
output = synthesization.run_synth_pipeline(
    data_real=df,
    generator=generator,
    preprocess_func=adult_synthesis.preprocess_real_data,
    generalizers=generalizers,
    postprocess_func=adult_synthesis.postprocess_synthetic_data,
    holdout_size=0.2,
    target_column='income',
    random_state=RANDOM_STATE,
    n_records=1000,
    output_keys= ["train", "holdout", "synth", "generators"],
    verbose=VERBOSE
)
output['synth'].head()

Fitting generator PrivBayes({'epsilon': 1, 'verbose': 1, 'theta_usefulness': 4, 'epsilon_split': 0.3, 'score_function': 'R', 'network_init': None, 'n_cpus': None}) on input data
1/11 - Root of network: workclass

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'occupation' - with parents: ('workclass',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 16
Selected attribute: 'sex' - with parents: ('occupation',)

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 15
Selected attribute: 'relationship' - with parents: ('sex', 'occupation')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 20
Selected attribute: 'marital-status' - with parents: ('sex', 'relationship')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 22
Selected attribute: 'age' - wit

  self.values = self.values / row_sum
  self.values = self.values / row_sum


Number of records generated: 1000 / 1000
Synthetic Data Generated


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,24,Private,HS-grad,Never-married,Machine-op-inspct,Not-in-family,White,Female,30,United-States,<=50K
1,59,Private,Assoc-voc,Divorced,Sales,Husband,White,Male,39,United-States,>50K
2,29,Private,HS-grad,Divorced,Machine-op-inspct,Not-in-family,White,Female,36,United-States,<=50K
3,27,Private,HS-grad,Never-married,Handlers-cleaners,Not-in-family,White,Male,17,United-States,<=50K
4,21,Private,Bachelors,Never-married,Sales,Not-in-family,White,Female,26,United-States,<=50K


## Experiment
Run multiple synthesis experiments

In [84]:
from crnsynth2.experiment.experiment import SynthExperiment
from crnsynth2.metrics import DEFAULT_METRICS, PRIVACY_METRICS
from crnsynth2.experiment import utils

In [17]:
from sklearn.model_selection import ParameterGrid

pg = ParameterGrid(param_grid=param_grid)
for p in pg:
    print(p)


{'epsilon': 0.01, 'holdout_size': 0.2}
{'epsilon': 0.01, 'holdout_size': 0.3}
{'epsilon': 0.01, 'holdout_size': 0.4}
{'epsilon': 0.1, 'holdout_size': 0.2}
{'epsilon': 0.1, 'holdout_size': 0.3}
{'epsilon': 0.1, 'holdout_size': 0.4}
{'epsilon': 1, 'holdout_size': 0.2}
{'epsilon': 1, 'holdout_size': 0.3}
{'epsilon': 1, 'holdout_size': 0.4}


In [86]:
# def create_synthpipes(default_pipeline, generators):
#     synthpipes = [default_pipeline.__copy__().set_generator(generator) for generator in generators]
#     return synthpipes

# create synthpipes
default_pipe = AdultSynthPipe(holdout_size=0.2, random_state=RANDOM_STATE)
generators = [PrivBayes(epsilon=1), MarginalGenerator(epsilon=1)]

param_grid = {
    'epsilon': [0.01, 0.1,],
    'holdout_size': [0.2, 0.3]
}

synth_pipes = utils.init_synthpipes(default_pipe, generators, param_grid=param_grid)
synth_pipes

[AdultSynthPipe({'generator': PrivBayes({'epsilon': 0.1, 'verbose': True, 'theta_usefulness': 4, 'epsilon_split': 0.3, 'score_function': 'R', 'network_init': None, 'n_cpus': None}), 'holdout_size': 0.3, 'target_column': None, 'random_state': 42, 'verbose': 2, 'generalize': True, 'dp_params': [DPParam(stat_name=mean, epsilon=0.025, column=age), DPParam(stat_name=std, epsilon=0.025, column=age), DPParam(stat_name=mean, epsilon=0.025, column=hours-per-week), DPParam(stat_name=std, epsilon=0.025, column=hours-per-week)]}),
 AdultSynthPipe({'generator': PrivBayes({'epsilon': 0.1, 'verbose': True, 'theta_usefulness': 4, 'epsilon_split': 0.3, 'score_function': 'R', 'network_init': None, 'n_cpus': None}), 'holdout_size': 0.3, 'target_column': None, 'random_state': 42, 'verbose': 2, 'generalize': True, 'dp_params': [DPParam(stat_name=mean, epsilon=0.025, column=age), DPParam(stat_name=std, epsilon=0.025, column=age), DPParam(stat_name=mean, epsilon=0.025, column=hours-per-week), DPParam(stat_na

In [99]:
new_pipe = default_pipe.__copy__()
new_pipe.set_params(params={'holdout_size': 0.4})
new_pipe

AdultSynthPipe({'generator': None, 'holdout_size': 0.2, 'target_column': None, 'random_state': 42, 'verbose': 2, 'generalize': True, 'dp_params': [DPParam(stat_name=mean, epsilon=0.025, column=age), DPParam(stat_name=std, epsilon=0.025, column=age), DPParam(stat_name=mean, epsilon=0.025, column=hours-per-week), DPParam(stat_name=std, epsilon=0.025, column=hours-per-week)]})

In [11]:
PRIVACY_METRICS

[DistanceClosestRecord({'quantile': 0.5, 'metric': 'gower', 'categorical_columns': None}),
 NearestNeighborDistanceRatio({'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 5, 'categorical_columns': None})]

In [12]:
synth_experiment = SynthExperiment(experiment_name='adult_synthesis', synth_pipes=synth_pipes, metrics=PRIVACY_METRICS, path_out=PATH_RESULTS)
synth_experiment.run(df_adult, n_records=1000)

Running synthesis experiment for PrivBayes
Computing DP parameter mean on column age
Computing DP parameter std on column age
Computing DP parameter mean on column hours-per-week
Computing DP parameter std on column hours-per-week

Epsilon params: 0.1
Epsilon generator: 1
Total epsilon: 1.1

1/11 - Root of network: income

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'relationship' - with parents: ('income',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 9
Selected attribute: 'marital-status' - with parents: ('relationship', 'income')

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 15




Selected attribute: 'sex' - with parents: ('marital-status', 'relationship', 'income')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 21
Selected attribute: 'age' - with parents: ('marital-status', 'sex', 'income')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 29
Selected attribute: 'occupation' - with parents: ('sex', 'income')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 28
Selected attribute: 'workclass' - with parents: ('occupation',)

8/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 26
Selected attribute: 'hours-per-week' - with parents: ('income', 'occupation')

9/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 25
Selected attribute: 'education' - with parents: ('income', 'age')

10/11 - Evaluating next attribute to add to network
Number of AttributePar

  self.values = self.values / row_sum


Number of records generated: 1000 / 1000
Synthetic Data Generated


ValueError: could not convert string to float: 'Private'