# Generate synthetic data
Generating synthetic data using public libraries for the 'adult' dataset.

## Load libraries and define settings

In [59]:
# general dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path

In [60]:
# package
from crnsynth2.generators.marginal_generator import MarginalGenerator
from crnsynth2.generators.privbayes import PrivBayes

In [61]:
# data-specific
from examples.adult.adult_config import PATH_ADULT, PATH_RESULTS
from examples.adult.adult_synthpipe import AdultSynthPipe


In [62]:
# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
# privacy budget for differentially private algorithms
EPSILON = 1

# dataset
DATASET_NAME = 'adult'

# other
RANDOM_STATE = 42

## Load and process dataset

In [64]:

df_adult = pd.read_csv(PATH_ADULT)
df_adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Generate
Here we show how step by step how to generate synthetic data.

In [65]:
# define your generator - in this case the simple marginal generator
generator = MarginalGenerator(epsilon=EPSILON)

# train the generator on the input data
generator.fit(df_adult)

# generate synthetic data with a desired number of rows
df_synth_marginal = generator.generate(n_records=1000)
df_synth_marginal.head()

Marginal fitted: age
Marginal fitted: workclass
Marginal fitted: fnlwgt
Marginal fitted: education
Marginal fitted: education-num
Marginal fitted: marital-status
Marginal fitted: occupation
Marginal fitted: relationship
Marginal fitted: race
Marginal fitted: sex
Marginal fitted: capital-gain
Marginal fitted: capital-loss
Marginal fitted: hours-per-week
Marginal fitted: native-country
Marginal fitted: income
Column sampled: age
Column sampled: workclass
Column sampled: fnlwgt
Column sampled: education
Column sampled: education-num
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: capital-gain
Column sampled: capital-loss
Column sampled: hours-per-week
Column sampled: native-country
Column sampled: income


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,44,Private,116608,Some-college,10,Never-married,Farming-fishing,Husband,White,Male,0,1977,46,United-States,<=50K
1,40,Private,150057,HS-grad,10,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,42,United-States,>50K
2,66,Self-emp-not-inc,216666,Prof-school,14,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,>50K
3,30,Private,181863,HS-grad,7,Married-civ-spouse,Exec-managerial,Own-child,White,Male,0,0,40,United-States,>50K
4,46,Local-gov,143083,Some-college,10,Never-married,Transport-moving,Own-child,White,Male,0,0,21,United-States,<=50K


## Synth Pipeline

More often than not, you will need to some extra processing steps to improve the quality of the synthetic data - similar to training a machine learning model. These steps can be put in a pipeline, consisting of the following steps:
1. pre-processing: process the original data by removing or altering information you do not want to capture in the synthetic dataset, e.g. sensitive information or incorrect values.
2. fit: train the generator algorithm on the processed data
3. generate: generate a desired number of samples
4. post-process: post-process the synthetic data to remove inconsistencies or add additional information to the synthetic dataset

We defined a pipeline for the adult dataset. If you're curious on how to define such pipelines yourself, look at the code in `examples/adult/adult_synthpipe.py`

In [66]:
# create copy to retain original data format
df = df_adult.copy()

# create a synthetic pipeline
generator=PrivBayes(epsilon=0.1)
synth_pipe = AdultSynthPipe(generator=generator, holdout_size=0.2, generalize=True, target_column='income', random_state=RANDOM_STATE)

data_out = synth_pipe.run(df, n_records=1000)
data_out['synth'].head()

Computing DP parameter mean on column age
Computing DP parameter std on column age
Computing DP parameter mean on column hours-per-week
Computing DP parameter std on column hours-per-week

Epsilon params: 0.1
Epsilon generator: 0.1
Total epsilon: 0.2

1/11 - Root of network: income

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'relationship' - with parents: ('income',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'marital-status' - with parents: ('income',)

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'sex' - with parents: ('marital-status',)

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 8
Selected attribute: 'workclass' - with parents: ('income',)

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPai



Selected attribute: 'race' - with parents: ('hours-per-week',)

10/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 2
Selected attribute: 'education' - with parents: None

11/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 1
Selected attribute: 'occupation' - with parents: None

Learned Network Structure

Learning conditional probabilities: income - with parents None ~ estimated size: 2
Learning conditional probabilities: relationship - with parents ('income',) ~ estimated size: 12
Learning conditional probabilities: marital-status - with parents ('income',) ~ estimated size: 14
Learning conditional probabilities: sex - with parents ('marital-status',) ~ estimated size: 14
Learning conditional probabilities: workclass - with parents ('income',) ~ estimated size: 18
Learning conditional probabilities: age - with parents ('income', 'sex') ~ estimated size: 20
Learning conditional probabilities: hours-per

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,54,Private,Bachelors,Married-civ-spouse,Other-service,Husband,White,Male,25,United-States,>50K
1,49,Private,Assoc-acdm,Married-civ-spouse,Adm-clerical,Husband,White,Female,70,United-States,<=50K
2,47,Private,HS-grad,Never-married,Farming-fishing,Husband,White,Female,60,Thailand,<=50K
3,30,Private,Bachelors,Married-civ-spouse,Adm-clerical,Not-in-family,White,Female,45,United-States,<=50K
4,19,Private,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,51,United-States,<=50K


## Experiment
Run multiple synthesis experiments

In [12]:
from crnsynth2.experiment.experiment import SynthExperiment
from crnsynth2.metrics import DEFAULT_METRICS, PRIVACY_METRICS
from crnsynth2.experiment import utils

In [17]:
from sklearn.model_selection import ParameterGrid

pg = ParameterGrid(param_grid=param_grid)
for p in pg:
    print(p)


{'epsilon': 0.01, 'holdout_size': 0.2}
{'epsilon': 0.01, 'holdout_size': 0.3}
{'epsilon': 0.01, 'holdout_size': 0.4}
{'epsilon': 0.1, 'holdout_size': 0.2}
{'epsilon': 0.1, 'holdout_size': 0.3}
{'epsilon': 0.1, 'holdout_size': 0.4}
{'epsilon': 1, 'holdout_size': 0.2}
{'epsilon': 1, 'holdout_size': 0.3}
{'epsilon': 1, 'holdout_size': 0.4}


In [26]:
# def create_synthpipes(default_pipeline, generators):
#     synthpipes = [default_pipeline.__copy__().set_generator(generator) for generator in generators]
#     return synthpipes

# create synthpipes
default_pipe = AdultSynthPipe(holdout_size=0.2, random_state=RANDOM_STATE)
generators = [PrivBayes(epsilon=1, random_state=RANDOM_STATE), MarginalGenerator(epsilon=1)]

param_grid = {
    'epsilon': [0.01, 0.1, 1],
    'holdout_size': [0.2, 0.3, 0.4]
}

synth_pipes = utils.init_synthpipes(default_pipe, generators, param_grid=param_grid)
synth_pipes

[AdultSynthPipe(generator=PrivBayes({'random_state': 42, 'epsilon': 1, 'model': <synthesis.synthesizers.privbayes.PrivBayes object at 0x2877aab20>})),
 AdultSynthPipe(generator=PrivBayes({'random_state': 42, 'epsilon': 1, 'model': <synthesis.synthesizers.privbayes.PrivBayes object at 0x2877aab20>})),
 AdultSynthPipe(generator=PrivBayes({'random_state': 42, 'epsilon': 1, 'model': <synthesis.synthesizers.privbayes.PrivBayes object at 0x2877aab20>})),
 AdultSynthPipe(generator=PrivBayes({'random_state': 42, 'epsilon': 1, 'model': <synthesis.synthesizers.privbayes.PrivBayes object at 0x2877aab20>})),
 AdultSynthPipe(generator=PrivBayes({'random_state': 42, 'epsilon': 1, 'model': <synthesis.synthesizers.privbayes.PrivBayes object at 0x2877aab20>})),
 AdultSynthPipe(generator=PrivBayes({'random_state': 42, 'epsilon': 1, 'model': <synthesis.synthesizers.privbayes.PrivBayes object at 0x2877aab20>})),
 AdultSynthPipe(generator=PrivBayes({'random_state': 42, 'epsilon': 1, 'model': <synthesis.syn

In [25]:
synth_pipes[1].generator.epsilon

1

In [10]:
from crnsynth2.metrics.privacy.dcr import DistanceClosestRecord
DistanceClosestRecord().name

<function crnsynth2.metrics.privacy.dcr.DistanceClosestRecord.name() -> str>

In [11]:
PRIVACY_METRICS

[DistanceClosestRecord({'quantile': 0.5, 'metric': 'gower', 'categorical_columns': None}),
 NearestNeighborDistanceRatio({'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 5, 'categorical_columns': None})]

In [12]:
synth_experiment = SynthExperiment(experiment_name='adult_synthesis', synth_pipes=synth_pipes, metrics=PRIVACY_METRICS, path_out=PATH_RESULTS)
synth_experiment.run(df_adult, n_records=1000)

Running synthesis experiment for PrivBayes
Computing DP parameter mean on column age
Computing DP parameter std on column age
Computing DP parameter mean on column hours-per-week
Computing DP parameter std on column hours-per-week

Epsilon params: 0.1
Epsilon generator: 1
Total epsilon: 1.1

1/11 - Root of network: income

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'relationship' - with parents: ('income',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 9
Selected attribute: 'marital-status' - with parents: ('relationship', 'income')

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 15




Selected attribute: 'sex' - with parents: ('marital-status', 'relationship', 'income')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 21
Selected attribute: 'age' - with parents: ('marital-status', 'sex', 'income')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 29
Selected attribute: 'occupation' - with parents: ('sex', 'income')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 28
Selected attribute: 'workclass' - with parents: ('occupation',)

8/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 26
Selected attribute: 'hours-per-week' - with parents: ('income', 'occupation')

9/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 25
Selected attribute: 'education' - with parents: ('income', 'age')

10/11 - Evaluating next attribute to add to network
Number of AttributePar

  self.values = self.values / row_sum


Number of records generated: 1000 / 1000
Synthetic Data Generated



ValueError: could not convert string to float: 'Private'