# Benchmark generators
Run a suite of metrics to evaluate the utility and privacy of synthetic data.

## Load libraries and define settings

In [21]:
# general dependencies
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path

In [76]:
# package
from crnsynth.generators.marginal import MarginalGenerator
from crnsynth.generators.privbayes import PrivBayes

from crnsynth.processing import preprocessing

from crnsynth.metrics import PRIVACY_METRICS, ALL_METRICS
from crnsynth.benchmark.review import SyntheticDataReview
from crnsynth.benchmark import benchmark
from crnsynth.serialization import paths, save, load

In [23]:
from examples import adult_synthesis

In [24]:
# ignore deprecation warning
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
# privacy budget for differentially private algorithms
EPSILON = 1

# other params
RANDOM_STATE = 42
VERBOSE = 1

## Load and process dataset

In [26]:
df_adult = pd.read_csv(adult_synthesis.PATH_ADULT)
df_adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Generate
Here we show how step by step how to generate synthetic data.

In [80]:
df = df_adult.copy()

# process data
df = adult_synthesis.preprocess_real_data(df)

# split data
df_train, df_holdout = preprocessing.split_train_holdout(df, target_column='income', holdout_size=0.2, random_state=RANDOM_STATE)

# define your generator 
generator = PrivBayes(epsilon=EPSILON)

# train the generator on the input data
generator.fit(df_train)

# generate synthetic data with a desired number of rows
df_synth = generator.generate(n_records=1000)

# post process the synthetic data
df_synth = adult_synthesis.postprocess_synthetic_data(df_synth)

df_synth.head()

1/11 - Root of network: occupation

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'workclass' - with parents: ('occupation',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 14
Selected attribute: 'sex' - with parents: ('occupation',)

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 13
Selected attribute: 'relationship' - with parents: ('sex', 'occupation')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 16
Selected attribute: 'marital-status' - with parents: ('sex', 'relationship')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 16
Selected attribute: 'income' - with parents: ('workclass', 'marital-status')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 15
Selected attribute: 'education' - 

  self.values = self.values / row_sum


Number of records generated: 1000 / 1000
Synthetic Data Generated



Unnamed: 0,age,hours-per-week,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,76,10,Private,Bachelors,Married-civ-spouse,Transport-moving,Husband,Black,Male,United-States,<=50K
1,49,42,Private,7th-8th,Never-married,Adm-clerical,Not-in-family,Black,Female,Canada,<=50K
2,38,40,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
3,40,45,Private,9th,Married-civ-spouse,Adm-clerical,Other-relative,White,Female,United-States,>50K
4,45,5,?,HS-grad,Married-civ-spouse,?,Husband,White,Male,United-States,>50K


## Synthetic Data Review
Run a suite of metrics to evaluate the utility and privacy of synthetic data.

In [28]:
# metrics can either be a list or dictionary
metrics = ALL_METRICS
metrics

{'privacy': [DistanceClosestRecord({'encoder': 'ordinal', 'quantile': 0.5, 'metric': 'gower', 'categorical_columns': None}),
  NearestNeighborDistanceRatio({'encoder': 'ordinal', 'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 2, 'categorical_columns': None}),
  CategoricalCAPScore({'encoder': 'ordinal', 'categorical_columns': None, 'frac_sensitive': 0.5, 'random_state': None})],
 'utility': [ContingencySimilarityScore({'encoder': None, 'categorical_columns': None}),
  FeatureCorrelation({'encoder': None, 'numerical_columns': None}),
  CorrelationSimilarityScore({'encoder': None, 'numerical_columns': None})]}

In [29]:
# metrics that have any of the parameters below are updated with the following values
# alternatively, you can set the values for each metric individually as well
metric_kwargs = {
    'categorical_columns': adult_synthesis.NOMINAL_COLUMNS,
    'numerical_columns': adult_synthesis.ORDINAL_COLUMNS,
    'frac_sensitive': 0.5,
    'random_state': RANDOM_STATE,
}

# define number of jobs for sequential (n_jobs == 1) or parallel computation (n_jobs > 1 or -1 for all cores)
n_jobs = -1 

synth_review = SyntheticDataReview(metrics=metrics, metric_kwargs=metric_kwargs, encoder='ordinal', n_jobs=n_jobs, verbose=VERBOSE)
scores = synth_review.compute(df_train, df_synth, df_holdout)
scores

{'privacy': {'DistanceClosestRecord': {'holdout': 0.0031013367760234613,
   'synth': 0.10548587908972329},
  'NearestNeighborDistanceRatio': {'holdout': 0.6242069835342642,
   'synth': 0.7947300522973817},
  'CategoricalCAPScore': {'score': 0.7905672980781999}},
 'utility': {'ContingencySimilarityScore': {'score': 0.8683820638820638},
  'FeatureCorrelation': {'score': 0.842010244181217},
  'CorrelationSimilarityScore': {'score': 0.9848911363172838}}}

After computation scores can also be accessed via the `scores_` attribute or as dataframe using `.score_as_dataframe()`.

In [30]:
synth_review.scores_

{'privacy': {'DistanceClosestRecord': {'holdout': 0.0031013367760234613,
   'synth': 0.10548587908972329},
  'NearestNeighborDistanceRatio': {'holdout': 0.6242069835342642,
   'synth': 0.7947300522973817},
  'CategoricalCAPScore': {'score': 0.7905672980781999}},
 'utility': {'ContingencySimilarityScore': {'score': 0.8683820638820638},
  'FeatureCorrelation': {'score': 0.842010244181217},
  'CorrelationSimilarityScore': {'score': 0.9848911363172838}}}

In [31]:
synth_review.score_as_dataframe(name='Privbayes')

Unnamed: 0,Privbayes
privacy_DistanceClosestRecord_holdout,0.003101
privacy_DistanceClosestRecord_synth,0.105486
privacy_NearestNeighborDistanceRatio_holdout,0.624207
privacy_NearestNeighborDistanceRatio_synth,0.79473
privacy_CategoricalCAPScore_score,0.790567
utility_ContingencySimilarityScore_score,0.868382
utility_FeatureCorrelation_score,0.84201
utility_CorrelationSimilarityScore_score,0.984891


You can see how the metrics within the `SyntheticDataBenchmark` class are affected by the `metic_kwargs` parameters and computation on the data.

In [32]:
synth_review.metrics

{'privacy': [DistanceClosestRecord({'encoder': None, 'quantile': 0.5, 'metric': 'gower', 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']}),
  NearestNeighborDistanceRatio({'encoder': None, 'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 2, 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']}),
  CategoricalCAPScore({'encoder': None, 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income'], 'frac_sensitive': 0.5, 'random_state': 42})],
 'utility': [ContingencySimilarityScore({'encoder': None, 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']}),
  FeatureCorrelation({'encoder': None, 'numerical_columns': ['age', 'hours-per-week']}),
 

## Benchmarking
We can benchmark a selection of generators by creating synthetic datasets for each and compare their performance.

In [85]:
# define generator selection
generators = [MarginalGenerator(epsilon=0.1), PrivBayes(epsilon=0.1), PrivBayes(epsilon=1)]

# optional: reviewer to run a suite of metrics and combine results for all generated synthetic datasets
synth_review = SyntheticDataReview(metrics=metrics, metric_kwargs=metric_kwargs, encoder='ordinal', n_jobs=n_jobs, verbose=VERBOSE)

benchmark.benchmark_generators(
    data_real=df_train,
    data_holdout=df_holdout,    
    generators=generators,
    path_out = paths.PATH_RESULTS / 'adult',
    n_records=1000,
    reviewer=synth_review,
    fname_param='epsilon',
    verbose=VERBOSE
)


Running generator MarginalGenerator
Fitting generator MarginalGenerator on input data
Marginal fitted: age
Marginal fitted: hours-per-week
Marginal fitted: workclass
Marginal fitted: education
Marginal fitted: marital-status
Marginal fitted: occupation
Marginal fitted: relationship
Marginal fitted: race
Marginal fitted: sex
Marginal fitted: native-country
Marginal fitted: income
Generator fitted. Generating 1000 records
Column sampled: age
Column sampled: hours-per-week
Column sampled: workclass
Column sampled: education
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: native-country
Column sampled: income
Saved to disk: /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult/configs/0_MarginalGenerator_epsilon0.1.json
Saved synthetic data, generator and configs for 0_MarginalGenerator_epsilon0.1 at /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult
Running reviewer for 0_Margi

  self.values = self.values / row_sum
  self.values = self.values / row_sum


Number of records generated: 1000 / 1000
Synthetic Data Generated

Saved to disk: /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult/configs/2_PrivBayes_epsilon1.json
Saved synthetic data, generator and configs for 2_PrivBayes_epsilon1 at /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult
Running reviewer for 2_PrivBayes_epsilon1
Saved scores at /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult/reports/scores.csv


## Load and review results

In [86]:
# load results
scores_benchmark = pd.read_csv(paths.PATH_RESULTS / 'adult/reports/scores.csv')
scores_benchmark

Unnamed: 0,metric,0_MarginalGenerator_epsilon0.1,1_PrivBayes_epsilon0.1,2_PrivBayes_epsilon1
0,privacy_DistanceClosestRecord_holdout,0.003101,0.003101,0.003101
1,privacy_DistanceClosestRecord_synth,1.0,0.991522,0.085904
2,privacy_NearestNeighborDistanceRatio_holdout,0.624207,0.624207,0.624207
3,privacy_NearestNeighborDistanceRatio_synth,0.937875,0.927293,0.791771
4,privacy_CategoricalCAPScore_score,0.96737,0.994094,0.953864
5,utility_ContingencySimilarityScore_score,0.859009,0.84456,0.883557
6,utility_FeatureCorrelation_score,0.799162,0.817108,0.849146
7,utility_CorrelationSimilarityScore_score,0.962402,0.971919,0.964151


In [87]:
# load saved generator
pb_reload = PrivBayes(epsilon=0.1).load(paths.PATH_RESULTS / 'adult/generators/2_PrivBayes_epsilon1.pkl')
pb_reload.model.network_

[APPair(attribute='marital-status', parents=None),
 APPair(attribute='relationship', parents=('marital-status',)),
 APPair(attribute='sex', parents=('marital-status', 'relationship')),
 APPair(attribute='occupation', parents=('sex', 'relationship')),
 APPair(attribute='workclass', parents=('occupation',)),
 APPair(attribute='income', parents=('occupation', 'relationship')),
 APPair(attribute='education', parents=('relationship', 'income')),
 APPair(attribute='age', parents=('income',)),
 APPair(attribute='hours-per-week', parents=('sex',)),
 APPair(attribute='race', parents=('occupation', 'income')),
 APPair(attribute='native-country', parents=('sex', 'income'))]

In [89]:
# load config
config = load.load_json(paths.PATH_RESULTS / 'adult/configs/0_MarginalGenerator_epsilon0.1.json')
config

Loaded: /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult/configs/0_MarginalGenerator_epsilon0.1.json


{'model': {'epsilon': 0.1,
  'verbose': 1,
  'columns_': ['age',
   'hours-per-week',
   'workclass',
   'education',
   'marital-status',
   'occupation',
   'relationship',
   'race',
   'sex',
   'native-country',
   'income'],
  'n_records_fit_': 26048,
  'dtypes_fit_': {},
  'model_': {'age': {'31': 0.02953113503180167,
    '23': 0.020206850348045247,
    '36': 0.02440254790165928,
    '35': 0.01588863780532297,
    '28': 0.023892853531741684,
    '34': 0.04283531904345958,
    '37': 0.03515129824025246,
    '27': 0.016135739678395038,
    '30': 0.035155427714805586,
    '33': 0.0,
    '25': 0.030597987474757376,
    '32': 0.03788621682140433,
    '38': 0.011745251300373686,
    '39': 0.016883122659789043,
    '29': 0.01938241244400488,
    '41': 0.0042284603625428385,
    '40': 0.008994927184385241,
    '26': 0.03019003752357534,
    '42': 0.023120106232622874,
    '43': 0.008038548346137302,
    '24': 0.03225537850400474,
    '20': 0.0,
    '22': 0.006547798003392347,
    '46': 