# Benchmark generators
Run a suite of metrics to evaluate the utility and privacy of synthetic data.

## Load libraries and define settings

In [2]:
# general dependencies
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path

In [3]:
# package
from crnsynth.generators.marginal_generator import MarginalGenerator
from crnsynth.generators.privbayes import PrivBayes

from crnsynth.processing import preprocessing

from crnsynth.metrics import PRIVACY_METRICS, ALL_METRICS
from crnsynth.benchmark.review import SyntheticDataReview
from crnsynth.benchmark import benchmark
from crnsynth.serialization import paths

In [4]:
from examples import adult_synthesis

In [5]:
# ignore deprecation warning
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

In [6]:
# privacy budget for differentially private algorithms
EPSILON = 1

# other params
RANDOM_STATE = 42
VERBOSE = 1

## Load and process dataset

In [7]:
df_adult = pd.read_csv(adult_synthesis.PATH_ADULT)
df_adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Generate
Here we show how step by step how to generate synthetic data.

In [8]:
df = df_adult.copy()

# process data
df = adult_synthesis.preprocess_real_data(df)

# split data
df_train, df_holdout = preprocessing.split_train_holdout(df, target_column='income', holdout_size=0.2, random_state=RANDOM_STATE)

# define your generator 
generator = PrivBayes(epsilon=EPSILON)

# train the generator on the input data
generator.fit(df_train)

# generate synthetic data with a desired number of rows
df_synth = generator.generate(n_records=1000)

# post process the synthetic data
df_synth = adult_synthesis.postprocess_synthetic_data(df_synth)

df_synth.head()

1/11 - Root of network: relationship

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'marital-status' - with parents: ('relationship',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 13
Selected attribute: 'sex' - with parents: ('relationship', 'marital-status')

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 12
Selected attribute: 'occupation' - with parents: ('sex', 'relationship')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 14
Selected attribute: 'workclass' - with parents: ('occupation',)

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 16
Selected attribute: 'income' - with parents: ('occupation', 'relationship')

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 15
Selected attribute: 

  self.values = self.values / row_sum
  self.values = self.values / row_sum


Number of records generated: 1000 / 1000
Synthetic Data Generated



Unnamed: 0,age,hours-per-week,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,39,30,Private,Some-college,Never-married,Craft-repair,Own-child,White,Male,United-States,<=50K
1,19,44,Private,Some-college,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,El-Salvador,<=50K
2,19,40,Private,HS-grad,Divorced,Craft-repair,Unmarried,White,Female,United-States,<=50K
3,18,86,?,Some-college,Divorced,?,Unmarried,White,Female,United-States,<=50K
4,30,50,Private,Assoc-acdm,Never-married,Machine-op-inspct,Own-child,Amer-Indian-Eskimo,Female,Columbia,<=50K


## Synthetic Data Review
Run a suite of metrics to evaluate the utility and privacy of synthetic data.

In [9]:
# metrics can either be a list or dictionary
metrics = ALL_METRICS
metrics

{'privacy': [DistanceClosestRecord({'encoder': 'ordinal', 'quantile': 0.5, 'metric': 'gower', 'categorical_columns': None}),
  NearestNeighborDistanceRatio({'encoder': 'ordinal', 'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 2, 'categorical_columns': None}),
  CategoricalCAPScore({'encoder': 'ordinal', 'categorical_columns': None, 'frac_sensitive': 0.5, 'random_state': None})]}

In [10]:
# metrics that have any of the parameters below are updated with the following values
# alternatively, you can set the values for each metric individually as well
metric_kwargs = {
    'categorical_columns': adult_synthesis.NOMINAL_COLUMNS,
    'frac_sensitive': 0.5,
    'random_state': RANDOM_STATE,
}

# define number of jobs for sequential (n_jobs == 1) or parallel computation (n_jobs > 1 or -1 for all cores)
n_jobs = -1 

synth_review = SyntheticDataReview(metrics=metrics, metric_kwargs=metric_kwargs, encoder='ordinal', n_jobs=n_jobs, verbose=VERBOSE)
scores = synth_review.compute(df_train, df_synth, df_holdout)
scores

{'privacy': {'DistanceClosestRecord': {'holdout': 0.0031013367760234613,
   'synth': 0.07022481592845455},
  'NearestNeighborDistanceRatio': {'holdout': 0.6242069835342642,
   'synth': 0.8147354192497644},
  'CategoricalCAPScore': {'score': 0.9645891978997194}}}

After computation scores can also be accessed via the `scores_` attribute or as dataframe using `.score_as_dataframe()`.

In [11]:
synth_review.scores_

{'privacy': {'DistanceClosestRecord': {'holdout': 0.0031013367760234613,
   'synth': 0.07022481592845455},
  'NearestNeighborDistanceRatio': {'holdout': 0.6242069835342642,
   'synth': 0.8147354192497644},
  'CategoricalCAPScore': {'score': 0.9645891978997194}}}

In [12]:
synth_review.score_as_dataframe(name='Privbayes')

Unnamed: 0,Privbayes
privacy_DistanceClosestRecord_holdout,0.003101
privacy_DistanceClosestRecord_synth,0.070225
privacy_NearestNeighborDistanceRatio_holdout,0.624207
privacy_NearestNeighborDistanceRatio_synth,0.814735
privacy_CategoricalCAPScore_score,0.964589


You can see how the metrics within the `SyntheticDataBenchmark` class are affected by the `metic_kwargs` parameters and computation on the data.

In [13]:
synth_review.metrics

{'privacy': [DistanceClosestRecord({'encoder': None, 'quantile': 0.5, 'metric': 'gower', 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']}),
  NearestNeighborDistanceRatio({'encoder': None, 'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 2, 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']}),
  CategoricalCAPScore({'encoder': None, 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income'], 'frac_sensitive': 0.5, 'random_state': 42})]}

## Benchmarking
We can benchmark a selection of generators by creating synthetic datasets for each and compare their performance.

In [14]:
# define generator selection
generators = [MarginalGenerator(epsilon=0.1), PrivBayes(epsilon=0.1), PrivBayes(epsilon=1)]

# optional: reviewer to run a suite of metrics and combine results for all generated synthetic datasets
synth_review = SyntheticDataReview(metrics=metrics, metric_kwargs=metric_kwargs, encoder='ordinal', n_jobs=n_jobs, verbose=VERBOSE)

benchmark.benchmark_generators(
    data_real=df_train,
    data_holdout=df_holdout,    
    generators=generators,
    path_out = paths.PATH_RESULTS / 'adult',
    n_records=1000,
    reviewer=synth_review,
    fname_param='epsilon',
    verbose=VERBOSE
)


Running generator MarginalGenerator
Fitting generator MarginalGenerator on input data
Marginal fitted: age
Marginal fitted: hours-per-week
Marginal fitted: workclass
Marginal fitted: education
Marginal fitted: marital-status
Marginal fitted: occupation
Marginal fitted: relationship
Marginal fitted: race
Marginal fitted: sex
Marginal fitted: native-country
Marginal fitted: income
Generator fitted. Generating 1000 records
Column sampled: age
Column sampled: hours-per-week
Column sampled: workclass
Column sampled: education
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: native-country
Column sampled: income
Saved synthetic data and generator for 0_MarginalGenerator_epsilon0.1 at /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult
Running reviewer for 0_MarginalGenerator_epsilon0.1
Running generator PrivBayes
Fitting generator PrivBayes on input data
1/11 - Root of network: marital-sta

  self.values = self.values / row_sum
  self.values = self.values / row_sum


Number of records generated: 1000 / 1000
Synthetic Data Generated

Saved synthetic data and generator for 2_PrivBayes_epsilon1 at /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult
Running reviewer for 2_PrivBayes_epsilon1
Saved scores at /Users/dknoors/Projects/synthesis-dk/crn-synth/results/adult/reports/scores.csv


In [15]:
# load results
scores_benchmark = pd.read_csv(paths.PATH_RESULTS / 'adult/reports/scores.csv')
scores_benchmark

Unnamed: 0,metric,0_MarginalGenerator_epsilon0.1,1_PrivBayes_epsilon0.1,2_PrivBayes_epsilon1
0,privacy_DistanceClosestRecord_holdout,0.003101,0.003101,0.003101
1,privacy_DistanceClosestRecord_synth,0.829048,0.874992,0.073154
2,privacy_NearestNeighborDistanceRatio_holdout,0.624207,0.624207,0.624207
3,privacy_NearestNeighborDistanceRatio_synth,0.933264,0.920203,0.774144
4,privacy_CategoricalCAPScore_score,0.977537,0.962037,0.982125


In [72]:
# load saved generator
pb_reload = PrivBayes(epsilon=1).load(paths.PATH_RESULTS / 'adult/generators/2_PrivBayes_epsilon1.pkl')
pb_reload.model_


<BayesianNetwork name='PrivBayes'>
  <Node RV='workclass' description='' states=['Private', 'Self-emp-not-inc', 'Local-gov', '?', 'State-gov', 'Self-emp-inc', 'Federal-gov', 'Without-pay', 'Never-worked'] />
  <Node RV='occupation' description='' states=['?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'] />
  <Node RV='sex' description='' states=['Female', 'Male'] />
  <Node RV='relationship' description='' states=['Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'] />
  <Node RV='marital-status' description='' states=['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'] />
  <Node RV='income' description='' states=['<=50K', '>50K'] />
  <Node RV='education' description='' states=['10th', '11th'