# Benchmark
Run a suite of metrics to evaluate the utility and privacy of synthetic data.

## Load libraries and define settings

In [1]:
# general dependencies
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path

In [2]:
# package
from crnsynth.generators.marginal_generator import MarginalGenerator
from crnsynth.generators.privbayes import PrivBayes

from crnsynth.processing import preprocessing, generalization, postprocessing
from crnsynth.synthesization import synthesization, pipeline
from crnsynth.processing.generalization import NumericGeneralizationMech

from crnsynth.metrics import PRIVACY_METRICS, ALL_METRICS
from crnsynth.benchmark.benchmark import SyntheticDataBenchmark

In [3]:
from examples.synthesization import adult_synthesis

In [4]:
# ignore deprecation warning
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

In [5]:
# privacy budget for differentially private algorithms
EPSILON = 1

# other params
N_JOBS = 1
RANDOM_STATE = 42
VERBOSE = 1

## Load and process dataset

In [6]:
df_adult = pd.read_csv(adult_synthesis.PATH_ADULT)
df_adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Generate
Here we show how step by step how to generate synthetic data.

In [7]:
df = df_adult.copy()

# process data
df = adult_synthesis.preprocess_real_data(df)

# split data
df_train, df_holdout = preprocessing.split_train_holdout(df, target_column='income', holdout_size=0.2, random_state=RANDOM_STATE)

# define your generator 
generator = PrivBayes(epsilon=EPSILON)

# train the generator on the input data
generator.fit(df_train)

# generate synthetic data with a desired number of rows
df_synth = generator.generate(n_records=1000)

# post process the synthetic data
df_synth = adult_synthesis.postprocess_synthetic_data(df_synth)

df_synth.head()

1/11 - Root of network: income

2/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 10
Selected attribute: 'relationship' - with parents: ('income',)

3/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 9
Selected attribute: 'marital-status' - with parents: ('relationship', 'income')

4/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 12
Selected attribute: 'sex' - with parents: ('marital-status', 'relationship', 'income')

5/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 20
Selected attribute: 'occupation' - with parents: ('sex', 'income')

6/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 19
Selected attribute: 'workclass' - with parents: ('occupation',)

7/11 - Evaluating next attribute to add to network
Number of AttributeParentPair candidates: 15
Selected attribute: 'educat

  self.values = self.values / row_sum


Number of records generated: 1000 / 1000
Synthetic Data Generated


Unnamed: 0,age,hours-per-week,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,33,40,Private,11th,Never-married,Other-service,Own-child,White,Male,?,<=50K
1,46,40,Federal-gov,HS-grad,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,>50K
2,62,40,Private,Some-college,Divorced,?,Not-in-family,White,Female,United-States,<=50K
3,35,28,Federal-gov,HS-grad,Married-civ-spouse,Prof-specialty,Wife,White,Female,United-States,<=50K
4,28,45,Private,7th-8th,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,United-States,<=50K


In [8]:
from crnsynth.metrics import PRIVACY_METRICS

PRIVACY_METRICS

[DistanceClosestRecord({'encoder': 'ordinal', 'quantile': 0.5, 'metric': 'gower', 'categorical_columns': None}),
 NearestNeighborDistanceRatio({'encoder': 'ordinal', 'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 2, 'categorical_columns': None}),
 CategoricalCAPScore({'encoder': 'ordinal', 'categorical_columns': None, 'frac_sensitive': 0.5, 'random_state': None})]

In [11]:
PRIVACY_METRICS 

[DistanceClosestRecord({'encoder': 'ordinal', 'quantile': 0.5, 'metric': 'gower', 'categorical_columns': None}),
 NearestNeighborDistanceRatio({'encoder': 'ordinal', 'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 2, 'categorical_columns': None}),
 CategoricalCAPScore({'encoder': 'ordinal', 'categorical_columns': None, 'frac_sensitive': 0.5, 'random_state': None})]

## Benchmark 
Run a suite of metrics to evaluate the utility and privacy of synthetic data.

In [19]:
# metrics that have any of the parameters below are updated with the following values
# alternatively, instead of setting a global value, you can set the values for each metric individually as well
metric_kwargs = {
    'categorical_columns': adult_synthesis.NOMINAL_COLUMNS,
    'frac_sensitive': 0.5,
    'random_state': RANDOM_STATE,
}

# metrics can either be a list or dictionary
metrics = ALL_METRICS

benchmark = SyntheticDataBenchmark(metrics=metrics, metric_kwargs=metric_kwargs, encoder='ordinal', n_jobs=1, verbose=VERBOSE)
scores = benchmark.compute(df_train, df_synth, df_holdout)
scores

Running metric distance_closest_record
Running metric nearest_neighbor_distance_ratio
Running metric cap_categorical_score


{'privacy': {'distance_closest_record': {'holdout': 0.0031013367760234613,
   'synth': 0.09704041370533126},
  'nearest_neighbor_distance_ratio': {'holdout': 0.6242069835342642,
   'synth': 0.7754796691059316},
  'cap_categorical_score': {'score': 0.6880591865623489}}}

After computation scores can also be accessed via the `scores_` attribute.

In [20]:
benchmark.scores_

{'privacy': {'distance_closest_record': {'holdout': 0.0031013367760234613,
   'synth': 0.09704041370533126},
  'nearest_neighbor_distance_ratio': {'holdout': 0.6242069835342642,
   'synth': 0.7754796691059316},
  'cap_categorical_score': {'score': 0.6880591865623489}}}

You can see how the metrics within the `SyntheticDataBenchmark` class are affected by the `metic_kwargs` parameters and computation on the data.

In [21]:
benchmark.metrics

{'privacy': [DistanceClosestRecord({'encoder': None, 'quantile': 0.5, 'metric': 'gower', 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']}),
  NearestNeighborDistanceRatio({'encoder': None, 'quantile': 0.5, 'metric': 'gower', 'n_neighbors': 2, 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']}),
  CategoricalCAPScore({'encoder': None, 'categorical_columns': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income'], 'frac_sensitive': 0.5, 'random_state': 42})]}