 ## Runs Code to Test Components to Ensure they are Working

In [2]:
import pandas as pd

from data_synthesis import prep_metadata, prep_bin_data
#if we are using DP-CTGAN, we will need to bin any continuous data
from privacy_attack import drop_columns

from data_synthesis import DataSynthesis
# ran into error when running above: missing OpenDP even though it was already installed
# solution: pip install opendp each time this happens
from privacy_attack import PrivacyAttack
from synth_evaluator import SynthEvaluator

# predata
train_df = pd.read_csv('test_data/adults_train-test.csv')
control_df = pd.read_csv('test_data/adults_control-test.csv')

metadata = prep_metadata(train_df)
metadata_dict = metadata.to_dict()
# for column in metadata_dict['columns']:
#     # print(column, metadata_dict['columns'][column]['sdtype'])
#     if(metadata_dict['columns'][column]['sdtype'] == 'numeric'):
#         train_df[column] = pd.to_numeric(train_df[column])
#         control_df[column] = pd.to_numeric(control_df[column])

synthevaluator = SynthEvaluator(metadata)

[KeOps] /Users/chhduong/.cache/keops2.1.2/Darwin_CHHDUONG-M-F2GZ_23.3.0_p3.9.6 has been cleaned.
[KeOps] Generating code for formula Sum_Reduction((Var(0,3,0)-Var(1,3,1))|(Var(0,3,0)-Var(1,3,1)),1) ... OK
[pyKeOps] Compiling pykeops cpp cb73cd1bce module ... 

<stdin>:1:10: fatal error: 'omp.h' file not found
#include <omp.h>
         ^~~~~~~
1 error generated.


OK
pyKeOps with numpy bindings is working!
pyKeOps with torch bindings is working!


## Data Synthesis - CTGAN

In [3]:
synthesizer = DataSynthesis(metadata)
approaches = synthesizer.get_approaches()
print(approaches)
print("Selecing Approach[0]")
params = synthesizer.get_default_params(approaches[0])
print(params)
params['sample_size'] = 75
params['epochs'] = 10
params['save_synthesizer'] = True
params['save_filepath'] = 'test_data/dataset_adults_train_ctgan_synthesizer.pkl'
print(params)
ctgan_df = synthesizer.synth_data(data=train_df, approach=approaches[0], parameters=params)
print("Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.")
ctgan_df.to_parquet('test_data/adults_syn_ctgan.parquet')

['ctgan', 'dpctgan']
Selecing Approach[0]
{'sample_size': 1000, 'enforce_rounding': False, 'epochs': 500, 'verbose': True, 'save_synthesizer': False, 'save_filepath': ''}
{'sample_size': 75, 'enforce_rounding': False, 'epochs': 10, 'verbose': True, 'save_synthesizer': True, 'save_filepath': 'test_data/dataset_adults_train_ctgan_synthesizer.pkl'}
Epoch 1, Loss G:  1.9357,Loss D: -0.0028
Epoch 2, Loss G:  1.8902,Loss D: -0.0195
Epoch 3, Loss G:  1.9122,Loss D: -0.0571
Epoch 4, Loss G:  1.8779,Loss D: -0.0897
Epoch 5, Loss G:  1.8398,Loss D: -0.0917
Epoch 6, Loss G:  1.8292,Loss D: -0.1899
Epoch 7, Loss G:  1.8467,Loss D: -0.1769
Epoch 8, Loss G:  1.7989,Loss D: -0.2389
Epoch 9, Loss G:  1.7888,Loss D: -0.3107
Epoch 10, Loss G:  1.7550,Loss D: -0.3085
Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.


## Data Analysis - CTGAN

In [4]:
synthevaluator.run_data_diagnosis(train_df, ctgan_df)
utility = synthevaluator.run_utility(train_df, ctgan_df)
for column in train_df:
    print(utility[column])
synthevaluator.run_column_diagnosis(train_df, ctgan_df, 'age')
synthevaluator.run_column_diagnosis(train_df, ctgan_df, 'occupation')

=== Quality Report ===
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 502.67it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 70.65it/s]

Overall Quality Score: 54.8%

Properties:
- Column Shapes: 62.96%
- Column Pair Trends: 46.64%
=== Diagnostic Report ===
Generating report ...
(1/3) Evaluating Coverage: : 100%|██████████| 15/15 [00:00<00:00, 1580.81it/s]
(2/3) Evaluating Boundary: : 100%|██████████| 15/15 [00:00<00:00, 2904.11it/s]
(3/3) Evaluating Synthesis: : 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]

Diagnostic Results:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
✓ Over 90% of the synthetic rows are not copies of the real data

! The synthetic data is missing more than 10% of the numerical ranges present in the real data
{'column': 'age', 'coverage': 0.3, 'mds': Kstest

## Preprocessing - DP-CTGAN

In [5]:
bin_size = 50
columns = [
    'age', 
    'fnlwgt', 
    'education_num', 
    'capital_gain', 
    'capital_loss', 
    'hr_per_week'
]
# #all numerical columns are distributed into 50 bins labeled from 1 to 50
train_df = prep_bin_data(train_df, columns, bin_size)

#do the same for control data
control_df = prep_bin_data(control_df, columns, bin_size)

# then detect metadata
# the metadata for train and control is expected to be the same
metadata = prep_metadata(train_df)

## Data Synthesis - DP-CTGAN

In [6]:
synthesizer = DataSynthesis(metadata)
approaches = synthesizer.get_approaches()
print(approaches)
print("Selecing Approach[1]")
params = synthesizer.get_default_params(approaches[1])
print(params)
params['sample_size'] = 75
params['epochs'] = 10
dp_ctgan_df = synthesizer.synth_data(data=train_df, approach=approaches[1], parameters=params)
print("Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.")
dp_ctgan_df.to_parquet('test_data/adults_syn_dpctgan.parquet')


['ctgan', 'dpctgan']
Selecing Approach[1]
{'sample_size': 1000, 'generator_decay': 1e-05, 'discriminator_decay': 0.001, 'batch_size': 64, 'epochs': 100, 'epsilon': 32, 'verbose': True, 'preprocessor_eps': 1.0}
Epoch 1, Loss G: 0.6840, Loss D: 1.3911
epsilon is 0.050758636885846496, alpha is 63.0
Epoch 2, Loss G: 0.6806, Loss D: 1.3940
epsilon is 0.41099203158650366, alpha is 22.0
Epoch 3, Loss G: 0.6853, Loss D: 1.3966
epsilon is 0.5895550915085035, alpha is 17.0
Epoch 4, Loss G: 0.6849, Loss D: 1.3962
epsilon is 0.7314390555226387, alpha is 15.0
Epoch 5, Loss G: 0.6800, Loss D: 1.3938
epsilon is 0.85295344754532, alpha is 13.0
Epoch 6, Loss G: 0.6776, Loss D: 1.3981
epsilon is 0.9621828864060221, alpha is 12.0
Epoch 7, Loss G: 0.6752, Loss D: 1.3949
epsilon is 1.0628032345112661, alpha is 10.9
Epoch 8, Loss G: 0.6797, Loss D: 1.3993
epsilon is 1.1555857820715685, alpha is 10.5
Epoch 9, Loss G: 0.6757, Loss D: 1.3969
epsilon is 1.2433065858958203, alpha is 10.0
Epoch 10, Loss G: 0.6786

## Data Analysis - DP-CTGAN

In [7]:
synthevaluator.run_data_diagnosis(train_df, dp_ctgan_df)
utility = synthevaluator.run_utility(train_df, dp_ctgan_df)
for column in train_df:
    print(utility[column])
synthevaluator.run_column_diagnosis(train_df, dp_ctgan_df, 'age')
synthevaluator.run_column_diagnosis(train_df, dp_ctgan_df, 'occupation')

=== Quality Report ===
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 356.91it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 82.45it/s]

Overall Quality Score: 40.08%

Properties:
- Column Shapes: 46.15%
- Column Pair Trends: 34.01%
=== Diagnostic Report ===
Generating report ...
(1/3) Evaluating Coverage: : 100%|██████████| 15/15 [00:00<00:00, 1342.55it/s]
(2/3) Evaluating Boundary: : 100%|██████████| 15/15 [00:00<00:00, 2261.65it/s]
(3/3) Evaluating Synthesis: : 100%|██████████| 1/1 [00:00<00:00,  1.69it/s]

Diagnostic Results:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
✓ Over 90% of the synthetic rows are not copies of the real data
{'column': 'age', 'coverage': 0.3, 'mds': KstestResult(st

### Privacy Attack on both Synthetic Sets
### Preprocessing

In [8]:
privacyattack = PrivacyAttack(metadata)

# If any columns have issues running through the attack
# columns_to_drop = ['country']
# if(len(columns_to_drop) > 0):
#     train_df = drop_columns(train_df, columns_to_drop)
#     control_df = drop_columns(control_df, columns_to_drop)
#     ctgan_df = drop_columns(ctgan_df, columns_to_drop)
#     dp_ctgan_df = drop_columns(dp_ctgan_df, columns_to_drop)


### Privacy Attack

In [9]:
params = privacyattack.get_default_params()
params['domias_mem_set_size'] = train_df.shape[0]
params['domias_reference_set_size'] = control_df.shape[0]

# for ctgan
if params['anon_inf_attacks'] > ctgan_df.shape[0]:
    params['anon_inf_attacks'] = ctgan_df.shape[0]
params['domias_synthetic_sizes'] = ctgan_df.shape[0]
ctgan_results = privacyattack.inference_attack(
    params = params,
    original_data = train_df,
    synth_data = ctgan_df,
    control_data = control_df,    
)

# for dpctgan
if params['anon_inf_attacks'] > dp_ctgan_df.shape[0]:
    params['anon_inf_attacks'] = dp_ctgan_df.shape[0]
params['domias_synthetic_sizes'] = dp_ctgan_df.shape[0]
dp_ctgan_results = privacyattack.inference_attack(
    params = params,
    original_data = train_df,
    synth_data = dp_ctgan_df,
    control_data = control_df,    
)

{'age': {'col': 'age', 'results': <anonymeter.stats.confidence.EvaluationResults object at 0x14153b4c0>, 'guesses': [38, 38, 41, 41, 17, 17, 83, 17, 41, 41, 41, 41, 41, 41, 41, 25, 22, 25, 26, 19, 19, 51, 22, 35, 35, 19, 41, 19, 19, 41, 41, 41, 83, 44, 35, 17, 19, 41, 52, 44, 41, 22, 19, 36, 52, 83, 25, 59, 25, 35, 17, 60, 59, 52, 44, 41, 41, 41, 41, 41, 64, 19, 41, 17, 19, 17, 41, 52, 17, 40, 51, 22, 25, 83, 17], 'targets': [5, 8, 20, 12, 8, 16, 18, 4, 4, 17, 22, 12, 17, 8, 21, 50, 9, 4, 12, 27, 1, 16, 8, 7, 19, 2, 30, 1, 2, 4, 33, 16, 17, 6, 4, 9, 22, 20, 9, 11, 27, 25, 2, 13, 22, 22, 27, 16, 28, 17, 12, 3, 10, 8, 4, 14, 11, 4, 19, 23, 11, 7, 18, 21, 2, 7, 31, 9, 11, 20, 16, 32, 3, 19, 20], 'distances': [array([4.76708589]), array([5.78956602]), array([4.93022809]), array([6.20696603]), array([5.58018805]), array([5.04417375]), array([5.76008084]), array([5.11968746]), array([5.77120256]), array([4.70600471]), array([2.77950025]), array([4.83643538]), array([4.77170739]), array([4.98

### Results Fields

In [10]:
print("\nn_attacks", ctgan_results['anon_inference']['age']['col'])
print("\nn_attacks", ctgan_results['anon_inference']['age']['results'].n_attacks)
print("\nn_success", ctgan_results['anon_inference']['age']['results'].n_success)
print("\nn_baseline", ctgan_results['anon_inference']['age']['results'].n_baseline)
print("\nn_control", ctgan_results['anon_inference']['age']['results'].n_control)
print("\nattack_rate", ctgan_results['anon_inference']['age']['results'].attack_rate)
print("\nbaseline_rate", ctgan_results['anon_inference']['age']['results'].baseline_rate)
print("\ncontrol_rate", ctgan_results['anon_inference']['age']['results'].control_rate)
print("\nguesses", ctgan_results['anon_inference']['age']['guesses'])
print("\ntargets", ctgan_results['anon_inference']['age']['targets'])
print("\ndistances", ctgan_results['anon_inference']['age']['distances'])
print("\ny_true", ctgan_results['anon_inference']['age']['y_true'])
print("\ny_pred", ctgan_results['anon_inference']['age']['y_pred'])

print("\nn_attacks", ctgan_results['domias']['n_attacks'])
print("\nn_success", ctgan_results['domias']['n_success'])
print("\ny_true", ctgan_results['domias']['y_true'])
print("\ny_pred", ctgan_results['domias']['y_pred'])
print("\nmia_scores", ctgan_results['domias']['mia_scores'])



n_attacks age

n_attacks 75

n_success 0

n_baseline 1

n_control 2

attack_rate SuccessRate(value=0.02436192124140293, error=0.02436192124140293)

baseline_rate SuccessRate(value=0.037045603341632184, error=0.03468805220258352)

control_rate SuccessRate(value=0.04972928544186144, error=0.04238559643573005)

guesses [38, 38, 41, 41, 17, 17, 83, 17, 41, 41, 41, 41, 41, 41, 41, 25, 22, 25, 26, 19, 19, 51, 22, 35, 35, 19, 41, 19, 19, 41, 41, 41, 83, 44, 35, 17, 19, 41, 52, 44, 41, 22, 19, 36, 52, 83, 25, 59, 25, 35, 17, 60, 59, 52, 44, 41, 41, 41, 41, 41, 64, 19, 41, 17, 19, 17, 41, 52, 17, 40, 51, 22, 25, 83, 17]

targets [5, 8, 20, 12, 8, 16, 18, 4, 4, 17, 22, 12, 17, 8, 21, 50, 9, 4, 12, 27, 1, 16, 8, 7, 19, 2, 30, 1, 2, 4, 33, 16, 17, 6, 4, 9, 22, 20, 9, 11, 27, 25, 2, 13, 22, 22, 27, 16, 28, 17, 12, 3, 10, 8, 4, 14, 11, 4, 19, 23, 11, 7, 18, 21, 2, 7, 31, 9, 11, 20, 16, 32, 3, 19, 20]

distances [array([4.76708589]), array([5.78956602]), array([4.93022809]), array([6.20696603]), arr

### RESULTS: CTGAN

In [13]:
evaluation_ctgan = synthevaluator.run_defense(ctgan_results)
print(evaluation_ctgan['accuracy'])

age  could not calculate ROCAUC due to single class in y_true
fnlwgt  could not calculate ROCAUC due to single class in y_true
education_num  could not calculate ROCAUC due to single class in y_true
capital_gain  could not calculate ROCAUC due to single class in y_true
capital_loss  could not calculate ROCAUC due to single class in y_true
{'anon_inference': {'age': {'rate1': 0.02436192124140293, 'error1': 0.02436192124140293, 'rate2': 0.0}, 'type_employer': {'rate1': 0.2019334706446125, 'error1': 0.08735237457932168, 'rate2': 0.18666666666666668, 'rocauc': 0.6756440281030446}, 'fnlwgt': {'rate1': 0.02436192124140293, 'error1': 0.02436192124140293, 'rate2': 0.0}, 'education': {'rate1': 0.15119874224369548, 'error1': 0.0771328952032559, 'rate2': 0.13333333333333333, 'rocauc': 0.5215384615384615}, 'education_num': {'rate1': 0.02436192124140293, 'error1': 0.02436192124140293, 'rate2': 0.0}, 'marital': {'rate1': 0.13851506014346623, 'error1': 0.07408134190402109, 'rate2': 0.12, 'rocauc': 0.

### RESULTS: DP-CTGAN

In [14]:
evaluation_dp_ctgan = synthevaluator.run_defense(dp_ctgan_results)
print(evaluation_dp_ctgan['accuracy'])

{'anon_inference': {'age': {'rate1': 0.06241296754209069, 'error1': 0.04871690328194677, 'rate2': 0.02666666666666667, 'rocauc': 0.6506849315068493}, 'type_employer': {'rate1': 0.15119874224369548, 'error1': 0.0771328952032559, 'rate2': 0.13333333333333333, 'rocauc': 0.5276923076923077}, 'fnlwgt': {'rate1': 0.12583137804323696, 'error1': 0.07078225042149378, 'rate2': 0.10666666666666667, 'rocauc': 0.5457089552238806}, 'education': {'rate1': 0.12583137804323696, 'error1': 0.07078225042149378, 'rate2': 0.10666666666666667, 'rocauc': 0.4337686567164179}, 'education_num': {'rate1': 0.07509664964231995, 'error1': 0.05416322189566562, 'rate2': 0.05333333333333334, 'rocauc': 0.4665492957746479}, 'marital': {'rate1': 0.12583137804323696, 'error1': 0.07078225042149378, 'rate2': 0.10666666666666667, 'rocauc': 0.8442164179104477}, 'occupation': {'rate1': 0.037045603341632184, 'error1': 0.03468805220258352, 'rate2': 0.013333333333333334, 'rocauc': 0.3648648648648649}, 'relationship': {'rate1': 0.1