 ## Runs Code to Test Components to Ensure they are Working

In [2]:
import pandas as pd

from data_synthesis import prep_metadata, prep_bin_data
#if we are using DP-CTGAN, we will need to bin any continuous data
from privacy_attack import drop_columns

from data_synthesis import DataSynthesis
# ran into error when running above: missing OpenDP even though it was already installed
# solution: pip install opendp each time this happens
from privacy_attack import PrivacyAttack
from synth_evaluator import SynthEvaluator

# predata
train_df = pd.read_csv('test_data/adults_train-test.csv')
control_df = pd.read_csv('test_data/adults_control-test.csv')

metadata = prep_metadata(train_df)
metadata_dict = metadata.to_dict()
# for column in metadata_dict['columns']:
#     # print(column, metadata_dict['columns'][column]['sdtype'])
#     if(metadata_dict['columns'][column]['sdtype'] == 'numeric'):
#         train_df[column] = pd.to_numeric(train_df[column])
#         control_df[column] = pd.to_numeric(control_df[column])

synthevaluator = SynthEvaluator(metadata)

[KeOps] /Users/chhduong/.cache/keops2.1.2/Darwin_CHHDUONG-M-F2GZ_23.3.0_p3.9.6 has been cleaned.
[KeOps] Generating code for formula Sum_Reduction((Var(0,3,0)-Var(1,3,1))|(Var(0,3,0)-Var(1,3,1)),1) ... OK
[pyKeOps] Compiling pykeops cpp cb73cd1bce module ... 

<stdin>:1:10: fatal error: 'omp.h' file not found
#include <omp.h>
         ^~~~~~~
1 error generated.


OK
pyKeOps with numpy bindings is working!
pyKeOps with torch bindings is working!


## Data Synthesis - CTGAN

In [3]:
synthesizer = DataSynthesis(metadata)
approaches = synthesizer.get_approaches()
print(approaches)
print("Selecing Approach[0]")
params = synthesizer.get_default_params(approaches[0])
print(params)
params['sample_size'] = 75
params['epochs'] = 10
params['save_synthesizer'] = True
params['save_filepath'] = 'test_data/dataset_adults_train_ctgan_synthesizer.pkl'
print(params)
ctgan_df = synthesizer.synth_data(data=train_df, approach=approaches[0], parameters=params)
print("Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.")
ctgan_df.to_parquet('test_data/adults_syn_ctgan.parquet')

['ctgan', 'dpctgan']
Selecing Approach[0]
{'sample_size': 1000, 'enforce_rounding': False, 'epochs': 500, 'verbose': True, 'save_synthesizer': False, 'save_filepath': ''}
{'sample_size': 75, 'enforce_rounding': False, 'epochs': 10, 'verbose': True, 'save_synthesizer': True, 'save_filepath': 'test_data/dataset_adults_train_ctgan_synthesizer.pkl'}
Epoch 1, Loss G:  1.7969,Loss D:  0.0126
Epoch 2, Loss G:  1.8325,Loss D: -0.0321
Epoch 3, Loss G:  1.8107,Loss D: -0.0234
Epoch 4, Loss G:  1.7435,Loss D: -0.0566
Epoch 5, Loss G:  1.7637,Loss D: -0.0993
Epoch 6, Loss G:  1.7849,Loss D: -0.1417
Epoch 7, Loss G:  1.7673,Loss D: -0.1739
Epoch 8, Loss G:  1.7324,Loss D: -0.1974
Epoch 9, Loss G:  1.8173,Loss D: -0.2584
Epoch 10, Loss G:  1.7007,Loss D: -0.2805
Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.


## Data Analysis - CTGAN

In [4]:
synthevaluator.run_data_diagnosis(train_df, ctgan_df)
utility = synthevaluator.run_utility(train_df, ctgan_df)
for column in train_df:
    print(utility[column])
synthevaluator.run_column_diagnosis(train_df, ctgan_df, 'age')
synthevaluator.run_column_diagnosis(train_df, ctgan_df, 'occupation')

=== Quality Report ===
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 429.16it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 74.30it/s]

Overall Quality Score: 54.39%

Properties:
- Column Shapes: 60.33%
- Column Pair Trends: 48.44%
=== Diagnostic Report ===
Generating report ...
(1/3) Evaluating Coverage: : 100%|██████████| 15/15 [00:00<00:00, 1310.39it/s]
(2/3) Evaluating Boundary: : 100%|██████████| 15/15 [00:00<00:00, 2292.55it/s]
(3/3) Evaluating Synthesis: : 100%|██████████| 1/1 [00:00<00:00,  1.47it/s]

Diagnostic Results:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
✓ Over 90% of the synthetic rows are not copies of the real data
{'column': 'age', 'coverage': 0.3, 'mds': KstestResult(st

## Preprocessing - DP-CTGAN

In [5]:
bin_size = 50
columns = [
    'age', 
    'fnlwgt', 
    'education_num', 
    'capital_gain', 
    'capital_loss', 
    'hr_per_week'
]
# #all numerical columns are distributed into 50 bins labeled from 1 to 50
train_df = prep_bin_data(train_df, columns, bin_size)

#do the same for control data
control_df = prep_bin_data(control_df, columns, bin_size)

# then detect metadata
# the metadata for train and control is expected to be the same
metadata = prep_metadata(train_df)

## Data Synthesis - DP-CTGAN

In [6]:
synthesizer = DataSynthesis(metadata)
approaches = synthesizer.get_approaches()
print(approaches)
print("Selecing Approach[1]")
params = synthesizer.get_default_params(approaches[1])
print(params)
params['sample_size'] = 75
params['epochs'] = 10
dp_ctgan_df = synthesizer.synth_data(data=train_df, approach=approaches[1], parameters=params)
print("Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.")
dp_ctgan_df.to_parquet('test_data/adults_syn_dpctgan.parquet')


['ctgan', 'dpctgan']
Selecing Approach[1]
{'sample_size': 1000, 'generator_decay': 1e-05, 'discriminator_decay': 0.001, 'batch_size': 64, 'epochs': 100, 'epsilon': 32, 'verbose': True, 'preprocessor_eps': 1.0}
Epoch 1, Loss G: 0.6810, Loss D: 1.3895
epsilon is 0.050758636885846496, alpha is 63.0
Epoch 2, Loss G: 0.6840, Loss D: 1.3906
epsilon is 0.41099203158650366, alpha is 22.0
Epoch 3, Loss G: 0.6829, Loss D: 1.3906
epsilon is 0.5895550915085035, alpha is 17.0
Epoch 4, Loss G: 0.6810, Loss D: 1.3961
epsilon is 0.7314390555226387, alpha is 15.0
Epoch 5, Loss G: 0.6797, Loss D: 1.3945
epsilon is 0.85295344754532, alpha is 13.0
Epoch 6, Loss G: 0.6826, Loss D: 1.3934
epsilon is 0.9621828864060221, alpha is 12.0
Epoch 7, Loss G: 0.6808, Loss D: 1.3955
epsilon is 1.0628032345112661, alpha is 10.9
Epoch 8, Loss G: 0.6762, Loss D: 1.3938
epsilon is 1.1555857820715685, alpha is 10.5
Epoch 9, Loss G: 0.6718, Loss D: 1.3907
epsilon is 1.2433065858958203, alpha is 10.0
Epoch 10, Loss G: 0.6749

## Data Analysis - DP-CTGAN

In [7]:
synthevaluator.run_data_diagnosis(train_df, dp_ctgan_df)
utility = synthevaluator.run_utility(train_df, dp_ctgan_df)
for column in train_df:
    print(utility[column])
synthevaluator.run_column_diagnosis(train_df, dp_ctgan_df, 'age')
synthevaluator.run_column_diagnosis(train_df, dp_ctgan_df, 'occupation')

=== Quality Report ===
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 439.56it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 94.15it/s]

Overall Quality Score: 42.8%

Properties:
- Column Shapes: 49.0%
- Column Pair Trends: 36.61%
=== Diagnostic Report ===
Generating report ...
(1/3) Evaluating Coverage: : 100%|██████████| 15/15 [00:00<00:00, 1207.41it/s]
(2/3) Evaluating Boundary: : 100%|██████████| 15/15 [00:00<00:00, 2288.63it/s]
(3/3) Evaluating Synthesis: : 100%|██████████| 1/1 [00:00<00:00,  1.63it/s]

Diagnostic Results:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
✓ Over 90% of the synthetic rows are not copies of the real data
{'column': 'age', 'coverage': 0.3, 'mds': KstestResult(stat

### Privacy Attack on both Synthetic Sets
### Preprocessing

In [8]:
privacyattack = PrivacyAttack(metadata)

# If any columns have issues running through the attack
# columns_to_drop = ['country']
# if(len(columns_to_drop) > 0):
#     train_df = drop_columns(train_df, columns_to_drop)
#     control_df = drop_columns(control_df, columns_to_drop)
#     ctgan_df = drop_columns(ctgan_df, columns_to_drop)
#     dp_ctgan_df = drop_columns(dp_ctgan_df, columns_to_drop)


### Privacy Attack

In [9]:
params = privacyattack.get_default_params()
params['domias_mem_set_size'] = train_df.shape[0]
params['domias_reference_set_size'] = control_df.shape[0]

# for ctgan
if params['anon_inf_attacks'] > ctgan_df.shape[0]:
    params['anon_inf_attacks'] = ctgan_df.shape[0]
params['domias_synthetic_sizes'] = ctgan_df.shape[0]
ctgan_results = privacyattack.inference_attack(
    params = params,
    original_data = train_df,
    synth_data = ctgan_df,
    control_data = control_df,    
)

# for dpctgan
if params['anon_inf_attacks'] > dp_ctgan_df.shape[0]:
    params['anon_inf_attacks'] = dp_ctgan_df.shape[0]
params['domias_synthetic_sizes'] = dp_ctgan_df.shape[0]
dp_ctgan_results = privacyattack.inference_attack(
    params = params,
    original_data = train_df,
    synth_data = dp_ctgan_df,
    control_data = control_df,    
)

TypeError: unhashable type: 'list'

### RESULTS: CTGAN

### RESULTS: DP-CTGAN