 ## Runs Code to Test Components to Ensure they are Working

In [13]:
import pandas as pd

from data_synthesis import prep_metadata, prep_bin_data
#if we are using DP-CTGAN, we will need to bin any continuous data
from privacy_attack import drop_columns

from data_synthesis import DataSynthesis
# ran into error when running above: missing OpenDP even though it was already installed
# solution: pip install opendp each time this happens
from privacy_attack import PrivacyAttack
from synth_evaluator import SynthEvaluator

# predata
train_df = pd.read_csv('test_data/adults_train-test.csv')
control_df = pd.read_csv('test_data/adults_control-test.csv')

metadata = prep_metadata(train_df)
metadata_dict = metadata.to_dict()
# for column in metadata_dict['columns']:
#     # print(column, metadata_dict['columns'][column]['sdtype'])
#     if(metadata_dict['columns'][column]['sdtype'] == 'numeric'):
#         train_df[column] = pd.to_numeric(train_df[column])
#         control_df[column] = pd.to_numeric(control_df[column])

synthevaluator = SynthEvaluator(metadata)

## Data Synthesis - CTGAN

In [14]:
synthesizer = DataSynthesis(metadata)
approaches = synthesizer.get_approaches()
print(approaches)
print("Selecing Approach[0]")
params = synthesizer.get_default_params(approaches[0])
print(params)
params['sample_size'] = 5000
params['epochs'] = 10
params['save_synthesizer'] = True
params['save_filepath'] = 'test_data/dataset_adults_train_ctgan_synthesizer.pkl'
print(params)
ctgan_df = synthesizer.synth_data(data=train_df, approach=approaches[0], parameters=params)
print("Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.")
ctgan_df.to_parquet('test_data/adults_syn_ctgan.parquet')

['ctgan', 'dpctgan']
Selecing Approach[0]
{'sample_size': 1000, 'enforce_rounding': False, 'epochs': 500, 'verbose': True, 'save_synthesizer': False, 'save_filepath': ''}
{'sample_size': 5000, 'enforce_rounding': False, 'epochs': 10, 'verbose': True, 'save_synthesizer': True, 'save_filepath': 'test_data/dataset_adults_train_ctgan_synthesizer.pkl'}
Epoch 1, Loss G:  1.8581,Loss D:  0.0033
Epoch 2, Loss G:  1.8612,Loss D: -0.0372
Epoch 3, Loss G:  1.8187,Loss D: -0.0149
Epoch 4, Loss G:  1.8110,Loss D: -0.0446
Epoch 5, Loss G:  1.8134,Loss D: -0.1067
Epoch 6, Loss G:  1.7660,Loss D: -0.1147
Epoch 7, Loss G:  1.7606,Loss D: -0.1474
Epoch 8, Loss G:  1.8153,Loss D: -0.2396
Epoch 9, Loss G:  1.7687,Loss D: -0.2525
Epoch 10, Loss G:  1.7151,Loss D: -0.3231
Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.


## Data Analysis - CTGAN

In [15]:
synthevaluator.run_data_diagnosis(train_df, ctgan_df)
utility = synthevaluator.run_utility(train_df, ctgan_df)
for column in train_df:
    print(utility[column])
synthevaluator.run_column_diagnosis(train_df, ctgan_df, 'age')
synthevaluator.run_column_diagnosis(train_df, ctgan_df, 'occupation')

=== Quality Report ===
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 56.92it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 62.56it/s]

Overall Quality Score: 60.17%

Properties:
- Column Shapes: 64.9%
- Column Pair Trends: 55.43%
=== Diagnostic Report ===
Generating report ...
(1/3) Evaluating Coverage: : 100%|██████████| 15/15 [00:00<00:00, 806.79it/s]
(2/3) Evaluating Boundary: : 100%|██████████| 15/15 [00:00<00:00, 1489.14it/s]
(3/3) Evaluating Synthesis: : 100%|██████████| 1/1 [00:47<00:00, 47.67s/it]

Diagnostic Results:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
✓ Over 90% of the synthetic rows are not copies of the real data
{'column': 'age', 'coverage': 20.0, 'mds': KstestResult(stat

## Preprocessing - DP-CTGAN

In [16]:
bin_size = 50
columns = [
    'age', 
    'fnlwgt', 
    'education_num', 
    'capital_gain', 
    'capital_loss', 
    'hr_per_week'
]
# #all numerical columns are distributed into 50 bins labeled from 1 to 50
train_df = prep_bin_data(train_df, columns, bin_size)

#do the same for control data
control_df = prep_bin_data(control_df, columns, bin_size)

# then detect metadata
# the metadata for train and control is expected to be the same
metadata = prep_metadata(train_df)

## Data Synthesis - DP-CTGAN

In [17]:
synthesizer = DataSynthesis(metadata)
approaches = synthesizer.get_approaches()
print(approaches)
print("Selecing Approach[1]")
params = synthesizer.get_default_params(approaches[1])
print(params)
params['sample_size'] = 5000
params['epochs'] = 10
dp_ctgan_df = synthesizer.synth_data(data=train_df, approach=approaches[1], parameters=params)
print("Synthesis completed. You can view the resultant data in Jupyter:Variables if you are on VS Code.")
dp_ctgan_df.to_parquet('test_data/adults_syn_dpctgan.parquet')


['ctgan', 'dpctgan']
Selecing Approach[1]
{'sample_size': 1000, 'generator_decay': 1e-05, 'discriminator_decay': 0.001, 'batch_size': 64, 'epochs': 100, 'epsilon': 32, 'verbose': True, 'preprocessor_eps': 1.0}
Epoch 1, Loss G: 0.6847, Loss D: 1.3887
epsilon is 0.050758636885846496, alpha is 63.0
Epoch 2, Loss G: 0.6813, Loss D: 1.3852
epsilon is 0.41099203158650366, alpha is 22.0
Epoch 3, Loss G: 0.6779, Loss D: 1.3941
epsilon is 0.5895550915085035, alpha is 17.0
Epoch 4, Loss G: 0.6801, Loss D: 1.3934
epsilon is 0.7314390555226387, alpha is 15.0
Epoch 5, Loss G: 0.6791, Loss D: 1.3949
epsilon is 0.85295344754532, alpha is 13.0
Epoch 6, Loss G: 0.6809, Loss D: 1.3893
epsilon is 0.9621828864060221, alpha is 12.0
Epoch 7, Loss G: 0.6817, Loss D: 1.3984
epsilon is 1.0628032345112661, alpha is 10.9
Epoch 8, Loss G: 0.6782, Loss D: 1.3957
epsilon is 1.1555857820715685, alpha is 10.5
Epoch 9, Loss G: 0.6798, Loss D: 1.3987
epsilon is 1.2433065858958203, alpha is 10.0
Epoch 10, Loss G: 0.6739

## Data Analysis - DP-CTGAN

In [18]:
synthevaluator.run_data_diagnosis(train_df, dp_ctgan_df)
utility = synthevaluator.run_utility(train_df, dp_ctgan_df)
for column in train_df:
    print(utility[column])
synthevaluator.run_column_diagnosis(train_df, dp_ctgan_df, 'age')
synthevaluator.run_column_diagnosis(train_df, dp_ctgan_df, 'occupation')

=== Quality Report ===
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 55.03it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 70.06it/s]

Overall Quality Score: 43.29%

Properties:
- Column Shapes: 47.26%
- Column Pair Trends: 39.32%
=== Diagnostic Report ===
Generating report ...
(1/3) Evaluating Coverage: : 100%|██████████| 15/15 [00:00<00:00, 1067.23it/s]
(2/3) Evaluating Boundary: : 100%|██████████| 15/15 [00:00<00:00, 2172.31it/s]
(3/3) Evaluating Synthesis: : 100%|██████████| 1/1 [00:45<00:00, 45.89s/it]

Diagnostic Results:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
✓ Over 90% of the synthetic rows are not copies of the real data
{'column': 'age', 'coverage': 20.0, 'mds': KstestResult(st

### Privacy Attack on both Synthetic Sets
### Preprocessing

In [19]:
privacyattack = PrivacyAttack(metadata)

# If any columns have issues running through the attack
# columns_to_drop = ['country']
# if(len(columns_to_drop) > 0):
#     train_df = drop_columns(train_df, columns_to_drop)
#     control_df = drop_columns(control_df, columns_to_drop)
#     ctgan_df = drop_columns(ctgan_df, columns_to_drop)
#     dp_ctgan_df = drop_columns(dp_ctgan_df, columns_to_drop)


### Privacy Attack

In [20]:
params = privacyattack.get_default_params()
params['domias_mem_set_size'] = train_df.shape[0]
params['domias_reference_set_size'] = control_df.shape[0]

# for ctgan
if params['anon_inf_attacks'] > ctgan_df.shape[0]:
    params['anon_inf_attacks'] = ctgan_df.shape[0]
params['domias_synthetic_sizes'] = ctgan_df.shape[0]
ctgan_results = privacyattack.inference_attack(
    params = params,
    original_data = train_df,
    synth_data = ctgan_df,
    control_data = control_df,    
)

# for dpctgan
if params['anon_inf_attacks'] > dp_ctgan_df.shape[0]:
    params['anon_inf_attacks'] = dp_ctgan_df.shape[0]
params['domias_synthetic_sizes'] = dp_ctgan_df.shape[0]
dp_ctgan_results = privacyattack.inference_attack(
    params = params,
    original_data = train_df,
    synth_data = dp_ctgan_df,
    control_data = control_df,    
)

### Results Fields

In [21]:
print("\nn_attacks", ctgan_results['anon_inference']['age']['col'])
print("\nn_attacks", ctgan_results['anon_inference']['age']['results'].n_attacks)
print("\nn_success", ctgan_results['anon_inference']['age']['results'].n_success)
print("\nn_baseline", ctgan_results['anon_inference']['age']['results'].n_baseline)
print("\nn_control", ctgan_results['anon_inference']['age']['results'].n_control)
print("\nattack_rate", ctgan_results['anon_inference']['age']['results'].attack_rate)
print("\nbaseline_rate", ctgan_results['anon_inference']['age']['results'].baseline_rate)
print("\ncontrol_rate", ctgan_results['anon_inference']['age']['results'].control_rate)
print("\nguesses", ctgan_results['anon_inference']['age']['guesses'])
print("\ntargets", ctgan_results['anon_inference']['age']['targets'])
print("\ndistances", ctgan_results['anon_inference']['age']['distances'])
print("\ny_true", ctgan_results['anon_inference']['age']['y_true'])
print("\ny_pred", ctgan_results['anon_inference']['age']['y_pred'])

print("\nn_attacks", ctgan_results['domias']['n_attacks'])
print("\nn_success", ctgan_results['domias']['n_success'])
print("\ny_true", ctgan_results['domias']['y_true'])
print("\ny_pred", ctgan_results['domias']['y_pred'])
print("\nmia_scores", ctgan_results['domias']['mia_scores'])



n_attacks age

n_attacks 100

n_success 1

n_baseline 3

n_control 2

attack_rate SuccessRate(value=0.028126814121422983, error=0.026359382057282336)

baseline_rate SuccessRate(value=0.04738694415728327, error=0.037132420133244345)

control_rate SuccessRate(value=0.037756879139353126, error=0.03225491158919078)

guesses [24, 40, 46, 60, 24, 51, 24, 24, 17, 17, 36, 18, 40, 33, 61, 30, 38, 38, 38, 17, 24, 36, 24, 33, 18, 40, 21, 40, 17, 17, 24, 71, 17, 55, 60, 17, 49, 17, 59, 26, 61, 17, 27, 17, 27, 24, 39, 55, 26, 17, 37, 23, 25, 33, 17, 27, 24, 43, 25, 27, 64, 20, 26, 20, 17, 24, 44, 24, 17, 24, 47, 48, 46, 17, 24, 27, 40, 17, 17, 17, 17, 17, 24, 24, 47, 26, 26, 62, 17, 17, 17, 24, 17, 60, 27, 17, 17, 17, 17, 24]

targets [16, 27, 26, 25, 20, 21, 19, 19, 4, 4, 6, 5, 3, 28, 13, 7, 9, 22, 13, 3, 17, 17, 7, 13, 2, 2, 25, 2, 30, 20, 20, 2, 28, 32, 9, 9, 6, 17, 25, 12, 20, 22, 7, 12, 9, 5, 4, 12, 18, 24, 3, 11, 50, 8, 1, 3, 7, 7, 17, 22, 23, 10, 21, 10, 15, 21, 5, 19, 15, 20, 10, 16, 1, 27

### RESULTS: CTGAN

In [22]:
evaluation_ctgan = synthevaluator.run_defense(ctgan_results)
print("\n", evaluation_ctgan['accuracy'])
print("\n", evaluation_ctgan['pairwise_error'])
print("\n", evaluation_ctgan['gda_defense'])

fnlwgt  could not calculate ROCAUC due to single class in y_true
education_num  could not calculate ROCAUC due to single class in y_true
capital_gain  could not calculate ROCAUC due to single class in y_true
capital_loss  could not calculate ROCAUC due to single class in y_true

 {'anon_inference': {'age': {'rate1': 0.028126814121422983, 'error1': 0.026359382057282336, 'rate2': 0.01, 'rocauc': 0.3434343434343434}, 'type_employer': {'rate1': 0.17257778939037513, 'error1': 0.07162490450232731, 'rate2': 0.16, 'rocauc': 0.45163690476190477}, 'fnlwgt': {'rate1': 0.01849674910349284, 'error1': 0.01849674910349284, 'rate2': 0.0}, 'education': {'rate1': 0.0762771392110737, 'error1': 0.04849101524788551, 'rate2': 0.06, 'rocauc': 0.46631205673758863}, 'education_num': {'rate1': 0.01849674910349284, 'error1': 0.01849674910349284, 'rate2': 0.0}, 'marital': {'rate1': 0.230358179497956, 'error1': 0.08034535565190198, 'rate2': 0.22, 'rocauc': 0.6060606060606061}, 'occupation': {'rate1': 0.06664707419

### RESULTS: DP-CTGAN

In [23]:
evaluation_dp_ctgan = synthevaluator.run_defense(dp_ctgan_results)
print("\n", evaluation_dp_ctgan['accuracy'])
print("\n", evaluation_dp_ctgan['pairwise_error'])
print("\n", evaluation_dp_ctgan['gda_defense'])


 {'anon_inference': {'age': {'rate1': 0.06664707419314356, 'error1': 0.045103395038775584, 'rate2': 0.03, 'rocauc': 0.3659793814432989}, 'type_employer': {'rate1': 0.230358179497956, 'error1': 0.08034535565190198, 'rate2': 0.22, 'rocauc': 0.5571095571095571}, 'fnlwgt': {'rate1': 0.10516733426486412, 'error1': 0.05709479426229897, 'rate2': 0.08, 'rocauc': 0.4436141304347826}, 'education': {'rate1': 0.1244274643007244, 'error1': 0.06188550073007873, 'rate2': 0.11, 'rocauc': 0.5566905005107253}, 'education_num': {'rate1': 0.0762771392110737, 'error1': 0.04849101524788551, 'rate2': 0.06, 'rocauc': 0.5186170212765957}, 'marital': {'rate1': 0.10516733426486412, 'error1': 0.05709479426229897, 'rate2': 0.09, 'rocauc': 0.44932844932844934}, 'occupation': {'rate1': 0.08590720422900384, 'error1': 0.05158794316173117, 'rate2': 0.07, 'rocauc': 0.45238095238095233}, 'relationship': {'rate1': 0.13405752931865456, 'error1': 0.06406346494845966, 'rate2': 0.12, 'rocauc': 0.48910984848484845}, 'race': {