In [1]:
import pandas as pd

from components.data_synthesis import prep_metadata, prep_bin_data
from components.privacy_attack import PrivacyAttack

import pickle
import os.path

#Round Parameters
# dataset
# round number
# synthetic type
#attack sizes: 10%, 50%, 100%, default is 10%
attack_size = 10 #data selection
# datasets = ['adults','diabetes','census1990'] #0,1,2
datasets = ['adults','diabetes']
#columns cases #1, 0
all_columns = True 
select_columns = {
    'adults': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'],
    'diabetes': ['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'Stroke', 'HeartDiseaseorAttack', 'HvyAlcoholConsump', 'AnyHealthcare', 'GenHlth', 'MentHlth', 'PhysHlth', 'Sex', 'Age'],
    'census1990': ['dAge', 'iMartial', 'iOccup', 'iSex', 'iSchool', 'iMilitary', 'iIndustry', 'iImmigr', 'iCitizen']
}
#synthesis method
synth_type = ['ctgan', 'dpctgan'] # 0,1
sample_size = [0.25, 1, 2] # 0,1,2
# epochs = [0.1, 0.5, 1] # 0,1,2
epochs = 0.1

#first set of data: 
# Adults, CTGAN vs DPCTGAN, 
# All Columns vs Select Columns, 
# Various Sample Size and Attack Size

test_cases_1 = []
for a in range (0,2):
    scenario = {'dataset': 'adults'}
    if a == 0:
        scenario['columns_all'] = False
        scenario['columns'] = select_columns['adults']
    else:
        scenario['columns_all'] = True
        scenario['columns'] = []
    for b in synth_type:
        scenario['synth_type'] = b
        for c in sample_size:
            scenario['sample_size'] = c
            test_cases_1.append({
                'dataset': scenario['dataset'],
                'columns_all': scenario['columns_all'], 
                'columns': scenario['columns'], 
                'synth_type': scenario['synth_type'],
                'sample_size': scenario['sample_size']
            })

print("Test Cases for Test Set 1\n==========")
for test_case in test_cases_1:
    print(test_case)

#second set of data: 
# Adults vs US Census vs Diabetes, 
# CTGAN vs DPCTGAN, 
# All Columns, 
# 100% Sample Size and Attack Size

test_cases_2 = []
for a in datasets:
    scenario = {}
    scenario = {'dataset': a}
    for b in synth_type:
        scenario['synth_type'] = b
        test_cases_2.append({
            'dataset': scenario['dataset'],
            'columns_all': True, 
            'columns': [], 
            'synth_type': scenario['synth_type'],
            'sample_size': 1
        }) 

test_case_test = [
    {
        'dataset': 'diabetes',
        'columns_all': True, 
        'columns': [], 
        'synth_type': 'ctgan',
        'sample_size': 1
    }
]

<stdin>:1:10: fatal error: 'omp.h' file not found
#include <omp.h>
         ^~~~~~~
1 error generated.


[KeOps] /Users/chhduong/.cache/keops2.1.2/Darwin_CHHDUONG-M-F2GZ_23.4.0_p3.9.6 has been cleaned.
[KeOps] Generating code for formula Sum_Reduction((Var(0,3,0)-Var(1,3,1))|(Var(0,3,0)-Var(1,3,1)),1) ... OK
[pyKeOps] Compiling pykeops cpp cb73cd1bce module ... OK
pyKeOps with numpy bindings is working!
pyKeOps with torch bindings is working!
Test Cases for Test Set 1
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 0.25}
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 1}
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 2}
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex']

In [3]:
test_set = test_cases_2
test_set_num = 2
iterations = range(0,10)
# iterations = [0]
# failed test cases: 
# dpctgan 1-1 0.25 10
# ctgan 1-0 1 50

for i in iterations:
    print("Round: ", i)
    # test_case = test_cases_1[0] #testing with 1 for now
    for test_case in test_set:
        train_df = pd.read_parquet('dataset/' + test_case['dataset'] + '_train.parquet')
        control_df = pd.read_parquet('dataset/' + test_case['dataset'] + '_control.parquet')
        
        # account for all columns vs some columns
        if not test_case['columns_all']:
            train_df = train_df[test_case['columns']]
            control_df = control_df[test_case['columns']]

        # account for bin sizing
        bin_size = 50 # assume this bin size is sufficent in all cases and pandas bins efficientally
        bin_columns = []
        if test_case['columns_all'] and test_case['dataset'] == 'adults':
            bin_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hr_per_week']
        elif (not test_case['columns_all']) and test_case['dataset'] == 'adults':
            bin_columns = ['age']
        elif test_case['columns_all'] and test_case['dataset'] == 'diabetes':
            bin_columns = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
        elif (not test_case['columns_all']) and test_case['dataset'] == 'diabetes':
            bin_columns = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age']
        # census data is all categorical so no binning needed
        if test_case['dataset'] != 'census1990':     
            train_df = prep_bin_data(train_df, bin_columns, bin_size)
            control_df = prep_bin_data(control_df, bin_columns, bin_size)
        
        dataset_size = train_df.shape[0]
        size = (int)(dataset_size * test_case['sample_size'])
        file_suffix = str(test_set_num) + "_" + str(i) + "_" +\
            test_case['dataset'] + "_" +\
            test_case['synth_type'] + "_" +\
            ("all" if test_case["columns_all"] else "some")  + "_" +\
            str(size)
        print(file_suffix)

        if os.path.isfile('dataset/results/attack_' + str(attack_size) + '_testset' + file_suffix + '.pkl'):
            print("Attack Results already exist. Skipping... \n")
            continue

        synth_df = pd.read_parquet('dataset/synthetic/testset' + file_suffix + '.parquet')
        metadata = prep_metadata(train_df)
        # Attack
        privacyattack = PrivacyAttack(metadata)
        params = privacyattack.get_default_params()
        params['domias_mem_set_size'] = train_df.shape[0]
        params['domias_reference_set_size'] = control_df.shape[0]
        params['anon_inf_attacks'] = int(synth_df.shape[0]*attack_size/100)
        params['domias_synthetic_sizes'] = int(synth_df.shape[0]*attack_size/100)
        # try:
        results = privacyattack.inference_attack(
            params = params,
            original_data = train_df,
            synth_data = synth_df,
            control_data = control_df,    
        )
        #save results
        with open('dataset/results/attack_' + str(attack_size) + '_testset' + file_suffix + '.pkl', 'wb+') as f:
            pickle.dump(results, f)
            f.close()
        # except:
        #     print("This test case is unable to be attacked. Needs more research...\n")
        #     f = open('dataset/results/failed_' + str(attack_size) + '_testset' + file_suffix + '.txt', 'w')
        #     f.write("Failed")
        #     f.close()
        #     continue

Round:  0
2_0_adults_ctgan_all_39032
Attack Results already exist. Skipping... 

2_0_adults_dpctgan_all_39032
Attack Results already exist. Skipping... 

2_0_diabetes_ctgan_all_202944
Running Anon Attack


KeyboardInterrupt: 