In [10]:
import pickle
import pandas as pd

from components.data_synthesis import prep_metadata, prep_bin_data
from components.synth_evaluator import SynthEvaluator

import xlsxwriter

attack_size = 10 
select_columns = {
    'adults': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'],
}
#synthesis method
synth_type = ['ctgan', 'dpctgan'] # 0,1
sample_size = [0.25, 1, 2] # 0,1,2

test_cases_1 = []
for a in range (0,2):
    scenario = {'dataset': 'adults'}
    if a == 0:
        scenario['columns_all'] = False
        scenario['columns'] = select_columns['adults']
    else:
        scenario['columns_all'] = True
        scenario['columns'] = []
    for b in synth_type:
        scenario['synth_type'] = b
        for c in sample_size:
            scenario['sample_size'] = c
            test_cases_1.append({
                'dataset': scenario['dataset'],
                'columns_all': scenario['columns_all'], 
                'columns': scenario['columns'], 
                'synth_type': scenario['synth_type'],
                'sample_size': scenario['sample_size']
            })

In [16]:
#copy code from attack.ipynb to retreive data
test_set_num = 1
iterations = range(0,1)

#these are the sheets we are capturing data:
#these columns should be in them all
# suffix, iteration, dataset, synth type, columns, synth size
# row_data = [file_suffix, i, test_case['dataset'], test_case['synth_type'], columns_all_string, str(size)]
#attack accuracy scores: rate1, errorbound, rate2
#defense scores: select columns + domias defense scores
#utility scores: select columns coverage + mds scores

book = xlsxwriter.Workbook('evaluation.xlsx')
sheet1 = book.add_worksheet("accuracy")
sheet1.write_row(0, 0, ['suffix', 'iteration', 'dataset', 'synth type', 'columns', 'synth size', 'col type', 'rate1', 'error1', 'rate2'])
sheet2 = book.add_worksheet("defense")
sheet2.write_row(0, 0, ['suffix', 'iteration', 'dataset', 'synth type', 'columns', 'synth size', 'col type', 'defense score'])
sheet3 = book.add_worksheet("utility")
sheet3.write_row(0, 0, ['suffix', 'iteration', 'dataset', 'synth type', 'columns', 'synth size', 'col type', 'coverage', 'mds', 'wasKSTest'])

row_count_sheet1 = 1
row_count_sheet2 = 1
row_count_sheet3 = 1

for i in iterations:
    for test_case in test_cases_1:
        train_df = pd.read_parquet('dataset/' + test_case['dataset'] + '_train.parquet')
        if not test_case['columns_all']:
            train_df = train_df[test_case['columns']]
        # account for all columns vs some columns
        if not test_case['columns_all']:
            train_df = train_df[test_case['columns']]

        # account for bin sizing
        bin_size = 50 # assume this bin size is sufficent in all cases and pandas bins efficientally
        bin_columns = []
        if test_case['columns_all'] and test_case['dataset'] == 'adults':
            bin_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hr_per_week']
        elif (not test_case['columns_all']) and test_case['dataset'] == 'adults':
            bin_columns = ['age']
        if test_case['dataset'] != 'census1990':     
            train_df = prep_bin_data(train_df, bin_columns, bin_size)

        dataset_size = train_df.shape[0]
        size = (int)(dataset_size * test_case['sample_size'])
        columns_all_string = ("all" if test_case["columns_all"] else "some")
        file_suffix = str(test_set_num) + "_" + str(i) + "_" +\
            test_case['dataset'] + "_" +\
            test_case['synth_type'] + "_" +\
            columns_all_string + "_" +\
            str(size)
        print("\n==========\n", file_suffix)

        if not os.path.isfile('dataset/results/attack_' + str(attack_size) + '_testset' + file_suffix + '.pkl'):
            print("No Attack Result Available. Skipping... \n")
            continue

        metadata = prep_metadata(train_df)
        metadata_dict = metadata.to_dict()

        results = {}
        #load attack results
        with open('dataset/results/attack_' + str(attack_size) + '_testset' + file_suffix + '.pkl', 'rb') as f:
            results = pickle.load(f)

        #Evaluation
        synthevaluator = SynthEvaluator(metadata)
        defense_evaluation = synthevaluator.run_defense(results)
        
        # print("\n", defense_evaluation['accuracy'])
        for col in select_columns['adults']:
            accuracy_row_data = [file_suffix, i, test_case['dataset'], test_case['synth_type'], columns_all_string, str(size),
                col, defense_evaluation['accuracy']['anon_inference'][col]['rate1'], 
                defense_evaluation['accuracy']['anon_inference'][col]['error1'], 
                defense_evaluation['accuracy']['anon_inference'][col]['rate2']]
            # print(accuracy_row_data)
            sheet1.write_row(row_count_sheet1, 0, accuracy_row_data)
            row_count_sheet1 = row_count_sheet1 + 1
        accuracy_row_data = [file_suffix, i, test_case['dataset'], test_case['synth_type'], columns_all_string, str(size),
                'domias', defense_evaluation['accuracy']['domias']['rate1'], 
                defense_evaluation['accuracy']['domias']['error1'], 
                defense_evaluation['accuracy']['domias']['rate2']]
        # print(accuracy_row_data) 
        sheet1.write_row(row_count_sheet1, 0, accuracy_row_data)
        row_count_sheet1 = row_count_sheet1 + 1
        
        # print("\n", defense_evaluation['pairwise_error'])
        
        # print("\n", defense_evaluation['gda_defense'])
        for col in select_columns['adults']:
            gdadefense_row_data = [file_suffix, i, test_case['dataset'], test_case['synth_type'], columns_all_string, str(size),
                col, defense_evaluation['gda_defense']['anon_inference'][col]]
            # print(gdadefense_row_data)
            sheet2.write_row(row_count_sheet2, 0, gdadefense_row_data)
            row_count_sheet2 = row_count_sheet2 + 1
        gdadefense_row_data = [file_suffix, i, test_case['dataset'], test_case['synth_type'], columns_all_string, str(size),
                'domias', defense_evaluation['gda_defense']['domias']]
        # print(gdadefense_row_data)
        sheet2.write_row(row_count_sheet2, 0, gdadefense_row_data)
        row_count_sheet2 = row_count_sheet2 + 1

        synth_df = pd.read_parquet('dataset/synthetic/testset' + file_suffix + '.parquet')
        utility_evaluation = synthevaluator.run_utility(train_df, synth_df)
        
        for col in select_columns['adults']:
            utility_row_data = [file_suffix, i, test_case['dataset'], test_case['synth_type'], columns_all_string, str(size),
                col, str(round(utility_evaluation[col]['coverage'],2)), 
                str(round(utility_evaluation[col]['mds']['mds'], 4)), 
                str(utility_evaluation[col]['mds']['kstest'])]
            # print(utility_row_data)
            sheet3.write_row(row_count_sheet3, 0, utility_row_data)
            row_count_sheet3 = row_count_sheet3 + 1
book.close()





 1_0_adults_ctgan_some_9758

 1_0_adults_ctgan_some_39032

 1_0_adults_ctgan_some_78064

 1_0_adults_dpctgan_some_9758

 1_0_adults_dpctgan_some_39032

 1_0_adults_dpctgan_some_78064

 1_0_adults_ctgan_all_9758
fnlwgt  could not calculate ROCAUC due to single class in y_true

 1_0_adults_ctgan_all_39032
fnlwgt  could not calculate ROCAUC due to single class in y_true

 1_0_adults_ctgan_all_78064
fnlwgt  could not calculate ROCAUC due to single class in y_true

 1_0_adults_dpctgan_all_9758

 1_0_adults_dpctgan_all_39032

 1_0_adults_dpctgan_all_78064
