In [1]:
import pandas as pd
from components.data_synthesis import prep_metadata, prep_bin_data
from components.data_synthesis import DataSynthesis

### Test Case Selection

In [2]:
#data selection
datasets = ['adults','diabetes','census1990'] #0,1,2
#columns cases #1, 0
all_columns = True 
select_columns = {
    'adults': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'],
    'diabetes': ['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'Stroke', 'HeartDiseaseorAttack', 'HvyAlcoholConsump', 'AnyHealthcare', 'GenHlth', 'MentHlth', 'PhysHlth', 'Sex', 'Age'],
    'census1990': ['dAge', 'iMartial', 'iOccup', 'iSex', 'iSchool', 'iMilitary', 'iIndustry', 'iImmigr', 'iCitizen']
}
#synthesis method
synth_type = ['ctgan', 'dpctgan'] # 0,1
sample_size = [0.25, 1, 2] # 0,1,2
# epochs = [0.1, 0.5, 1] # 0,1,2
epochs = 0.1

#first set of data: 
# Adults, CTGAN vs DPCTGAN, 
# All Columns vs Select Columns, 
# Various Sample Size and Attack Size

test_cases_1 = []
for a in range (0,2):
    scenario = {'dataset': 'adults'}
    if a == 0:
        scenario['columns_all'] = False
        scenario['columns'] = select_columns['adults']
    else:
        scenario['columns_all'] = True
        scenario['columns'] = []
    for b in synth_type:
        scenario['synth_type'] = b
        for c in sample_size:
            scenario['sample_size'] = c
            test_cases_1.append({
                'dataset': scenario['dataset'],
                'columns_all': scenario['columns_all'], 
                'columns': scenario['columns'], 
                'synth_type': scenario['synth_type'],
                'sample_size': scenario['sample_size']
            })

print("Test Cases for Test Set 1\n==========")
for test_case in test_cases_1:
    print(test_case)

#second set of data: 
# Adults vs US Census vs Diabetes, 
# CTGAN vs DPCTGAN, 
# All Columns, 
# 100% Sample Size and Attack Size

test_cases_2 = []
for a in datasets:
    scenario = {}
    scenario = {'dataset': a}
    for b in synth_type:
        scenario['synth_type'] = b
        test_cases_2.append({
            'dataset': scenario['dataset'],
            'columns_all': True, 
            'columns': [], 
            'synth_type': scenario['synth_type'],
            'sample_size': 1
        })                 
print("\nTest Cases for Test Set 2\n==========")
for test_case in test_cases_2:
    print(test_case)

Test Cases for Test Set 1
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 0.25}
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 1}
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 2}
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'dpctgan', 'sample_size': 0.25}
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'dpctgan', 'sample_size': 1}
{'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'

### Data Synthesis of Test Set 1

In [4]:
adults_df = pd.read_parquet('dataset/adults_train.parquet')
dataset_size = adults_df.shape[0]

range_rounds = range(1,10)
# range_rounds = [0]

for i in range_rounds:
    print("Round: ", i)
    # test_case = test_cases_1[0] #testing with 1 for now
    for test_case in test_cases_1:
        train_testcase = adults_df
        if not test_case['columns_all']:
            train_testcase = train_testcase[test_case['columns']]
        print("\nTest Case: ", test_case)
        print("===============")
        if test_case['synth_type'] == 'ctgan':
            metadata = prep_metadata(train_testcase)
            metadata_dict = metadata.to_dict()
            synthesizer = DataSynthesis(metadata)
            params = synthesizer.get_default_params(test_case['synth_type'])
            # params['epochs'] = 1 # for quick test
            params['sample_size'] = (int)(dataset_size * test_case['sample_size'])
            params['verbose'] = False
            params['epochs'] = (int)(params['epochs'] * epochs)
            print(params)
            print("Running Synthesis:::")
            file_suffix = str(i) + "_" +\
                test_case['dataset'] + "_" +\
                test_case['synth_type'] + "_" +\
                ("all" if test_case["columns_all"] else "some")  + "_" +\
                str(params['sample_size'])
            print(file_suffix)
            ctgan_df = synthesizer.synth_data(data=train_testcase, approach=test_case['synth_type'], parameters=params)
            ctgan_df.to_parquet('dataset/synthetic/testset1_' + file_suffix + '.parquet')
        elif test_case['synth_type'] == 'dpctgan':
            bin_size = 50
            bin_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hr_per_week']
            if not test_case['columns_all']:
                bin_columns = ['age']
            train_testcase = prep_bin_data(train_testcase, bin_columns, bin_size)
            metadata = prep_metadata(train_testcase)
            metadata_dict = metadata.to_dict()
            synthesizer = DataSynthesis(metadata)
            params = synthesizer.get_default_params(test_case['synth_type'])
            # params['epochs'] = 1 # for quick test
            params['sample_size'] = (int)(dataset_size * test_case['sample_size'])
            params['verbose'] = False
            params['epochs'] = (int)(params['epochs'] * epochs)
            print(params)
            print("Running Synthesis:::")
            file_suffix = str(i) + "_" +\
                test_case['dataset'] + "_" +\
                test_case['synth_type'] + "_" +\
                ("all" if test_case["columns_all"] else "some")  + "_" +\
                str(params['sample_size'])
            print(file_suffix)
            dpctgan_df = synthesizer.synth_data(data=train_testcase, approach=test_case['synth_type'], parameters=params)
            dpctgan_df.to_parquet('dataset/synthetic/testset1_' + file_suffix + '.parquet')
        else:
            print("\nUnknown Synth Type, Skipping...\n")
            continue

print("\n\nDONE!")

Round:  1

Test Case:  {'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 0.25}
{'sample_size': 9758, 'enforce_rounding': False, 'epochs': 50, 'verbose': False, 'save_synthesizer': False, 'save_filepath': ''}
Running Synthesis:::
1_adults_ctgan_some_9758

Test Case:  {'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 1}
{'sample_size': 39032, 'enforce_rounding': False, 'epochs': 50, 'verbose': False, 'save_synthesizer': False, 'save_filepath': ''}
Running Synthesis:::
1_adults_ctgan_some_39032

Test Case:  {'dataset': 'adults', 'columns_all': False, 'columns': ['age', 'education', 'marital', 'occupation', 'income', 'race', 'sex'], 'synth_type': 'ctgan', 'sample_size': 2}
{'sample_size': 78064, 'enforce_rounding': False, 'epochs': 50, 'verbose': False, 'save_synt

### Data Synthesis of Test Set 2

In [29]:
adults_df = pd.read_parquet('dataset/adults_train.parquet')
diabetes_df = pd.read_parquet('dataset/diabetes_train.parquet')
census1990_df = pd.read_parquet('dataset/census1990_train.parquet')

i=0
# for i in range(1,10):
# print("Round: ", i)
# test_case = test_cases_1[0] #testing with 1 for now
for test_case in test_cases_2:
    train_testcase = pd.DataFrame()
    if test_case['dataset'] == 'adults':
        train_testcase = adults_df
    elif test_case['dataset'] == 'diabetes':
        train_testcase = diabetes_df
    elif test_case['dataset'] == 'census1990':
        train_testcase = census1990_df
    else:
        print("Error: Unknown Dataset")
        break
    if not test_case['columns_all']:
        train_testcase = train_testcase[test_case['columns']]
    dataset_size = train_testcase.shape[0]
    print("\nTest Case: ", test_case)
    print("===============")
    if test_case['synth_type'] == 'ctgan':
        metadata = prep_metadata(train_testcase)
        metadata_dict = metadata.to_dict()
        synthesizer = DataSynthesis(metadata)
        params = synthesizer.get_default_params(test_case['synth_type'])
        params['epochs'] = 1 # for quick test
        params['sample_size'] = (int)(dataset_size * test_case['sample_size'])
        params['verbose'] = False
        # params['epochs'] = (int)(params['epochs'] * epochs)
        print(params)
        print("Running Synthesis:::")
        file_suffix = str(i) + "_" +\
            test_case['dataset'] + "_" +\
            test_case['synth_type'] + "_" +\
            ("all" if test_case["columns_all"] else "some")  + "_" +\
            str(params['sample_size'])
        print(file_suffix)
        ctgan_df = synthesizer.synth_data(data=train_testcase, approach=test_case['synth_type'], parameters=params)
        ctgan_df.to_parquet('dataset/synthetic/testset2_' + file_suffix + '.parquet')
    elif test_case['synth_type'] == 'dpctgan':
        bin_size = 50 # assume this bin size is sufficent in all cases and pandas bins efficientally
        bin_columns = []
        if test_case['columns_all'] and test_case['dataset'] == 'adults':
            bin_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hr_per_week']
        elif (not test_case['columns_all']) and test_case['dataset'] == 'adults':
            bin_columns = ['age']
        elif test_case['columns_all'] and test_case['dataset'] == 'diabetes':
            bin_columns = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
        elif (not test_case['columns_all']) and test_case['dataset'] == 'diabetes':
            bin_columns = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age']
        # census data is all categorical so no binning needed
        if test_case['dataset'] != 'census1990':     
            train_testcase = prep_bin_data(train_testcase, bin_columns, bin_size)
        metadata = prep_metadata(train_testcase)
        metadata_dict = metadata.to_dict()
        synthesizer = DataSynthesis(metadata)
        params = synthesizer.get_default_params(test_case['synth_type'])
        params['epochs'] = 1 # for quick test
        params['sample_size'] = (int)(dataset_size * test_case['sample_size'])
        params['verbose'] = False
        # params['epochs'] = (int)(params['epochs'] * epochs)
        print(params)
        print("Running Synthesis:::")
        file_suffix = str(i) + "_" +\
            test_case['dataset'] + "_" +\
            test_case['synth_type'] + "_" +\
            ("all" if test_case["columns_all"] else "some")  + "_" +\
            str(params['sample_size'])
        print(file_suffix)
        dpctgan_df = synthesizer.synth_data(data=train_testcase, approach=test_case['synth_type'], parameters=params)
        dpctgan_df.to_parquet('dataset/synthetic/testset2_' + file_suffix + '.parquet')
    else:
        print("\nUnknown Synth Type, Skipping...\n")
        continue

print("\n\nDONE!")


Test Case:  {'dataset': 'adults', 'columns_all': True, 'columns': [], 'synth_type': 'ctgan', 'sample_size': 1}
{'sample_size': 39032, 'enforce_rounding': False, 'epochs': 1, 'verbose': False, 'save_synthesizer': False, 'save_filepath': ''}
Running Synthesis:::
0_adults_ctgan_all_39032

Test Case:  {'dataset': 'adults', 'columns_all': True, 'columns': [], 'synth_type': 'dpctgan', 'sample_size': 1}
{'sample_size': 39032, 'generator_decay': 1e-05, 'discriminator_decay': 0.001, 'batch_size': 64, 'epochs': 1, 'epsilon': 32, 'verbose': False, 'preprocessor_eps': 1.0}
Running Synthesis:::
0_adults_dpctgan_all_39032

Test Case:  {'dataset': 'diabetes', 'columns_all': True, 'columns': [], 'synth_type': 'ctgan', 'sample_size': 1}
{'sample_size': 202944, 'enforce_rounding': False, 'epochs': 1, 'verbose': False, 'save_synthesizer': False, 'save_filepath': ''}
Running Synthesis:::
0_diabetes_ctgan_all_202944

Test Case:  {'dataset': 'diabetes', 'columns_all': True, 'columns': [], 'synth_type': 'dp