In [2]:
import pandas as pd
from snsynth import Synthesizer
from sdv.metadata import SingleTableMetadata
import graphviz

## Preprocessing Block
Data cannot be continuous to achieve DP-CTGAN. Anonymeter addresses this by binning each continous column into 50 equally spaced bins.

In [7]:
#pandas has a method for binning called cut:
#binData = pd.cut(df['Data'], bins=50)
dataframe = pd.read_parquet('datasets/adults_train.parquet')

#all numerical columns are distributed into 50 bins labeled from 1 to 50

# age
bin_age = pd.cut(dataframe['age'], bins=50, labels=list(range(1,51)))
dataframe['age'] = bin_age
# fnlwgt
bin_fnlwgt = pd.cut(dataframe['fnlwgt'], bins=50, labels=list(range(1,51)))
dataframe['fnlwgt'] = bin_fnlwgt
# education_num
bin_education_num = pd.cut(dataframe['education_num'], bins=50, labels=list(range(1,51)))
dataframe['education_num'] = bin_education_num
# capital_gain
bin_capital_gain = pd.cut(dataframe['capital_gain'], bins=50, labels=list(range(1,51)))
dataframe['capital_gain'] = bin_capital_gain
# capital_loss
bin_capital_loss = pd.cut(dataframe['capital_loss'], bins=50, labels=list(range(1,51)))
dataframe['capital_loss'] = bin_capital_loss
# hr_per_week
bin_hr_per_week = pd.cut(dataframe['hr_per_week'], bins=50, labels=list(range(1,51)))
dataframe['hr_per_week'] = bin_hr_per_week

#generate metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(dataframe)
metadata.visualize(
    show_table_details='full',
    output_filepath='datasets/adults_train_bin_metadata_stats.png'
)
dataframe.to_parquet('datasets/adults_train_bin.parquet')

  if _pandas_api.is_sparse(col):


## DP-CTGAN synethesization block

In [11]:

dataframe = pd.read_parquet('datasets/adults_train_bin.parquet')

# dpctgan, patectgan
synth = Synthesizer.create("dpctgan", 
    generator_decay = (10**-5),
    discriminator_decay = (10**-3), 
    batch_size = 64, 
    epochs = 100, 
    epsilon = 32, 
    verbose = True
)
synth.fit(dataframe, preprocessor_eps=1.0)
dataframe_synth = synth.sample(1000)

#note that the synthesizer does not have the ability to save/load

dataframe_synth.to_parquet('datasets/adults_syn_dpctgan.parquet')



Epoch 1, Loss G: 0.6679, Loss D: 1.3900
epsilon is 0.17295241552213905, alpha is 63.0
Epoch 2, Loss G: 0.6655, Loss D: 1.3975
epsilon is 0.1771795990317799, alpha is 63.0
Epoch 3, Loss G: 0.6802, Loss D: 1.3820
epsilon is 0.18140678254142079, alpha is 63.0
Epoch 4, Loss G: 0.6840, Loss D: 1.3924
epsilon is 0.18563396605106164, alpha is 63.0
Epoch 5, Loss G: 0.6893, Loss D: 1.3835
epsilon is 0.18986114956070252, alpha is 63.0
Epoch 6, Loss G: 0.6888, Loss D: 1.3869
epsilon is 0.19408833307034337, alpha is 63.0
Epoch 7, Loss G: 0.6967, Loss D: 1.3950
epsilon is 0.19831551657998422, alpha is 63.0
Epoch 8, Loss G: 0.6957, Loss D: 1.3845
epsilon is 0.2025427000896251, alpha is 63.0
Epoch 9, Loss G: 0.6851, Loss D: 1.3893
epsilon is 0.20676988359926596, alpha is 63.0
Epoch 10, Loss G: 0.6835, Loss D: 1.3935
epsilon is 0.2109970671089068, alpha is 63.0
Epoch 11, Loss G: 0.6842, Loss D: 1.3821
epsilon is 0.2152242506185477, alpha is 63.0
Epoch 12, Loss G: 0.6859, Loss D: 1.3985
epsilon is 0.21

  if _pandas_api.is_sparse(col):
