In [None]:
!pip install synthcity
!pip uninstall -y torchaudio torchdata
!pip install openml

In [None]:
# source code

import os

code_path = 'code/'

# source utility functions 
file_path = os.path.join(code_path, 'utility_functions_syn_tab_sjppds_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source additional utility functions 
file_path = os.path.join(code_path, 'utility_functions_additional_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source the synth_tab_sjppds method synthcity plugin
file_path = os.path.join(code_path, 'syn_tab_sjppds_synthcity_plugin_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

In [None]:
# synthcity absolute
from synthcity.plugins import Plugins

generators = Plugins()

generators.add("syn_tab_sjppds", SynTabSjppdsPlugin)

In [None]:
from sklearn.datasets import fetch_openml

openml_data = fetch_openml(name="adult", as_frame=True, version=1)

# Get the features and target as DataFrames
X = openml_data.data  # Features (pandas DataFrame)
y = openml_data.target  # Target (pandas Series)

X["target"] = y

X = X.dropna()

X = process_adult_data(X)

num_idx = [2, 4]
cat_idx = [0, 1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)

In [None]:
from synthcity.benchmark import Benchmarks

out_path = 'outputs/adult/'

In [None]:
score1 = Benchmarks.evaluate(
    [
        ('TabSDS', 'syn_tab_sjppds', {'n_levels': 20, 
                                      'n_prop': 0.5, 
                                      "num_variables": num_idx, 
                                      "cat_variables": cat_idx}),
        ('ddpm', 'ddpm', {'lr': 0.0009375080542687667,
                          'batch_size': 2929,
                          'num_timesteps': 998,
                          'n_iter': 1051,
                          'is_classification': True}),
        ('arf', 'arf', {}),
        ('tvae', 'tvae', {'n_iter': 300,
                          'lr': 0.0002,
                          'decoder_n_layers_hidden': 4,
                          'weight_decay': 0.001,
                          'batch_size': 256,
                          'n_units_embedding': 200,
                          'decoder_n_units_hidden': 300,
                          'decoder_nonlin': 'elu',
                          'decoder_dropout': 0.194325119117226,
                          'encoder_n_layers_hidden': 1,
                          'encoder_n_units_hidden': 450,
                          'encoder_nonlin': 'leaky_relu',
                          'encoder_dropout': 0.04288563703094718}),
        ('ctgan', 'ctgan', {'generator_n_layers_hidden': 2,
                            'generator_n_units_hidden': 50,
                            'generator_nonlin': 'tanh',
                            'n_iter': 1000,
                            'generator_dropout': 0.0575,
                            'discriminator_n_layers_hidden': 4,
                            'discriminator_n_units_hidden': 150,
                            'discriminator_nonlin': 'relu'}),
        ('bayesnet', 'bayesian_network', {'struct_learning_search_method': 'hillclimb',
                                            'struct_learning_score': 'bic'}),
    ],
    X=loader_train,
    X_test=loader_test,
    repeats=10,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"], 
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    task_type = "classification",
)

# save results

synthesizer_names = ['TabSDS', 'ddpm', 'arf', 'tvae', 'ctgan', 'bayesnet']

mean1 = extract_summary(score_output = score1,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev1 = extract_summary(score_output = score1,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, "adult_mean1.csv")
mean1.to_csv(file_name, index = True)

file_name = os.path.join(out_path, "adult_stddev1.csv")
stddev1.to_csv(file_name, index = True)

In [None]:
Benchmarks.print(score1)

In [None]:
out_path = 'outputs/adult/simulated_datasets/'

file_name = os.path.join(out_path, 'train_set.csv')
X_train.to_csv(file_name, index = False)

file_name = os.path.join(out_path, 'test_set.csv')
X_test.to_csv(file_name, index = False)

In [None]:
# ddpm

np.random.seed(123)

syn_model = Plugins().get('ddpm',
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_ddpm.csv')
Y.to_csv(file_name, index = False)

In [None]:
# arf

np.random.seed(123)

syn_model = Plugins().get('arf')

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_arf.csv')
Y.to_csv(file_name, index = False)

In [None]:
# tvae

np.random.seed(123)

syn_model = Plugins().get('tvae', 
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_tvae.csv')
Y.to_csv(file_name, index = False)

In [None]:
# ctgan

np.random.seed(123)

syn_model = Plugins().get('ctgan',
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_ctgan.csv')
Y.to_csv(file_name, index = False)

In [None]:
# bayesnet

np.random.seed(123)

syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_bayesnet.csv')
Y.to_csv(file_name, index = False)

In [None]:
# syn_tab_sjppds

np.random.seed(123)

syn_model = Plugins().get('syn_tab_sjppds', 
                          num_variables = num_idx,
                          cat_variables = cat_idx,
                          n_levels = 20,
                         n_prop = 0.5)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_tab_sjppds_20_0.5.csv')
Y.to_csv(file_name, index = False)