In [None]:
!pip install synthcity
!pip uninstall -y torchaudio torchdata
!pip install openml

In [None]:
# source code

import os

code_path = 'code/'

# source utility functions 
file_path = os.path.join(code_path, 'utility_functions_syn_tab_sjppds_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source additional utility functions 
file_path = os.path.join(code_path, 'utility_functions_additional_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source the synth_tab_sjppds method synthcity plugin
file_path = os.path.join(code_path, 'syn_tab_sjppds_synthcity_plugin_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

In [None]:
# synthcity absolute
from synthcity.plugins import Plugins

generators = Plugins()

generators.add("syn_tab_sjppds", SynTabSjppdsPlugin)

In [None]:
import openml

## mushroom data
dataset = openml.datasets.get_dataset(24) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

X = X.drop(X.columns[[10, 15]], axis=1) # column 10 has too many NAs and column 15 has no variability

num_idx = None
cat_idx = list(range(21))

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)

In [None]:
from synthcity.benchmark import Benchmarks

out_path = 'outputs/mushroom/'

In [None]:
score1 = Benchmarks.evaluate(
    [
        ('TabSDS', 'syn_tab_sjppds', {'n_levels': 40, 
                                      'n_prop': 0.5, 
                                      "num_variables": num_idx, 
                                      "cat_variables": cat_idx}),
        ('ddpm', 'ddpm', {'lr': 0.00884671824119367,
                          'batch_size': 4093,
                          'num_timesteps': 853,
                          'n_iter': 5127,
                          'is_classification': True}),
        ('arf', 'arf', {'num_trees': 100,
                        'delta': 0,
                        'max_iters': 1,
                        'early_stop': False,
                        'min_node_size': 2}),
        ('tvae', 'tvae', {'n_iter': 200,
                          'lr': 0.001,
                          'decoder_n_layers_hidden': 5,
                          'weight_decay': 0.001,
                          'batch_size': 512,
                          'n_units_embedding': 150,
                          'decoder_n_units_hidden': 150,
                          'decoder_nonlin': 'relu',
                          'decoder_dropout': 0.1171471896118231,
                          'encoder_n_layers_hidden': 4,
                          'encoder_n_units_hidden': 300,
                          'encoder_nonlin': 'tanh',
                          'encoder_dropout': 0.16007215982462047}),
        ('ctgan', 'ctgan', {'generator_n_layers_hidden': 4,
                            'generator_n_units_hidden': 50,
                            'generator_nonlin': 'leaky_relu',
                            'n_iter': 700,
                            'generator_dropout': 0.020973543252274986,
                            'discriminator_n_layers_hidden': 3,
                            'discriminator_n_units_hidden': 150,
                            'discriminator_nonlin': 'tanh',
                            'discriminator_n_iter': 4,
                            'discriminator_dropout': 0.1644064126493125,
                            'lr': 0.001,
                            'weight_decay': 0.001,
                            'batch_size': 500,
                            'encoder_max_clusters': 13}),
        ('bayesnet', 'bayesian_network', {'struct_learning_search_method': 'hillclimb',
                                            'struct_learning_score': 'bic'}),  
    ],
    X=loader_train,
    X_test=loader_test,
    repeats=10,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"], 
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    task_type = "classification",
)

# save results

synthesizer_names = ['TabSDS', 'ddpm', 'arf', 'tvae', 'ctgan', 'bayesnet']

mean1 = extract_summary(score_output = score1,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev1 = extract_summary(score_output = score1,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, "mushroom_optuna_mean1.csv")
mean1.to_csv(file_name, index = True)

file_name = os.path.join(out_path, "mushroom_optuna_stddev1.csv")
stddev1.to_csv(file_name, index = True)

In [None]:
Benchmarks.print(score1)

In [None]:
out_path = 'outputs/mushroom/simulated_datasets/'

file_name = os.path.join(out_path, 'train_set.csv')
X_train.to_csv(file_name, index = False)

file_name = os.path.join(out_path, 'test_set.csv')
X_test.to_csv(file_name, index = False)

In [None]:
# ddpm

np.random.seed(123)

syn_model = Plugins().get('ddpm',
                          n_iter = 5127,
                          lr = 0.00884671824119367,
                          batch_size = 4093,
                          num_timesteps = 853,
                          is_classification = True)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_ddpm.csv')
Y.to_csv(file_name, index = False)

In [None]:
# arf

np.random.seed(123)

syn_model = Plugins().get('arf',
                         num_trees = 100,
                          delta = 0,
                          max_iters = 1,
                          early_stop = False,
                          min_node_size = 2)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_arf.csv')
Y.to_csv(file_name, index = False)

In [None]:
# tvae

np.random.seed(123)

syn_model = Plugins().get('tvae',
                          n_iter = 200,
                          lr = 0.001,
                          decoder_n_layers_hidden = 5,
                          weight_decay = 0.001,
                          batch_size = 512,
                          n_units_embedding = 150,
                          decoder_n_units_hidden = 150,
                          decoder_nonlin = 'relu',
                          decoder_dropout = 0.1171471896118231,
                          encoder_n_layers_hidden = 4,
                          encoder_n_units_hidden = 300,
                          encoder_nonlin = 'tanh',
                          encoder_dropout = 0.16007215982462047)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_tvae.csv')
Y.to_csv(file_name, index = False)

In [None]:
# ctgan

np.random.seed(123)

syn_model = Plugins().get('ctgan',
                          generator_n_layers_hidden = 4,
                          generator_n_units_hidden = 50,
                          generator_nonlin = 'leaky_relu',
                          n_iter = 700,
                          generator_dropout = 0.020973543252274986,
                          discriminator_n_layers_hidden = 3,
                          discriminator_n_units_hidden = 150,
                          discriminator_nonlin = 'tanh',
                          discriminator_n_iter = 4,
                          discriminator_dropout = 0.1644064126493125,
                          lr = 0.001,
                          weight_decay = 0.001,
                          batch_size = 500,
                          encoder_max_clusters = 13)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_ctgan.csv')
Y.to_csv(file_name, index = False)

In [None]:
# bayesnet

np.random.seed(123)

syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_bayesnet.csv')
Y.to_csv(file_name, index = False)# bayesnet

In [None]:
# syn_tab_sjppds

np.random.seed(123)

syn_model = Plugins().get('syn_tab_sjppds', 
                          num_variables = num_idx,
                          cat_variables = cat_idx,
                          n_levels = 40,
                         n_prop = 0.5)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_tab_sjppds_40_0.5.csv')
Y.to_csv(file_name, index = False)