In [None]:
!pip install synthcity
!pip uninstall -y torchaudio torchdata
!pip install openml

In [None]:
# source code

import os

code_path = 'code/'

# source utility functions 
file_path = os.path.join(code_path, 'utility_functions_syn_tab_sjppds_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source additional utility functions 
file_path = os.path.join(code_path, 'utility_functions_additional_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source the synth_tab_sjppds method synthcity plugin
file_path = os.path.join(code_path, 'syn_tab_sjppds_synthcity_plugin_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

In [None]:
# synthcity absolute
from synthcity.plugins import Plugins

generators = Plugins()

generators.add("syn_tab_sjppds", SynTabSjppdsPlugin)

In [None]:
# load the data

from sklearn.datasets import fetch_openml

# Fetch the Abalone dataset
abalone = fetch_openml(name="abalone", version=1, as_frame=True)

# Access the data and target
X = abalone.data
y = abalone.target

X['target'] =  y # Rings

num_idx = [1, 2, 3, 4, 5, 6, 7, 8]
cat_idx = [0]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)

In [None]:
from synthcity.benchmark import Benchmarks

out_path = 'outputs/abalone/'

In [None]:
score1 = Benchmarks.evaluate(
    [
        ('TabSDS', 'syn_tab_sjppds', {'n_levels': 20, 
                                      'n_prop': 0.5, 
                                      "num_variables": num_idx, 
                                      "cat_variables": cat_idx}),  
        ('ddpm', 'ddpm', {'lr': 0.002991978123076162,
                          'batch_size': 970,
                          'num_timesteps': 407,
                          'n_iter': 7605,
                          'is_classification': False}),
        ('arf', 'arf', {'num_trees': 80,
                        'delta': 0,
                        'max_iters': 2,
                        'early_stop': False,
                        'min_node_size': 2}),
        ('tvae', 'tvae', {'n_iter': 400,
                          'lr': 0.001,
                          'decoder_n_layers_hidden': 5,
                          'weight_decay': 0.0001,
                          'batch_size': 128,
                          'n_units_embedding': 200,
                          'decoder_n_units_hidden': 150,
                          'decoder_nonlin': 'tanh',
                          'decoder_dropout': 0.19964446358158816,
                          'encoder_n_layers_hidden': 4,
                          'encoder_n_units_hidden': 100,
                          'encoder_nonlin': 'relu',
                          'encoder_dropout': 0.0820245231222064}),
        ('ctgan', 'ctgan', {'generator_n_layers_hidden': 1,
                            'generator_n_units_hidden': 100,
                            'generator_nonlin': 'elu',
                            'n_iter': 700,
                            'generator_dropout': 0.13836424598477665,
                            'discriminator_n_layers_hidden': 2,
                            'discriminator_n_units_hidden': 100,
                            'discriminator_nonlin': 'tanh',
                            'discriminator_n_iter': 5,
                            'discriminator_dropout': 0.023861565936528797,
                            'lr': 0.001,
                            'weight_decay': 0.0001,
                            'batch_size': 200,
                            'encoder_max_clusters': 8}),
        ('bayesnet', 'bayesian_network', {'struct_learning_search_method': 'hillclimb',
                                            'struct_learning_score': 'bic'}),  
    ],
    X=loader_train,
    X_test=loader_test,
    repeats=10,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"], 
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    task_type = "regression",
)

# save results

synthesizer_names = ['TabSDS', 'ddpm', 'arf', 'tvae', 'ctgan', 'bayesnet']

mean1 = extract_summary(score_output = score1,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev1 = extract_summary(score_output = score1,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, "abalone_optuna_mean1.csv")
mean1.to_csv(file_name, index = True)

file_name = os.path.join(out_path, "abalone_optuna_stddev1.csv")
stddev1.to_csv(file_name, index = True)

In [None]:
Benchmarks.print(score1)

In [None]:
out_path = 'outputs/abalone/simulated_datasets/'

file_name = os.path.join(out_path, 'train_set.csv')
X_train.to_csv(file_name, index = False)

file_name = os.path.join(out_path, 'test_set.csv')
X_test.to_csv(file_name, index = False)

In [None]:
# ddpm

np.random.seed(123)

syn_model = Plugins().get('ddpm',
                          n_iter = 7605,
                          lr = 0.002991978123076162,
                          batch_size = 970,
                          num_timesteps = 407,
                          is_classification = False)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_ddpm.csv')
Y.to_csv(file_name, index = False)

In [None]:
# arf

np.random.seed(123)

syn_model = Plugins().get('arf',
                          num_trees = 80,
                              delta = 0,
                              max_iters = 2,
                              early_stop = False,
                              min_node_size = 2)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_arf.csv')
Y.to_csv(file_name, index = False)

In [None]:
# tvae

np.random.seed(123)

syn_model = Plugins().get("tvae",
                              n_iter = 400,
                              lr = 0.001,
                              decoder_n_layers_hidden = 5,
                              weight_decay = 0.0001,
                              batch_size = 128,
                              n_units_embedding = 200,
                              decoder_n_units_hidden = 150,
                              decoder_nonlin = 'tanh',
                              decoder_dropout = 0.19964446358158816,
                              encoder_n_layers_hidden = 4,
                              encoder_n_units_hidden = 100,
                              encoder_nonlin = 'relu',
                              encoder_dropout = 0.0820245231222064)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_tvae.csv')
Y.to_csv(file_name, index = False)

In [None]:
# ctgan

np.random.seed(123)

syn_model = Plugins().get('ctgan',
                          generator_n_layers_hidden = 1,
                          generator_n_units_hidden = 100,
                              generator_nonlin = 'elu',
                              n_iter = 700,
                              generator_dropout = 0.13836424598477665,
                              discriminator_n_layers_hidden = 2,
                              discriminator_n_units_hidden = 100,
                              discriminator_nonlin = 'tanh',
                              discriminator_n_iter = 5,
                              discriminator_dropout = 0.023861565936528797,
                              lr = 0.001,
                              weight_decay = 0.0001,
                              batch_size = 200,
                              encoder_max_clusters = 8)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_ctgan.csv')
Y.to_csv(file_name, index = False)

In [None]:
# bayesnet

np.random.seed(123)

syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_bayesnet.csv')
Y.to_csv(file_name, index = False)

In [None]:
# syn_tab_sjppds

np.random.seed(123)

syn_model = Plugins().get('syn_tab_sjppds', 
                          num_variables = num_idx,
                          cat_variables = cat_idx,
                          n_levels = 20,
                         n_prop = 0.5)

syn_model.fit(loader_train)

n = len(loader_train)
Y = syn_model.generate(count=n).dataframe()

file_name = os.path.join(out_path, 'syn_tab_sjppds_20_0.5.csv')
Y.to_csv(file_name, index = False)