In [None]:
!pip install synthcity
!pip uninstall -y torchaudio torchdata
!pip install openml

In [None]:
# Source scripts containing the utility functions implementing the TabSDS approach as well
# as the Synthcity plugin script

import os

code_path = 'code/' # path for the folder containing the scripts

# source utility functions 
file_path = os.path.join(code_path, 'utility_functions_syn_tab_sjppds_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source additional utility functions 
file_path = os.path.join(code_path, 'utility_functions_additional_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source the synth_tab_sjppds method synthcity plugin 
file_path = os.path.join(code_path, 'syn_tab_sjppds_synthcity_plugin_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

In [None]:
# Add the TabSDS plugin. 

# Note that the code uses the longer and more descritive name of the method, namely, 
# Synthetic Tabular Sequential Joint Probability Preserving Data Shuffling 
# (syn_tab_sjppds)

# synthcity absolute
from synthcity.plugins import Plugins

generators = Plugins()

generators.add("syn_tab_sjppds", SynTabSjppdsPlugin)

In [None]:
# Load the Abalone data

from sklearn.datasets import fetch_openml

# Fetch the Abalone dataset
abalone = fetch_openml(name="abalone", version=1, as_frame=True)

# Access the data and target
X = abalone.data
y = abalone.target

X['target'] =  y # Rings is the target variable

# The current TabSDS implementation requires the specification of which 
# variables are numeric, and which are categorical. (For the Abalone data
# only the first variable, Sex, is categorical.)

num_idx = [1, 2, 3, 4, 5, 6, 7, 8]
cat_idx = [0]

# This function (provided in the 'utility_functions_additional_for_icml_2025.py' script)
# ensures that all numeric variables are of type float64 and all categorical varaibles
# are of type str

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# This function (provided in the 'utility_functions_additional_for_icml_2025.py' script)
# splits the data into two equaly sized training and test sets. (Change the my_seed parameter
# to change the data split)

aux = train_test_data_split(X, my_seed=123)
X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
# Run Synthcity's Benchmarks function on a grid of n_levels (n_c in the paper notation)

from synthcity.benchmark import Benchmarks

# Create the n_c grid.
n_level_grid = list(range(5, 51, 5)) + list(range(60, 101, 10)) + list(range(200, 1001, 100))

# Create Synthcity's  data loaders
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)
loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)

# Run the Benchmarks. The output of this function is used to create panels a, b, and c of Figure 11.
score0 = Benchmarks.evaluate(
    [
        (f"TabSDS_{n_levels}", 'syn_tab_sjppds', {'n_levels': n_levels, 'n_prop': 0.5, "num_variables": num_idx, "cat_variables": cat_idx}) 
        for n_levels in n_level_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=10,
    task_type = "regression",
)

synthesizer_names = [f"TabSDS_{n_levels}" for n_levels in n_level_grid]

In [None]:
# Print the benchmark results.
Benchmarks.print(score0)

In [None]:
# This function extracts the means of each model for each of the metrics.
# (It is provided in the 'utility_functions_additional_for_icml_2025.py' script.)
mean = extract_summary(score_output = score0,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")

In [None]:
mean

In [None]:
# Plot the results for the detection test, ML efficiency, and domias MIA metrics.
# (Corresponds to panels a, b, and c in Figure 11 in the Supplement.)

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(12, 4))

metric_names = mean.index

axes[0].plot(n_level_grid, mean.iloc[3,:], 'o-')
axes[0].set_title(metric_names[3])
axes[0].set_xlabel("n_c")
axes[0].set_ylabel("metric")

axes[1].plot(n_level_grid, mean.iloc[2,:], 'o-')
axes[1].set_title(metric_names[2])
axes[1].set_xlabel("n_c")
axes[1].set_ylabel("metric")

axes[2].plot(n_level_grid, mean.iloc[7,:], 'o-')
axes[2].set_title(metric_names[7])
axes[2].set_xlabel("n_c")
axes[2].set_ylabel("metric")

plt.tight_layout()
plt.show()

In [None]:
# Compute the DCR distributions between the training set samples and each of the 
# synthetic data samples generated by TabSDS at each n_c value, as well as, between
# the training set and the test set.
# (Corresponds to panel d in Figure 11 in the Appendix.)

np.random.seed(123)

dcr_AB = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)


## plot DCR distributions across n_levels grid

dcr = dcr_AB

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# Benchmarks the selected TabSDS generator (based on nc = 20) against the
# ddpm, arf, tvae, ctgan, and bayesnet generators. The hyperpameters for each
# of these models were obtained using Optuna (see 'optuna_models_abalone_for_icml_2025.ipynb')

score1 = Benchmarks.evaluate(
    [
        ('TabSDS', 'syn_tab_sjppds', {'n_levels': 20, 
                                      'n_prop': 0.5, 
                                      "num_variables": num_idx, 
                                      "cat_variables": cat_idx}),  
        ('ddpm', 'ddpm', {'lr': 0.002991978123076162,
                          'batch_size': 970,
                          'num_timesteps': 407,
                          'n_iter': 7605,
                          'is_classification': False}),
        ('arf', 'arf', {'num_trees': 80,
                        'delta': 0,
                        'max_iters': 2,
                        'early_stop': False,
                        'min_node_size': 2}),
        ('tvae', 'tvae', {'n_iter': 400,
                          'lr': 0.001,
                          'decoder_n_layers_hidden': 5,
                          'weight_decay': 0.0001,
                          'batch_size': 128,
                          'n_units_embedding': 200,
                          'decoder_n_units_hidden': 150,
                          'decoder_nonlin': 'tanh',
                          'decoder_dropout': 0.19964446358158816,
                          'encoder_n_layers_hidden': 4,
                          'encoder_n_units_hidden': 100,
                          'encoder_nonlin': 'relu',
                          'encoder_dropout': 0.0820245231222064}),
        ('ctgan', 'ctgan', {'generator_n_layers_hidden': 1,
                            'generator_n_units_hidden': 100,
                            'generator_nonlin': 'elu',
                            'n_iter': 700,
                            'generator_dropout': 0.13836424598477665,
                            'discriminator_n_layers_hidden': 2,
                            'discriminator_n_units_hidden': 100,
                            'discriminator_nonlin': 'tanh',
                            'discriminator_n_iter': 5,
                            'discriminator_dropout': 0.023861565936528797,
                            'lr': 0.001,
                            'weight_decay': 0.0001,
                            'batch_size': 200,
                            'encoder_max_clusters': 8}),
        ('bayesnet', 'bayesian_network', {'struct_learning_search_method': 'hillclimb',
                                            'struct_learning_score': 'bic'}),  
    ],
    X=loader_train,
    X_test=loader_test,
    repeats=10,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"], 
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    task_type = "regression",
)



In [None]:
Benchmarks.print(score1)