In [None]:
!pip install synthcity
!pip uninstall -y torchaudio torchdata
!pip install openml

In [None]:
# source code

import os

code_path = 'code/'

# source utility functions 
file_path = os.path.join(code_path, 'utility_functions_syn_tab_sjppds_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source additional utility functions 
file_path = os.path.join(code_path, 'utility_functions_additional_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source the synth_tab_sjppds method synthcity plugin
file_path = os.path.join(code_path, 'syn_tab_sjppds_synthcity_plugin_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source the additive noise method synthcity plugin
file_path = os.path.join(code_path, 'additive_noise_synthcity_plugin_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

In [None]:
# synthcity absolute
from synthcity.plugins import Plugins
from synthcity.benchmark import Benchmarks

generators = Plugins()

generators.add("syn_tab_sjppds", SynTabSjppdsPlugin)
generators.add("additive_noise", AdditiveNoisePlugin)

In [None]:
num_repeats = 10

out_path = 'outputs/additive_noise/'

noise_percent_grid = [1] + list(range(5, 116, 5))

In [None]:
# load the data

from sklearn.datasets import fetch_openml

# Fetch the Abalone dataset
abalone = fetch_openml(name="abalone", version=1, as_frame=True)

# Access the data and target
X = abalone.data
y = abalone.target

X['target'] =  y # Rings

num_idx = [1, 2, 3, 4, 5, 6, 7, 8]
cat_idx = [0]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)


score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "regression",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_abalone.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_abalone.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for bank marketing

import openml

# bank marketing
dataset = openml.datasets.get_dataset(44126) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)


score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "classification",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_bank_marketing.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_bank_marketing.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for California Housing (original)

from sklearn.datasets import fetch_california_housing

# Load the dataset
california_housing = fetch_california_housing(as_frame=True)

# Features (X) and target (y)
X = california_housing.data
y = california_housing.target

X["target"] = y

num_idx = list(range(9))
cat_idx = None

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)

score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"stats":["ks_test", "chi_squared_test", "inv_kl_divergence"],
             "performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior", "k-anonymization", "distinct l-diversity", "delta-presence", "identifiability_score"]},
    repeats=num_repeats,
    task_type = "regression",
    #synthetic_cache=False,
    #synthetic_reuse_if_exists=False,
    #augmented_reuse_if_exists=False,
    #use_metric_cache=False,
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_california_orig.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_california_orig.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for credit

import openml

# credit
dataset = openml.datasets.get_dataset(44089) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 10))
cat_idx = [10]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)


score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "classification",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_credit.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_credit.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for Diabetes130US

import openml

# Diabetes130US
dataset = openml.datasets.get_dataset(45022) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)

score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "classification",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_diabetes130us.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_diabetes130us.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for electricity

# load the electricity data 

import openml

dataset = openml.datasets.get_dataset(44120) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)


score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "classification",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_electricity.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_electricity.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for eye movements data

import openml

# eye movements
dataset = openml.datasets.get_dataset(44130) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 20))
cat_idx = [20]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)


score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "classification",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_eye_movement.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_eye_movement.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for house_16h

import openml

# house_16H
dataset = openml.datasets.get_dataset(44123) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 16))
cat_idx = [16]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)


score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "classification",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_house_16h.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_house_16h.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for magic telescope

import openml

# MagicTelescope
dataset = openml.datasets.get_dataset(44125) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 10))
cat_idx = [10]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)


score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "classification",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_magic_telescope.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_magic_telescope.csv')
stddev.to_csv(file_name, index = True)

In [None]:
# Select tuning parameter for pol

import openml

# pol
dataset = openml.datasets.get_dataset(44122) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 23))
cat_idx = [26]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

loader_test = GenericDataLoader(
    X_test,
    target_column = 'target'
)


score = Benchmarks.evaluate(
    [
        (f"addnoise_{noise_level}", 'additive_noise', {'noise_percent': noise_level, "num_variables": num_idx}) 
        for noise_level in noise_percent_grid
    ],
    X=loader_train,
    X_test=loader_test,
    metrics={"performance": ["xgb"],
             "detection": ["detection_xgb"],
             "privacy": ["DomiasMIA_KDE", "DomiasMIA_prior"]},
    repeats=num_repeats,
    task_type = "classification",
)

synthesizer_names = [f"addnoise_{noise_level}" for noise_level in noise_percent_grid]

mean = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "mean")
stddev = extract_summary(score_output = score,
                          synthesizer_names = synthesizer_names,
                          summary_name = "stddev")

file_name = os.path.join(out_path, 'mean_tuning_param_selection_pol.csv')
mean.to_csv(file_name, index = True)

file_name = os.path.join(out_path, 'stddev_tuning_param_selection_pol.csv')
stddev.to_csv(file_name, index = True)