In [None]:
!pip install synthcity
!pip uninstall -y torchaudio torchdata
!pip install openml

In [None]:
# source code

import timeit

# source code

import os

code_path = 'code/'

# source utility functions 
file_path = os.path.join(code_path, 'utility_functions_syn_tab_sjppds_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source additional utility functions 
file_path = os.path.join(code_path, 'utility_functions_additional_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source the synth_tab_sjppds method synthcity plugin
file_path = os.path.join(code_path, 'syn_tab_sjppds_synthcity_plugin_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

In [None]:
# synthcity absolute
from synthcity.plugins import Plugins

generators = Plugins()

generators.add("syn_tab_sjppds", SynTabSjppdsPlugin)

In [None]:
num_repli = 5

In [None]:
from sklearn.datasets import fetch_california_housing

# Load the dataset
california_housing = fetch_california_housing(as_frame=True)

# Features (X) and target (y)
X = california_housing.data
y = california_housing.target

X["target"] = y

num_idx = list(range(9))
cat_idx = None

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(9)),
                              cat_variables = None,
                              n_levels = 200,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf',
                          num_trees = 70,
                          delta = 0,
                          max_iters = 2,
                          early_stop = True,
                          min_node_size = 6)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get('ddpm',
                          n_iter = 8300,
                          lr = 0.009824330156648882,
                          batch_size = 3177,
                          num_timesteps = 200,
                          is_classification = False)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get('tvae',
                          n_iter = 200,
                          lr = 0.001,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 512,
                          n_units_embedding = 150,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "leaky_relu",
                          decoder_dropout = 0.13648576055463643,
                          encoder_n_layers_hidden = 2,
                          encoder_n_units_hidden = 400,
                          encoder_nonlin = "tanh",
                          encoder_dropout = 0.02705334756273372)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get('ctgan',
                          generator_n_layers_hidden = 1,
                          generator_n_units_hidden = 150,
                          generator_nonlin = 'relu',
                          n_iter = 600,
                          generator_dropout = 0.16863490048383495,
                          discriminator_n_layers_hidden = 3,
                          discriminator_n_units_hidden = 150,
                          discriminator_nonlin = 'relu',
                          discriminator_n_iter = 4,
                          discriminator_dropout = 0.06303278452420555,
                          lr = 0.0002,
                          weight_decay = 0.001,
                          batch_size = 200,
                          encoder_max_clusters = 5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_california_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_california_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_california_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_california_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_california_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_california_arf / num_repli:.6f} seconds")

In [None]:
rt_california_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_california_ddpm / num_repli:.6f} seconds")

In [None]:
rt_california_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_california_tvae / num_repli:.6f} seconds")

In [None]:
rt_california_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_california_ctgan / num_repli:.6f} seconds")

In [None]:
import openml

## mushroom data
dataset = openml.datasets.get_dataset(24) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

X = X.drop(X.columns[[10, 15]], axis=1) # column 10 has too many NAs and column 15 has no variability

num_idx = None
cat_idx = list(range(21))

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = None,
                              cat_variables = list(range(21)),
                              n_levels = 40,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf',
                         num_trees = 100,
                          delta = 0,
                          max_iters = 1,
                          early_stop = False,
                          min_node_size = 2)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get('ddpm',
                          n_iter = 5127,
                          lr = 0.00884671824119367,
                          batch_size = 4093,
                          num_timesteps = 853,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get('tvae',
                          n_iter = 200,
                          lr = 0.001,
                          decoder_n_layers_hidden = 5,
                          weight_decay = 0.001,
                          batch_size = 512,
                          n_units_embedding = 150,
                          decoder_n_units_hidden = 150,
                          decoder_nonlin = 'relu',
                          decoder_dropout = 0.1171471896118231,
                          encoder_n_layers_hidden = 4,
                          encoder_n_units_hidden = 300,
                          encoder_nonlin = 'tanh',
                          encoder_dropout = 0.16007215982462047)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get('ctgan',
                          generator_n_layers_hidden = 4,
                          generator_n_units_hidden = 50,
                          generator_nonlin = 'leaky_relu',
                          n_iter = 700,
                          generator_dropout = 0.020973543252274986,
                          discriminator_n_layers_hidden = 3,
                          discriminator_n_units_hidden = 150,
                          discriminator_nonlin = 'tanh',
                          discriminator_n_iter = 4,
                          discriminator_dropout = 0.1644064126493125,
                          lr = 0.001,
                          weight_decay = 0.001,
                          batch_size = 500,
                          encoder_max_clusters = 13)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_mushroom_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_mushroom_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_mushroom_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_mushroom_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_mushroom_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_mushroom_arf / num_repli:.6f} seconds")

In [None]:
rt_mushroom_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_mushroom_ddpm / num_repli:.6f} seconds")

In [None]:
rt_mushroom_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_mushroom_tvae / num_repli:.6f} seconds")

In [None]:
rt_mushroom_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_mushroom_ctgan / num_repli:.6f} seconds")

In [None]:
# adult dataset

from sklearn.datasets import fetch_openml

openml_data = fetch_openml(name="adult", as_frame=True, version=1)

# Get the features and target as DataFrames
X = openml_data.data  # Features (pandas DataFrame)
y = openml_data.target  # Target (pandas Series)

X["target"] = y

X = X.dropna()

X = process_adult_data(X)

num_idx = [2, 4]
cat_idx = [0, 1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = [2, 4],
                              cat_variables = [0, 1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
                              n_levels = 20,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_adult_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_adult_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_adult_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_adult_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_adult_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_adult_arf / num_repli:.6f} seconds")

In [None]:
rt_adult_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_adult_ddpm / num_repli:.6f} seconds")

In [None]:
rt_adult_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_adult_tvae / num_repli:.6f} seconds")

In [None]:
rt_adult_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_adult_ctgan / num_repli:.6f} seconds")

In [None]:
# Select tuning parameter for eye movements data

import openml

# eye movements
dataset = openml.datasets.get_dataset(44130) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 20))
cat_idx = [20]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(0, 20)),
                              cat_variables = [20],
                              n_levels = 20,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_eye_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_eye_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_eye_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_eye_ / num_repli:.6f} seconds")

In [None]:
rt_eye_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_eye_arf / num_repli:.6f} seconds")

In [None]:
rt_eye_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_eye_ddpm / num_repli:.6f} seconds")

In [None]:
rt_eye_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_eye_tvae / num_repli:.6f} seconds")

In [None]:
rt_eye_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_eye_ctgan / num_repli:.6f} seconds")

In [None]:
num_repli = 5

In [None]:
import openml

# credit
dataset = openml.datasets.get_dataset(44089) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 10))
cat_idx = [10]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(0, 10)),
                              cat_variables = [10],
                              n_levels = 1000,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_credit_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_credit_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_credit_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_credit_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_credit_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_credit_arf / num_repli:.6f} seconds")

In [None]:
rt_credit_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_credit_ddpm / num_repli:.6f} seconds")

In [None]:
rt_credit_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_credit_tvae / num_repli:.6f} seconds")

In [None]:
rt_credit_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_credit_ctgan / num_repli:.6f} seconds")

In [None]:
# pol dataset

import openml

# pol
dataset = openml.datasets.get_dataset(44122) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 26))
cat_idx = [26]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(0, 26)),
                              cat_variables = [26],
                              n_levels = 15,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_pol_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_pol_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_pol_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_pol_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_pol_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_pol_arf / num_repli:.6f} seconds")

In [None]:
rt_pol_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_pol_ddpm / num_repli:.6f} seconds")

In [None]:
rt_pol_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_pol_tvae / num_repli:.6f} seconds")

In [None]:
rt_pol_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_pol_ctgan / num_repli:.6f} seconds")

In [None]:
import openml

# house_16H
dataset = openml.datasets.get_dataset(44123) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 16))
cat_idx = [16]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(0, 16)),
                              cat_variables = [16],
                              n_levels = 1000,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_house16h_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_house16h_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_house16h_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_house16h_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_house16h_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_house16h_arf / num_repli:.6f} seconds")

In [None]:
rt_house16h_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_house16h_ddpm / num_repli:.6f} seconds")

In [None]:
rt_house16h_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_house16h_tvae / num_repli:.6f} seconds")

In [None]:
rt_house16h_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_house16h_ctgan / num_repli:.6f} seconds")

In [None]:
# bank marketing

import openml

# bank marketing
dataset = openml.datasets.get_dataset(44126) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(0, 7)),
                              cat_variables = [7],
                              n_levels = 100,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_bankmark_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_bankmark_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_bankmark_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_bankmark_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_bankmark_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_bankmark_arf / num_repli:.6f} seconds")

In [None]:
rt_bankmark_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_bankmark_ddpm / num_repli:.6f} seconds")

In [None]:
rt_bankmark_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_bankmark_tvae / num_repli:.6f} seconds")

In [None]:
rt_bankmark_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_bankmark_ctgan / num_repli:.6f} seconds")

In [None]:
# load the data

from sklearn.datasets import fetch_openml

# Fetch the Abalone dataset
abalone = fetch_openml(name="abalone", version=1, as_frame=True)

# Access the data and target
X = abalone.data
y = abalone.target

X['target'] =  y # Rings

num_idx = [1, 2, 3, 4, 5, 6, 7, 8]
cat_idx = [0]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = [1, 2, 3, 4, 5, 6, 7, 8],
                              cat_variables = [0],
                              n_levels = 20,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf',
                              num_trees = 80,
                              delta = 0,
                              max_iters = 2,
                              early_stop = False,
                              min_node_size = 2)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 7605,
                          lr = 0.002991978123076162,
                          batch_size = 970,
                          num_timesteps = 407,
                          is_classification = False)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                              n_iter = 400,
                              lr = 0.001,
                              decoder_n_layers_hidden = 5,
                              weight_decay = 0.0001,
                              batch_size = 128,
                              n_units_embedding = 200,
                              decoder_n_units_hidden = 150,
                              decoder_nonlin = 'tanh',
                              decoder_dropout = 0.19964446358158816,
                              encoder_n_layers_hidden = 4,
                              encoder_n_units_hidden = 100,
                              encoder_nonlin = 'relu',
                              encoder_dropout = 0.0820245231222064)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                              generator_n_layers_hidden = 1,
                              generator_n_units_hidden = 100,
                              generator_nonlin = 'elu',
                              n_iter = 700,
                              generator_dropout = 0.13836424598477665,
                              discriminator_n_layers_hidden = 2,
                              discriminator_n_units_hidden = 100,
                              discriminator_nonlin = 'tanh',
                              discriminator_n_iter = 5,
                              discriminator_dropout = 0.023861565936528797,
                              lr = 0.001,
                              weight_decay = 0.0001,
                              batch_size = 200,
                              encoder_max_clusters = 8)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_abalone_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_abalone_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_abalone_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_abalone_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_abalone_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_abalone_arf / num_repli:.6f} seconds")

In [None]:
rt_abalone_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_abalone_ddpm / num_repli:.6f} seconds")

In [None]:
rt_abalone_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_abalone_tvae / num_repli:.6f} seconds")

In [None]:
rt_abalone_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_abalone_ctgan / num_repli:.6f} seconds")

In [None]:
# MagicTelescope dataset

import openml

# MagicTelescope
dataset = openml.datasets.get_dataset(44125) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 10))
cat_idx = [10]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(0, 10)),
                              cat_variables = [10],
                              n_levels = 25,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_magtel_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_magtel_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_magtel_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_magtel_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_magtel_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_magtel_arf / num_repli:.6f} seconds")

In [None]:
rt_magtel_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_magtel_ddpm / num_repli:.6f} seconds")

In [None]:
rt_magtel_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_magtel_tvae / num_repli:.6f} seconds")

In [None]:
rt_magtel_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_magtel_ctgan / num_repli:.6f} seconds")

In [None]:
# load the electricity data 

import openml

# electricity
dataset = openml.datasets.get_dataset(44120) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(0, 7)),
                              cat_variables = [7],
                              n_levels = 20,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_electricity_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_electricity_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_electricity_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_electricity_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_electricity_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_electricity_arf / num_repli:.6f} seconds")

In [None]:
rt_electricity_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_electricity_ddpm / num_repli:.6f} seconds")

In [None]:
rt_electricity_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_electricity_tvae / num_repli:.6f} seconds")

In [None]:
rt_electricity_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_electricity_ctgan / num_repli:.6f} seconds")

In [None]:
import openml

# Diabetes130US
dataset = openml.datasets.get_dataset(45022) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

In [None]:
loader_train = GenericDataLoader(
    X_train,
    target_column = 'target'
)

In [None]:
def train_and_generate_syn_tab_sjppds():
    syn_model = Plugins().get('syn_tab_sjppds',
                              num_variables = list(range(0, 7)),
                              cat_variables = [7],
                              n_levels = 35,
                              n_prop = 0.5)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_bayesnet():
    syn_model = Plugins().get('bayesian_network',
                          struct_learning_search_method = 'hillclimb',
                          struct_learning_score = 'bic')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y
    

def train_and_generate_arf():
    syn_model = Plugins().get('arf')
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ddpm():
    syn_model = Plugins().get("ddpm",
                          n_iter = 1051,
                          lr = 0.0009375080542687667,
                          batch_size = 2929,
                          num_timesteps = 998,
                          is_classification = True)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_tvae():
    syn_model = Plugins().get("tvae",
                          n_iter = 300,
                          lr = 0.0002,
                          decoder_n_layers_hidden = 4,
                          weight_decay = 0.001,
                          batch_size = 256,
                          n_units_embedding = 200,
                          decoder_n_units_hidden = 300,
                          decoder_nonlin = "elu",
                          decoder_dropout = 0.194325119117226,
                          encoder_n_layers_hidden = 1,
                          encoder_n_units_hidden = 450,
                          encoder_nonlin = "leaky_relu",
                          encoder_dropout = 0.04288563703094718)
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y


def train_and_generate_ctgan():
    syn_model = Plugins().get("ctgan",
                            n_iter = 1000,
                            generator_n_layers_hidden = 2,
                            generator_n_units_hidden = 50,
                            generator_nonlin = "tanh",
                            generator_dropout = 0.0575,
                            discriminator_n_layers_hidden = 4,
                            discriminator_n_units_hidden = 150,
                            discriminator_nonlin = "relu")
    n = len(loader_train)
    syn_model.fit(loader_train)
    Y = syn_model.generate(count=n).dataframe()
    return Y

In [None]:
rt_diabetes130us_syn_tab_sjppds = timeit.timeit(train_and_generate_syn_tab_sjppds, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_diabetes130us_syn_tab_sjppds / num_repli:.6f} seconds")

In [None]:
rt_diabetes130us_bayesnet = timeit.timeit(train_and_generate_bayesnet, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_diabetes130us_bayesnet / num_repli:.6f} seconds")

In [None]:
rt_diabetes130us_arf = timeit.timeit(train_and_generate_arf, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_diabetes130us_arf / num_repli:.6f} seconds")

In [None]:
rt_diabetes130us_ddpm = timeit.timeit(train_and_generate_ddpm, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_diabetes130us_ddpm / num_repli:.6f} seconds")

In [None]:
rt_diabetes130us_tvae = timeit.timeit(train_and_generate_tvae, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_diabetes130us_tvae / num_repli:.6f} seconds")

In [None]:
rt_diabetes130us_ctgan = timeit.timeit(train_and_generate_ctgan, number=num_repli)  # Repeat num_repli times
print(f"Average execution time: {rt_diabetes130us_ctgan / num_repli:.6f} seconds")