In [None]:
!pip install synthcity
!pip uninstall -y torchaudio torchdata
!pip install openml

In [None]:
# source code

import os

code_path = 'code/'

# source utility functions 
file_path = os.path.join(code_path, 'utility_functions_syn_tab_sjppds_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source additional utility functions 
file_path = os.path.join(code_path, 'utility_functions_additional_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

# source the synth_tab_sjppds method synthcity plugin
file_path = os.path.join(code_path, 'syn_tab_sjppds_synthcity_plugin_for_icml_2025.py')
with open(os.path.expanduser(file_path)) as file:
    exec(file.read())

In [None]:
# synthcity absolute
from synthcity.plugins import Plugins

generators = Plugins()

generators.add("syn_tab_sjppds", SynTabSjppdsPlugin)

In [None]:
import matplotlib.pyplot as plt

n_level_grid = list(range(5, 51, 5)) + list(range(60, 101, 10)) + list(range(200, 1001, 100))

out_path = 'outputs/tuning_param_dcr_TabSDS/'

In [None]:
from sklearn.datasets import fetch_openml

# Fetch the Abalone dataset
abalone = fetch_openml(name="abalone", version=1, as_frame=True)

# Access the data and target
X = abalone.data
y = abalone.target

X['target'] =  y # Rings

num_idx = [1, 2, 3, 4, 5, 6, 7, 8]
cat_idx = [0]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

np.random.seed(123)

dcr_AB = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_AB.csv')
dcr_AB.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_AB

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# bank marketing dataset

import openml

# bank marketing
dataset = openml.datasets.get_dataset(44126) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

np.random.seed(123)

dcr_BM = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_BM.csv')
dcr_BM.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_BM

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout() 
plt.show()

In [None]:
# california housing dataset

from sklearn.datasets import fetch_california_housing

# Load the dataset
california_housing = fetch_california_housing(as_frame=True)

# Features (X) and target (y)
X = california_housing.data
y = california_housing.target

X["MedHouseVal"] = y

num_idx = list(range(9))
cat_idx = None

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

np.random.seed(123)

dcr_CH = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_CH.csv')
dcr_CH.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_CH

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# Select tuning parameter for credit

import openml

# credit
dataset = openml.datasets.get_dataset(44089) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 10))
cat_idx = [10]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


np.random.seed(123)

dcr_CR = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_CR.csv')
dcr_CR.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_CR

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# Select tuning parameter for Diabetes130US

import openml

# Diabetes130US
dataset = openml.datasets.get_dataset(45022) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]


np.random.seed(123)

dcr_DI = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_DI.csv')
dcr_DI.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_DI

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# load the electricity data 

import openml

# 
dataset = openml.datasets.get_dataset(44120) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 7))
cat_idx = [7]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

np.random.seed(123)

dcr_EL = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_EL.csv')
dcr_EL.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_EL

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# eye movements dataset

import openml

# eye movements
dataset = openml.datasets.get_dataset(44130) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 20))
cat_idx = [20]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

np.random.seed(123)

dcr_EM = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_EM.csv')
dcr_EM.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_EM

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# house 16h dataset

import openml

# house_16H
dataset = openml.datasets.get_dataset(44123) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 16))
cat_idx = [16]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

np.random.seed(123)

dcr_HO = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_HO.csv')
dcr_HO.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_HO

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# magic telescope dataset

import openml

# magic telescope
dataset = openml.datasets.get_dataset(44125) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 10))
cat_idx = [10]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

np.random.seed(123)

dcr_MT = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_MT.csv')
dcr_MT.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_MT

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()

In [None]:
# pol dataset

import openml

# pol
dataset = openml.datasets.get_dataset(44122) 

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X['target'] = y

num_idx = list(range(0, 26))
cat_idx = [26]

X = enforce_dtypes(dat = X, 
                   num_variables = num_idx, 
                   cat_variables = cat_idx)

# Split the data
aux = train_test_data_split(X, my_seed=123)

X_train = aux["X_train"]
X_test = aux["X_test"]

np.random.seed(123)

dcr_PO = syn_tab_sjppds_dcr(dat_train = X_train, 
                           dat_test = X_test, 
                           num_variables = num_idx, 
                           cat_variables = cat_idx, 
                           n_prop=0.5, 
                           tuning_par_grid=n_level_grid)

file_name = os.path.join(out_path, 'syn_tab_sjppds_dcr_PO.csv')
dcr_PO.to_csv(file_name, index = False)

## plot DCR distributions across n_levels grid

dcr = dcr_PO

plt.boxplot(dcr.values, showfliers=False)
plt.xticks(ticks=np.arange(1, dcr.shape[1] + 1), labels=dcr.columns, rotation=90)

# Add a horizontal line at the median of the first column
plt.axhline(y=np.median(dcr.iloc[:, 0]), color='red', linestyle='--')

plt.xlabel("Tuning parameter grid")
plt.ylabel("DCR")
plt.tight_layout()  # Adjust layout to prevent label overlap
plt.show()