In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.combine import SMOTEENN
from boruta import BorutaPy
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy


In [2]:

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# Create output folders
os.makedirs('results', exist_ok=True)
os.makedirs('figures', exist_ok=True)

# Load and preprocess data
data = pd.read_csv('C:/Users/danie/OneDrive/Documentos/1 UNIANDES/10 semestre/Tesis/differential-privacy-banking-sector/data/processed/bank-processed.csv')
X = data.drop(columns=['y'])
y = data['y']

numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
X[numeric_cols] = MinMaxScaler().fit_transform(X[numeric_cols])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)


In [3]:

# Apply SMOTEENN
X_resample, y_resample = SMOTEENN(random_state=SEED).fit_resample(X_train, y_train)
X_resample = pd.DataFrame(X_resample, columns=X.columns)
y_resample = pd.Series(y_resample)


In [4]:

# Boruta feature selection
rf = xgb.XGBClassifier(eval_metric='logloss', random_state=SEED)
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=SEED)
feat_selector.fit(X_resample.values, y_resample.values.ravel())




BorutaPy(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                 colsample_bylevel=1, colsample_bynode=1,
                                 colsample_bytree=1, enable_categorical=False,
                                 eval_metric='logloss', gamma=0, gpu_id=-1,
                                 importance_type=None,
                                 interaction_constraints='',
                                 learning_rate=0.300000012, max_delta_step=0,
                                 max_depth=6, min_child_weight=1, missing=nan,
                                 monotone_constraints='()', n_estimators=129,
                                 n_jobs=12, num_parallel_tree=1,
                                 predictor='auto', random_state=1436615557,
                                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                                 subsample=1, tree_method='exact',
                                 validate_parameters=1, verbosity

In [5]:

if not feat_selector.support_.any():
    raise ValueError("Boruta did not select any features. Please check your preprocessing or model setup.")

X_filtered = X.columns[feat_selector.support_].tolist()


In [6]:
X_train_filtered = X_resample[X_filtered].values
X_test_filtered = X_test[X_filtered].values
y_train_filtered = y_resample.values
y_test_filtered = y_test.values

# Training parameters
input_size = len(X_filtered)
hidden_units = 64
hidden_layers = 2
dropout_rate = 0.2
epochs = 50


In [7]:
# DP parameters
def compute_privacy_budget(n, batch_size, noise_multiplier, epochs, delta=1e-5):
    try:
        return compute_dp_sgd_privacy.compute_dp_sgd_privacy(
            n=n, batch_size=batch_size, noise_multiplier=noise_multiplier, epochs=epochs, delta=delta)[0]
    except:
        return float('inf')

def create_model(input_size, hidden_units, hidden_layers, dropout_rate, learning_rate, num_microbatches, l2_norm_clip, noise_multiplier, use_dp):
    model = Sequential()
    model.add(Dense(hidden_units, activation='relu', input_shape=(input_size,)))
    for _ in range(hidden_layers - 1):
        model.add(Dense(hidden_units, activation='relu'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    if use_dp:
        optimizer = DPKerasSGDOptimizer(
            l2_norm_clip=l2_norm_clip,
            noise_multiplier=noise_multiplier,
            num_microbatches=num_microbatches,
            learning_rate=learning_rate
        )
    else:
        optimizer = Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_model(X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate, use_dp, noise_multiplier, l2_norm_clip, use_early_stopping):
    num_microbatches = batch_size
    model = create_model(
        input_size, hidden_units, hidden_layers, dropout_rate,
        learning_rate, num_microbatches, l2_norm_clip,
        noise_multiplier, use_dp
    )

    callbacks = []
    if use_early_stopping:
        callbacks.append(EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True))

    model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=0,
        validation_split=0.2,
        callbacks=callbacks
    )
    y_pred_prob = model.predict(X_test, batch_size=batch_size).flatten()
    y_pred = (y_pred_prob > 0.5).astype(int)
    return y_pred_prob, y_pred

def evaluate_model(y_true, y_pred, y_pred_prob):
    cm = confusion_matrix(y_true, y_pred)
    return {
        'ROC AUC': roc_auc_score(y_true, y_pred_prob),
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'Type I Error': cm[0][1] / cm[0].sum() if cm[0].sum() > 0 else 0,
        'Type II Error': cm[1][0] / cm[1].sum() if cm[1].sum() > 0 else 0
    }

def run_experiment(X_train_data, y_train_data, batch_size, learning_rate, noise_multiplier, l2_norm_clip, use_dp=True, use_early_stopping=True):
    eps = compute_privacy_budget(len(X_train_data), batch_size, noise_multiplier, epochs)
    y_prob, y_pred = train_model(
        X_train_data, y_train_data, X_test_filtered, y_test_filtered,
        batch_size=batch_size, epochs=epochs, learning_rate=learning_rate,
        use_dp=use_dp, noise_multiplier=noise_multiplier, l2_norm_clip=l2_norm_clip,
        use_early_stopping=use_early_stopping
    )
    results = evaluate_model(y_test_filtered, y_pred, y_prob)
    results['Epsilon'] = eps
    results['Batch Size'] = batch_size
    results['Noise Multiplier'] = noise_multiplier
    results['Learning Rate'] = learning_rate
    results['Clipping Norm'] = l2_norm_clip
    results['Sample Ratio'] = len(X_train_data) / len(X_train_filtered)
    results['DP Enabled'] = use_dp
    return results

def grid_search_experiments(use_early_stopping=True):
    batch_sizes = [16, 32, 64, 128]
    noise_multipliers = [0.8, 1.1, 1.5, 2.0]
    learning_rates = [0.001, 0.003, 0.005]
    clip_norms = [0.5, 1.0, 2.0]
    sample_ratios = [1.0, 0.5, 0.1, 0.05]

    results = []

    for ratio in sample_ratios:
        n_samples = int(len(X_train_filtered) * ratio)
        idx = np.random.choice(len(X_train_filtered), n_samples, replace=False)
        X_sample = X_train_filtered[idx]
        y_sample = y_train_filtered[idx]

        for batch_size in batch_sizes:
            for lr in learning_rates:
                res = run_experiment(
                    X_sample, y_sample,
                    batch_size=batch_size,
                    learning_rate=lr,
                    noise_multiplier=0.0,
                    l2_norm_clip=0.0,
                    use_dp=False,
                    use_early_stopping=use_early_stopping
                )
                results.append(res)

        for batch_size in batch_sizes:
            for noise in noise_multipliers:
                for lr in learning_rates:
                    for clip in clip_norms:
                        res = run_experiment(
                            X_sample, y_sample,
                            batch_size=batch_size,
                            learning_rate=lr,
                            noise_multiplier=noise,
                            l2_norm_clip=clip,
                            use_dp=True,
                            use_early_stopping=use_early_stopping
                        )
                        results.append(res)

    return pd.DataFrame(results)




In [8]:
# Run experiments multiple times
all_runs = []
n_repeats = 10
for run_id in range(n_repeats):
    print(f"\n--- Running grid search iteration {run_id + 1} ---")
    df_run = grid_search_experiments(use_early_stopping=True)
    df_run['Run'] = run_id + 1
    all_runs.append(df_run)



--- Running grid search iteration 1 ---
DP-SGD with sampling rate = 0.0299% and noise_multiplier = 0.8 iterated over 167200 steps satisfies differential privacy with eps = 1.7 and delta = 1e-05.
The optimal RDP order is 9.0.
DP-SGD with sampling rate = 0.0299% and noise_multiplier = 0.8 iterated over 167200 steps satisfies differential privacy with eps = 1.7 and delta = 1e-05.
The optimal RDP order is 9.0.
DP-SGD with sampling rate = 0.0299% and noise_multiplier = 0.8 iterated over 167200 steps satisfies differential privacy with eps = 1.7 and delta = 1e-05.
The optimal RDP order is 9.0.
DP-SGD with sampling rate = 0.0299% and noise_multiplier = 0.8 iterated over 167200 steps satisfies differential privacy with eps = 1.7 and delta = 1e-05.
The optimal RDP order is 9.0.
DP-SGD with sampling rate = 0.0299% and noise_multiplier = 0.8 iterated over 167200 steps satisfies differential privacy with eps = 1.7 and delta = 1e-05.
The optimal RDP order is 9.0.
DP-SGD with sampling rate = 0.0299

In [9]:
df_all = pd.concat(all_runs, ignore_index=True)
df_all.round(3).to_csv('results/cdp_all_runs.csv', index=False)

# Aggregate statistics
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Type I Error', 'Type II Error', 'Epsilon']
group_cols = ['Batch Size', 'Noise Multiplier', 'Learning Rate', 'Clipping Norm', 'Sample Ratio', 'DP Enabled']

agg_results = df_all.groupby(group_cols)[metrics].agg(['mean', 'min', 'max']).reset_index()
agg_results.columns = [' '.join(col).strip() for col in agg_results.columns.values]
agg_results.round(3).to_csv('results/cdp_aggregated_results.csv', index=False)

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set style for plots
plt.style.use('seaborn')
os.makedirs('figures/cdp1', exist_ok=True)
os.makedirs('figures/cdp2', exist_ok=True)
os.makedirs('figures/cdp3', exist_ok=True)
os.makedirs('figures/cdp4', exist_ok=True)

# Load aggregated results
df_results = pd.read_csv('results/cdp_aggregated_results.csv')

# Define parameters
sample_ratios = [1.0, 0.5, 0.1, 0.05]
batch_sizes = [16, 32, 64, 128]
learning_rates = [0.001, 0.003, 0.005]
noise_multipliers = [0.8, 1.1, 1.5, 2.0]
clip_norms = [0.5, 1.0, 2.0]


In [29]:
# 1. Non-DP ROC AUC Heatmaps with fixed color threshold
non_dp_data = df_results[df_results['DP Enabled'] == False]

fig1, axes1 = plt.subplots(2, 2, figsize=(11, 9))
axes1 = axes1.ravel()
fig1.suptitle('ROC AUC for Non-DP Experiments by Sample Ratio', fontsize=18, fontweight='bold')

for i, ratio in enumerate(sample_ratios):
    data_subset = non_dp_data[non_dp_data['Sample Ratio'] == ratio]
    pivot = data_subset.pivot(index='Batch Size', columns='Learning Rate', values='ROC AUC mean')
    pivot = pivot.reindex(index=batch_sizes, columns=learning_rates, fill_value=np.nan)

    sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes1[i],
                cbar_kws={'label': 'ROC AUC'}, annot_kws={"size": 13}, vmin=0.8, vmax=0.905)
    axes1[i].set_title(f'Sample Ratio: {ratio}', fontsize=16)
    axes1[i].set_xlabel('Learning Rate', fontsize=14)
    axes1[i].set_ylabel('Batch Size', fontsize=14)
    axes1[i].tick_params(axis='x', labelsize=12)
    axes1[i].tick_params(axis='y', labelsize=12)
    cbar = axes1[i].collections[0].colorbar
    cbar.ax.tick_params(labelsize=12)
    cbar.set_label('ROC AUC', fontsize=14)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig('figures/cdp1/non_dp_roc_auc_heatmaps.png')
plt.close()


In [None]:
# 2. DP Epsilon Heatmaps
dp_data = df_results[df_results['DP Enabled'] == True]

fig2, axes2 = plt.subplots(2, 2, figsize=(11, 9))
axes2 = axes2.ravel()
fig2.suptitle('Epsilon for DP Experiments by Sample Ratio', fontsize=18, fontweight='bold')

for i, ratio in enumerate(sample_ratios):
    data_subset = dp_data[dp_data['Sample Ratio'] == ratio]
    pivot = data_subset.pivot_table(index='Batch Size', columns='Noise Multiplier', values='Epsilon mean', aggfunc='mean')
    pivot = pivot.reindex(index=batch_sizes, columns=noise_multipliers, fill_value=np.nan)

    sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes2[i],
                cbar_kws={'label': 'Epsilon'}, annot_kws={"size": 13})
    axes2[i].set_title(f'Sample Ratio: {ratio}', fontsize=16)
    axes2[i].set_xlabel('Noise Multiplier', fontsize=14)
    axes2[i].set_ylabel('Batch Size', fontsize=14)
    axes2[i].tick_params(axis='x', labelsize=12)
    axes2[i].tick_params(axis='y', labelsize=12)
    cbar = axes2[i].collections[0].colorbar
    cbar.ax.tick_params(labelsize=12)
    cbar.set_label('Epsilon', fontsize=14)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig('figures/cdp2/dp_epsilon_heatmaps.png')
plt.close()

In [88]:
# 3. DP ROC AUC for Specific Configurations
configurations3 = [
    {'batch_size': 16, 'noise_multiplier': 0.8, 'fig_name': 'batch16_noise0.8', 'vmin': 0.59, 'vmax': 0.9},
    {'batch_size': 16, 'noise_multiplier': 1.1, 'fig_name': 'batch16_noise1.1', 'vmin': 0.59, 'vmax': 0.9},
    {'batch_size': 16, 'noise_multiplier': 1.5, 'fig_name': 'batch16_noise1.5', 'vmin': 0.59, 'vmax': 0.9},
    {'batch_size': 16, 'noise_multiplier': 2.0, 'fig_name': 'batch16_noise2.0', 'vmin': 0.59, 'vmax': 0.9}
]

for config in configurations3:
    bs, nm, fig_name, vmin, vmax = config.values()
    config_data = dp_data[(dp_data['Batch Size'] == bs) & (dp_data['Noise Multiplier'] == nm)]

    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.ravel()
    fig.suptitle(f'ROC AUC for DP (Batch Size={bs}, Noise Multiplier={nm})', fontsize=18, fontweight='bold')

    for i, ratio in enumerate(sample_ratios):
        subset = config_data[config_data['Sample Ratio'] == ratio]
        epsilon = subset['Epsilon mean'].mean() if not subset.empty else np.nan
        epsilon_str = f'{epsilon:.3f}' if not np.isnan(epsilon) else 'N/A'

        pivot = subset.pivot(index='Clipping Norm', columns='Learning Rate', values='ROC AUC mean')
        pivot = pivot.reindex(index=clip_norms, columns=learning_rates, fill_value=np.nan)

        sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes[i],
                    cbar_kws={'label': 'ROC AUC'}, vmin=vmin, vmax=vmax, annot_kws={"size": 13})
        axes[i].set_title(f'Sample Ratio: {ratio}, Epsilon: {epsilon_str}', fontsize=16)
        axes[i].set_xlabel('Learning Rate', fontsize=14)
        axes[i].set_ylabel('Clipping Norm', fontsize=14)
        axes[i].tick_params(axis='x', labelsize=12)
        axes[i].tick_params(axis='y', labelsize=12)
        cbar = axes[i].collections[0].colorbar
        cbar.ax.tick_params(labelsize=12)
        cbar.set_label('ROC AUC', fontsize=14)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f'figures/cdp3/dp_roc_auc_{fig_name}.png')
    plt.close()

In [89]:
# 4. Grouped by Sample Ratio changing noise multiplier
configurations4 = [{'batch_size': 16, 'noise_multiplier': nm} for nm in [0.8, 1.1, 1.5, 2.0]]
sample_ratio_color_scales = {r: {'vmin': 0.58, 'vmax': 0.9} for r in sample_ratios}

for ratio in sample_ratios:
    vmin = sample_ratio_color_scales[ratio]['vmin']
    vmax = sample_ratio_color_scales[ratio]['vmax']
    ratio_data = dp_data[dp_data['Sample Ratio'] == ratio]

    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.ravel()
    fig.suptitle(f'ROC AUC for DP (Sample Ratio={ratio})', fontsize=18, fontweight='bold')

    for i, config in enumerate(configurations4):
        bs, nm = config['batch_size'], config['noise_multiplier']
        subset = ratio_data[(ratio_data['Batch Size'] == bs) & (ratio_data['Noise Multiplier'] == nm)]
        epsilon = subset['Epsilon mean'].mean() if not subset.empty else np.nan
        epsilon_str = f'{epsilon:.3f}' if not np.isnan(epsilon) else 'N/A'

        pivot = subset.pivot(index='Clipping Norm', columns='Learning Rate', values='ROC AUC mean')
        pivot = pivot.reindex(index=clip_norms, columns=learning_rates, fill_value=np.nan)

        sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes[i],
                    cbar_kws={'label': 'ROC AUC'}, vmin=vmin, vmax=vmax, annot_kws={"size": 13})
        axes[i].set_title(f'Batch Size: {bs}, Noise: {nm}\nEpsilon: {epsilon_str}', fontsize=16)
        axes[i].set_xlabel('Learning Rate', fontsize=14)
        axes[i].set_ylabel('Clipping Norm', fontsize=14)
        axes[i].tick_params(axis='x', labelsize=12)
        axes[i].tick_params(axis='y', labelsize=12)
        cbar = axes[i].collections[0].colorbar
        cbar.ax.tick_params(labelsize=12)
        cbar.set_label('ROC AUC', fontsize=14)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f'figures/cdp4/dp_roc_auc_sample_ratio_{ratio}.png')
    plt.close()


In [90]:
# 5. Grouped by Sample Ratio changing batch size
configurations4 = [{'batch_size': bs, 'noise_multiplier': 1.5} for bs in [16, 32, 64, 128]]
sample_ratio_color_scales = {r: {'vmin': 0.58, 'vmax': 0.9} for r in sample_ratios}

for ratio in sample_ratios:
    vmin = sample_ratio_color_scales[ratio]['vmin']
    vmax = sample_ratio_color_scales[ratio]['vmax']
    ratio_data = dp_data[dp_data['Sample Ratio'] == ratio]

    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.ravel()
    fig.suptitle(f'ROC AUC for DP (Sample Ratio={ratio})', fontsize=18, fontweight='bold')

    for i, config in enumerate(configurations4):
        bs, nm = config['batch_size'], config['noise_multiplier']
        subset = ratio_data[(ratio_data['Batch Size'] == bs) & (ratio_data['Noise Multiplier'] == nm)]
        epsilon = subset['Epsilon mean'].mean() if not subset.empty else np.nan
        epsilon_str = f'{epsilon:.3f}' if not np.isnan(epsilon) else 'N/A'

        pivot = subset.pivot(index='Clipping Norm', columns='Learning Rate', values='ROC AUC mean')
        pivot = pivot.reindex(index=clip_norms, columns=learning_rates, fill_value=np.nan)

        sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes[i],
                    cbar_kws={'label': 'ROC AUC'}, vmin=vmin, vmax=vmax, annot_kws={"size": 13})
        axes[i].set_title(f'Batch Size: {bs}, Noise: {nm}\nEpsilon: {epsilon_str}', fontsize=16)
        axes[i].set_xlabel('Learning Rate', fontsize=14)
        axes[i].set_ylabel('Clipping Norm', fontsize=14)
        axes[i].tick_params(axis='x', labelsize=12)
        axes[i].tick_params(axis='y', labelsize=12)
        cbar = axes[i].collections[0].colorbar
        cbar.ax.tick_params(labelsize=12)
        cbar.set_label('ROC AUC', fontsize=14)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f'figures/cdp5/dp_roc_auc_sample_ratio_{ratio}.png')
    plt.close()


In [20]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

In [65]:
non_dp_data = df_results[df_results['DP Enabled'] == False]

anova_non_dp_data = non_dp_data[['ROC AUC mean', 'Batch Size', 'Learning Rate', 'Sample Ratio']]
anova_non_dp_data = anova_non_dp_data.astype({
    'Batch Size': 'category',
    'Learning Rate': 'category',
    'Sample Ratio': 'category',
})

# Define the ANOVA model with all main effects and 2-way + 3-way interaction
formula = 'Q("ROC AUC mean") ~ C(Q("Batch Size")) * C(Q("Sample Ratio")) + C(Q("Batch Size")) * C(Q("Learning Rate")) + C(Q("Sample Ratio")) * C(Q("Learning Rate"))'


# Fit the model
model = smf.ols(formula=formula, data=anova_non_dp_data).fit()

# Perform ANOVA
anova_results = anova_lm(model, typ=2)

# Display the results
anova_results


Unnamed: 0,sum_sq,df,F,PR(>F)
"C(Q(""Batch Size""))",3.15625e-05,3.0,23.795812,1.735987e-06
"C(Q(""Sample Ratio""))",0.003862729,3.0,2912.21466,2.311706e-24
"C(Q(""Learning Rate""))",2.916667e-07,2.0,0.329843,0.7232919
"C(Q(""Batch Size"")):C(Q(""Sample Ratio""))",3.435417e-05,9.0,8.633508,6.24905e-05
"C(Q(""Batch Size"")):C(Q(""Learning Rate""))",1.1875e-05,6.0,4.47644,0.006073218
"C(Q(""Sample Ratio"")):C(Q(""Learning Rate""))",1.208333e-06,6.0,0.455497,0.8317672
Residual,7.958333e-06,18.0,,


In [None]:
anova_data = dp_data[['ROC AUC mean', 'Batch Size', 'Noise Multiplier', 'Clipping Norm', 'Learning Rate', 'Sample Ratio']]
anova_data = anova_data.astype({
    'Batch Size': 'category',
    'Noise Multiplier': 'category',
    'Clipping Norm': 'category',
    'Learning Rate': 'category',
    'Sample Ratio': 'category',
})

# Build the formula (using C() to treat as categorical)
formula = 'Q("ROC AUC mean") ~ C(Q("Batch Size")) + C(Q("Noise Multiplier")) + C(Q("Clipping Norm")) + C(Q("Learning Rate")) + C(Q("Sample Ratio"))'

# Fit the model
model = smf.ols(formula=formula, data=anova_data).fit()

# Perform ANOVA
anova_results = anova_lm(model, typ=2)  # Type II sums of squares
anova_results

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(Q(""Batch Size""))",0.577754,3.0,732.489739,1.0766059999999999e-193
"C(Q(""Noise Multiplier""))",7e-06,3.0,0.008353,0.9989515
"C(Q(""Clipping Norm""))",3.9e-05,2.0,0.074095,0.9285925
"C(Q(""Learning Rate""))",0.436163,2.0,829.46589,1.993005e-168
"C(Q(""Sample Ratio""))",2.052662,3.0,2602.413146,0.0
Residual,0.14776,562.0,,


In [None]:
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

# Define your model with all pairwise (2-way) interactions
formula = '''
Q("ROC AUC mean") ~ 
                   C(Q("Sample Ratio")) + 
                   C(Q("Batch Size")) + 
                   C(Q("Learning Rate")) +
                   C(Q("Noise Multiplier")) + 
                   C(Q("Clipping Norm")) +  
                   C(Q("Batch Size")):C(Q("Noise Multiplier")) + 
                   C(Q("Batch Size")):C(Q("Clipping Norm")) + 
                   C(Q("Batch Size")):C(Q("Learning Rate")) + 
                   C(Q("Batch Size")):C(Q("Sample Ratio")) + 
                   C(Q("Noise Multiplier")):C(Q("Clipping Norm")) + 
                   C(Q("Noise Multiplier")):C(Q("Learning Rate")) + 
                   C(Q("Noise Multiplier")):C(Q("Sample Ratio")) + 
                   C(Q("Clipping Norm")):C(Q("Learning Rate")) + 
                   C(Q("Clipping Norm")):C(Q("Sample Ratio")) + 
                   C(Q("Learning Rate")):C(Q("Sample Ratio"))
'''

# Fit the model
model = smf.ols(formula=formula, data=anova_data).fit()

# Run ANOVA
anova_results = anova_lm(model, typ=2)

# Display results
anova_results


Unnamed: 0,sum_sq,df,F,PR(>F)
"C(Q(""Batch Size""))",0.577754,3.0,1134.983853,2.229381e-221
"C(Q(""Noise Multiplier""))",7e-06,3.0,0.012943,0.9979857
"C(Q(""Clipping Norm""))",3.9e-05,2.0,0.114809,0.8915598
"C(Q(""Learning Rate""))",0.436163,2.0,1285.247207,1.0136910000000002e-196
"C(Q(""Sample Ratio""))",2.052662,3.0,4032.407203,0.0
"C(Q(""Batch Size"")):C(Q(""Noise Multiplier""))",4.6e-05,9.0,0.030014,0.9999978
"C(Q(""Batch Size"")):C(Q(""Clipping Norm""))",8.5e-05,6.0,0.083705,0.9977895
"C(Q(""Batch Size"")):C(Q(""Learning Rate""))",0.028241,6.0,27.739411,1.4036380000000001e-28
"C(Q(""Batch Size"")):C(Q(""Sample Ratio""))",0.028006,9.0,18.339005,2.025869e-26
"C(Q(""Noise Multiplier"")):C(Q(""Clipping Norm""))",2.9e-05,6.0,0.028004,0.9999061


In [76]:
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

# Model with main effects + 2-way + 3-way interaction of Batch Size, Noise Multiplier, and Sample Ratio
formula = '''
Q("ROC AUC mean") ~ C(Q("Noise Multiplier")) * C(Q("Sample Ratio")) * C(Q("Learning Rate")) * C(Q("Clipping Norm"))
'''

# Fit the model
model = smf.ols(formula=formula, data=anova_data).fit()

# Perform ANOVA
anova_results = anova_lm(model, typ=2)

# Display results
anova_results


Unnamed: 0,sum_sq,df,F,PR(>F)
"C(Q(""Noise Multiplier""))",7e-06,3.0,0.001323,0.9999334
"C(Q(""Sample Ratio""))",2.052662,3.0,412.315464,2.358695e-126
"C(Q(""Learning Rate""))",0.436163,2.0,131.417109,2.61909e-45
"C(Q(""Clipping Norm""))",3.9e-05,2.0,0.011739,0.9883296
"C(Q(""Noise Multiplier"")):C(Q(""Sample Ratio""))",6.6e-05,9.0,0.004451,1.0
"C(Q(""Noise Multiplier"")):C(Q(""Learning Rate""))",9.7e-05,6.0,0.009747,0.9999959
"C(Q(""Sample Ratio"")):C(Q(""Learning Rate""))",0.007097,6.0,0.712813,0.6394483
"C(Q(""Noise Multiplier"")):C(Q(""Clipping Norm""))",2.9e-05,6.0,0.002863,0.9999999
"C(Q(""Sample Ratio"")):C(Q(""Clipping Norm""))",4.5e-05,6.0,0.004504,0.9999996
"C(Q(""Learning Rate"")):C(Q(""Clipping Norm""))",5.6e-05,4.0,0.008403,0.9998597
