In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.combine import SMOTEENN
from boruta import BorutaPy
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
import os

In [3]:
os.makedirs('results', exist_ok=True)
os.makedirs('figures', exist_ok=True)

# Load data
data = pd.read_csv('C:/Users/danie/OneDrive/Documentos/1 UNIANDES/10 semestre/Tesis/differential-privacy-banking-sector/data/processed/bank-processed.csv')
X = data.drop(columns=['y'])
y = data['y']

# Normalize numeric columns
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
X[numeric_cols] = MinMaxScaler().fit_transform(X[numeric_cols])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [4]:
# Apply SMOTEENN
X_resample, y_resample = SMOTEENN(random_state=42).fit_resample(X_train, y_train)
X_resample = pd.DataFrame(X_resample, columns=X.columns)
y_resample = pd.Series(y_resample)


In [None]:
# Boruta feature selection
rf = xgb.XGBClassifier(eval_metric='logloss')
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=42)
feat_selector.fit(X_resample.values, y_resample.values.ravel())
X_filtered = X.columns[feat_selector.support_].tolist()




In [None]:
X_train_filtered = X_resample[X_filtered].values
X_test_filtered = X_test[X_filtered].values
y_train_filtered = y_resample.values
y_test_filtered = y_test.values


In [None]:
# Training parameters
input_size = len(X_filtered)
hidden_units = 64
hidden_layers = 2
dropout_rate = 0.2
epochs = 50
learning_rate = 0.001
l2_norm_clip = 1.0
default_noise_multiplier = 1.1


In [None]:
# DP parameters
def compute_privacy_budget(n, batch_size, noise_multiplier, epochs, delta=1e-5):
    try:
        return compute_dp_sgd_privacy.compute_dp_sgd_privacy(
            n=n, batch_size=batch_size, noise_multiplier=noise_multiplier, epochs=epochs, delta=delta)[0]
    except:
        return float('inf')

def create_model(input_size, hidden_units, hidden_layers, dropout_rate, learning_rate, num_microbatches, l2_norm_clip, noise_multiplier, use_dp):
    model = Sequential()
    model.add(Dense(hidden_units, activation='relu', input_shape=(input_size,)))
    for _ in range(hidden_layers - 1):
        model.add(Dense(hidden_units, activation='relu'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    if use_dp:
        optimizer = DPKerasSGDOptimizer(
            l2_norm_clip=l2_norm_clip,
            noise_multiplier=noise_multiplier,
            num_microbatches=num_microbatches,
            learning_rate=learning_rate
        )
    else:
        optimizer = Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_model(X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate, use_dp, noise_multiplier, l2_norm_clip, use_early_stopping):
    num_microbatches = batch_size
    model = create_model(
        input_size, hidden_units, hidden_layers, dropout_rate,
        learning_rate, num_microbatches, l2_norm_clip,
        noise_multiplier, use_dp
    )

    callbacks = []
    if use_early_stopping:
        callbacks.append(EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True))

    model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=1,
        validation_data=(X_test, y_test),
        callbacks=callbacks
    )

    y_pred_prob = model.predict(X_test, batch_size=batch_size).flatten()
    y_pred = (y_pred_prob > 0.5).astype(int)
    return y_pred_prob, y_pred

def evaluate_model(y_true, y_pred, y_pred_prob):
    cm = confusion_matrix(y_true, y_pred)
    return {
        'ROC AUC': roc_auc_score(y_true, y_pred_prob),
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'Type I Error': cm[0][1] / cm[0].sum() if cm[0].sum() > 0 else 0,
        'Type II Error': cm[1][0] / cm[1].sum() if cm[1].sum() > 0 else 0
    }

def run_experiment(X_train_data, y_train_data, batch_size, learning_rate, noise_multiplier, l2_norm_clip, use_dp=True, use_early_stopping=True):
    eps = compute_privacy_budget(len(X_train_data), batch_size, noise_multiplier, epochs)
    print(f"Running: batch_size={batch_size}, lr={learning_rate}, noise={noise_multiplier}, clip={l2_norm_clip}, eps={eps:.2f}")
    y_prob, y_pred = train_model(
        X_train_data, y_train_data, X_test_filtered, y_test_filtered,
        batch_size=batch_size, epochs=epochs, learning_rate=learning_rate,
        use_dp=use_dp, noise_multiplier=noise_multiplier, l2_norm_clip=l2_norm_clip,
        use_early_stopping=use_early_stopping
    )
    results = evaluate_model(y_test_filtered, y_pred, y_prob)
    results['Epsilon'] = eps
    results['Batch Size'] = batch_size
    results['Noise Multiplier'] = noise_multiplier
    results['Learning Rate'] = learning_rate
    results['Clipping Norm'] = l2_norm_clip
    results['Sample Ratio'] = len(X_train_data) / len(X_train_filtered)
    return results

def grid_search_experiments(use_early_stopping=True):
    batch_sizes = [16, 32, 64, 128]
    noise_multipliers = [0.8, 1.1, 1.5, 2.0]
    learning_rates = [0.001, 0.003, 0.005]
    clip_norms = [0.5, 1.0, 2.0]
    sample_ratios = [1.0, 0.5, 0.1, 0.05]

    results = []

    for ratio in sample_ratios:
        n_samples = int(len(X_train_filtered) * ratio)
        idx = np.random.choice(len(X_train_filtered), n_samples, replace=False)
        X_sample = X_train_filtered[idx]
        y_sample = y_train_filtered[idx]

        # Run non-DP experiments for all batch sizes and learning rates
        for batch_size in batch_sizes:
            for lr in learning_rates:
                res = run_experiment(
                    X_sample, y_sample,
                    batch_size=batch_size,
                    learning_rate=lr,
                    noise_multiplier=0.0,
                    l2_norm_clip=0.0,
                    use_dp=False,
                    use_early_stopping=use_early_stopping
                )
                results.append(res)

        # Run DP experiments for all combinations
        for batch_size in batch_sizes:
            for noise in noise_multipliers:
                for lr in learning_rates:
                    for clip in clip_norms:
                        res = run_experiment(
                            X_sample, y_sample,
                            batch_size=batch_size,
                            learning_rate=lr,
                            noise_multiplier=noise,
                            l2_norm_clip=clip,
                            use_dp=True,
                            use_early_stopping=use_early_stopping
                        )
                        results.append(res)

    return pd.DataFrame(results)


In [None]:
# Run grid search with early stopping
df_results = grid_search_experiments(use_early_stopping=True)
df_results.round(3).to_csv('results/cdp_results_early_stopping.csv', index=False)

In [None]:
# Run grid search without early stopping
df_results_no_early_stopping = grid_search_experiments(use_early_stopping=False)
df_results_no_early_stopping.round(3).to_csv('results/cdp_results_no_early_stopping.csv', index=False)