In [1]:
import numpy as np
import time
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

sns.set_style("whitegrid")

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def gaussian_mech_RDP_vec(vec, sensitivity, alpha, epsilon):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon))
    return [v + np.random.normal(loc=0, scale=sigma) for v in vec]


def loss(weights, xi, yi):
    exponent = - yi * (xi.dot(weights))
    return np.log(1 + np.exp(exponent))


def gradient(weights, xi, yi):
    exponent = yi * (xi.dot(weights))
    return - (yi*xi) / (1+np.exp(exponent))

def predict(xi, weights, bias=0):
    label = np.sign(xi @ weights + bias)
    return label

def accuracy(weights):
    return np.sum(predict(X_test, weights) == y_test)/X_test.shape[0]


X = np.load('adult_processed_x.npy')
y = np.load('adult_processed_y.npy')

training_size = int(X.shape[0] * 0.8)

X_train = X[:training_size]
X_test = X[training_size:]

y_train = y[:training_size]
y_test = y[training_size:]

print('Train and test set sizes:', len(y_train), len(y_test))

Train and test set sizes: 36176 9044


In [2]:
def train_model():
    # Initialize logistic regression model
    model = LogisticRegression(max_iter=1000)

    # Train the model
    model.fit(X_train, y_train)

    return model

model = train_model()
print('Model coefficients:', model.coef_[0])
print('Model accuracy:', np.sum(model.predict(X_test) == y_test)/X_test.shape[0])

Model coefficients: [ 7.25273625e-01  5.28692744e-02  2.16164416e-01  3.15914290e-01
 -3.36810399e-01 -1.17398041e-01 -7.92758753e-01 -4.76246610e-01
 -4.75825890e-01 -3.48796888e-01 -4.75751160e-01 -1.04259054e-01
 -5.00962087e-01 -6.00407682e-01  9.53302018e-02  1.75102572e-01
  5.11977080e-01  9.32111483e-01 -6.71637360e-03  7.18722508e-01
 -7.89427608e-01  1.21778497e+00  1.90618957e-01 -6.43742600e-01
  1.72193131e+00  1.53039377e+00 -3.43725509e-01 -1.05523627e+00
 -6.42126608e-01 -5.04239687e-01  8.76083921e-02  1.26578768e-01
  1.43387358e-01  8.63652165e-01 -7.57995142e-01 -5.51821340e-01
 -1.80600184e-01 -7.92412364e-01 -1.12385567e+00  6.14533505e-01
  6.16375499e-01  3.62331762e-01  6.48779468e-01  6.69219649e-03
 -9.51169529e-02  3.05253385e-01 -4.86064983e-01 -7.56624074e-01
  1.07014947e-01  9.88792092e-01 -1.80816970e-01  3.33349291e-01
 -1.41148528e-01 -6.66401519e-02  1.18510773e-01 -3.02875788e-01
  3.66130202e-01  9.42949358e-01  6.69469573e-01 -2.38027432e-01
 -1.2

In [3]:
def L2_clip(v: np.ndarray, C: float) -> np.ndarray:
    """
    Clip a vector by its L2 norm.
    """
    norm = np.linalg.norm(v, ord=2)
    factor = min(1, C / norm)
    return v * factor

def gradient_sum(weights: np.ndarray, X: np.ndarray, y: np.ndarray, C: float) -> np.ndarray:
    """
    Compute the sum of L2-clipped gradients for the current weights.
    
    Parameters:
    weights (np.ndarray): The current weights of the model.
    X (np.ndarray): The training data features.
    y (np.ndarray): The training data labels.
    C (float): The clipping parameter.
    """
    gradients = np.array([L2_clip(gradient(weights, x_i, y_i), C) for x_i, y_i in zip(X, y)])
    return np.sum(gradients, axis=0)

def noisy_gradient_descent_RDP(iterations: int, alpha: float, epsilon_bar: float) -> np.ndarray:
    """
    Perform noisy gradient descent with Renyi Differential Privacy.
    
    Parameters:
    iterations (int): Number of iterations for gradient descent.
    alpha (float): The order of Renyi Divergence.
    epsilon_bar (float): Total privacy budget.
    """
    epsilon_iteration = (0.9 * epsilon_bar) / iterations
    weights = np.zeros(X_train.shape[1])
    C = 4

    noisy_count = laplace_mech(X_train.shape[0], 1, 0.1 * epsilon_bar)

    for i in range(iterations):
        clipped_gradient_sum = gradient_sum(weights, X_train, y_train, C)
        noisy_gradient_sum = gaussian_mech_RDP_vec(clipped_gradient_sum, C, alpha, epsilon_iteration)
        noisy_gradient_sum = np.array(noisy_gradient_sum)  # Convert list to numpy array
        noisy_avg_gradient = noisy_gradient_sum / noisy_count
        weights -= noisy_avg_gradient

    return weights

weights = noisy_gradient_descent_RDP(10, 20, 0.1)
print('Final accuracy:', accuracy(weights))

Final accuracy: 0.7784166298098186


- The total privacy cost of this solution is $(\alpha, \bar{\epsilon})$. This is achieved by allocating 90% of $\bar{\epsilon}$ across the iterations and using 10% of $\bar{\epsilon}$ for the noisy count. Specifically, $\alpha$ is applied per iteration as defined by the Renyi Differential Privacy framework.
- The noisy gradient descent method with Renyi Differential Privacy (RDP) tends to be less accurate than a standard Logistic Regression model because:

1. Noise Addition:
   - To ensure privacy, the noisy gradient descent method adds noise to the gradients during the training process.
2. Clipping:
   - The gradients are clipped to ensure they do not exceed a certain norm.

In [4]:
def dpsgd(iterations, epsilon, delta, learning_rate, batch_size, C):
    """
    Perform Differentially Private Stochastic Gradient Descent (DPSGD).

    Parameters:
    iterations (int): Number of iterations for gradient descent.
    epsilon (float): Privacy budget parameter.
    delta (float): Privacy parameter.
    learning_rate (float): Learning rate for the gradient descent.
    batch_size (int): Size of the mini-batch for each iteration.
    C (float): Clipping parameter for the gradient.
    """
    
    # Initialize weights to zeros
    weights = np.zeros(X_train.shape[1])
    
    # Number of samples in the training data
    num_samples = X_train.shape[0]
    
    # Compute the noisy count using Laplace mechanism
    noisy_count = laplace_mech(batch_size, 1, epsilon)

    for i in range(iterations):
        # Randomly sample a mini-batch from the training data
        batch_indices = np.random.choice(num_samples, size=batch_size, replace=False)
        X_batch = X_train[batch_indices]
        y_batch = y_train[batch_indices]
        
        # Compute the clipped gradient sum for the mini-batch
        clipped_gradient_sum = gradient_sum(weights, X_batch, y_batch, C)
        
        # Add Gaussian noise to the gradient sum for differential privacy
        noisy_gradient_sum = np.array(gaussian_mech_vec(clipped_gradient_sum, C, epsilon, delta))
        
        # Compute the noisy average gradient
        noisy_avg_gradient = noisy_gradient_sum / noisy_count
        
        # Update the weights using the noisy average gradient
        weights = weights - (learning_rate * noisy_avg_gradient)

    return weights

# Run DPSGD with specified parameters
weights = dpsgd(1000, 1, 1e-5, 0.1, 16, 3)
print('Final accuracy:', accuracy(weights))

Final accuracy: 0.7516585581601062


In [None]:
epsilons = [0.1, 0.5, 0.4, 1.0]
iterations_values = [100, 500, 1000, 1500, 2000, 3000]
learning_rates = [0.01, 0.1, 0.5]
batch_sizes = [8, 16, 32, 64]

# Store results
results = []

# Iterate over parameter combinations
for epsilon in epsilons:
    for iterations in iterations_values:
        for learning_rate in learning_rates:
            for batch_size in batch_sizes:
                # Train DPSGD model
                weights = dpsgd(iterations, epsilon, 1e-5, learning_rate, batch_size, 3)
                
                # Evaluate model on test set
                test_accuracy = accuracy(weights)
                
                # Store results
                results.append({
                    'epsilon': epsilon,
                    'iterations': iterations,
                    'learning_rate': learning_rate,
                    'batch_size': batch_size,
                    'test_accuracy': test_accuracy
                })

In [None]:
top_5_accuracies = sorted(results, key=lambda x: x['test_accuracy'], reverse=True)[:5]
print(top_5_accuracies)

In [None]:
# Convert data to DataFrame
df = pd.DataFrame(results)

# Scatter plot of test accuracies against epsilon, colored by learning rate
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='epsilon', y='test_accuracy', hue='learning_rate', palette='viridis')
plt.title('Test Accuracy vs. Epsilon')
plt.xlabel('Epsilon')
plt.ylabel('Test Accuracy')
plt.legend(title='Learning Rate')
plt.show()

# Scatter plot of test accuracies against number of iterations, colored by epsilon
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='iterations', y='test_accuracy', hue='epsilon', palette='viridis')
plt.title('Test Accuracy vs. Iterations')
plt.xlabel('Iterations')
plt.ylabel('Test Accuracy')
plt.legend(title='Epsilon')
plt.show()

# Scatter plot of test accuracies against batch size, colored by learning rate
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='batch_size', y='test_accuracy', hue='learning_rate', palette='viridis')
plt.title('Test Accuracy vs. Batch Size')
plt.xlabel('Batch Size')
plt.ylabel('Test Accuracy')
plt.legend(title='Learning Rate')
plt.show()

# Box plot of test accuracies for different epsilons
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='epsilon', y='test_accuracy')
plt.title('Test Accuracy Distribution for Different Epsilons')
plt.xlabel('Epsilon')
plt.ylabel('Test Accuracy')
plt.show()

#### 4
1. Test Accuracy vs. Epsilon:

- As epsilon increases (indicating lower privacy), test accuracy generally improves. This demonstrates the privacy-utility trade-off: higher privacy (lower epsilon) leads to lower accuracy, and lower privacy (higher epsilon) leads to higher accuracy.
- But the best trade-off is 0.5 for epsilon leands to almost the same test accuracy as with epsilon 1.
 
2. Test Accuracy vs. Iterations:

- Increasing the number of iterations tends to improve test accuracy, especially for higher values of epsilon (lower privacy). This trend suggests that more iterations allow the model to better converge, even when differential privacy constraints are applied.
- So 3000 interations leaded to the best accuracy with lower epsilon

3. Test Accuracy vs. Batch Size:

- Larger batch sizes generally result in better test accuracies.
Larger batch sizes provide more stable gradient estimates, which can improve the training process. But it can have an impact on learning dynamics and generalization, but 64 still acceptable.
- So batch size 64

4. Learning Rate:
 - Lower learning rates (e.g., 0.01 or 0.1) generally yield more stable and better performance compared to higher learning rates.
 - Accroding to my plots 0.01 was the best.

Based on the observed results, a recommended parameter configuration could be:
- Epsilon: 0.5
- Iterations: 3000
- Learning Rate: 0.01
- Batch Size: 64

With test acc 0.805

#### 5
q=B/N and the costs are therefore the following: $(q*\epsilon*\sqrt(T), \delta)$

In [None]:
q =64/X_train.shape[0]
cost = q * 1 * np.sqrt(1000)
cost

In [None]:
# plot batch_sizes
epsilon = 1
iterations = [10, 100, 500, 1000, 2000]
iterations_2 = [1, 2, 5, 10, 100]
delta = 1e-5
learning_rate = 0.1
alpha = 20
batch_size = [1, 4, 16, 32, 64, 128]
C = 3
    
res = []
for b in batch_size:
    for i in iterations:
        start = time.time()
        acc = accuracy(dpsgd(i, epsilon, delta, learning_rate, b, C))
        end = time.time()
        res.append((end-start, acc, b, i, "dspg"))
#oisy_gradient_descent_RDP(iterations: int, alpha: float, epsilon_bar: float) -> np.ndarray:
    for i in iterations_2:
        start = time.time()
        acc_ngd = accuracy(noisy_gradient_descent_RDP(i, alpha, np.log(1/delta)/(alpha - 1)))
        end = time.time()
        res.append((end-start, acc_ngd, b, i, "ngd"))
        
df = pd.DataFrame(res, columns=['time', 'accuracy', 'batch_size', 'iterations', 'type'])

In [None]:
# Create a FacetGrid to separate the plots for each method
g = sns.FacetGrid(df, col="type", hue="iterations", col_wrap=2, palette='colorblind', height=5)

# Plot time vs. batch size for DPSG
g.map(plt.plot, 'batch_size', 'time', marker='o', ms=8, ls='-')

# Plot time vs. batch size for NGD
g.map(plt.plot, 'batch_size', 'time', marker='s', ms=8, ls='-')

# Set labels and title
g.set_axis_labels("Batch Size", "Time (s)")
g.fig.suptitle('Time vs. Batch Size (Color: Iterations)', fontsize=16)
plt.subplots_adjust(top=0.85)

# Add legend
g.add_legend(title='Iterations', fontsize=12, title_fontsize=12)

plt.show()


In [None]:


# Create a FacetGrid to separate the plots for each method
g = sns.FacetGrid(df, col="type", hue="iterations", col_wrap=2, palette='colorblind', height=5)

# Plot accuracy vs. batch size for DPSG
g.map(plt.plot, 'batch_size', 'accuracy', marker='o', ms=8, ls='-')

# Plot accuracy vs. batch size for NGD
g.map(plt.plot, 'batch_size', 'accuracy', marker='s', ms=8, ls='-')

# Set labels and title
g.set_axis_labels("Batch Size", "Accuracy")
g.fig.suptitle('Accuracy vs. Batch Size (Color: Iterations)', fontsize=16)
plt.subplots_adjust(top=0.85)

# Add legend
g.add_legend(title='Iterations', fontsize=12, title_fontsize=12)

plt.show()


When it comes to execution time, the batch size mainly impacts the results of DSPG because it only operates on a small batch of data rather than the entire training set. This difference in approach explains the time saved with DSPG compared to noisy gradient descent, which processes the entire training set.

However, in terms of accuracy or usefulness, noisy gradient descent consistently delivers good results, whereas DSPG tends to fluctuate more, especially with smaller batch sizes.