# 🛡️ Ethics, Safety & Robustness

**Topics:** Adversarial Attacks, Robustness, Fairness

In [None]:
# Setup
!pip install torch torchvision numpy matplotlib -q
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
print('✅ Setup complete!')

In [None]:
# FGSM Attack
def fgsm_attack(image, epsilon, gradient):
    """Fast Gradient Sign Method"""
    # Get sign of gradient
    sign_grad = gradient.sign()
    # Add perturbation
    perturbed = image + epsilon * sign_grad
    # Clamp to valid range
    perturbed = torch.clamp(perturbed, 0, 1)
    return perturbed

# Demo with synthetic data
image = torch.rand(1, 1, 28, 28, requires_grad=True)
gradient = torch.randn_like(image)  # Simulated gradient

for eps in [0.0, 0.1, 0.3]:
    perturbed = fgsm_attack(image, eps, gradient)
    diff = (perturbed - image).abs().mean().item()
    print(f'ε={eps}: Mean perturbation = {diff:.4f}')

In [None]:
# Visualize Attack
image = torch.rand(28, 28)
gradient = torch.randn(28, 28)

fig, axes = plt.subplots(1, 4, figsize=(12, 3))
axes[0].imshow(image.numpy(), cmap='gray')
axes[0].set_title('Original')

for i, eps in enumerate([0.1, 0.2, 0.3]):
    perturbed = torch.clamp(image + eps * gradient.sign(), 0, 1)
    axes[i+1].imshow(perturbed.numpy(), cmap='gray')
    axes[i+1].set_title(f'ε = {eps}')

for ax in axes: ax.axis('off')
plt.suptitle('FGSM Attack with Increasing ε')
plt.tight_layout()
plt.show()

In [None]:
# PGD Attack (Iterative FGSM)
def pgd_attack(image, epsilon, alpha, num_steps, gradient_fn):
    """Projected Gradient Descent Attack"""
    perturbed = image.clone()
    original = image.clone()
    
    for _ in range(num_steps):
        # Get gradient (simulated here)
        grad = gradient_fn(perturbed)
        # FGSM step
        perturbed = perturbed + alpha * grad.sign()
        # Project back to epsilon ball
        delta = torch.clamp(perturbed - original, -epsilon, epsilon)
        perturbed = torch.clamp(original + delta, 0, 1)
    
    return perturbed

# Demo
image = torch.rand(28, 28)
gradient_fn = lambda x: torch.randn_like(x)

pgd_image = pgd_attack(image, epsilon=0.3, alpha=0.01, num_steps=40, gradient_fn=gradient_fn)
print(f'PGD perturbation: max={abs(pgd_image - image).max():.3f}')