# Adversarial attack

Question: what does it take to turn a little dog into a pineapple?

In [None]:
!wget -r "https://ds4440.baulab.info/data/dog.png"
!wget -r "https://ds4440.baulab.info/data/imagenet_class_index.json"

In [None]:
# Imports
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import requests
from torchvision.utils import save_image
from matplotlib import pyplot as plt
import json

# Load the names of imagenet classes
with open("imagenet_class_index.json") as f:
    class_idx = json.load(f)
    idx2label = [class_idx[str(k)][1] for k in range(len(class_idx))]

# Load pretrained ResNet18 model
model = models.resnet18(pretrained=True)

# We will NOT train this model!
model.eval()
for param in model.parameters():
    param.requires_grad = False

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

class UnNormalize(transforms.Normalize):
    def __init__(self, mean, std, *args,**kwargs):
        new_mean = [-m/s for m,s in zip(mean,std)]
        new_std = [1/s for s in std]
        super().__init__(new_mean, new_std, *args, **kwargs)

# Preprocessing transform
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
black = preprocess.transforms[-1](torch.tensor([[[-1.0]], [[-1.0]], [[-1.0]]]))
white = preprocess.transforms[-1](torch.tensor([[[1.0]], [[1.0]], [[1.0]]]))

unprocess = transforms.Compose([
    UnNormalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.ToPILImage(),
])

# Load and preprocess image
img = Image.open("dog.png").convert('RGB')
input_tensor = preprocess(img)

# PGD Attack
def pgd_attack(image, model, target_class, epsilon=0.01, alpha=0.01, num_steps=10):
    # Set model to evaluation mode
    model.eval()

    # Clone the image
    image = image.clone()

    # Set requires_grad attribute of tensor
    image.requires_grad = True

    for i in range(num_steps):
        # Forward pass
        output = model(image)

        # Calculate loss
        loss = nn.CrossEntropyLoss()(output, torch.tensor([target_class]))
        model.zero_grad()

        # Compute gradients
        loss.backward()

        # Compute a step change in the IMAGE (not the network!)
        eta = -alpha * image.grad.sign()

        # Project the step to the epislon ball to keep changes small
        eta = torch.clamp(eta, -epsilon, epsilon)

        # Create the perturbed image by adjusting each pixel of the input image
        with torch.no_grad():
            image += eta

            # Prevent image pixels brighter than white or darker than black
            image.clamp_(min=black, max=white)

    return image.detach()

# Set target class and epsilon
target_class = 953  # example target class (pineapple)
epsilon = 0.01
alpha = 0.01
num_steps = 10

# Set input tensor
input_batch = input_tensor[None].to(device)

# Generate adversarial example
adversarial_ex = pgd_attack(input_batch, model, target_class, epsilon, alpha, num_steps)[0]

# Save original and adversarial images
save_image(adversarial_ex, 'adversarial_image.png')

# Show the results
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
for i, (im, title) in enumerate([
    (input_tensor, 'Original'),
    (adversarial_ex, 'Adversarial'),
    (input_tensor - adversarial_ex, 'Difference')
]):
    ax[i].imshow(unprocess(im))
    ax[i].axis('off')
    ax[i].set_title(title)
plt.show()

print("Original prediction:", idx2label[model(input_batch).argmax().item()])
print("Adversarial prediction:", idx2label[model(adversarial_ex[None]).argmax().item()])
