# Model definition

Below is an implementation of the [LeNet](https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html) architecture for the MNIST dataset.

In [0]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt

In [0]:
# LeNet Model definition
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        
        return F.log_softmax(x, dim=1)

In [7]:
# Define what device we are using
print("CUDA Available: ",torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA Available:  True


Of note: This notebook uses only a single GPU.
PyTorch can run models on several GPU, try to search how to specify several GPUs and create several devices.
model = Net().to(device)
model

In [8]:
model = Net().to(device)
model

Net(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2_drop): Dropout2d(p=0.5, inplace=False)
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

## Master way

Or inspect code for training a model

In [9]:
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('./mnist', 
                   download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                   ])),
    batch_size=128, shuffle=True)

0it [00:00, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./mnist/MNIST/raw/train-images-idx3-ubyte.gz


9920512it [00:01, 8089202.28it/s]                            


Extracting ./mnist/MNIST/raw/train-images-idx3-ubyte.gz to ./mnist/MNIST/raw


  0%|          | 0/28881 [00:00<?, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./mnist/MNIST/raw/train-labels-idx1-ubyte.gz


32768it [00:00, 129902.16it/s]           
  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./mnist/MNIST/raw/train-labels-idx1-ubyte.gz to ./mnist/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./mnist/MNIST/raw/t10k-images-idx3-ubyte.gz


1654784it [00:00, 2140331.55it/s]                            
0it [00:00, ?it/s]

Extracting ./mnist/MNIST/raw/t10k-images-idx3-ubyte.gz to ./mnist/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./mnist/MNIST/raw/t10k-labels-idx1-ubyte.gz


8192it [00:00, 49840.13it/s]            


Extracting ./mnist/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./mnist/MNIST/raw
Processing...
Done!


In [0]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()

# Use Stochastic Gradient Descent
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [11]:
epoch_num = 20
for epoch in range(epoch_num):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()
        
        # Remember a line with model.to(device)?
        # It moves a model to a GPU and PyTorch expects that
        # input data also will be on the GPU where the model resides
        inputs, labels = inputs.to(device), labels.to(device)

        # forward + backward + optimize
        outputs = model(inputs)
        
        # Calculate the error between model predictins and actual labels
        loss = criterion(outputs, labels)
        
        # Initiate backward propagation
        loss.backward()
        
        # Update weights
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 200 == 199:    # print every 200 mini-batches
            print(f'[Epoch {epoch + 1}: batch {i + 1}] loss: {running_loss / 200}')
            running_loss = 0.0

print('Finished Training')

[Epoch 1: batch 200] loss: 2.291912612915039
[Epoch 1: batch 400] loss: 2.2420103108882903
[Epoch 2: batch 200] loss: 1.7662313979864122
[Epoch 2: batch 400] loss: 1.1717280262708665
[Epoch 3: batch 200] loss: 0.8511688721179962
[Epoch 3: batch 400] loss: 0.742559602111578
[Epoch 4: batch 200] loss: 0.6571808513998986
[Epoch 4: batch 400] loss: 0.6158310669660568
[Epoch 5: batch 200] loss: 0.5636150392889977
[Epoch 5: batch 400] loss: 0.5445651832222939
[Epoch 6: batch 200] loss: 0.5125834642350674
[Epoch 6: batch 400] loss: 0.4894846361875534
[Epoch 7: batch 200] loss: 0.4771484616398811
[Epoch 7: batch 400] loss: 0.45511133134365084
[Epoch 8: batch 200] loss: 0.4432361833006144
[Epoch 8: batch 400] loss: 0.42647586211562155
[Epoch 9: batch 200] loss: 0.4308468564599752
[Epoch 9: batch 400] loss: 0.4117084227502346
[Epoch 10: batch 200] loss: 0.391351705789566
[Epoch 10: batch 400] loss: 0.386165821775794
[Epoch 11: batch 200] loss: 0.3767972456663847
[Epoch 11: batch 400] loss: 0.369

Let's check how accurate is our network

In [0]:
data_test = datasets.MNIST('./mnist',
                           train=False,
                           download=True,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                           ]))

test_loader = torch.utils.data.DataLoader(data_test, batch_size=256)

In [13]:
# Prevent training
model.eval()

total_correct = 0
avg_loss = 0.0

for i, data in enumerate(test_loader, 0):
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)
   
    output = model(inputs)
    avg_loss += criterion(output, labels).sum()
    
    # tensor.detach() creates a tensor that shares storage with tensor that does not require grad.
    # It detaches the output from the computational graph.
    # So no gradient will be backpropagated along this variable.
    pred = output.detach().max(1)[1]
    total_correct += pred.eq(labels.view_as(pred)).sum()

avg_loss /= len(data_test)
avg_loss = avg_loss.detach().cpu().item()
accuracy = float(total_correct) / len(data_test)
print(f'Test Avg. Loss: {avg_loss}, Accuracy: {accuracy * 100}%')


Test Avg. Loss: 0.0004083073581568897, Accuracy: 96.71%


In [0]:
# Save model state for re-use
my_awesome_model = 'my-lenet.pth'
torch.save(model.state_dict(), my_awesome_model)

**End of training code**

**FGSM Attack**
The fgsm_attack function takes three inputs: image is the original clean image (x), epsilon is the pixel-wise perturbation amount (ϵ), and data_grad is gradient of the loss w.r.t the input image.

The function then creates perturbed image as

$$\text{perturbed image} = image + epsilon * sign(gradient) = x + ϵ ∗ sign(∇xJ(θ,x,y))$$
The fgsm_attack function takes three inputs: image is the original clean image (x), epsilon is the pixel-wise perturbation amount (ϵ), and data_grad is gradient of the loss w.r.t the input image.

The function then creates perturbed image as

$$\text{perturbed image} = image + epsilon * sign(gradient) = x + ϵ ∗ sign(∇xJ(θ,x,y))$$

In [0]:
# # FGSM attack code
# def fgsm_attack(image, epsilon, data_grad):
#     # Collect the element-wise sign of the data gradient
#     sign_data_grad = data_grad.sign()
    
#     # Create the perturbed image by adjusting each pixel of the input image
#     perturbed_image = image + epsilon * sign_data_grad
    
#     # Adding clipping to maintain [0,1] range
#     perturbed_image = torch.clamp(perturbed_image, 0, 1)
    
#     # Return the perturbed image
#     return perturbed_image

In [0]:
 # MNIST Test dataset and dataloader declaration
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('./mnist',
                   train=False,
                   download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                   ])),
    batch_size=1, shuffle=True)

Test function performs a full test step on the MNIST test set and reports a final accuracy.

For each sample in the test set, the function computes the gradient of the loss w.r.t the input data (data_grad),  creates a perturbed image with fgsm_attack (perturbed_data), then checks to see if the perturbed example is adversarial. 

In [0]:
# def test( model, device, test_loader, epsilon ):

#     # Accuracy counter
#     correct = 0
#     adv_examples = []

#     # Loop over all examples in test set
#     for data, target in test_loader:

#         # Send the data and label to the device
#         data, target = data.to(device), target.to(device)

#         # Set requires_grad attribute of tensor. Important for Attack
#         data.requires_grad = True

#         # Forward pass the data through the model
#         output = model(data)
#         init_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability

#         # If the initial prediction is wrong, dont bother attacking, just move on
#         if init_pred.item() != target.item():
#             continue

#         # Calculate the loss - Negative Log Likehood
#         # Loosely explained at https://medium.com/deeplearningmadeeasy/negative-log-likelihood-6bd79b55d8b6
#         loss = F.nll_loss(output, target)

#         # Zero all existing gradients
#         model.zero_grad()

#         # Calculate gradients of model in backward pass
#         loss.backward()

#         # Collect datagrad
#         data_grad = data.grad.data

#         # Call FGSM Attack
#         perturbed_data = fgsm_attack(data, epsilon, data_grad)

#         # Re-classify the perturbed image
#         output = model(perturbed_data)

#         # Check for success
#         final_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
#         if final_pred.item() == target.item():
#             correct += 1
#             # Special case for saving 0 epsilon examples
#             if (epsilon == 0) and (len(adv_examples) < 5):
#                 adv_ex = perturbed_data.squeeze().detach().cpu().numpy()
#                 adv_examples.append( (init_pred.item(), final_pred.item(), adv_ex) )
#         else:
#             # Save some adv examples for visualization later
#             if len(adv_examples) < 5:
#                 adv_ex = perturbed_data.squeeze().detach().cpu().numpy()
#                 adv_examples.append( (init_pred.item(), final_pred.item(), adv_ex) )

#     # Calculate final accuracy for this epsilon
#     final_acc = correct/float(len(test_loader))
#     print("Epsilon: {}\tTest Accuracy = {} / {} = {}".format(epsilon, correct, len(test_loader), final_acc))

#     # Return the accuracy and an adversarial example
#     return final_acc, adv_examples

# Let's attack!

In [0]:
# accuracies = []
# examples = []

# # Epsilon 0 means no attack at all
# epsilons = [0, .05, .1, .15, .2, .25, .3]

# # Run test for each epsilon
# for eps in epsilons:
#     acc, ex = test(model, device, test_loader, eps)
#     accuracies.append(acc)
#     examples.append(ex)

The first result is the accuracy versus epsilon plot.

As epsilon increases we expect the test accuracy to decrease and it actually does. 

In [0]:
# plt.figure(figsize=(5,5))
# plt.plot(epsilons, accuracies, "*-")
# plt.yticks(np.arange(0, 1.1, step=0.1))
# plt.xticks(np.arange(0, epsilons[-1] + 0.05, step=0.05))
# plt.title("Accuracy vs Epsilon")
# plt.xlabel("Epsilon")
# plt.ylabel("Accuracy")
# plt.show()

There is a tradeoff between accuracy degredation and perceptibility that an attacker must consider.  

Below are some examples of successful adversarial examples at each epsilon value.  
Each row of the plot shows a different epsilon value. The first row is the ϵ=0 examples which represent the original “clean” images with no perturbation. The title of each image shows the “original classification -> adversarial classification.”

In [0]:
#    # Plot several examples of adversarial samples at each epsilon
# cnt = 0
# plt.figure(figsize=(8,10))

# for i in range(len(epsilons)):
#     for j in range(len(examples[i])):
#         cnt += 1
#         plt.subplot(len(epsilons),len(examples[0]),cnt)
#         plt.xticks([], [])
#         plt.yticks([], [])
#         if j == 0:
#             plt.ylabel("Eps: {}".format(epsilons[i]), fontsize=14)
#         orig, adv, ex = examples[i][j]
#         plt.title("True {} -> Adv {}".format(orig, adv))
#         plt.imshow(ex, cmap="gray")
        
# plt.tight_layout()
# plt.show()

Take a closer look at how we calculated gradient. 

In [21]:
epsilon = epsilons[2]
for data, target in test_loader:  
    data, target = data.to(device), target.to(device)
    data.requires_grad = True
    
    output = model(data)
    loss = F.nll_loss(output, target)
   
    model.zero_grad()
    loss.backward()

    data_grad = data.grad.data
    perturbed_data = fgsm_attack(data, epsilon, data_grad)

NameError: ignored

The second parameter to the loss function is a true label of the current image. After that gradient is used for perturbation generation.
<br>
<br>
Remember that gradient is obtained for the whole image
pixels, namely gradient shape is equal to the image shape.  
In our case it's (28, 28, 1). So **each pixel gets own perturbation**.

Now recall how we apply gradient.

In [0]:
# sign_data_grad = data_grad.sign()
# perturbed_image = img + epsilon * sign_data_grad

We add the scaled gradient to the original image. The result of this action is misclassification, but we don't know what label gain higher confidence. This is a case of *untargeted misclassification*.

*Targeted misclassification* implies that you add perturbation and model predicts one of the possible labels but not the correct one. For achieving this you should take gradient with respect to you selected label.

But it's not enough. Also, you should subtract scaled gradient instead of adding. Doing this we actually perform gradient descent on the loss function surface in direction to the target class. Remember that we want **to minimize** loss function with respect to **selected class**.

# PRACTICUM TASK

1 - For each class select 10 images not from this class. Perform attack to move selected images to this class. for each class select image with highest confidence.
    
Best sample is the one with higher confidence.

2 - Try different epsilons for one selected class and collect the number of iterations required to achieve success

3* OPTIONAL - make attacks using a model trained on Cifar10 obtained from the previous task.

# Let's rock!

importing foolbox and torch



In [0]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt

If it's necessary - install foolbox

In [23]:
pip install foolbox

Collecting foolbox
[?25l  Downloading https://files.pythonhosted.org/packages/0a/3f/cee46491ba9546d8c7bf14e18a4ecbdae411ca3d2e2ccdb227aad6de1782/foolbox-2.3.0.tar.gz (1.9MB)
[K     |████████████████████████████████| 1.9MB 9.0MB/s 
Collecting GitPython
[?25l  Downloading https://files.pythonhosted.org/packages/20/8c/4543981439d23c4ff65b2e62dddd767ebc84a8e664a9b67e840d1e2730d3/GitPython-3.0.5-py3-none-any.whl (455kB)
[K     |████████████████████████████████| 460kB 49.3MB/s 
Collecting gitdb2>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/03/6c/99296f89bad2ef85626e1df9f677acbee8885bb043ad82ad3ed4746d2325/gitdb2-2.0.6-py2.py3-none-any.whl (63kB)
[K     |████████████████████████████████| 71kB 13.5MB/s 
[?25hCollecting smmap2>=2.0.0
  Downloading https://files.pythonhosted.org/packages/55/d2/866d45e3a121ee15a1dc013824d58072fd5c7799c9c34d01378eb262ca8f/smmap2-2.0.5-py2.py3-none-any.whl
Building wheels for collected packages: foolbox
  Building wheel for foolbox (setu

In [0]:
import foolbox
from foolbox.models import PyTorchModel
from foolbox.attacks import L2BasicIterativeAttack, FGSM
from foolbox.criteria import Misclassification, ConfidentMisclassification, TargetClassProbability

Loader and criterion. We have chosen class of digit 2.

In [0]:
loader = torch.utils.data.DataLoader(
    datasets.MNIST('./mnist', 
                   download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                   ])),
    batch_size=128, shuffle=True)

criterion = TargetClassProbability(2, 0.90)

Visualization if it is necessary.

In [0]:
# # visualize data
# fig=plt.figure(figsize=(16, 8))
# data, label = next(iter(loader))

# for i in range(1, 51):
#     img = data[i][0]
#     fig.add_subplot(5, 10, i)
#     plt.imshow(img, cmap='gray', vmin=0, vmax=1)
# plt.show()


Creating a dictionary, which has keys = digits and values are ten pictures, which contains these digits.

In [0]:
from collections import defaultdict
digit_images = defaultdict(list)
data, label = next(iter(loader))
# label = np.array(lebel)

for digit in range(10):
  i = 0
  while (len(digit_images[digit])) != 10 and i < len(data):
    if label[i] == digit:
      digit_images[digit].append(data[i][0])
    i += 1

Functions for adversarials

In [0]:
model.eval()
fb_model = PyTorchModel(
    model, 
    bounds=(-4, +4), 
    num_classes=10,
    channel_axis=1,
)
 
transform = transforms.Compose(
    [
        # resize image to 299 pixels in width and hight
        transforms.Resize((299,299)),
        
        # transorm "Image" object to "tensor" onject. Used when working with PIL.Image
        transforms.ToTensor(),
        
        # Normalize image per chanel
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ]
)

In [0]:
from PIL import Image

def restore_image(image):
    # Move chanel axis [3, 299, 299] -> [299, 299, 3]
    new_image = np.rollaxis(np.rollaxis((image), 2), 2)
    
    # Multiply by std and add mean
    new_image = (new_image * [0.229, 0.224, 0.225]) + [0.485, 0.456, 0.406]
    
    # Move from range 0-1 to the range 0-255
    new_image = new_image * 255
    
    # Make sure to remove all values that lower that 0 or higher than 255
    # as it not valid images
    new_image = np.clip(new_image, 0, 255)
    
    # Put image to the "byte" format. 
    # That`s required by PIL.Image to be abble to restore image from numpy array
    new_image = new_image.astype(np.uint8)
    return new_image

In [0]:
def generate_adversarial(foolbox_model, attack, selected_criterion, image):
    attack = attack(
        model=foolbox_model,
        criterion=selected_criterion
    )
    normalized_image = image.unsqueeze(0).to(device)
    ########### I added  another unsqueeze, because it caused an error ########
    normalized_image = normalized_image.unsqueeze(0).to(device)
    prediction = model(normalized_image)[0]
    predicted_class = prediction.argmax(-1).cpu().numpy()
    
    normalazed_input_numpy = normalized_image.cpu().numpy()
    predicted_labels = np.array([int(predicted_class)])

    new_images = attack(normalazed_input_numpy, labels=predicted_labels)
    
    restored_numpy_array = restore_image(new_images[0])
    restored_image = Image.fromarray(restored_numpy_array)
    return restored_image

def print_prediction(image):
  normalized_image = transform(image).unsqueeze(0).to(device)
  # print(normalized_image.size())
  # normalized_image = normalized_image.unsqueeze(0).to(device)
  prediction = model(normalized_image)[0]
  predicted_class = prediction.argmax(-1).cpu().numpy()
  print(f"Predicted class {int(predicted_class)} : {imagenet_labels[int(predicted_class)]}")
  print(f"Probability: {torch.softmax(prediction, -1)[predicted_class]:.3f}")

#Checking

In [70]:
adv_image = generate_adversarial(
    fb_model, 
    L2BasicIterativeAttack,
    TargetClassProbability(2, 0.95), 
    digit_images[7][0]
)
print_prediction(adv_image)
adv_image

RuntimeError: ignored

Fuctions for generating adversarials

In [0]:
# normalized_image = digit_images[7][1].unsqueeze(0).to(device)
# normalized_image = normalized_image.unsqueeze(0).to(device)
# normalized_image.size()
# prediction = model(normalized_image)[0]
# predicted_class = prediction.argmax(-1).cpu().numpy()
# print(f"Predicted class {int(predicted_class)} : {'7'}")
# print(f"Probability: {torch.softmax(prediction, -1)[predicted_class]:.4f}")

# # First we need to move our input from torch to numpy array, as it required by FoolBox
# normalazed_input_numpy = normalized_image.cpu().numpy()

# # Than we need to do a little modification to the predicted class
# # Namely move it to array of shape [batch_size], in our case [1]
# predicted_labels = np.array([int(predicted_class)])
# # And finaly we can generate new image provided starting point and true label
# new_images = attack(normalazed_input_numpy, labels=predicted_labels)
# new_images = attack(new_images, labels=predicted_labels)
# prediction = model(new_images)[0]
# predicted_class = prediction.argmax(-1).cpu().numpy()
# print(f"Predicted class {int(predicted_class)} : {'7'}")
# print(f"Probability: {torch.softmax(prediction, -1)[predicted_class]:.4f}")