In [16]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

cuda = torch.cuda.is_available()

## From adversarial examples to training robust models

In the previous notebooks, we focused on methods for solving the maximization problem over perturbations; that is, to finding the solution to the problem
\begin{equation}
\DeclareMathOperator*{\maximize}{maximize}
\maximize_{\|\delta\| \leq \epsilon} \ell(h_\theta(x + \delta), y).
\end{equation}

In this notebook, we will focus on training a robust classifier. More precisly, we aim at solving following minimization problem, namely Adversarial Training:
\begin{equation}
\DeclareMathOperator*{\minimize}{minimize}
\minimize_\theta \frac{1}{|S|} \sum_{x,y \in S} \max_{\|\delta\| \leq \epsilon} \ell(h_\theta(x + \delta), y).
\end{equation}
The order of the min-max operations is important here.  Specially, the max is inside the minimization, meaning that the adversary (trying to maximize the loss) gets to "move" _second_.  We assume, essentially, that the adversary has full knowledge of the classifier parameters $\theta$, and that they get to specialize their attack to whatever parameters we have chosen in the outer minimization. The goal of the robust optimization formulation, therefore, is to ensure that the model cannot be attacked _even if_ the adversary has full knowledge of the model.  Of course, in practice we may want to make assumptions about the power of the adversary but it can be difficult to pin down a precise definition of what we mean by the "power" of the adversary, so extra care should be taken in evaluating models against possible "realistic" adversaries.

## Exercice 1
1. Train a robust classifier using Adversarial Training with a specific norm
2. Evaluate your classifier on natural and adversarial examples crafted with the norm of the training and other norms
3. Make an analysis and conclude

In [2]:
# load CIFAR10 dataset
def load_cifar(split, batch_size):
  train = True if split == 'train' else False
  dataset = datasets.CIFAR10("./docs", train=split, download=True, transform=transforms.ToTensor())
  return DataLoader(dataset, batch_size=batch_size, shuffle=train)

batch_size = 100
train_loader = load_cifar('train', batch_size)
test_loader = load_cifar('test', batch_size)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./docs/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:18<00:00, 9109268.90it/s]


Extracting ./docs/cifar-10-python.tar.gz to ./docs
Files already downloaded and verified


In [3]:
class ConvModel(torch.nn.Module):

  def __init__(self):
    super(ConvModel, self).__init__()
    self.conv1 = nn.Conv2d(3, 6, 3, padding=1) # 3 input channels, 6 output channels (doit être sup a input), 3x3 kernel
    self.pool = nn.MaxPool2d(2) # 2x2 kernel
    self.conv2 = nn.Conv2d(6, 16, 3, padding=1) # 6 input channels, 16 output channels, 3x3 kernel
    self.fc1 = nn.Linear(16 * 8 * 8, 120) # 16x8x8 input features, 120 output features
    self.fc2 = nn.Linear(120, 84) # 120 input features, 84 output features
    self.fc3 = nn.Linear(84, 10) # 84 input features, 10 output features


  def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    #print(x.size())
    x = x.view(-1, 16 * 8 * 8)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

In [14]:
class FastGradientSignMethod:

  def __init__(self, model, eps):
    super().__init__()
    self.model = model
    self.eps = eps
    self.loss = nn.CrossEntropyLoss()

  def compute(self, x, y):
    """ Construct FGSM adversarial perturbation for examples x"""
    x.requires_grad = True
    y_pred = self.model(x)
    loss = self.loss(y_pred, y)
    loss.backward()
    grad = x.grad.data
    grad = grad.sign()
    return self.eps * grad
    #delta = torch.zeros_like(x, requires_grad=True)
    # Use variable.grad.detach() to retreive the gradient with respect to a loss

eps = 0.007 # define eps here
fgsm = FastGradientSignMethod(model,eps)

class ProjectedGradientDescent:

  def __init__(self, model, eps, alpha, num_iter):
    self.model = model
    self.eps = eps
    self.alpha = alpha #le pas
    self.num_iter = num_iter #nb iteration
    self.loss = nn.CrossEntropyLoss()



  def compute(self, x, y):
    """ Construct PGD adversarial pertubration on the examples x."""
    delta = torch.zeros_like(x, requires_grad=True)
    for _ in range(self.num_iter):
      y_pred = self.model(x+delta)
      loss = self.loss(y_pred, y)
      loss.backward()
      grad = delta.grad
      delta.data = delta.data + self.alpha * grad.detach().sign()
      delta.data = torch.clamp(delta, -self.eps, self.eps) #projection sur le cercle
      #delta.grad.zero_()
    return delta.detach()

alpha = 1e-2
pgd = ProjectedGradientDescent(model, eps, alpha, 5)


In [23]:
def adversarial_train_model(model, criterion, optimizer, loader, attack):
  """Function to train the model"""
  train_loss = 0.
  model.train()
  epochs = 10

  for e in range(epochs):
    for batch_idx, (imgs, labels) in enumerate(train_loader):
      if cuda:
       imgs, labels = imgs.cuda(), labels.cuda()
      # clear the gradients of all optimized variables
      optimizer.zero_grad()
      # forward pass: compute predicted outputs by passing inputs to the model
      img_attack = imgs + attack.compute(imgs, labels)
      output = model(img_attack)
      # calculate the batch loss
      loss = criterion(output, labels)
      # backward pass: compute gradient of the loss with respect to model parameters
      loss.backward()
      # perform a single optimization step (parameter update)
      optimizer.step()
      # update training loss
      train_loss += loss.item()*imgs.size(0)

  return train_loss / len(train_loader.dataset)

# adverserial training with PGD
model = ConvModel()
if cuda:
  model = model.cuda()

# define your loss
criterion = nn.CrossEntropyLoss()

# define the optimizer
opt = optim.SGD(model.parameters(), lr=1e-2)

# define the attack
attack = fgsm

adversarial_train_model(model, criterion, opt, train_loader, attack)

20.64851740837097

In [41]:
#train the model depending on the attack

model1=ConvModel()
if cuda:
  model1 = model1.cuda()

model2=ConvModel()
if cuda:
  model2 = model2.cuda()

def adversarial_train_model_2(model, criterion, optimizer, loader, attack):

 train_loss = 0.
 model.train()
 epochs = 10
 k=3

 for e in range(epochs):
    for batch_idx, (imgs, labels) in enumerate(train_loader):
      if cuda:
       imgs, labels = imgs.cuda(), labels.cuda()
       if batch_idx % k == 0:
          delta = attack.compute(imgs, labels)
          adv_imgs = torch.clamp(imgs + delta, min=0, max=1)
          output = model(adv_imgs)
       else:
          output = model(imgs)

      loss = criterion(output, labels)
      loss.backward()
      opt.step()

      train_loss += loss.item() * imgs.size(0)

 return train_loss / len(loader.dataset)


In [42]:
def eval_model(model, loader, attack=None):
  """Function to evaluate your model on a specific loader"""
  accuracy = 0.
  n_inputs = 0.
  for n_batch, (imgs, labels) in enumerate(loader):
    if cuda:
      imgs, labels = imgs.cuda(), labels.cuda()
    if attack is None:
      outputs = model(imgs)
    else:
      delta = attack.compute(imgs, labels)
      adv = imgs + delta
      outputs = model(adv)
    _, predicted = torch.max(outputs.data, 1)
    accuracy += predicted.eq(labels.data).cpu().sum().numpy()
    n_inputs += imgs.shape[0]
  accuracy /= n_inputs
  print('accuracy on testset: {:.4f}'.format(accuracy))

attack = fgsm
eval_model(model, test_loader)
eval_model(model, test_loader, attack)

accuracy on testset: 0.1000
accuracy on testset: 0.1000
