In [3]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import logging
import os
import time

import numpy as np
import matplotlib.pyplot as plt
import foolbox
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.optim as optim

from lib.dataset_utils import *
from lib.mnist_model import *
from lib.adv_model import *
from lib.dknn_attack import DKNNAttack
from lib.dknn_attack_l2 import DKNNL2Attack
from lib.dknn_attack_linf import DKNNLinfAttack
from lib.cwl2_attack import CWL2Attack
from lib.dknn import DKNNL2
from lib.utils import *
from lib.lip_model import *

Loading faiss with AVX2 support.


In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [8]:
# Subsamples dataset and reduces to two classes (1 and 7)
# Code adapted from https://github.com/yangarbiter/adversarial-nonparametrics

num_samples = 2200
num_val = 300 # can pick anything > 0, does not really get used here

from keras.datasets import mnist
(X, y), (_, _) = mnist.load_data()
np.random.seed(2019)
idx1 = np.random.choice(
    np.where(y == 1)[0], num_samples // 2, replace=False)
idx2 = np.random.choice(
    np.where(y == 7)[0], num_samples // 2, replace=False)
y[idx1] = 0
y[idx2] = 1
X = np.vstack((X[idx1], X[idx2])).astype(np.float32) / 255.
y = np.concatenate((y[idx1], y[idx2]))

idxs = np.arange(num_samples)
np.random.shuffle(idxs)
x_train_sub = torch.tensor(X[idxs[:-200]])
x_test_sub = torch.tensor(X[idxs[-200:]])
y_train_sub = torch.tensor(y[idxs[:-200]])
y_test_sub = torch.tensor(y[idxs[-200:]])

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz


In [9]:
net_knn = KNNModel()

In [99]:
layers = ['identity']
knn = DKNNL2(net_knn, x_train_sub, y_train_sub, 
             x_test_sub, y_test_sub, layers, 
             k=1, num_classes=2)

In [100]:
with torch.no_grad():
    y_pred = knn.classify(x_test_sub)
    ind = np.where(y_pred.argmax(1) == y_test_sub.numpy())[0]
    print((y_pred.argmax(1) == y_test_sub.numpy()).sum() / y_test_sub.size(0))

0.99


In [None]:
# Attack for L2 DkNN

attack = DKNNL2Attack()
# attack = DKNNLinfAttack()

def attack_batch(x, y, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    num_batches = total_num // batch_size
    for i in range(num_batches):
        begin = i * batch_size
        end = (i + 1) * batch_size
        x_adv[begin:end] = attack(
            knn, x[begin:end], y[begin:end],
            guide_layer=layers[0], m=75, binary_search_steps=15,
            max_iterations=1000, learning_rate=1e-2, guide_mode=2,
            initial_const=1e-2, abort_early=True, random_start=False)
    return x_adv

num = 200
x_adv = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200)

In [180]:
'''Implement gradient-based attack on DkNN with L-2 constraint'''

import logging

import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim

INFTY = 1e20


class DKNNExpAttack(object):
    """
    Implement gradient-based attack on Deep k-Nearest Neigbhor that uses
    L-2 distance as a metric
    """
    def __init__(self, dknn):
        self.dknn = dknn
        self.device = dknn.device
        self.layers = dknn.layers
        self.guide_reps = {}
        self.thres = None
        self.coeff = None

    def __call__(self, x_orig, label, guide_layer='relu1', m=100,
                 binary_search_steps=5, max_iterations=500,
                 learning_rate=1e-2, initial_const=1, max_linf=None, 
                 random_start=False, thres_steps=100, check_adv_steps=100,
                 verbose=True):
        """
        Parameters
        ----------
        dknn : DKNN object
            DkNN (defined in lin/dknn.py) that we want to attack
        x_orig : torch.tensor
            tensor of the original samples to attack. Does not need to require
            gradients, shape is (num_samples, ) + input_shape
        label : torch.tensor
            tensor of the label corresponding to x_orig
        guide_layer : str. optional
            layer name in which we want to find guide samples. Default is
            'relu1'
        m : int, optional
            number of guide samples. Default is 100
        binary_search_step : int, optional
            number of steps for binary search on the norm penalty constant.
            Default is 5
        max_iterations : int, optional
            number of optimization steps (per one binary search). Default is
            500
        learning_rate : float , optional
            step size or learning rate for the optimizer. Default is 1e-2
        initial_const : float, optional
            a number the norm penalty constant should be initialized to.
            Default is 1
        abort_early : bool, optional
            whether or not to abort the optimization early (before reaching
            max_iterations) if the objective does not improve from the past
            (max_iterations // 10) steps. Default is True
        max_linf : float, optional
            use to bound the L-inf norm of the attacks (addition to L-2 norm
            penalty). Set to None to not use this option. Default is None
        random_start : bool, optional
            whether or not to initialize the perturbation with small isotropic
            Gaussian noise. Default is False
        guide_mode : int, optional
            Choose the guide_mode to use between 1 and 2. Default is 1
            - guide_mode == 1: find m nearest neighbors to input that all have
            the same class but not equal its original label.
            - guide_mode == 2: find the nearest neighbor that has a different
            class from the input and find its m - 1 neighbors

        Returns
        -------
        x_adv : torch.tensor
            adversarial examples found. If adversarial examples for some inputs
            are not found, return those inputs.
        """

        min_, max_ = x_orig.min(), x_orig.max()
        if max_linf is not None:
            min_ = torch.max(x_orig - max_linf, min_)
            max_ = torch.min(x_orig + max_linf, max_)
        batch_size = x_orig.size(0)
        x_adv = x_orig.clone()
        label = label.cpu().numpy()
        input_shape = x_orig.detach().cpu().numpy().shape
        # initialize coeff for guide samples
        self.coeff = torch.zeros((x_orig.size(0), m))
        self.coeff[:, :m // 2] += 1
        self.coeff[:, m // 2:] -= 1

        def to_attack_space(x):
            # map from [min_, max_] to [-1, +1]
            a = (min_ + max_) / 2
            b = (max_ - min_) / 2
            x = (x - a) / b

            # from [-1, +1] to approx. (-1, +1)
            x = x * 0.999999

            # from (-1, +1) to (-inf, +inf)
            return self.atanh(x)

        def to_model_space(x):
            """Transforms an input from the attack space
            to the model space. This transformation and
            the returned gradient are elementwise."""

            # from (-inf, +inf) to (-1, +1)
            x = torch.tanh(x)

            # map from (-1, +1) to (min_, max_)
            a = (min_ + max_) / 2
            b = (max_ - min_) / 2
            x = x * b + a

            return x

        # variables representing inputs in attack space will be prefixed with z
        z_orig = to_attack_space(x_orig)
        x_recon = to_model_space(z_orig)

        # declare tensors that keep track of constants and binary search
        const = torch.zeros((batch_size, ), device=self.device)
        const += initial_const
        lower_bound = torch.zeros_like(const)
        upper_bound = torch.zeros_like(const) + INFTY
        best_l2dist = torch.zeros_like(const) + INFTY

        for binary_search_step in range(binary_search_steps):
            if (binary_search_step == binary_search_steps - 1 and
                    binary_search_steps >= 10):
                    # in the last binary search step, use the upper_bound instead
                    # to ensure that unsuccessful attacks use the largest
                    # possible constant
                const = upper_bound

            # initialize perturbation in transformed space
            if not random_start:
                z_delta = torch.zeros_like(z_orig, requires_grad=True)
            else:
                rand = np.random.randn(*input_shape) * 1e-2
                z_delta = torch.tensor(
                    rand, dtype=torch.float32, requires_grad=True, 
                    device=self.device)
            loss_at_previous_check = torch.zeros(1, device=self.device) + INFTY

            # create a new optimizer
            optimizer = optim.RMSprop([z_delta], lr=learning_rate)
#             optimizer = torch.optim.LBFGS([z_delta], lr=1, max_iter=20, max_eval=None, 
#                                           tolerance_grad=1e-07, tolerance_change=1e-09, 
#                                           history_size=100, line_search_fn=None)
            
            # add learning rate scheduler
            lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='min', factor=0.5, patience=150, 
                threshold=0.01, threshold_mode='rel')

            for iteration in range(max_iterations):
                optimizer.zero_grad()
                x = to_model_space(z_orig + z_delta)
                
                # adaptively choose threshold and guide samples every 
                # <thres_steps> iterations
                with torch.no_grad():
                    if iteration % thres_steps == 0:
#                         order = (self.dknn.k + 1) // 2 - 1
#                         thres = self.dknn.get_neighbors(x)[0][0][:, order]
                        thres = self.dknn.get_neighbors(x)[0][0][:, -1]
                        self.thres = torch.tensor(thres).to(self.device).view(
                            batch_size, 1)
                        self.find_guide_samples(
                            x, label, m=m, layer=guide_layer)

                reps = self.dknn.get_activations(x, requires_grad=True)
                loss, l2dist = self.loss_function(
                    x, reps, const, x_recon)
                loss.backward()
                optimizer.step()
#                 lr_scheduler.step(loss)

                if (verbose and iteration % \
                    (np.ceil(max_iterations / 10)) == 0):
                    print('    step: %d; loss: %.3f; l2dist: %.3f' %
                          (iteration, loss.cpu().detach().numpy(),
                           l2dist.mean().cpu().detach().numpy()))
                
                # every <check_adv_steps>, save adversarial samples 
                # with minimal perturbation
                if iteration % check_adv_steps == 0:
                    is_adv = self.check_adv(x, label)
                    for i in range(batch_size):
                        if is_adv[i] and best_l2dist[i] > l2dist[i]:
                            x_adv[i] = x[i]
                            best_l2dist[i] = l2dist[i]

            # check how many attacks have succeeded
            with torch.no_grad():
                is_adv = self.check_adv(x, label)
                if verbose:
                    print(is_adv.sum())

            for i in range(batch_size):
                # set new upper and lower bounds
                if is_adv[i]:
                    upper_bound[i] = const[i]
                else:
                    lower_bound[i] = const[i]
                # set new const
                if upper_bound[i] == INFTY:
                    # exponential search if adv has not been found
                    const[i] *= 10
                else:
                    # binary search if adv has been found
                    const[i] = (lower_bound[i] + upper_bound[i]) / 2
                # only keep adv with smallest l2dist
                if is_adv[i] and best_l2dist[i] > l2dist[i]:
                    x_adv[i] = x[i]
                    best_l2dist[i] = l2dist[i]

            # check the current attack success rate (combined with previous
            # binary search steps)
            if verbose:
                with torch.no_grad():
                    is_adv = self.check_adv(x_adv, label)
                    print('binary step: %d; number of successful adv: %d/%d' %
                          (binary_search_step, is_adv.sum(), batch_size))

        return x_adv

    def check_adv(self, x, label):
        """Check if label of <x> predicted by <dknn> matches with <label>"""
        y_pred = self.dknn.classify(x).argmax(1)
        return torch.tensor((y_pred != label).astype(np.float32)).to(self.device)

    def loss_function(self, x, reps, const, x_recon):
        """Returns the loss averaged over the batch (first dimension of x) and
        L-2 norm squared of the perturbation
        """

        batch_size = x.size(0)
        adv_loss = torch.zeros((batch_size, len(layers)), device=self.device)
        # find squared L-2 distance between original samples and their
        # adversarial examples at each layer
        for l, layer in enumerate(self.layers):
            rep = reps[layer].view(batch_size, 1, -1)
            dist = ((rep - self.guide_reps[layer])**2).sum(2)
#             fx = self.sigmoid((self.thres - dist).clamp(-80 / self.a, 80 / self.a), a=self.a)
#             fx = -dist
            fx = self.thres - dist
#             Fx = (coeff.to(device) * fx).sum(1)
            Fx = torch.max(torch.tensor(0., device=self.device), 
                           self.coeff.to(self.device) * fx).sum(1)
#             adv_loss[:, l] = torch.max(torch.tensor(-1., device=device), Fx)
            adv_loss[:, l] = Fx
        # find L-2 norm squared of perturbation
        l2dist = torch.norm((x - x_recon).view(batch_size, -1), dim=1)**2
        # total_loss is sum of squared perturbation norm and squared distance
        # of representations, multiplied by constant
        total_loss = l2dist + const * adv_loss.mean(1)

        return total_loss.mean(), l2dist.sqrt()

    def find_guide_samples(self, x, label, m=100, layer='relu1'):
        """Find k nearest neighbors to <x> that all have the same class but not
        equal to <label>
        """
        num_classes = self.dknn.num_classes
        x_train = self.dknn.x_train
        y_train = self.dknn.y_train
        batch_size = x.size(0)
        nn = torch.zeros((m, ) + x.size()).transpose(0, 1)
        D, I = self.dknn.get_neighbors(
            x, k=x_train.size(0), layers=[layer])[0]

        for i, (d, ind) in enumerate(zip(D, I)):
            mean_dist = np.zeros((num_classes, ))
            for j in range(num_classes):
                mean_dist[j] = np.mean(
                    d[np.where(y_train[ind] == j)[0]][:m // 2])
            mean_dist[label[i]] += INFTY
            nearest_label = mean_dist.argmin()
            nn_ind = np.where(y_train[ind] == nearest_label)[0][:m // 2]
            nn[i, m // 2:] = x_train[ind[nn_ind]]
            nn_ind = np.where(y_train[ind] == label[i])[0][:m // 2]
            nn[i, :m // 2] = x_train[ind[nn_ind]]
            
        # initialize self.guide_reps if empty
        if not self.guide_reps:
            guide_rep = self.dknn.get_activations(
                nn[0], requires_grad=False)
            for layer in self.layers:
                # set a zero tensor before filling it
                size = (batch_size, ) + guide_rep[layer].view(m, -1).size()
                self.guide_reps[layer] = torch.zeros(size, device=self.device)
        
        # fill self.guide_reps
        for i in range(batch_size):
            guide_rep = self.dknn.get_activations(
                nn[i], requires_grad=False)
            self.guide_reps[layer][i] = guide_rep[layer].view(
                m, -1).detach()

    @staticmethod
    def atanh(x):
        return 0.5 * torch.log((1 + x) / (1 - x))

    @staticmethod
    def sigmoid(x, a=1):
        return 1 / (1 + torch.exp(-a * x))

In [104]:
# '''Implement gradient-based attack on DkNN with L-2 constraint'''

# import logging

# import numpy as np
# import torch
# import torch.nn.functional as F
# import torch.optim as optim

# INFTY = 1e20


# class DKNNExpAttack(object):
#     """
#     Implement gradient-based attack on Deep k-Nearest Neigbhor that uses
#     L-2 distance as a metric
#     """

#     def __call__(self, dknn, x_orig, label, guide_layer='relu1', m=100,
#                  binary_search_steps=5, max_iterations=500,
#                  learning_rate=1e-2, initial_const=1, abort_early=True,
#                  max_linf=None, random_start=False, guide_mode=1, thres=0, a=1):
#         """
#         Parameters
#         ----------
#         dknn : DKNN object
#             DkNN (defined in lin/dknn.py) that we want to attack
#         x_orig : torch.tensor
#             tensor of the original samples to attack. Does not need to require
#             gradients, shape is (num_samples, ) + input_shape
#         label : torch.tensor
#             tensor of the label corresponding to x_orig
#         guide_layer : str. optional
#             layer name in which we want to find guide samples. Default is
#             'relu1'
#         m : int, optional
#             number of guide samples. Default is 100
#         binary_search_step : int, optional
#             number of steps for binary search on the norm penalty constant.
#             Default is 5
#         max_iterations : int, optional
#             number of optimization steps (per one binary search). Default is
#             500
#         learning_rate : float , optional
#             step size or learning rate for the optimizer. Default is 1e-2
#         initial_const : float, optional
#             a number the norm penalty constant should be initialized to.
#             Default is 1
#         abort_early : bool, optional
#             whether or not to abort the optimization early (before reaching
#             max_iterations) if the objective does not improve from the past
#             (max_iterations // 10) steps. Default is True
#         max_linf : float, optional
#             use to bound the L-inf norm of the attacks (addition to L-2 norm
#             penalty). Set to None to not use this option. Default is None
#         random_start : bool, optional
#             whether or not to initialize the perturbation with small isotropic
#             Gaussian noise. Default is False
#         guide_mode : int, optional
#             Choose the guide_mode to use between 1 and 2. Default is 1
#             - guide_mode == 1: find m nearest neighbors to input that all have
#             the same class but not equal its original label.
#             - guide_mode == 2: find the nearest neighbor that has a different
#             class from the input and find its m - 1 neighbors

#         Returns
#         -------
#         x_adv : torch.tensor
#             adversarial examples found. If adversarial examples for some inputs
#             are not found, return those inputs.
#         """

#         min_, max_ = x_orig.min(), x_orig.max()
#         if max_linf is not None:
#             min_ = torch.max(x_orig - max_linf, min_)
#             max_ = torch.min(x_orig + max_linf, max_)
#         batch_size = x_orig.size(0)
#         x_adv = x_orig.clone()
#         label = label.cpu().numpy()
#         input_shape = x_orig.detach().cpu().numpy().shape
#         device = dknn.device
#         self.a = a
#         self.thres = thres.to(device).view(batch_size, 1)

#         def to_attack_space(x):
#             # map from [min_, max_] to [-1, +1]
#             a = (min_ + max_) / 2
#             b = (max_ - min_) / 2
#             x = (x - a) / b

#             # from [-1, +1] to approx. (-1, +1)
#             x = x * 0.999999

#             # from (-1, +1) to (-inf, +inf)
#             return self.atanh(x)

#         def to_model_space(x):
#             """Transforms an input from the attack space
#             to the model space. This transformation and
#             the returned gradient are elementwise."""

#             # from (-inf, +inf) to (-1, +1)
#             x = torch.tanh(x)

#             # map from (-1, +1) to (min_, max_)
#             a = (min_ + max_) / 2
#             b = (max_ - min_) / 2
#             x = x * b + a

#             return x

#         # variables representing inputs in attack space will be prefixed with z
#         z_orig = to_attack_space(x_orig)
#         x_recon = to_model_space(z_orig)

#         # declare tensors that keep track of constants and binary search
#         const = torch.zeros((batch_size, ), device=device)
#         const += initial_const
#         lower_bound = torch.zeros_like(const)
#         upper_bound = torch.zeros_like(const) + INFTY
#         best_l2dist = torch.zeros_like(const) + INFTY

#         with torch.no_grad():

#             # choose guide samples and get their representations
#             x_guide, coeff = self.find_guide_samples_v2(
#                 dknn, x_orig, label, k=m, layer=guide_layer)
#             guide_reps = {}
#             for i in range(batch_size):
#                 guide_rep = dknn.get_activations(
#                     x_guide[i], requires_grad=False)
#                 for layer in dknn.layers:
#                     if i == 0:
#                         # set a zero tensor before filling it
#                         size = (batch_size, ) + \
#                             guide_rep[layer].view(m, -1).size()
#                         guide_reps[layer] = torch.zeros(size, device=device)
#                     guide_reps[layer][i] = guide_rep[layer].view(
#                         m, -1).detach()

#         for binary_search_step in range(binary_search_steps):
#             if (binary_search_step == binary_search_steps - 1 and
#                     binary_search_steps >= 10):
#                     # in the last binary search step, use the upper_bound instead
#                     # to ensure that unsuccessful attacks use the largest
#                     # possible constant
#                 const = upper_bound

#             if not random_start:
#                 z_delta = torch.zeros_like(z_orig, requires_grad=True)
#             else:
#                 rand = np.random.randn(*input_shape) * 1e-2
#                 z_delta = torch.tensor(
#                     rand, dtype=torch.float32, requires_grad=True, device=device)
#             loss_at_previous_check = torch.zeros(1, device=device) + INFTY

#             # create a new optimizer
# #             optimizer = optim.Adam([z_delta], lr=learning_rate)
#             # optimizer = optim.SGD([z_delta], lr=learning_rate)
#             optimizer = optim.RMSprop([z_delta], lr=learning_rate)
            
#             # add learning rate scheduler
#             lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
#                 optimizer, mode='min', factor=0.5, patience=150, 
#                 threshold=0.01, threshold_mode='rel')

#             for iteration in range(max_iterations):
#                 optimizer.zero_grad()
#                 x = to_model_space(z_orig + z_delta)
#                 # adjust threshold
# #                 with torch.no_grad():
# #                     if iteration % 100 == 0:
# #                         thres = dknn.get_neighbors(x)[0][0][:, -1]
# #                         self.thres = torch.tensor(thres).to(device).view(batch_size, 1)
# #                         x_guide, coeff = self.find_guide_samples_v2(
# #                             dknn, x, label, k=m, layer=guide_layer)
# #                         guide_reps = {}
# #                         for i in range(batch_size):
# #                             guide_rep = dknn.get_activations(
# #                                 x_guide[i], requires_grad=False)
# #                             for layer in dknn.layers:
# #                                 if i == 0:
# #                                     # set a zero tensor before filling it
# #                                     size = (batch_size, ) + \
# #                                         guide_rep[layer].view(m, -1).size()
# #                                     guide_reps[layer] = torch.zeros(size, device=device)
# #                                 guide_reps[layer][i] = guide_rep[layer].view(
# #                                     m, -1).detach()
                    
#                 reps = dknn.get_activations(x, requires_grad=True)
#                 loss, l2dist = self.loss_function(
#                     x, reps, guide_reps, coeff, dknn.layers, const, x_recon, device)
#                 loss.backward()
#                 optimizer.step()
# #                 lr_scheduler.step(loss)

# #                 if iteration == 0 or iteration == 900:
# #                     import pdb
# #                     pdb.set_trace()
# #                 import pdb; pdb.set_trace()

#                 if iteration % (np.ceil(max_iterations / 10)) == 0:
#                     print('    step: %d; loss: %.3f; l2dist: %.3f' %
#                           (iteration, loss.cpu().detach().numpy(),
#                            l2dist.mean().cpu().detach().numpy()))
#                 # DEBUG:
#                 # for i in range(5):
#                 #     print(z_delta.grad[i].view(-1).norm().item())

#                 if abort_early and iteration % 10 == 0:
#                     # # after each tenth of the iterations, check progress
#                     # if torch.gt(loss, .9999 * loss_at_previous_check):
#                     #     break  # stop Adam if there has not been progress
#                     # loss_at_previous_check = loss
#                     is_adv = self.check_adv(dknn, x, label)
# #                     print(is_adv.sum())
#                     for i in range(batch_size):
#                         if is_adv[i] and best_l2dist[i] > l2dist[i]:
#                             x_adv[i] = x[i]
#                             best_l2dist[i] = l2dist[i]

#             # check how many attacks have succeeded
#             with torch.no_grad():
#                 is_adv = self.check_adv(dknn, x, label)
#                 print(is_adv.sum())

#             # if binary_search_step == 14:
#             #     import pdb
#             #     pdb.set_trace()

#             for i in range(batch_size):
#                 # set new upper and lower bounds
#                 if is_adv[i]:
#                     upper_bound[i] = const[i]
#                 else:
#                     lower_bound[i] = const[i]
#                 # set new const
#                 if upper_bound[i] == INFTY:
#                     # exponential search if adv has not been found
#                     const[i] *= 10
#                 else:
#                     # binary search if adv has been found
#                     const[i] = (lower_bound[i] + upper_bound[i]) / 2
#                 # only keep adv with smallest l2dist
#                 if is_adv[i] and best_l2dist[i] > l2dist[i]:
#                     x_adv[i] = x[i]
#                     best_l2dist[i] = l2dist[i]

#             # check the current attack success rate (combined with previous
#             # binary search steps)
#             with torch.no_grad():
#                 is_adv = self.check_adv(dknn, x_adv, label)
#             print('binary step: %d; number of successful adv: %d/%d' %
#                   (binary_search_step, is_adv.sum(), batch_size))

#         return x_adv

#     @classmethod
#     def check_adv(cls, dknn, x, label):
#         """Check if label of <x> predicted by <dknn> matches with <label>"""
#         y_pred = dknn.classify(x).argmax(1)
#         return torch.tensor((y_pred != label).astype(np.float32)).to(dknn.device)

#     def loss_function(self, x, reps, guide_reps, coeff, layers, const, x_recon, device):
#         """Returns the loss averaged over the batch (first dimension of x) and
#         L-2 norm squared of the perturbation
#         """

#         batch_size = x.size(0)
#         adv_loss = torch.zeros((batch_size, len(layers)), device=device)
#         # find squared L-2 distance between original samples and their
#         # adversarial examples at each layer
#         for l, layer in enumerate(layers):
#             rep = reps[layer].view(batch_size, 1, -1)
#             dist = ((rep - guide_reps[layer])**2).sum(2)
# #             import pdb; pdb.set_trace()
# #             fx = self.sigmoid((self.thres - dist).clamp(-80 / self.a, 80 / self.a), a=self.a)
# #             fx = -dist
#             fx = self.thres - dist
# #             Fx = (coeff.to(device) * fx).sum(1)
#             Fx = torch.max(torch.tensor(0., device=device), coeff.to(device) * fx).sum(1)
# #             adv_loss[:, l] = torch.max(torch.tensor(-1., device=device), Fx)
#             adv_loss[:, l] = Fx
#         # find L-2 norm squared of perturbation
#         l2dist = torch.norm((x - x_recon).view(batch_size, -1), dim=1)**2
#         # total_loss is sum of squared perturbation norm and squared distance
#         # of representations, multiplied by constant
#         total_loss = l2dist + const * adv_loss.mean(1)

#         return total_loss.mean(), l2dist.sqrt()

#     @staticmethod
#     def find_guide_samples(dknn, x, label, k=100, layer='relu1'):
#         """Find k nearest neighbors to <x> that all have the same class but not
#         equal to <label>
#         """
#         num_classes = dknn.num_classes
#         nn = torch.zeros((k, ) + x.size()).transpose(0, 1)
#         coeff = torch.zeros((x.size(0), k))
#         # coeff[:, :k // 2] -= 1
#         # coeff[:, k // 2:] += 1
#         coeff[:, :k // 2] += 1
#         coeff[:, k // 2:] -= 1
#         # coeff += 1
#         D, I = dknn.get_neighbors(
#             x, k=dknn.x_train.size(0), layers=[layer])[0]

#         for i, (d, ind) in enumerate(zip(D, I)):
#             mean_dist = np.zeros((num_classes, ))
#             for j in range(num_classes):
#                 mean_dist[j] = np.mean(
#                     d[np.where(dknn.y_train[ind] == j)[0]][:k // 2])
#             mean_dist[label[i]] += INFTY
#             nearest_label = mean_dist.argmin()
#             nn_ind = np.where(dknn.y_train[ind] == nearest_label)[0][:k // 2]
#             nn[i, k // 2:] = dknn.x_train[ind[nn_ind]]
#             nn_ind = np.where(dknn.y_train[ind] == label[i])[0][:k // 2]
#             nn[i, :k // 2] = dknn.x_train[ind[nn_ind]]
#             # nn_ind = np.where(dknn.y_train[ind] == nearest_label)[0][:k]
#             # nn[i] = dknn.x_train[ind[nn_ind]]

#         return nn, coeff
    
#     def find_guide_samples_v2(cls, dknn, x, label, k=100, layer='relu1'):
#         """Find the nearest neighbor to <x> that has a different label from
#         <label>. Then find other <k> - 1 training samples that are closest to
#         the neighbor and has the same class
#         """
#         x_nn = torch.zeros((k, ) + x.size()).transpose(0, 1)
#         # find nearest sample with same class
#         _, I = dknn.get_neighbors(x, k=1, layers=[layer])[0]
#         x_nn[:, :k // 2] = cls.find_nn_same_class(dknn, I[0], k=k // 2, layer=layer)
        
#         # find nearest sample with different class
#         nn = dknn.find_nn_diff_class(x, label)
#         # now find k neighbors that has the same class as x_nn
#         x_nn[:, k // 2:] = cls.find_nn_same_class(dknn, nn, k=k // 2, layer=layer)
        
#         coeff = torch.zeros((x.size(0), k))
#         # coeff[:, :k // 2] -= 1
#         # coeff[:, k // 2:] += 1
#         coeff[:, :k // 2] += 1
#         coeff[:, k // 2:] -= 1
        
#         return x_nn, coeff

#     @staticmethod
#     def find_nn_same_class(dknn, ind_x, k=100, layer='relu1'):
#         """Find <k> training samples with the same class as and closest to the
#         training sample with index <ind_x> in representation space at <layer>
#         """

#         batch_size = ind_x.shape[0]
#         label = dknn.y_train[ind_x]
#         x_nn = torch.zeros((batch_size, k) + dknn.x_train[0].size())
#         _, I = dknn.get_neighbors(
#             dknn.x_train[ind_x], k=dknn.x_train.size(0), layers=[layer])[0]

#         for i, ind in enumerate(I):
#             nn_ind = np.where(dknn.y_train[ind] == label[i])[0][:k]
#             x_nn[i] = dknn.x_train[ind[nn_ind]]

#         return x_nn

#     @staticmethod
#     def atanh(x):
#         return 0.5 * torch.log((1 + x) / (1 - x))

#     @staticmethod
#     def sigmoid(x, a=1):
#         return 1 / (1 + torch.exp(-a * x))

In [114]:
# Attack for L2 DkNN

# attack = DKNNL2Attack()
# attack = DKNNLinfAttack()
from lib.dknn_attack_exp import DKNNExpAttack
attack = DKNNExpAttack(knn)

def attack_batch(x, y, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    num_batches = total_num // batch_size
    for i in range(num_batches):
        begin = i * batch_size
        end = (i + 1) * batch_size
        x_adv[begin:end] = attack(
            x[begin:end], y[begin:end],
            guide_layer=layers[0], m=2, binary_search_steps=10,
            max_iterations=1000, learning_rate=1e-2,
            initial_const=1e-1, random_start=False,
            thres_steps=10, check_adv_steps=50, verbose=True,
            max_linf=0.3)
    return x_adv

x_adv = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200)
# x_adv = attack_batch(x_test_sub[num-2:num-1].cuda(), y_test_sub[num-2:num-1], 1)

    step: 0; loss: 3.188; l2dist: 0.000
    step: 100; loss: 2.927; l2dist: 0.472
    step: 200; loss: 2.926; l2dist: 0.472
    step: 300; loss: 2.926; l2dist: 0.473
    step: 400; loss: 2.927; l2dist: 0.473
    step: 500; loss: 2.927; l2dist: 0.473
    step: 600; loss: 2.927; l2dist: 0.473
    step: 700; loss: 2.927; l2dist: 0.473
    step: 800; loss: 2.927; l2dist: 0.473
    step: 900; loss: 2.927; l2dist: 0.472
tensor(2., device='cuda:0')
binary step: 0; number of successful adv: 3/200
    step: 0; loss: 31.876; l2dist: 0.000
    step: 100; loss: 19.202; l2dist: 1.654
    step: 200; loss: 18.137; l2dist: 1.851
    step: 300; loss: 17.832; l2dist: 1.921
    step: 400; loss: 16.257; l2dist: 1.982
    step: 500; loss: 13.356; l2dist: 2.139
    step: 600; loss: 12.258; l2dist: 2.227
    step: 700; loss: 11.792; l2dist: 2.278
    step: 800; loss: 11.596; l2dist: 2.303
    step: 900; loss: 11.509; l2dist: 2.320
tensor(34., device='cuda:0')
binary step: 1; number of successful adv: 41/200


In [95]:
# Attack for L2 DkNN

attack = DKNNExpAttack()

def attack_batch(x, y, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    for i in range(batch_size):
        x_adv[i:i+1] = attack(
            knn, x[i:i+1], y[i:i+1],
            guide_layer=layers[0], m=4, binary_search_steps=15,
            max_iterations=1500, learning_rate=1e-2, guide_mode=1,
            initial_const=1e-1, abort_early=True, random_start=False,
            thres=thres[i:i+1], a=0.1)
    return x_adv

x_adv = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200)

    step: 0; loss: 7.592; l2dist: 0.000
    step: 150; loss: 5.997; l2dist: 1.029
    step: 300; loss: 5.585; l2dist: 1.400
    step: 450; loss: 5.588; l2dist: 1.394
    step: 600; loss: 5.583; l2dist: 1.405
    step: 750; loss: 5.584; l2dist: 1.405
    step: 900; loss: 5.585; l2dist: 1.404
    step: 1050; loss: 5.587; l2dist: 1.395
    step: 1200; loss: 5.584; l2dist: 1.404
    step: 1350; loss: 5.586; l2dist: 1.405
tensor(0., device='cuda:0')
binary step: 0; number of successful adv: 0/1
    step: 0; loss: 75.919; l2dist: 0.000
    step: 150; loss: 17.589; l2dist: 3.628
    step: 300; loss: 15.533; l2dist: 2.641
    step: 450; loss: 8.157; l2dist: 2.856
    step: 600; loss: 7.948; l2dist: 2.798
    step: 750; loss: 8.104; l2dist: 2.847
    step: 900; loss: 7.841; l2dist: 2.800
    step: 1050; loss: 7.986; l2dist: 2.826
    step: 1200; loss: 7.870; l2dist: 2.805
    step: 1350; loss: 7.879; l2dist: 2.807
tensor(0., device='cuda:0')
binary step: 1; number of successful adv: 1/1
    ste

KeyboardInterrupt: 

In [None]:
rep = reps[layer].view(batch_size, 1, -1);dist = ((rep - guide_reps[layer])**2).sum(2)
fx = self.sigmoid((self.thres - dist).clamp(-80 / self.a, 80 / self.a), a=self.a)

In [189]:
with torch.no_grad():
    y_pred = knn.classify(x_adv)
    ind = np.where(y_pred.argmax(1) == y_test_sub[:num].numpy())[0]
    print(len(ind) / x_adv.size(0))
(x_test_sub - x_adv.cpu()).view(num, -1).norm(2, 1).mean()

0.0


tensor(3.0325, grad_fn=<MeanBackward0>)

In [13]:
thres = ((knn.x_train[knn.get_neighbors(x_test_sub)[0][1]].squeeze() - x_test_sub)**2).sum((1, 2))

RuntimeError: The size of tensor a (3) must match the size of tensor b (200) at non-singleton dimension 1

In [32]:
thres.mean()

tensor(13.5435)

In [63]:
INFTY = 1e20


class DKNN_PGD(object):
    """
    """

    def __init__(self, dknn):
        self.dknn = dknn
        self.device = dknn.device
        self.layers = dknn.layers
        self.guide_reps = {}
        self.thres = None
        self.coeff = None

    def __call__(self, x_orig, label, guide_layer, m, epsilon=0.1,
                 max_epsilon=0.3, max_iterations=1000, num_restart=1,
                 rand_start=True, thres_steps=100, check_adv_steps=100,
                 verbose=True):
        """
        x_orig is tensor (requires_grad=False)
        """

        # make sure we run at least once
        if num_restart < 1:
            num_restart = 1

        # if not using randomized start, no point in doing more than one start
        if not rand_start:
            num_restart = 1

        label = label.cpu().numpy()
        batch_size = x_orig.size(0)
        min_, max_ = x_orig.min(), x_orig.max()
        x_adv = x_orig.detach()
        best_num_nn = np.zeros((batch_size, ))
        
        self.coeff = torch.zeros((x_orig.size(0), m))
        self.coeff[:, :m // 2] += 1
        self.coeff[:, m // 2:] -= 1
        
        for i in range(num_restart):

            # initialize perturbation
            delta = torch.zeros_like(x_adv)
            if rand_start:
                delta.uniform_(- max_epsilon * 0.1, max_epsilon * 0.1)
            delta.requires_grad_()

            for iteration in range(max_iterations):
                x = torch.clamp(x_orig + delta, min_, max_)

                # adaptively choose threshold and guide samples every
                # <thres_steps> iterations
                with torch.no_grad():
                    if iteration % thres_steps == 0:
                        thres = self.dknn.get_neighbors(x)[0][0][:, -1]
                        self.thres = torch.tensor(thres).to(self.device).view(
                            batch_size, 1)
                        self.find_guide_samples(
                            x, label, m=m, layer=guide_layer)

                reps = self.dknn.get_activations(x, requires_grad=True)
                loss = self.loss_function(reps)
                loss.backward()
                # perform update on delta
                with torch.no_grad():
                    delta -= epsilon * delta.grad.detach().sign()
                    delta.clamp_(- max_epsilon, max_epsilon)

                if (verbose and iteration % (np.ceil(max_iterations / 10)) == 0):
                    print('    step: %d; loss: %.3f' %
                          (iteration, loss.cpu().detach().numpy()))
                
                if ((iteration + 1) % check_adv_steps == 0 or
                        iteration == max_iterations):
                    with torch.no_grad():
                        # check if x are adversarial. Only store adversarial examples
                        # if they have a larger number of wrong neighbors than orevious
                        is_adv, num_nn = self.check_adv(x, label)
                        for j in range(batch_size):
                            if is_adv[j] and num_nn[j] > best_num_nn[j]:
                                x_adv[j] = x[j]
                                best_num_nn[j] = num_nn[j]

            with torch.no_grad():
                is_adv, _ = self.check_adv(x_adv, label)
            if verbose:
                print('number of successful adv: %d/%d' % (is_adv.sum(), batch_size))

        return x_adv

    def check_adv(self, x, label):
        """Check if label of <x> predicted by <dknn> matches with <label>"""
        output = self.dknn.classify(x)
        num_nn = output.max(1)
        y_pred = output.argmax(1)
        is_adv = (y_pred != label).astype(np.float32)
        return is_adv, num_nn

    def loss_function(self, reps):
        """Returns the loss averaged over the batch (first dimension of x) and
        L-2 norm squared of the perturbation
        """

        batch_size = reps[self.layers[0]].size(0)
        adv_loss = torch.zeros(
            (batch_size, len(self.layers)), device=self.device)
        # find squared L-2 distance between original samples and their
        # adversarial examples at each layer
        for l, layer in enumerate(self.layers):
            rep = reps[layer].view(batch_size, 1, -1)
            dist = ((rep - self.guide_reps[layer])**2).sum(2)
            fx = self.thres - dist
            Fx = torch.max(torch.tensor(0., device=self.device),
                           self.coeff.to(self.device) * fx).sum(1)
            Fx = (- self.coeff.to(self.device) * dist).sum(1)
            adv_loss[:, l] = Fx

        return adv_loss.mean()

    def find_guide_samples(self, x, label, m=100, layer='relu1'):
        """Find k nearest neighbors to <x> that all have the same class but not
        equal to <label>
        """
        num_classes = self.dknn.num_classes
        x_train = self.dknn.x_train
        y_train = self.dknn.y_train
        batch_size = x.size(0)
        nn = torch.zeros((m, ) + x.size()).transpose(0, 1)
        D, I = self.dknn.get_neighbors(
            x, k=x_train.size(0), layers=[layer])[0]

        for i, (d, ind) in enumerate(zip(D, I)):
            mean_dist = np.zeros((num_classes, ))
            for j in range(num_classes):
                mean_dist[j] = np.mean(
                    d[np.where(y_train[ind] == j)[0]][:m // 2])
            mean_dist[label[i]] += INFTY
            nearest_label = mean_dist.argmin()
            nn_ind = np.where(y_train[ind] == nearest_label)[0][:m // 2]
            nn[i, m // 2:] = x_train[ind[nn_ind]]
            nn_ind = np.where(y_train[ind] == label[i])[0][:m // 2]
            nn[i, :m // 2] = x_train[ind[nn_ind]]

        # initialize self.guide_reps if empty
        if not self.guide_reps:
            guide_rep = self.dknn.get_activations(
                nn[0], requires_grad=False)
            for l in self.layers:
                # set a zero tensor before filling it
                size = (batch_size, ) + guide_rep[l].view(m, -1).size()
                self.guide_reps[l] = torch.zeros(size, device=self.device)

        # fill self.guide_reps
        for i in range(batch_size):
            guide_rep = self.dknn.get_activations(
                nn[i], requires_grad=False)
            self.guide_reps[layer][i] = guide_rep[layer].view(
                m, -1).detach()

In [106]:
INFTY = 1e20


class DKNN_PGD(object):
    """
    """

    def __init__(self, dknn):
        self.dknn = dknn
        self.device = dknn.device
        self.layers = dknn.layers
        self.guide_reps = {}
        self.thres = None
        self.coeff = None

    def __call__(self, x_orig, label, guide_layer, m, epsilon=0.1,
                 max_epsilon=0.3, max_iterations=1000, num_restart=1,
                 rand_start=True, thres_steps=100, check_adv_steps=100,
                 verbose=True):
        """
        x_orig is tensor (requires_grad=False)
        """

        # make sure we run at least once
        if num_restart < 1:
            num_restart = 1

        # if not using randomized start, no point in doing more than one start
        if not rand_start:
            num_restart = 1

        label = label.cpu().numpy()
        batch_size = x_orig.size(0)
        min_, max_ = x_orig.min(), x_orig.max()
        x_adv = x_orig.detach()
        best_num_nn = np.zeros((batch_size, ))
        
        self.coeff = torch.zeros((x_orig.size(0), m))
        self.coeff[:, :m // 2] += 1
        self.coeff[:, m // 2:] -= 1
        
        for i in range(num_restart):

            # initialize perturbation
            delta = torch.zeros_like(x_adv)
            if rand_start:
                delta.uniform_(- max_epsilon * 0.1, max_epsilon * 0.1)
            delta.requires_grad_()

            for iteration in range(max_iterations):
                x = torch.clamp(x_orig + delta, min_, max_)

                # adaptively choose threshold and guide samples every
                # <thres_steps> iterations
                with torch.no_grad():
                    if iteration % thres_steps == 0:
                        thres = self.dknn.get_neighbors(x)[0][0][:, -1]
                        self.thres = torch.tensor(thres).to(self.device).view(
                            batch_size, 1)
                        self.find_guide_samples(
                            x, label, m=m, layer=guide_layer)

                reps = self.dknn.get_activations(x, requires_grad=True)
                loss = self.loss_function(reps)
                loss.backward()
                # perform update on delta
                with torch.no_grad():
#                     import pdb; pdb.set_trace()
                    delta -= epsilon * delta.grad.detach().sign()
#                     delta -= epsilon * delta.grad.detach()
                    delta.clamp_(- max_epsilon, max_epsilon)

                if (verbose and iteration % (np.ceil(max_iterations / 10)) == 0):
                    print('    step: %d; loss: %.3f' %
                          (iteration, loss.cpu().detach().numpy()))
                
                if ((iteration + 1) % check_adv_steps == 0 or
                        iteration == max_iterations):
                    with torch.no_grad():
                        # check if x are adversarial. Only store adversarial examples
                        # if they have a larger number of wrong neighbors than orevious
                        is_adv, num_nn = self.check_adv(x, label)
                        for j in range(batch_size):
                            if is_adv[j] and num_nn[j] > best_num_nn[j]:
                                x_adv[j] = x[j]
                                best_num_nn[j] = num_nn[j]

            with torch.no_grad():
                is_adv, _ = self.check_adv(x_adv, label)
            if verbose:
                print('number of successful adv: %d/%d' % (is_adv.sum(), batch_size))

        return x_adv

    def check_adv(self, x, label):
        """Check if label of <x> predicted by <dknn> matches with <label>"""
        output = self.dknn.classify(x)
        num_nn = output.max(1)
        y_pred = output.argmax(1)
        is_adv = (y_pred != label).astype(np.float32)
#         import pdb; pdb.set_trace()
        return is_adv, num_nn

    def loss_function(self, reps):
        """Returns the loss averaged over the batch (first dimension of x) and
        L-2 norm squared of the perturbation
        """

        batch_size = reps[self.layers[0]].size(0)
        adv_loss = torch.zeros(
            (batch_size, len(self.layers)), device=self.device)
        # find squared L-2 distance between original samples and their
        # adversarial examples at each layer
        for l, layer in enumerate(self.layers):
            rep = reps[layer].view(batch_size, 1, -1)
            dist = ((rep - self.guide_reps[layer])**2).sum(2)
#             fx = self.thres - dist
#             Fx = torch.max(torch.tensor(0., device=self.device),
#                            self.coeff.to(self.device) * fx).sum(1)
#             Fx = (- self.coeff.to(self.device) * dist).sum(1)
            Fx = dist[:, 1]
            adv_loss[:, l] = Fx

        return adv_loss.mean()

    def find_guide_samples(self, x, label, m=100, layer='relu1'):
        """Find k nearest neighbors to <x> that all have the same class but not
        equal to <label>
        """
        num_classes = self.dknn.num_classes
        x_train = self.dknn.x_train
        y_train = self.dknn.y_train
        batch_size = x.size(0)
        nn = torch.zeros((m, ) + x.size()).transpose(0, 1)
        D, I = self.dknn.get_neighbors(
            x, k=x_train.size(0), layers=[layer])[0]
        y_pred = self.dknn.classify(x_train).argmax(1)
        is_correct = y_pred == y_train.numpy()

        for i, (d, ind) in enumerate(zip(D, I)):
#             mean_dist = np.zeros((num_classes, ))
#             for j in range(num_classes):
#                 mean_dist[j] = np.mean(
#                     d[np.where(y_train[ind] == j)[0]][:m // 2])
#             mean_dist[label[i]] += INFTY
#             nearest_label = mean_dist.argmin()
#             nn_ind = np.where(y_train[ind] == nearest_label)[0][:m // 2]
#             nn[i, m // 2:] = x_train[ind[nn_ind]]
#             nn_ind = np.where(y_train[ind] == label[i])[0][:m // 2]
#             nn[i, :m // 2] = x_train[ind[nn_ind]]
            # find nearest sample that is correctly classified as j
#             import pdb; pdb.set_trace()
            is_not_label = y_train.numpy() != label[i]
            idx = np.where(is_correct & is_not_label)[0]
            nn_ind = (x_train - x[i].cpu())[idx].view(
                idx.shape[0], -1).norm(2, 1).argmin()
            nn[i, 1] = x_train[idx][nn_ind]

        # initialize self.guide_reps if empty
        if not self.guide_reps:
            guide_rep = self.dknn.get_activations(
                nn[0], requires_grad=False)
            for l in self.layers:
                # set a zero tensor before filling it
                size = (batch_size, ) + guide_rep[l].view(m, -1).size()
                self.guide_reps[l] = torch.zeros(size, device=self.device)

        # fill self.guide_reps
        for i in range(batch_size):
            guide_rep = self.dknn.get_activations(
                nn[i], requires_grad=False)
            self.guide_reps[layer][i] = guide_rep[layer].view(
                m, -1).detach()

In [109]:
attack = DKNN_PGD(knn)

def attack_batch(x, y, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    num_batches = total_num // batch_size
    for i in range(num_batches):
        begin = i * batch_size
        end = (i + 1) * batch_size
        x_adv[begin:end] = attack(
            x[begin:end], y[begin:end],
            guide_layer=layers[0], m=2, epsilon=0.001,
            max_epsilon=0.3, max_iterations=1000, num_restart=10,
            rand_start=False, thres_steps=1000, check_adv_steps=1000,
            verbose=True)
    return x_adv

num = 200
x_adv = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200)

    step: 0; loss: 45.419
    step: 100; loss: 34.209
    step: 200; loss: 25.145
    step: 300; loss: 17.951
    step: 400; loss: 17.896
    step: 500; loss: 17.885
    step: 600; loss: 17.942
    step: 700; loss: 17.890
    step: 800; loss: 17.899
    step: 900; loss: 17.923
number of successful adv: 54/200


In [115]:
knn.device

'cuda'