In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
import logging
import os
import time

import numpy as np
import matplotlib.pyplot as plt
import foolbox
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.optim as optim

from lib.dataset_utils import *
from lib.mnist_model import *
from lib.adv_model import *
from lib.dknn_attack_v2 import DKNNAttackV2
from lib.cwl2_attack import CWL2Attack
from lib.dknn import DKNNL2
from lib.utils import *
from lib.lip_model import *

Loading faiss with AVX2 support.


In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [10]:
# Subsamples dataset and reduces to two classes (1 and 7)
# Code adapted from https://github.com/yangarbiter/adversarial-nonparametrics

num_samples = 2200
num_val = 300 # can pick anything > 0, does not really get used here

from keras.datasets import mnist
(X, y), (_, _) = mnist.load_data()
np.random.seed(2019)
idx1 = np.random.choice(
    np.where(y == 1)[0], num_samples // 2, replace=False)
idx2 = np.random.choice(
    np.where(y == 7)[0], num_samples // 2, replace=False)
y[idx1] = 0
y[idx2] = 1
X = np.vstack((X[idx1], X[idx2])).astype(np.float32) / 255.
y = np.concatenate((y[idx1], y[idx2]))

idxs = np.arange(num_samples)
np.random.shuffle(idxs)
x_train = torch.tensor(X[idxs[:-200]])
x_test = torch.tensor(X[idxs[-200:]])
y_train = torch.tensor(y[idxs[:-200]])
y_test = torch.tensor(y[idxs[-200:]])

In [11]:
net_knn = KNNModel()

In [88]:
layers = ['identity']
knn = DKNNL2(net_knn, x_train, y_train, 
             x_test, y_test, layers, 
             k=5, num_classes=2)

In [89]:
with torch.no_grad():
    y_pred = knn.classify(x_test_sub)
    ind = np.where(y_pred.argmax(1) == y_test_sub.numpy())[0]
    print((y_pred.argmax(1) == y_test_sub.numpy()).sum() / y_test_sub.size(0))

0.99


In [90]:
def attack_batch(attack, x, y, init_mode, init_mode_k, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    num_batches = total_num // batch_size
    for i in range(num_batches):
        begin = i * batch_size
        end = (i + 1) * batch_size
        x_adv[begin:end] = attack(
            x[begin:end], y[begin:end], 2, guide_layer='identity', m=6,
            init_mode=init_mode, init_mode_k=init_mode_k,
            binary_search_steps=10, max_iterations=1000, learning_rate=1e-2,
            initial_const=1e-1, max_linf=None, random_start=False,
            thres_steps=20, check_adv_steps=20, verbose=False)
    return x_adv

num = 200

def full_eval(dknn):
    with torch.no_grad():
        y_pred = dknn.classify(x_test)
        ind = np.where(y_pred.argmax(1) == y_test.numpy())[0]
    print((y_pred.argmax(1) == y_test.numpy()).sum() / y_test.size(0))
    
    dist_all = np.zeros(num) + 1e9
    attack = DKNNAttackV2(dknn)
    
    x_adv = attack_batch(
        attack, x_test[:num].cuda(), y_test[:num], 1, 1, 200)
    with torch.no_grad():
        y_pred = dknn.classify(x_adv)
        ind_adv = y_pred.argmax(1) != y_test[:num].numpy()
        dist = (x_adv.cpu() - x_test[:num]).view(
            num, -1).norm(2, 1).numpy()
    for i in range(num):
        if ind_adv[i] and (dist[i] < dist_all[i]):
            dist_all[i] = dist[i]
            
    for k in range(1, 4):
        x_adv = attack_batch(
            attack, x_test[:num].cuda(), y_test[:num], 2, k, 200)
        with torch.no_grad():
            y_pred = dknn.classify(x_adv)
            ind_adv = y_pred.argmax(1) != y_test[:num].numpy()
            dist = (x_adv.cpu() - x_test[:num]).view(
                num, -1).norm(2, 1).numpy()
        for i in range(num):
            if ind_adv[i] and (dist[i] < dist_all[i]):
                dist_all[i] = dist[i]
                
    adv_acc = (dist_all == 1e9).mean()
    print('adv accuracy: %.4f, mean dist: %.4f' % (
        adv_acc, dist_all[dist_all < 1e9].mean()))
    return dist_all

In [91]:
start = time.time()
dist = full_eval(knn)
print(time.time() - start)

0.99
adv accuracy: 0.0000, mean dist: 3.0913
284.73527669906616


In [75]:
dist.mean()

2.9384045471162654

In [55]:
# Attack for L2 DkNN

attack = DKNNL2Attack()
# attack = DKNNLinfAttack()

def attack_batch(x, y, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    num_batches = total_num // batch_size
    for i in range(num_batches):
        begin = i * batch_size
        end = (i + 1) * batch_size
        x_adv[begin:end] = attack(
            knn, x[begin:end], y[begin:end],
            guide_layer=layers[0], m=5, binary_search_steps=15,
            max_iterations=1500, learning_rate=1e-2, guide_mode=2,
            initial_const=1e-2, abort_early=True, random_start=False)
    return x_adv

num = 200
x_adv = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200)

    step: 0; loss: 0.046; l2dist: 0.000
    step: 100; loss: 0.046; l2dist: 0.001
binary step: 0; num successful adv: 2/200
binary step: 0; num successful adv so far: 2/200
    step: 0; loss: 0.462; l2dist: 0.000
    step: 100; loss: 0.461; l2dist: 0.012
    step: 200; loss: 0.461; l2dist: 0.017
binary step: 1; num successful adv: 2/200
binary step: 1; num successful adv so far: 2/200
    step: 0; loss: 4.616; l2dist: 0.000
    step: 100; loss: 4.581; l2dist: 0.132
    step: 200; loss: 4.581; l2dist: 0.134
binary step: 2; num successful adv: 3/200
binary step: 2; num successful adv so far: 3/200
    step: 0; loss: 46.030; l2dist: 0.000
    step: 100; loss: 41.627; l2dist: 1.587
    step: 200; loss: 41.554; l2dist: 1.625
    step: 300; loss: 39.644; l2dist: 1.736
    step: 400; loss: 38.801; l2dist: 1.949
    step: 500; loss: 38.461; l2dist: 2.077
    step: 600; loss: 38.309; l2dist: 2.132
    step: 700; loss: 38.214; l2dist: 2.175
    step: 800; loss: 38.140; l2dist: 2.213
    step: 90

In [56]:
with torch.no_grad():
    y_pred = knn.classify(x_adv)
    ind = np.where(y_pred.argmax(1) == y_test_sub[:num].numpy())[0]
    print(len(ind) / x_adv.size(0))
(x_test_sub - x_adv.cpu()).view(num, -1).norm(2, 1).mean()

0.0


tensor(3.9757, grad_fn=<MeanBackward0>)

In [57]:
# Attack for L2 DkNN

# attack = DKNNL2Attack()
# attack = DKNNLinfAttack()
from lib.dknn_attack_exp import DKNNExpAttack
attack = DKNNExpAttack(knn)

def attack_batch(x, y, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    num_batches = total_num // batch_size
    for i in range(num_batches):
        begin = i * batch_size
        end = (i + 1) * batch_size
        x_adv[begin:end] = attack(
            x[begin:end], y[begin:end],
            guide_layer=layers[0], m=6, binary_search_steps=10,
            max_iterations=1000, learning_rate=1e-2,
            initial_const=1e-1, random_start=False,
            thres_steps=10, check_adv_steps=50, verbose=True,
            max_linf=None)
    return x_adv

x_adv = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200)
# x_adv = attack_batch(x_test_sub[num-2:num-1].cuda(), y_test_sub[num-2:num-1], 1)

    step: 0; loss: 9.961; l2dist: 0.000
    step: 100; loss: 8.053; l2dist: 1.232
    step: 200; loss: 8.048; l2dist: 1.235
    step: 300; loss: 8.046; l2dist: 1.234
    step: 400; loss: 7.850; l2dist: 1.271
    step: 500; loss: 7.552; l2dist: 1.395
    step: 600; loss: 7.440; l2dist: 1.444
    step: 700; loss: 7.380; l2dist: 1.463
    step: 800; loss: 7.336; l2dist: 1.482
    step: 900; loss: 7.312; l2dist: 1.492
binary step: 0; num successful adv: 5/200
binary step: 0; num successful adv so far: 6/200
    step: 0; loss: 99.356; l2dist: 0.000
    step: 100; loss: 26.939; l2dist: 3.411
    step: 200; loss: 21.338; l2dist: 3.496
    step: 300; loss: 13.465; l2dist: 3.467
    step: 400; loss: 12.395; l2dist: 3.356
    step: 500; loss: 11.989; l2dist: 3.315
    step: 600; loss: 11.904; l2dist: 3.303
    step: 700; loss: 11.893; l2dist: 3.301
    step: 800; loss: 11.940; l2dist: 3.310
    step: 900; loss: 12.002; l2dist: 3.319
binary step: 1; num successful adv: 147/200
binary step: 1; num

In [95]:
# Attack for L2 DkNN

attack = DKNNExpAttack()

def attack_batch(x, y, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    for i in range(batch_size):
        x_adv[i:i+1] = attack(
            knn, x[i:i+1], y[i:i+1],
            guide_layer=layers[0], m=4, binary_search_steps=15,
            max_iterations=1500, learning_rate=1e-2, guide_mode=1,
            initial_const=1e-1, abort_early=True, random_start=False,
            thres=thres[i:i+1], a=0.1)
    return x_adv

x_adv = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200)

    step: 0; loss: 7.592; l2dist: 0.000
    step: 150; loss: 5.997; l2dist: 1.029
    step: 300; loss: 5.585; l2dist: 1.400
    step: 450; loss: 5.588; l2dist: 1.394
    step: 600; loss: 5.583; l2dist: 1.405
    step: 750; loss: 5.584; l2dist: 1.405
    step: 900; loss: 5.585; l2dist: 1.404
    step: 1050; loss: 5.587; l2dist: 1.395
    step: 1200; loss: 5.584; l2dist: 1.404
    step: 1350; loss: 5.586; l2dist: 1.405
tensor(0., device='cuda:0')
binary step: 0; number of successful adv: 0/1
    step: 0; loss: 75.919; l2dist: 0.000
    step: 150; loss: 17.589; l2dist: 3.628
    step: 300; loss: 15.533; l2dist: 2.641
    step: 450; loss: 8.157; l2dist: 2.856
    step: 600; loss: 7.948; l2dist: 2.798
    step: 750; loss: 8.104; l2dist: 2.847
    step: 900; loss: 7.841; l2dist: 2.800
    step: 1050; loss: 7.986; l2dist: 2.826
    step: 1200; loss: 7.870; l2dist: 2.805
    step: 1350; loss: 7.879; l2dist: 2.807
tensor(0., device='cuda:0')
binary step: 1; number of successful adv: 1/1
    ste

KeyboardInterrupt: 

In [20]:
from lib.dknn_attack_exp import DKNNExpAttack
attack = DKNNExpAttack(knn)

def attack_batch(x, y, batch_size, init_mode):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    num_batches = int(np.ceil(total_num / batch_size))
    for i in range(num_batches):
        begin = i * batch_size
        end = (i + 1) * batch_size
        x_adv[begin:end] = attack(
            x[begin:end], y[begin:end],
            guide_layer=layers[0], m=2, init_mode=init_mode, 
            binary_search_steps=15, max_iterations=1500, learning_rate=1e-2,
            initial_const=1e-1, random_start=False,
            thres_steps=20, check_adv_steps=20, verbose=True,
            max_linf=None)
    return x_adv

num = 200
x_adv0 = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200, 0)
with torch.no_grad():
    y_pred = knn.classify(x_adv0)
    ind_adv = np.where(y_pred.argmax(1) == y_test_sub[:num].numpy())[0]
    print(len(ind_adv) / y_pred.shape[0])
pert0 = (x_test_sub - x_adv0.cpu()).view(num, -1).norm(2, 1)

x_adv1 = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200, 1)
with torch.no_grad():
    y_pred = knn.classify(x_adv1)
    ind_adv = np.where(y_pred.argmax(1) == y_test_sub[:num].numpy())[0]
    print(len(ind_adv) / y_pred.shape[0])
pert1 = (x_test_sub - x_adv1.cpu()).view(num, -1).norm(2, 1)
print(torch.min(pert0, pert1).mean())

    step: 0; loss: 3.188; l2dist: 0.000
    step: 150; loss: 2.933; l2dist: 0.473
    step: 300; loss: 2.930; l2dist: 0.474
    step: 450; loss: 2.933; l2dist: 0.474
    step: 600; loss: 2.931; l2dist: 0.473
    step: 750; loss: 2.877; l2dist: 0.516
    step: 900; loss: 2.839; l2dist: 0.557
    step: 1050; loss: 2.831; l2dist: 0.569
    step: 1200; loss: 2.823; l2dist: 0.573
    step: 1350; loss: 2.821; l2dist: 0.578
binary step: 0; num successful adv: 2/200
binary step: 0; num successful adv so far: 3/200
    step: 0; loss: 31.876; l2dist: 0.000
    step: 150; loss: 15.769; l2dist: 2.569
    step: 300; loss: 9.648; l2dist: 2.827
    step: 450; loss: 9.046; l2dist: 2.802
    step: 600; loss: 8.944; l2dist: 2.803
    step: 750; loss: 8.984; l2dist: 2.806
    step: 900; loss: 8.948; l2dist: 2.809
    step: 1050; loss: 8.984; l2dist: 2.804
    step: 1200; loss: 8.944; l2dist: 2.808
    step: 1350; loss: 8.981; l2dist: 2.805
binary step: 1; num successful adv: 101/200
binary step: 1; num s

In [None]:
rep = reps[layer].view(batch_size, 1, -1);dist = ((rep - guide_reps[layer])**2).sum(2)
fx = self.sigmoid((self.thres - dist).clamp(-80 / self.a, 80 / self.a), a=self.a)

In [58]:
with torch.no_grad():
    y_pred = knn.classify(x_adv)
    ind = np.where(y_pred.argmax(1) == y_test_sub[:num].numpy())[0]
    print(len(ind) / x_adv.size(0))
(x_test_sub - x_adv.cpu()).view(num, -1).norm(2, 1).mean()

0.0


tensor(3.1310, grad_fn=<MeanBackward0>)

In [13]:
thres = ((knn.x_train[knn.get_neighbors(x_test_sub)[0][1]].squeeze() - x_test_sub)**2).sum((1, 2))

RuntimeError: The size of tensor a (3) must match the size of tensor b (200) at non-singleton dimension 1

In [32]:
thres.mean()

tensor(13.5435)

In [63]:
INFTY = 1e20


class DKNN_PGD(object):
    """
    """

    def __init__(self, dknn):
        self.dknn = dknn
        self.device = dknn.device
        self.layers = dknn.layers
        self.guide_reps = {}
        self.thres = None
        self.coeff = None

    def __call__(self, x_orig, label, guide_layer, m, epsilon=0.1,
                 max_epsilon=0.3, max_iterations=1000, num_restart=1,
                 rand_start=True, thres_steps=100, check_adv_steps=100,
                 verbose=True):
        """
        x_orig is tensor (requires_grad=False)
        """

        # make sure we run at least once
        if num_restart < 1:
            num_restart = 1

        # if not using randomized start, no point in doing more than one start
        if not rand_start:
            num_restart = 1

        label = label.cpu().numpy()
        batch_size = x_orig.size(0)
        min_, max_ = x_orig.min(), x_orig.max()
        x_adv = x_orig.detach()
        best_num_nn = np.zeros((batch_size, ))
        
        self.coeff = torch.zeros((x_orig.size(0), m))
        self.coeff[:, :m // 2] += 1
        self.coeff[:, m // 2:] -= 1
        
        for i in range(num_restart):

            # initialize perturbation
            delta = torch.zeros_like(x_adv)
            if rand_start:
                delta.uniform_(- max_epsilon * 0.1, max_epsilon * 0.1)
            delta.requires_grad_()

            for iteration in range(max_iterations):
                x = torch.clamp(x_orig + delta, min_, max_)

                # adaptively choose threshold and guide samples every
                # <thres_steps> iterations
                with torch.no_grad():
                    if iteration % thres_steps == 0:
                        thres = self.dknn.get_neighbors(x)[0][0][:, -1]
                        self.thres = torch.tensor(thres).to(self.device).view(
                            batch_size, 1)
                        self.find_guide_samples(
                            x, label, m=m, layer=guide_layer)

                reps = self.dknn.get_activations(x, requires_grad=True)
                loss = self.loss_function(reps)
                loss.backward()
                # perform update on delta
                with torch.no_grad():
                    delta -= epsilon * delta.grad.detach().sign()
                    delta.clamp_(- max_epsilon, max_epsilon)

                if (verbose and iteration % (np.ceil(max_iterations / 10)) == 0):
                    print('    step: %d; loss: %.3f' %
                          (iteration, loss.cpu().detach().numpy()))
                
                if ((iteration + 1) % check_adv_steps == 0 or
                        iteration == max_iterations):
                    with torch.no_grad():
                        # check if x are adversarial. Only store adversarial examples
                        # if they have a larger number of wrong neighbors than orevious
                        is_adv, num_nn = self.check_adv(x, label)
                        for j in range(batch_size):
                            if is_adv[j] and num_nn[j] > best_num_nn[j]:
                                x_adv[j] = x[j]
                                best_num_nn[j] = num_nn[j]

            with torch.no_grad():
                is_adv, _ = self.check_adv(x_adv, label)
            if verbose:
                print('number of successful adv: %d/%d' % (is_adv.sum(), batch_size))

        return x_adv

    def check_adv(self, x, label):
        """Check if label of <x> predicted by <dknn> matches with <label>"""
        output = self.dknn.classify(x)
        num_nn = output.max(1)
        y_pred = output.argmax(1)
        is_adv = (y_pred != label).astype(np.float32)
        return is_adv, num_nn

    def loss_function(self, reps):
        """Returns the loss averaged over the batch (first dimension of x) and
        L-2 norm squared of the perturbation
        """

        batch_size = reps[self.layers[0]].size(0)
        adv_loss = torch.zeros(
            (batch_size, len(self.layers)), device=self.device)
        # find squared L-2 distance between original samples and their
        # adversarial examples at each layer
        for l, layer in enumerate(self.layers):
            rep = reps[layer].view(batch_size, 1, -1)
            dist = ((rep - self.guide_reps[layer])**2).sum(2)
            fx = self.thres - dist
            Fx = torch.max(torch.tensor(0., device=self.device),
                           self.coeff.to(self.device) * fx).sum(1)
            Fx = (- self.coeff.to(self.device) * dist).sum(1)
            adv_loss[:, l] = Fx

        return adv_loss.mean()

    def find_guide_samples(self, x, label, m=100, layer='relu1'):
        """Find k nearest neighbors to <x> that all have the same class but not
        equal to <label>
        """
        num_classes = self.dknn.num_classes
        x_train = self.dknn.x_train
        y_train = self.dknn.y_train
        batch_size = x.size(0)
        nn = torch.zeros((m, ) + x.size()).transpose(0, 1)
        D, I = self.dknn.get_neighbors(
            x, k=x_train.size(0), layers=[layer])[0]

        for i, (d, ind) in enumerate(zip(D, I)):
            mean_dist = np.zeros((num_classes, ))
            for j in range(num_classes):
                mean_dist[j] = np.mean(
                    d[np.where(y_train[ind] == j)[0]][:m // 2])
            mean_dist[label[i]] += INFTY
            nearest_label = mean_dist.argmin()
            nn_ind = np.where(y_train[ind] == nearest_label)[0][:m // 2]
            nn[i, m // 2:] = x_train[ind[nn_ind]]
            nn_ind = np.where(y_train[ind] == label[i])[0][:m // 2]
            nn[i, :m // 2] = x_train[ind[nn_ind]]

        # initialize self.guide_reps if empty
        if not self.guide_reps:
            guide_rep = self.dknn.get_activations(
                nn[0], requires_grad=False)
            for l in self.layers:
                # set a zero tensor before filling it
                size = (batch_size, ) + guide_rep[l].view(m, -1).size()
                self.guide_reps[l] = torch.zeros(size, device=self.device)

        # fill self.guide_reps
        for i in range(batch_size):
            guide_rep = self.dknn.get_activations(
                nn[i], requires_grad=False)
            self.guide_reps[layer][i] = guide_rep[layer].view(
                m, -1).detach()

In [106]:
INFTY = 1e20


class DKNN_PGD(object):
    """
    """

    def __init__(self, dknn):
        self.dknn = dknn
        self.device = dknn.device
        self.layers = dknn.layers
        self.guide_reps = {}
        self.thres = None
        self.coeff = None

    def __call__(self, x_orig, label, guide_layer, m, epsilon=0.1,
                 max_epsilon=0.3, max_iterations=1000, num_restart=1,
                 rand_start=True, thres_steps=100, check_adv_steps=100,
                 verbose=True):
        """
        x_orig is tensor (requires_grad=False)
        """

        # make sure we run at least once
        if num_restart < 1:
            num_restart = 1

        # if not using randomized start, no point in doing more than one start
        if not rand_start:
            num_restart = 1

        label = label.cpu().numpy()
        batch_size = x_orig.size(0)
        min_, max_ = x_orig.min(), x_orig.max()
        x_adv = x_orig.detach()
        best_num_nn = np.zeros((batch_size, ))
        
        self.coeff = torch.zeros((x_orig.size(0), m))
        self.coeff[:, :m // 2] += 1
        self.coeff[:, m // 2:] -= 1
        
        for i in range(num_restart):

            # initialize perturbation
            delta = torch.zeros_like(x_adv)
            if rand_start:
                delta.uniform_(- max_epsilon * 0.1, max_epsilon * 0.1)
            delta.requires_grad_()

            for iteration in range(max_iterations):
                x = torch.clamp(x_orig + delta, min_, max_)

                # adaptively choose threshold and guide samples every
                # <thres_steps> iterations
                with torch.no_grad():
                    if iteration % thres_steps == 0:
                        thres = self.dknn.get_neighbors(x)[0][0][:, -1]
                        self.thres = torch.tensor(thres).to(self.device).view(
                            batch_size, 1)
                        self.find_guide_samples(
                            x, label, m=m, layer=guide_layer)

                reps = self.dknn.get_activations(x, requires_grad=True)
                loss = self.loss_function(reps)
                loss.backward()
                # perform update on delta
                with torch.no_grad():
#                     import pdb; pdb.set_trace()
                    delta -= epsilon * delta.grad.detach().sign()
#                     delta -= epsilon * delta.grad.detach()
                    delta.clamp_(- max_epsilon, max_epsilon)

                if (verbose and iteration % (np.ceil(max_iterations / 10)) == 0):
                    print('    step: %d; loss: %.3f' %
                          (iteration, loss.cpu().detach().numpy()))
                
                if ((iteration + 1) % check_adv_steps == 0 or
                        iteration == max_iterations):
                    with torch.no_grad():
                        # check if x are adversarial. Only store adversarial examples
                        # if they have a larger number of wrong neighbors than orevious
                        is_adv, num_nn = self.check_adv(x, label)
                        for j in range(batch_size):
                            if is_adv[j] and num_nn[j] > best_num_nn[j]:
                                x_adv[j] = x[j]
                                best_num_nn[j] = num_nn[j]

            with torch.no_grad():
                is_adv, _ = self.check_adv(x_adv, label)
            if verbose:
                print('number of successful adv: %d/%d' % (is_adv.sum(), batch_size))

        return x_adv

    def check_adv(self, x, label):
        """Check if label of <x> predicted by <dknn> matches with <label>"""
        output = self.dknn.classify(x)
        num_nn = output.max(1)
        y_pred = output.argmax(1)
        is_adv = (y_pred != label).astype(np.float32)
#         import pdb; pdb.set_trace()
        return is_adv, num_nn

    def loss_function(self, reps):
        """Returns the loss averaged over the batch (first dimension of x) and
        L-2 norm squared of the perturbation
        """

        batch_size = reps[self.layers[0]].size(0)
        adv_loss = torch.zeros(
            (batch_size, len(self.layers)), device=self.device)
        # find squared L-2 distance between original samples and their
        # adversarial examples at each layer
        for l, layer in enumerate(self.layers):
            rep = reps[layer].view(batch_size, 1, -1)
            dist = ((rep - self.guide_reps[layer])**2).sum(2)
#             fx = self.thres - dist
#             Fx = torch.max(torch.tensor(0., device=self.device),
#                            self.coeff.to(self.device) * fx).sum(1)
#             Fx = (- self.coeff.to(self.device) * dist).sum(1)
            Fx = dist[:, 1]
            adv_loss[:, l] = Fx

        return adv_loss.mean()

    def find_guide_samples(self, x, label, m=100, layer='relu1'):
        """Find k nearest neighbors to <x> that all have the same class but not
        equal to <label>
        """
        num_classes = self.dknn.num_classes
        x_train = self.dknn.x_train
        y_train = self.dknn.y_train
        batch_size = x.size(0)
        nn = torch.zeros((m, ) + x.size()).transpose(0, 1)
        D, I = self.dknn.get_neighbors(
            x, k=x_train.size(0), layers=[layer])[0]
        y_pred = self.dknn.classify(x_train).argmax(1)
        is_correct = y_pred == y_train.numpy()

        for i, (d, ind) in enumerate(zip(D, I)):
#             mean_dist = np.zeros((num_classes, ))
#             for j in range(num_classes):
#                 mean_dist[j] = np.mean(
#                     d[np.where(y_train[ind] == j)[0]][:m // 2])
#             mean_dist[label[i]] += INFTY
#             nearest_label = mean_dist.argmin()
#             nn_ind = np.where(y_train[ind] == nearest_label)[0][:m // 2]
#             nn[i, m // 2:] = x_train[ind[nn_ind]]
#             nn_ind = np.where(y_train[ind] == label[i])[0][:m // 2]
#             nn[i, :m // 2] = x_train[ind[nn_ind]]
            # find nearest sample that is correctly classified as j
#             import pdb; pdb.set_trace()
            is_not_label = y_train.numpy() != label[i]
            idx = np.where(is_correct & is_not_label)[0]
            nn_ind = (x_train - x[i].cpu())[idx].view(
                idx.shape[0], -1).norm(2, 1).argmin()
            nn[i, 1] = x_train[idx][nn_ind]

        # initialize self.guide_reps if empty
        if not self.guide_reps:
            guide_rep = self.dknn.get_activations(
                nn[0], requires_grad=False)
            for l in self.layers:
                # set a zero tensor before filling it
                size = (batch_size, ) + guide_rep[l].view(m, -1).size()
                self.guide_reps[l] = torch.zeros(size, device=self.device)

        # fill self.guide_reps
        for i in range(batch_size):
            guide_rep = self.dknn.get_activations(
                nn[i], requires_grad=False)
            self.guide_reps[layer][i] = guide_rep[layer].view(
                m, -1).detach()

In [109]:
attack = DKNN_PGD(knn)

def attack_batch(x, y, batch_size):
    x_adv = torch.zeros_like(x)
    total_num = x.size(0)
    num_batches = total_num // batch_size
    for i in range(num_batches):
        begin = i * batch_size
        end = (i + 1) * batch_size
        x_adv[begin:end] = attack(
            x[begin:end], y[begin:end],
            guide_layer=layers[0], m=2, epsilon=0.001,
            max_epsilon=0.3, max_iterations=1000, num_restart=10,
            rand_start=False, thres_steps=1000, check_adv_steps=1000,
            verbose=True)
    return x_adv

num = 200
x_adv = attack_batch(x_test_sub[:num].cuda(), y_test_sub[:num], 200)

    step: 0; loss: 45.419
    step: 100; loss: 34.209
    step: 200; loss: 25.145
    step: 300; loss: 17.951
    step: 400; loss: 17.896
    step: 500; loss: 17.885
    step: 600; loss: 17.942
    step: 700; loss: 17.890
    step: 800; loss: 17.899
    step: 900; loss: 17.923
number of successful adv: 54/200


In [115]:
knn.device

'cuda'