In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

"""
Zero-order Stochastic Conditional Gradient (ZSCG) with Inexact Conditional Gradient (ICG) update.
Based on the paper:
    Zeroth-order Nonconvex Stochastic Optimization: Handling Constraints, High-Dimensionality and Saddle-Points∗
by:
    Krishnakumar Balasubramanian†1 and Saeed Ghadimi‡2
ALG.4 (modified version for non-convex problem)
"""
class InexactZSCG(object):
    """
    Args:
    Name            Type                Description
    model:          (nn.Module)         The model to use to get the output
    loss:           (nn.Module)         The loss to minimize
    device:
    """
    def __init__(self, model, loss, device=torch.device('cuda')):
        self.device = device
        self.loss = loss
        self.model = model.to(self.device)
        self.model.eval()


    """
    Perform an attack against the model given an input and the required run params
    """
    def run(self, x, v, n_gradient, gamma_k, mu_k, epsilon, L_type = -1, batch_size = -1, C = (0, 1),
            max_steps=100, verbose=0, additional_out=False, tqdm_disabled=False, max_t=1000):
        """
        Args:
        Name            Type                Description
        x               (torch.tensor)      The variable of our optimization problem. Should be a 3D tensor (img)
        v               (float)             The gaussian smoothing
        n_gradient      (list)              Number of normal vector to generate at every step
        gamma_k         (list)              Momentum at every step inside ICG
        mu_k            (list)              Stoppinc criterion at every step k inside ICG
        max_t           (int)               The maximum number of iteration inside of ICG.
        epsilon         (float)             The upper bound of norm
        L_type          (int)               Either -1 for L_infinity or x for Lx. Default is -1
        batch_size      (int)               Maximum parallelization during the gradient estimation. Default is -1 (=n_grad)
        C               (tuple)             The boundaires of the pixel. Default is (0, 1)
        max_steps       (int)               The maximum number of steps. Default is 100
        verbose         (int)               Display information or not. Default is 0
        additional_out  (bool)              Return also all the x. Default is False
        tqdm_disable    (bool)              Disable the tqdm bar. Default is False
        """

        x = x.to(self.device)

        # 1. Init class attributes
        self.x_original = x.clone()
        self.dim = x.shape
        self.total_dim = torch.prod(torch.tensor(x.shape))
        self.epsilon = epsilon
        self.L_type = L_type
        self.C = C
        self.batch = batch_size
        self.max_t = max_t

        # 2. Init list of results
        losses, outs = [], []
        x_list = []

        # 3. Main optimization cycle
        for ep in tqdm(range(max_steps), disable=tqdm_disabled):

            if verbose:
                print("---------------")
                print("Step number: {}".format(ep))

            # 3.1 Call the step
            x, gk = self.step(x, v, gamma_k[ep], mu_k[ep], n_gradient[ep], verbose)
            x = x.reshape(self.dim[0], self.dim[1], self.dim[2]).detach()

            # 3.2 Compute loss
            out = self.model(x.view(1, self.dim[0], self.dim[1], self.dim[2]))
            loss = self.loss(out).view(-1, 1)

            # 3.3 Save results
            losses.append(loss.detach().cpu().item())
            outs.append(out.detach().cpu()[0, self.loss.neuron].item())
            if additional_out:
                x_list.append(x.cpu())

            # 3.4 Display current info
            if verbose:
                print("Loss:        {}".format(losses[-1]))
                print("Output:      {}".format(outs[-1]))

            # 3.5 Check Stopping criterion
            condition1 = (int(torch.argmax(out)) != self.loss.neuron) and (self.loss.maximise == 0)
            condition2 = (int(torch.argmax(out)) == self.loss.neuron) and (self.loss.maximise == 1)
            if condition1 or condition2:
                break

        if additional_out:
            return x, losses, outs, input_list
        return  x, losses, outs

    """
    Do an optimization step
    """
    def step(self, x, v, gamma, mu, mk, verbose=0):
        """
        Args:
        Name            Type                Description
        x:              (torch.tensor)      The variable of our optimization problem. Should be a 3D tensor (img)
        v:              (float)             The gaussian smoothing
        gamma:          (float)             The update parameters of g
        mu:             (float)             The stopping criterion
        mk:             (int)               The number of Gaussian Random Vector to generate
        verbose:        (bool)              Display information or not. Default is 0
        """
        # Compute the approximated gradient
        g = self.compute_Gk(x, v, mk, verbose)
        # Call the inexact conditional gradient
        x_new = self.compute_ICG(x, g, gamma, mu, verbose).reshape(x.shape[0], x.shape[1], x.shape[2])

        if verbose > 1:
            print("\nINSIDE STEP")
            print("Gradient has shape: {}".format(g.shape))
            print("Gradient is:\n{}".format(g))
            print("x_new has shape: {}".format(x_new.shape))
            print("x_new is:\n{}".format(x_new))

        return x_new.detach(), g.detach()


    """
    Prepare parallelization for the gradient estimation
    """
    def get_parallel(self, x, bs, v):
        """
        Args:
        Name            Type                Description
        x               (torch.tensor)      The current variable
        bs              (int)               The maximum bacth size
        v               (int)               The Gaussian smoothing
        """
        uk     = torch.empty(bs, self.total_dim).normal_(mean=0, std=1).to(self.device) # Dim (bs, channel*width*height)
        img_u  = uk.reshape(bs, self.dim[0], self.dim[1], self.dim[2])                  # Dim (bs, channel, width, height)
        img_x  = x.expand(bs, self.dim[0], self.dim[1], self.dim[2])                    # Dim (bs, channel, width, height)
        m_x    = (img_x + v*img_u)

        return m_x, uk

    """
    Compute the Gv(x(k-1), chi(k-1), u(k)) in order to compute an approximation of the gradient of f(x(k-1), chi(k-1))
    """
    def compute_Gk(self, x, v, mk, verbose=0):
        """
        Args:
        Name            Type                Description
        x:              (torch.tensor)      The variable of our optimization problem. Should be a 3D tensor (img)
        v:              (float)             The gaussian smoothing
        mk:             (int)               The number of Gaussian Random Vector to generate
        verbose:        (bool)              Display information or not. Default is 0
        """

        # 1. Get objective functions
        # CASE BATCH_SIZE == N_GRADIENT
        if self.batch == -1:

            # 1.a Compute standard Loss
            standard_loss = self.loss(self.model(x.view(1, *list(self.dim))))

            # 1.b Compute gaussian loss
            m_x, uk = self.get_parallel(x, mk, v)
            gaussian_loss = self.loss(self.model(m_x))

            if verbose > 1:
                print('\nINSIDE GRADIENT')
                print('The Gaussian vector uk has shape:{}'.format(uk.shape))
                print('The input x has shape:\t\t{}'.format(x.shape))
                print('The input x + vu has shape:\t{}'.format(m_x.shape))

            # 1.c Compute Gv(x(k-1), chi(k-1), u(k))
            fv = ((gaussian_loss - standard_loss.expand(uk.shape[0]))/v).view(-1, 1)        # Dim (mk, 1)
            G = fv * uk                                                                     # Dim (mk, channel*width*height)

            return torch.mean(G, axis=0).detach()

        # CASE BATCH_SIZE < N_GRADIENT
        else:

            # 1.a Compute standard loss
            standard_loss = self.loss(self.model(x.view(1, *list(self.dim))))                   # Dim (1)
            G_tot = torch.zeros(mk//self.batch, self.total_dim).to(self.device)                 # Dim (n_batches, hannel*width*height)

            #1.b Compute Gradient
            for n in range(mk//self.batch):
                from_, to_ = n*self.batch, (n+1)*self.batch

                # 1.b Create batch x(k-1) + v*u(k-1)
                m_x, uk = self.get_parallel(x, self.batch, v)

                # 1.c Compute
                tmp_gaussian_loss = self.loss(self.model(m_x)).detach()                                 # Dim(bs)

                if verbose > 1:
                    print('\nINSIDE GRADIENT')
                    print('The Gaussian vector uk has shape:{}'.format(uk.shape))
                    print('The input x has shape:\t\t{}'.format(x.shape))
                    print('The input x + vu has shape:\t{}'.format(m_x.shape))

                # 1.d Compute Gradient
                fv = ((tmp_gaussian_loss - standard_loss.expand(uk.shape[0]))/v).view(-1, 1)            # Dim (bs, 1)
                G = fv * uk                                                                             # Dim (bs, channel*width*height)

                if verbose > 1:
                    print('Gaussian cycle loss has shape:\t{}'.format(tmp_gaussian_loss.shape))
                    print('Function approx has shape:\t{}'.format(fv.shape))
                    print('Gradient has shape:\t\t{}'.format(G.shape))

                G_tot[n] = torch.mean(G, axis=0).detach()

        return torch.mean(G_tot, axis=0).detach()



    """
    Compute the Inexact Condtion Gradient (Algorithm 3 of source article)
    """
    def compute_ICG(self, x, g, gamma, mu, verbose):
        """
        Args:
        Name            Type                Description
        x:              (torch.tensor)      The variable of our optimization problem. Should be a 3D tensor (img)
        g:              (torch.tensor)      The approximated gradient. Should be a 1D tensor
        gamma:          (float)             The update parameters of g
        mu:             (float)             The stopping criterion
        """
        # 1. Init variables
        y_old = x.view(-1).clone() # dim = (n_channel * width * height)
        t = 1
        k = 0
        # 2. Main cycle
        while(k==0):
            # 2.1 Compute gradient
            grad = g + gamma*(y_old - x.view(-1))

            # 2.2 Perform LMO
            # Infinity norm
            if self.L_type == -1:
                y_new = self.x_original.view(-1) - self.epsilon*torch.sign(g)
            # L1 norm
            elif self.L_type == 1:
                raise NotImplementedError
            elif self.L_type == 2:
                y_new = self.x_original.view(-1) - (self.epsilon*g)/torch.norm(g, 2)
            # Generic Lp norm (1 < p < +inf)
            else:
                p = self.L_type
                gp = torch.abs(g)**(1/p-1)
                h = torch.sign(g) * (gp) / torch.norm(gp, p)
                y_new = self.x_original.view(-1) - self.epsilon*h

            # 2.3 Compute new function value
            h = torch.dot(grad, y_new - y_old)

            if verbose > 1:
                print('\nINSIDE ICG')
                print('Difference between g and y_old -x: {}')
                print(torch.norm(g/torch.norm(g) - (y_old - x.view(-1))/torch.norm(y_old - x.view(-1))))
                print('Time t = {}'.format(t))
                print('The original gradient is:\n{}'.format(g))
                print('The ICG gradient is:\n{}'.format(grad))
                print('The new y is:\n {}'.format(y_new))
                print('The function h(y_new) is {}'.format(h))
                print('Mu is: {}'.format(mu))

            # 2.4 Check conditions
            if h >= -mu or t > self.max_t:
                k = 1
            else:
                y_old = (t-1)/(t+1)*y_old + 2/(t+1)*y_new
                t += 1

        return self.project_boundaries(y_old.detach())


    """
    Check the boundaries of our constraint optimization problem
    """
    def project_boundaries(self, x):
        x[x > self.C[1]] = self.C[1]
        x[x < self.C[0]] = self.C[0]
        return x

In [7]:
import torch
from torch import nn

"""
Abstract object for the Custom Loss. Child of nn.Module
"""
class Loss(object):

    def __init__(self, neuron, maximise=0):
        """
        Args:
            Name       Type    Desc
            neuron     int     The output neuron to minimize
            maximise   bool    The desired activation
        """
        self.neuron = neuron
        self.maximise = maximise

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

    def forward(self, *args, **kwargs):
        raise NotImplementedError



"""
Given a target neuron and a target (y_true).
Compute the Mean Squared Difference between the softmax output and the target
"""
class MSELoss(Loss):

    def __init__(self, neuron, maximise=0, is_softmax=False, dim=1):
        """
        Args:
            Name        Type    Desc
            neuron:     int     The output neuron to minimize
            maximise    bool.   The desired activation (0/1)
            is_softmax:  bool     Bool indicating if the model output is probability distribution. Defaulti is False
            dim:         int      Dimension of softmax application. Default is 1
        """
        super().__init__(neuron, maximise)
        self.is_softmax = is_softmax
        self.dim = dim

    """
    Compute the MSE after computing the softmax of input.
    Forward is implemented in the __call__ method of super
    """
    def forward(self, y_pred):
        """
        Args
            y_pred  torch.tensor The output of the network. Preferable shape (n_batch, n_classes)
        """
        # Deal with 1D input
        if len(y_pred.shape) == 1:
            y_pred = y_pred.view(-1, 1)
        # Case model output does not have non-softmax model output
        if not self.is_softmax:
            # Compute logits
            y_pred = nn.Softmax(dim=self.dim)(y_pred)
        # Return loss
        return 0.5*(int(self.maximise) - y_pred[:, self.neuron])**2




In [14]:
import matplotlib.pyplot as plt
from torch import nn
from tqdm import tqdm

class Net(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(1, 3, (2, 2), stride=2, padding=1)
        self.linear = nn.Linear(3, 3)


    def forward(self, x):
        x = nn.ReLU()(self.conv(x))
        return nn.Sigmoid()(self.linear(x.view(x.shape[0], -1)))

epoch = 50
m = [50]*epoch
a = [0.9]*epoch
v = 1


net = Net()
loss = MSELoss(neuron=2, maximise=0)
optim = InexactZSCG(model=net, loss=loss)

x = torch.tensor([1])
x, loss_curve, out, xs = optim.run(x.view(1, 1, 1),
                                    m, a,
                                   v=0.01,
                                   n_gradient=5,
                                   gamma_k=2,
                                   epsilon=0.5,
                                   max_steps=epoch,
                                   verbose=0, additional_out=True)

min_, max_ = min(xs), max(xs)
losses = []
for i in tqdm(range(int(min_-1)*10, int(max_+1)*10)):
    x = torch.tensor([i/10]).to(torch.device('cuda'))
    out = net(x.view(1, 1, 1, 1))
    losses.append(loss(out))

plt.plot([i/10 for i in range(int(min_-1)*10, int(max_+1)*10)], losses, label='Loss curve')
plt.scatter(xs, loss_curve, label='Parameters')
plt.legend()
plt.xlabel('Input')
plt.ylabel('Loss')
plt.title('Loss function')
plt.grid()
plt.show()


TypeError: run() got multiple values for argument 'v'