In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import numpy as np
from six.moves import xrange

import logging
import tensorflow as tf
from tensorflow.python.platform import flags

from cleverhans.utils_mnist import data_mnist
from cleverhans.utils import to_categorical
from cleverhans.utils import set_log_level
from cleverhans.utils_tf import model_train, model_eval, batch_eval
from cleverhans.attacks import FastGradientMethod
from cleverhans.attacks_tf import jacobian_graph, jacobian_augmentation

from cleverhans_tutorials.tutorial_models import make_basic_cnn, MLP
from cleverhans_tutorials.tutorial_models import Flatten, Linear, ReLU, Softmax

In [3]:
# General setup
set_log_level(logging.DEBUG)
rng = np.random.RandomState([2017, 8, 30])
accuracies = {}

n_epochs = 10
batch_size = 128
learning_rate = 0.001

sess = tf.Session()
x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
y = tf.placeholder(tf.float32, shape=(None, 10))

In [4]:
# Setup dataset
train_start, train_end = 0, 60000
test_start, test_end = 0, 10000
holdout = 150

# Get MNIST data
X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                              train_end=train_end,
                                              test_start=test_start,
                                              test_end=test_end)

# Initialize substitute training set reserved for adversary
X_sub = X_test[:holdout]
Y_sub = np.argmax(Y_test[:holdout], axis=1)

# Redefine test set as remaining samples unavailable to adversaries
X_test = X_test[holdout:]
Y_test = Y_test[holdout:]

Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)


In [5]:
# Train our own "black-box"
n_blackbox_epochs = n_epochs
n_blackbox_batch_size = batch_size
blackbox_learning_rate = learning_rate

model = make_basic_cnn()
bbox_preds = model(x)
print("Defined TensorFlow model graph.")

# Train an MNIST model
train_params = {
    'nb_epochs': n_blackbox_epochs,
    'batch_size': n_blackbox_batch_size,
    'learning_rate': blackbox_learning_rate
}
model_train(sess, x, y, bbox_preds, X_train, Y_train, verbose=False,
            args=train_params, rng=rng)

# Print out the accuracy on legitimate data
eval_params = {'batch_size': n_blackbox_batch_size}
accuracies['bbox'] = model_eval(sess, x, y, bbox_preds, X_test, Y_test, args=eval_params)
print('Test accuracy of black-box on legitimate test '
      'examples: ' + str(accuracies['bbox']))

Defined TensorFlow model graph.




Test accuracy of black-box on legitimate test examples: 0.98923857868


In [6]:
# Train the substitute model using a new set of data with jacobian augmentation (this sounds fancy because it's part of their methodology)
n_blackbox_classes = 10
data_aug = 6
sub_epochs = 10
sub_batch_size = batch_size
sub_learning_rate = learning_rate
sub_lmbda = 0.1

# Define a fully connected model (it's different than the black-box)
img_rows, img_cols = 28, 28
layers = [Flatten(),
          Linear(200),
          ReLU(),
          Linear(200),
          ReLU(),
          Linear(n_blackbox_classes),
          Softmax()]
input_shape = (None, img_rows, img_cols, 1)
model_sub =  MLP(layers, input_shape)

# Define TF model graph (for the black-box model)
preds_sub = model_sub(x)

# Define the Jacobian symbolically using TensorFlow
grads = jacobian_graph(preds_sub, x, n_blackbox_classes)

# Train the substitute and augment dataset alternatively
for rho in xrange(data_aug):
    print("Substitute training epoch #" + str(rho))
    train_params = {
        'nb_epochs': sub_epochs,
        'batch_size': sub_batch_size,
        'learning_rate': sub_learning_rate
    }
    model_train(sess, x, y, preds_sub, X_sub, to_categorical(Y_sub),
                init_all=False, verbose=False, args=train_params,
                rng=rng)

    # If we are not at last substitute training iteration, augment dataset
    if rho < data_aug - 1:
        print("Augmenting substitute training data.")
        # Perform the Jacobian augmentation
        lmbda_coef = 2 * int(int(rho / 3) != 0) - 1
        X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads,
                                      lmbda_coef * sub_lmbda)

        print("Labeling substitute training data.")
        # Label the newly generated synthetic points using the black-box
        Y_sub = np.hstack([Y_sub, Y_sub])
        X_sub_prev = X_sub[int(len(X_sub)/2):]
        eval_params = {'batch_size': sub_batch_size}
        bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev],
                              args=eval_params)[0]
        # Note here that we take the argmax because the adversary
        # only has access to the label (not the probabilities) output
        # by the black-box model
        Y_sub[int(len(X_sub)/2):] = np.argmax(bbox_val, axis=1)
            
# Evaluate the substitute model on clean test examples
eval_params = {'batch_size': sub_batch_size}
acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params)
accuracies['sub'] = acc

Substitute training epoch #0




Augmenting substitute training data.
Labeling substitute training data.
Substitute training epoch #1
Augmenting substitute training data.
Labeling substitute training data.
Substitute training epoch #2
Augmenting substitute training data.
Labeling substitute training data.
Substitute training epoch #3
Augmenting substitute training data.
Labeling substitute training data.
Substitute training epoch #4
Augmenting substitute training data.
Labeling substitute training data.
Substitute training epoch #5


In [7]:
# Wrap the FGSM attack object around the substitute model and evaluate it on the holdout data

# Initialize the Fast Gradient Sign Method (FGSM) attack object.
fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
fgsm = FastGradientMethod(model_sub, sess=sess)

# Craft adversarial examples using the substitute
eval_params = {'batch_size': sub_batch_size}
x_adv_sub = fgsm.generate(x, **fgsm_par)

# Evaluate the accuracy of the "black-box" model on adversarial examples
accuracy = model_eval(sess, x, y, model(x_adv_sub), X_test, Y_test,
                      args=eval_params)
print('Test accuracy of oracle on adversarial examples generated '
      'using the substitute: ' + str(accuracy))
accuracies['bbox_on_sub_adv_ex'] = accuracy

Test accuracy of oracle on adversarial examples generated using the substitute: 0.716040609137


In [8]:
print(accuracies)

{'bbox_on_sub_adv_ex': 0.71604060913705581, 'sub': 0.76730964467005081, 'bbox': 0.98923857868020304}
