# Adversarial examples

This notebook shows a step-by-step guide how to craft adversarial examples. The examples are optimized using TensorFlow, and the results can be manually cross-checked with our pure-Python CNN implementation.
The notebook provided the steps necessary to exploit two vulnerabilities:
1. Coming up with an image that triggers an activation at the desired output class.
2. Creating an overflow in `np.exp()` if we forget to use softmax normalization.

In [41]:
%matplotlib notebook
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
from PIL import Image
from io import BytesIO
from scipy.misc import imread
import math
from vggface.tf_vggface_v2 import VGGFace as TfVGGFace
from vggface.model import VGGFace as MyVGGFace
import tensorflow as tf
import random
import json
import re


dataset_path = "/media/explicat/Moosilauke/ctf/facescrub/training"
tf_checkpoint_dir = "/media/explicat/Moosilauke/ctf/vggface/checkpoint_pretrained_530_lr_0.000001"
my_cnn_weights_file = "/home/explicat/ctf/jodlgang/prototype/vggface/weights_pretrained_530_lr_0.000001.h5"
num_classes = 530

Find all image files.

In [2]:
img_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(dataset_path) for f in filenames if re.search(".(jpg|jpeg|png)$", f.lower()) is not None]
random.shuffle(img_files)

Load the mapping from class number to person.

In [30]:
name_to_class_label_mapping_file = os.path.join(dataset_path, "class_label_mapping.json")
with open(name_to_class_label_mapping_file, "r") as f:
    name_to_class_label_mapping = json.load(f)

def decode_single_prediction(probabilities):
    top_5_classes = np.argsort(-probabilities)[:5]
    top_5_identities = np.array(name_to_class_label_mapping)[top_5_classes]
    return list(zip(top_5_identities, top_5_classes, probabilities[top_5_classes]))

def tf_classify(img):
    feed_dict = {img_var: img}
    class_probabilities = sess.run(cnn_output, feed_dict=feed_dict)[0]
    print(decode_single_prediction(class_probabilities))
    
def my_classify(img):
    class_probabilities = my_cnn.inference(img[None, :])[0]
    print(decode_single_prediction(class_probabilities))

Restore TensorFlow model from checkpoint.

In [4]:
sess = tf.InteractiveSession()
img_var = tf.Variable(tf.zeros((224, 224, 3)))
tf_cnn = TfVGGFace(sess, num_classes, auto_setup_model=False)

cnn_output, cnn_output_logits = tf_cnn.model(tf.expand_dims(img_var, 0), drop_rate=0, num_classes=num_classes)
trainable_vars = tf.trainable_variables()
trainable_vars.remove(img_var)
saver = tf.train.Saver(var_list=trainable_vars)
tf_cnn.load(tf_checkpoint_dir, saver);

Attempting to read checkpoint from /media/explicat/Moosilauke/ctf/vggface/checkpoint_pretrained_530_lr_0.000001
INFO:tensorflow:Restoring parameters from /media/explicat/Moosilauke/ctf/vggface/checkpoint_pretrained_530_lr_0.000001/VGGFace_v2-78000
Successfully restored checkpoint


Restore our pure python model.

In [42]:
my_cnn = MyVGGFace()
my_cnn.restore_weights(my_cnn_weights_file)

## Crafting adversarial examples

### Initialization
Copy the placeholder into the variable which is the input to the computational graph.

In [14]:
x = tf.placeholder(tf.float32, (224, 224, 3))

# Set up trainable adversarial input
x_hat = img_var
assign_op = tf.assign(x_hat, x)

### Gradient descent step

Use gradient descent to maximize the log probability of the desired output class.

In [15]:
learning_rate = tf.placeholder(tf.float32, ())
y_hat = tf.placeholder(tf.int32, ())

labels = tf.one_hot(y_hat, num_classes)
loss_sample = tf.nn.softmax_cross_entropy_with_logits(logits=cnn_output_logits, labels=[labels])
loss = tf.reduce_sum(loss_sample)
optimization_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, var_list=[x_hat])

### Projection step
Keep adversarial example visually close to the original image. Clip to `[0, 255]` range.

In [17]:
epsilon = tf.placeholder(tf.float32, ())

below = x - epsilon
above = x + epsilon
epsilon_clipped = tf.clip_by_value(x_hat, below, above)
valid_clipped = tf.clip_by_value(epsilon_clipped, 0, 255)

with tf.control_dependencies([valid_clipped]):
    project_step = tf.assign(x_hat, valid_clipped)

### Execution

Turn random image into class 42

In [27]:
demo_epsilon = 5.0
demo_learning_rate = 1e4
demo_steps = 50
demo_target = 42

# Initialization step
original_img = imread(img_files[0])[:, :, :3]
sess.run(assign_op, feed_dict={x: original_img})

# Projected gradient descent
for i in range(demo_steps):
    # Gradient descent step
    _, loss_val = sess.run([optimization_step, loss], feed_dict={learning_rate: demo_learning_rate, y_hat: demo_target})
    # Projection step
    sess.run(project_step, feed_dict={x: original_img, epsilon: demo_epsilon})
    # Pront progress after every 10 steps
    if (i + 1) % 10 == 0:
        print("Step: {:d}, loss {:3.2f}".format(i + 1, loss_val))
        
# Retrieve the adversarial example
adv = x_hat.eval()
adv_uint8 = adv.astype(np.uint8)

Step: 10, loss 0.00
Step: 20, loss 0.00
Step: 30, loss 0.00
Step: 40, loss 0.00
Step: 50, loss 0.00


Print classification.

In [33]:
tf_classify(adv_uint8)

[('Robert_Redford', 42, 0.99992526), ('Jenilee_Harrison', 403, 1.9556934e-05), ('Christian_Bale', 24, 8.629233e-06), ('Jerry_Seinfeld', 123, 7.749858e-06), ('Kevin_Connolly', 297, 5.350144e-06)]


## Check whether our own CNN implementation produces the same output

In [43]:
my_classify(adv_uint8.astype(np.float))

Feeding through layer conv1_1
Feeding through layer conv1_2
Feeding through layer pool1
Feeding through layer conv2_1
Feeding through layer conv2_2
Feeding through layer pool2
Feeding through layer conv3_1
Feeding through layer conv3_2
Feeding through layer conv3_3
Feeding through layer pool3
Feeding through layer conv4_1
Feeding through layer conv4_2
Feeding through layer conv4_3
Feeding through layer pool4
Feeding through layer conv5_1
Feeding through layer conv5_2
Feeding through layer conv5_3
Feeding through layer pool5
Feeding through layer fc6
Feeding through layer fc7
Feeding through layer fc8
[('Robert_Redford', 42, 0.99992537770515921), ('Jenilee_Harrison', 403, 1.9556848866794792e-05), ('Christian_Bale', 24, 8.6291494600134487e-06), ('Jerry_Seinfeld', 123, 7.749831161957529e-06), ('Kevin_Connolly', 297, 5.3501524802967576e-06)]


# Overflow the softmax for fun and profit

We need to achieve `np.exp(710)`

In [51]:
negative_max = -tf.reduce_max(cnn_output_logits)
smashing_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(negative_max, var_list=[x_hat])

demo_epsilon = 25.0
demo_learning_rate = 1e5
demo_steps = 50

# Initialization step
sess.run(assign_op, feed_dict={x: original_img})

# Projected gradient descent
for i in range(demo_steps):
    # Gradient descent step
    _, max_val = sess.run([smashing_step, negative_max], feed_dict={learning_rate: demo_learning_rate})
    # Projection step
    sess.run(project_step, feed_dict={x: original_img, epsilon: demo_epsilon})
    # Print progress after every 10 steps
    if (i + 1) % 10 == 0:
        print("Step: {:d}, max val {:8.2f}".format(i + 1, -max_val))
        print(np.exp(-max_val))
        
# Retrieve the adversarial example
maximum_adv = x_hat.eval()
maximum_adv_uint8 = maximum_adv.astype(np.uint8)

Step: 10, max val    82.92
1.02456e+36
Step: 20, max val   111.75
inf




Step: 30, max val    99.22
inf
Step: 40, max val   119.37
inf
Step: 50, max val    92.76
inf


**TensorFlow**: Feed through all layers except for softmax activation

In [52]:
logits_vals = sess.run(cnn_output_logits, feed_dict={x: maximum_adv_uint8})
print(np.exp(logits_vals) / np.sum(np.exp(logits_vals)))

[[  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.  nan   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   

  
  


**MyCNN**: We expect an overflow error

In [53]:
print(my_cnn.inference(maximum_adv_uint8[None, :].astype(np.float)))

Feeding through layer conv1_1
Feeding through layer conv1_2
Feeding through layer pool1
Feeding through layer conv2_1
Feeding through layer conv2_2
Feeding through layer pool2
Feeding through layer conv3_1
Feeding through layer conv3_2
Feeding through layer conv3_3
Feeding through layer pool3
Feeding through layer conv4_1
Feeding through layer conv4_2
Feeding through layer conv4_3
Feeding through layer pool4
Feeding through layer conv5_1
Feeding through layer conv5_2
Feeding through layer conv5_3
Feeding through layer pool5
Feeding through layer fc6
Feeding through layer fc7
Feeding through layer fc8
[[  7.39700886e-61   4.82014565e-53   8.67390142e-64   4.50857340e-55
    7.06640442e-55   1.15708161e-49   2.88871853e-59   7.62853331e-60
    3.02918597e-49   3.77402565e-52   1.08593982e-54   1.52134138e-60
    2.19832353e-41   4.15977120e-63   4.13991352e-56   1.44274893e-61
    1.44724779e-65   2.64357920e-62   1.20841283e-52   7.43530906e-53
    9.31298117e-48   7.91140647e-63   1.85