# Fooling a NN (Advasarial examples)

Here, I'm trying to create an advasarial example using gradient descent with some of the ideas coming from [Attacking Machine Learning
with Adversarial Examples](https://openai.com/blog/adversarial-example-research/). The inception v3 model trained on imagenet.

Essentially, this is what I do:
- Choose a test image and verify that the the model can classify it.
- Choose an advasarial class and perform gradient descent on the test image.
- Verify that the model has been fooled


In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.preprocessing import image
import tensorflow_hub as hub
import PIL
import PIL.Image
import json

from tensorflow.keras.preprocessing import image_dataset_from_directory

In [None]:
inception_model = tf.keras.applications.InceptionV3()
inception_model.summary()

In [None]:
input_tensor_shape = inception_model.layers[0].input_shape
image_height = input_tensor_shape[0][1]
image_width = input_tensor_shape[0][2]
image_shape = (image_height, image_width)
print(f"input image shape (height, width) = {image_shape}")

output_tensor_shape = inception_model.layers[-1].output_shape
n_classes = output_tensor_shape[1]
print(f"number of classes = {n_classes}")

with open('imagenet_class_names.txt', 'r') as f:
    class_names = eval(f.read())

assert(n_classes == len(class_names))

In [None]:
def load_image_from_path(img_path, target_size):
    img = image.load_img(img_path, target_size=target_size)
    img_tensor = image.img_to_array(img)
    img_tensor = np.expand_dims(img_tensor, axis=0) / 255.
    return (img, img_tensor)

def get_prediction(tensor, class_names):
    pred = inception_model.predict(tensor)
    assert(pred.shape == (1,len(class_names)))
    pred_class = tf.math.argmax(pred[0]).numpy()
    return class_names[pred_class], pred[0, pred_class]

In [None]:
test_img, test_img_tensor = load_image_from_path('./test_images/test_dog.jpg', target_size=(image_height, image_width))
print(f'Input image shape: {test_img_tensor.shape}')
imshow(test_img)
plt.show()

class_name, confidence = get_prediction(test_img_tensor, class_names)
print(f"'{class_name}' with confidence {confidence}")

In [None]:
def tensor_to_image(tensor):
    """ Converts image tensor to PIL.Image"""
    
    tensor = tensor * 255
    tensor = np.array(tensor, dtype=np.uint8)
    if np.ndim(tensor) > 3:
        assert tensor.shape[0] == 1
        tensor = tensor[0]
    return PIL.Image.fromarray(tensor)

def create_label_for_class(class_index, n_classes):
    """ Create a one hot encoded label for the class_index"""
    
    hack_labels = np.zeros((1,n_classes), dtype=np.float32)
    hack_labels[0, class_index] = 1
    hack_labels = tf.convert_to_tensor(hack_labels)
    return hack_labels

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.03)

@tf.function()
def train_step(img, labels):
    with tf.GradientTape() as tape:
        logits = inception_model(img)
        cost = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
    grad = tape.gradient(cost, img)
    optimizer.apply_gradients([(grad, img)])
    img.assign(tf.clip_by_value(img, clip_value_min=0.0, clip_value_max=1.0))
    return cost

img_tensor = tf.Variable(tf.identity(test_img_tensor))
epochs = 1001
hack_labels = create_label_for_class(23, n_classes) # vulture
j1 = train_step(img_tensor, hack_labels)
print(j1)
j2 = train_step(img_tensor, hack_labels)
print(j2)

# print(tf.compat.v1.trainable_variables())
for i in range(1, epochs):
    train_step(img_tensor, hack_labels)
    if i %250 == 0:
        print(f"Epoch {i}")
        img = tensor_to_image(img_tensor)
        imshow(img)
        plt.show()
        name, confidence = get_prediction(tf.convert_to_tensor(img_tensor), class_names)
        print(f"Prediction: '{name}', confidence={confidence}")

img.save('./test_images/trained_image.jpg')

As you can see the model is predicting the image is a 'vulture'. 

This doesn't work well once the image is saved, I suspect that the compression removes someof te details so the initial effect should be prominent for this to work. The opposite can be checked for a lower value of alpha (learning rate) in the optimizer.

In [None]:
test_img2, test_img2_tensor = load_image_from_path('./test_images/trained_image.jpg', target_size=image_shape)
print(get_prediction(test_img2_tensor, class_names))

Other references
- ["Explaining and Harnessing Adversarial Examples", Ian J. Goodfellow, Jonathon Shlens, Christian Szegedy](https://arxiv.org/abs/1412.6572)
