## MNIST

In this first section, the base DeepFool attack is demonstrated on the MNIST dataset.

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import sys
sys.path.append("..")

%matplotlib inline

import numpy as np

from sklearn.metrics import classification_report

import keras
from keras import metrics
from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.layers import Dense, Flatten
from keras.models import Model, load_model
import keras.backend as k
from matplotlib import pyplot as plt
from IPython.display import clear_output

from art.config import ART_DATA_PATH
from art.estimators.classification import KerasClassifier
from art.utils import to_categorical, load_dataset, get_file

import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from algorithms.deepfool import DeepFool

In [2]:
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset('mnist')

path = get_file('mnist_cnn_original.h5', extract=False, path=ART_DATA_PATH,
                url='https://www.dropbox.com/s/p2nyzne9chcerid/mnist_cnn_original.h5?dl=1')

METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.CategoricalAccuracy(name='categorical_accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

classifier_model = load_model(path)
classifier = KerasClassifier(clip_values=(min_, max_), model=classifier_model, use_logits=True)
classifier_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=METRICS)
classifier.fit(x_train, y_train, nb_epochs=1, batch_size=128, verbose=1)



In [3]:
classifier_model.summary()
pred = classifier.predict(x_test)
x_test_pred = np.argmax(pred, axis=1)
nb_correct_pred = np.sum(x_test_pred == np.argmax(y_test, axis=1))
accuracy = np.mean(np.argmax(pred, axis=1) == np.argmax(y_test, axis=1))
base_results = classifier.model.evaluate(x_test, y_test, verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               204928    
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1

In [4]:
print(f"---Original test images---:")
print("Correctly classified: {}".format(nb_correct_pred))
print("Accuracy on test samples: %f" % accuracy)
dict(zip(classifier.model.metrics_names, base_results))

---Original test images---:
Correctly classified: 9928
Accuracy on test samples: 0.992800


{'loss': 0.06432244380980565,
 'tp': 9928.0,
 'fp': 72.0,
 'tn': 89928.0,
 'fn': 72.0,
 'categorical_accuracy': 0.9928,
 'precision': 0.9928,
 'recall': 0.9928,
 'auc': 0.9972277}

In [5]:
attacker = DeepFool(classifier)
x_test_adv = attacker.generate(x_test[:1000])

Targeted model should output logits, not probabilities for predictions.
DeepFool: 100%|██████████| 1000/1000 [04:25<00:00,  3.76it/s]


In [6]:
preds = np.argmax(classifier.predict(x_test_adv), axis=1)
acc = np.sum(preds == np.argmax(y_test[:1000], axis=1)) / y_test[:1000].shape[0]

adv_results = classifier.model.evaluate(x_test_adv, y_test[:1000], verbose=1)

print(np.sum(preds == np.argmax(y_test[:1000], axis=1)))
dict(zip(classifier.model.metrics_names, adv_results))

923


{'loss': 0.9035765079259872,
 'tp': 922.0,
 'fp': 76.0,
 'tn': 8924.0,
 'fn': 78.0,
 'categorical_accuracy': 0.923,
 'precision': 0.9238477,
 'recall': 0.922,
 'auc': 0.96704066}

## CFAIR-10

In [7]:
import logging

from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Activation, Dropout

In [8]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter("[%(levelname)s] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

In [9]:
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset(str("cifar10"))

In [10]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.CategoricalAccuracy(name='categorical_accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

model = Sequential()
model.add(Conv2D(32, (3, 3), padding="same", input_shape=x_train.shape[1:]))
model.add(Activation("relu"))
model.add(Conv2D(32, (3, 3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(Conv2D(64, (3, 3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(10))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=METRICS)

In [11]:
classifier = KerasClassifier(model=model, clip_values=(min_, max_))
classifier.fit(x_train, y_train, nb_epochs=15, batch_size=128, verbose=0)

[INFO] Inferred 17 hidden layers on Keras classifier.


In [12]:
preds = np.argmax(classifier.predict(x_test), axis=1)
acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0]
base_results = classifier.model.evaluate(x_test, y_test, verbose=1)
print("Classifier results before attack: ")
dict(zip(classifier.model.metrics_names, base_results))

Classifier results before attack: 


{'loss': 0.6200162759780884,
 'tp': 7389.0,
 'fp': 1342.0,
 'tn': 88658.0,
 'fn': 2611.0,
 'categorical_accuracy': 0.7872,
 'precision': 0.8462948,
 'recall': 0.7389,
 'auc': 0.9763044}

In [13]:
logger.info("Create DeepFool attack")
adv_crafter = DeepFool(classifier)
#logger.info("Craft attack on training examples")
#x_train_adv = adv_crafter.generate(x_train)
logger.info("Craft attack test examples")
x_test_adv = adv_crafter.generate(x_test[:500])

[INFO] Create DeepFool attack
[INFO] Craft attack test examples
DeepFool: 100%|██████████| 500/500 [01:20<00:00,  6.19it/s]
[INFO] DeepFool attack success rate: 87.60%


In [14]:
preds = np.argmax(classifier.predict(x_test_adv), axis=1)
acc = np.sum(preds == np.argmax(y_test[:500], axis=1)) / y_test[:500].shape[0]
logger.info("Classifier after adversarial training")
logger.info("Accuracy on adversarial samples: %.2f%%", (acc * 100))
adv_results = classifier.model.evaluate(x_test_adv, y_test[:500], verbose=1)
print("Classifier results after attack: ")
dict(zip(classifier.model.metrics_names, adv_results))

[INFO] Classifier after adversarial training
[INFO] Accuracy on adversarial samples: 19.20%


Classifier results after attack: 


{'loss': 3.783032844543457,
 'tp': 15.0,
 'fp': 251.0,
 'tn': 4249.0,
 'fn': 485.0,
 'categorical_accuracy': 0.192,
 'precision': 0.05639098,
 'recall': 0.03,
 'auc': 0.7463559}

In [15]:
classifier = KerasClassifier(model=model, clip_values=(min_, max_))
classifier.fit(x_train, y_train, nb_epochs=15, batch_size=128, verbose=0)

[INFO] Inferred 17 hidden layers on Keras classifier.


In [16]:
logger.info("Create DeepFool attack")
adv_crafter = DeepFool(classifier)
#logger.info("Craft attack on training examples")
#x_train_adv = adv_crafter.generate(x_train)
logger.info("Craft attack test examples")
x_test_adv = adv_crafter.generate(x_test[:500])

[INFO] Create DeepFool attack
[INFO] Craft attack test examples
DeepFool: 100%|██████████| 500/500 [01:08<00:00,  7.30it/s]
[INFO] DeepFool attack success rate: 90.20%


In [17]:
preds = np.argmax(classifier.predict(x_test_adv), axis=1)
acc = np.sum(preds == np.argmax(y_test[:500], axis=1)) / y_test[:500].shape[0]
logger.info("Classifier after adversarial training")
logger.info("Accuracy on adversarial samples: %.2f%%", (acc * 100))
adv_results = classifier.model.evaluate(x_test_adv, y_test[:500], verbose=1)
print("Classifier results after attack: ")
dict(zip(classifier.model.metrics_names, adv_results))

[INFO] Classifier after adversarial training
[INFO] Accuracy on adversarial samples: 15.20%


Classifier results after attack: 


{'loss': 5.0207947158813475,
 'tp': 22.0,
 'fp': 304.0,
 'tn': 4196.0,
 'fn': 478.0,
 'categorical_accuracy': 0.152,
 'precision': 0.06748466,
 'recall': 0.044,
 'auc': 0.70780313}

In [18]:
classifier = KerasClassifier(model=model, clip_values=(min_, max_))
classifier.fit(x_train, y_train, nb_epochs=15, batch_size=128, verbose=0)

[INFO] Inferred 17 hidden layers on Keras classifier.


In [19]:
logger.info("Create DeepFool attack")
adv_crafter = DeepFool(classifier)
#logger.info("Craft attack on training examples")
#x_train_adv = adv_crafter.generate(x_train)
logger.info("Craft attack test examples")
x_test_adv = adv_crafter.generate(x_test[:500])

[INFO] Create DeepFool attack
[INFO] Craft attack test examples
DeepFool: 100%|██████████| 500/500 [00:48<00:00, 10.21it/s]
[INFO] DeepFool attack success rate: 92.60%


In [20]:
preds = np.argmax(classifier.predict(x_test_adv), axis=1)
acc = np.sum(preds == np.argmax(y_test[:500], axis=1)) / y_test[:500].shape[0]
logger.info("Classifier after adversarial training")
logger.info("Accuracy on adversarial samples: %.2f%%", (acc * 100))
adv_results = classifier.model.evaluate(x_test_adv, y_test[:500], verbose=1)
print("Classifier results after attack: ")
dict(zip(classifier.model.metrics_names, adv_results))

[INFO] Classifier after adversarial training
[INFO] Accuracy on adversarial samples: 14.80%


Classifier results after attack: 


{'loss': 5.2573206329345705,
 'tp': 33.0,
 'fp': 319.0,
 'tn': 4181.0,
 'fn': 467.0,
 'categorical_accuracy': 0.148,
 'precision': 0.09375,
 'recall': 0.066,
 'auc': 0.6885616}

## Extension

In this extension, FastDeepFool was implemented to have a dynamic epsilon value for its overshooting correction parameter, which logarithmically decreases as the attack progresses. This allows for larger corrections early on, with finer tuning as it goes.

In [21]:
from algorithms.dynamic_deepfool import DynamicDeepFool

In [22]:
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset(str("cifar10"))

In [23]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.CategoricalAccuracy(name='categorical_accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

model = Sequential()
model.add(Conv2D(32, (3, 3), padding="same", input_shape=x_train.shape[1:]))
model.add(Activation("relu"))
model.add(Conv2D(32, (3, 3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(Conv2D(64, (3, 3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(10))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=METRICS)

In [24]:
classifier = KerasClassifier(model=model, clip_values=(min_, max_))
classifier.fit(x_train, y_train, nb_epochs=15, batch_size=128, verbose=1)

[INFO] Inferred 17 hidden layers on Keras classifier.


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [25]:
preds = np.argmax(classifier.predict(x_test[:500]), axis=1)
acc = np.sum(preds == np.argmax(y_test[:500], axis=1)) / y_test[:500].shape[0]
base_results = classifier.model.evaluate(x_test[:500], y_test[:500], verbose=1)
print("Classifier results before attack: ")
dict(zip(classifier.model.metrics_names, base_results))

Classifier results before attack: 


{'loss': 0.5499661130905151,
 'tp': 383.0,
 'fp': 60.0,
 'tn': 4440.0,
 'fn': 117.0,
 'categorical_accuracy': 0.814,
 'precision': 0.8645598,
 'recall': 0.766,
 'auc': 0.9806411}

In [26]:
logger.info("Create DeepFool attack")
adv_crafter = DynamicDeepFool(classifier)
logger.info("Craft attack on test examples")
x_test_adv = adv_crafter.generate(x_test[:500])

[INFO] Create DeepFool attack
[INFO] Craft attack on test examples
DeepFool: 100%|██████████| 500/500 [01:11<00:00,  6.96it/s]
[INFO] DeepFool attack success rate: 97.60%


In [27]:
preds = np.argmax(classifier.predict(x_test_adv), axis=1)
acc = np.sum(preds == np.argmax(y_test[:500], axis=1)) / y_test[:500].shape[0]
logger.info("Classifier after adversarial training")
logger.info("Accuracy on adversarial samples: %.2f%%", (acc * 100))
adv_results = classifier.model.evaluate(x_test_adv, y_test[:500], verbose=1)
print("Classifier results after attack: ")
dict(zip(classifier.model.metrics_names, adv_results))

[INFO] Classifier after adversarial training
[INFO] Accuracy on adversarial samples: 12.20%


Classifier results after attack: 


{'loss': 3.938795997619629,
 'tp': 17.0,
 'fp': 264.0,
 'tn': 4236.0,
 'fn': 483.0,
 'categorical_accuracy': 0.122,
 'precision': 0.06049822,
 'recall': 0.034,
 'auc': 0.7303899}

In [30]:
classifier = KerasClassifier(model=model, clip_values=(min_, max_))
classifier.fit(x_train, y_train, nb_epochs=15, batch_size=128, verbose=0)

[INFO] Inferred 17 hidden layers on Keras classifier.


In [31]:
logger.info("Create DeepFool attack")
adv_crafter = DynamicDeepFool(classifier)
logger.info("Craft attack on test examples")
x_test_adv = adv_crafter.generate(x_test[:500])

[INFO] Create DeepFool attack
[INFO] Craft attack on test examples
DeepFool: 100%|██████████| 500/500 [00:56<00:00,  8.78it/s]
[INFO] DeepFool attack success rate: 98.60%


In [32]:
preds = np.argmax(classifier.predict(x_test_adv), axis=1)
acc = np.sum(preds == np.argmax(y_test[:500], axis=1)) / y_test[:500].shape[0]
logger.info("Classifier after adversarial training")
logger.info("Accuracy on adversarial samples: %.2f%%", (acc * 100))
adv_results = classifier.model.evaluate(x_test_adv, y_test[:500], verbose=1)
print("Classifier results after attack: ")
dict(zip(classifier.model.metrics_names, adv_results))

[INFO] Classifier after adversarial training
[INFO] Accuracy on adversarial samples: 11.60%


Classifier results after attack: 


{'loss': 5.571189750671387,
 'tp': 26.0,
 'fp': 334.0,
 'tn': 4166.0,
 'fn': 474.0,
 'categorical_accuracy': 0.116,
 'precision': 0.072222225,
 'recall': 0.052,
 'auc': 0.69014335}

In [33]:
classifier = KerasClassifier(model=model, clip_values=(min_, max_))
classifier.fit(x_train, y_train, nb_epochs=15, batch_size=128, verbose=0)

[INFO] Inferred 17 hidden layers on Keras classifier.


In [34]:
logger.info("Create DeepFool attack")
adv_crafter = DynamicDeepFool(classifier)
logger.info("Craft attack on test examples")
x_test_adv = adv_crafter.generate(x_test[:500])

[INFO] Create DeepFool attack
[INFO] Craft attack on test examples
DeepFool: 100%|██████████| 500/500 [00:54<00:00,  9.21it/s]
[INFO] DeepFool attack success rate: 98.40%


In [35]:
preds = np.argmax(classifier.predict(x_test_adv), axis=1)
acc = np.sum(preds == np.argmax(y_test[:500], axis=1)) / y_test[:500].shape[0]
logger.info("Classifier after adversarial training")
logger.info("Accuracy on adversarial samples: %.2f%%", (acc * 100))
adv_results = classifier.model.evaluate(x_test_adv, y_test[:500], verbose=1)
print("Classifier results after attack: ")
dict(zip(classifier.model.metrics_names, adv_results))

[INFO] Classifier after adversarial training
[INFO] Accuracy on adversarial samples: 13.40%


Classifier results after attack: 


{'loss': 4.564453834533691,
 'tp': 21.0,
 'fp': 313.0,
 'tn': 4187.0,
 'fn': 479.0,
 'categorical_accuracy': 0.134,
 'precision': 0.06287425,
 'recall': 0.042,
 'auc': 0.7094519}