In [1]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/mnist/", one_hot=True)

Extracting /tmp/mnist/train-images-idx3-ubyte.gz
Extracting /tmp/mnist/train-labels-idx1-ubyte.gz
Extracting /tmp/mnist/t10k-images-idx3-ubyte.gz
Extracting /tmp/mnist/t10k-labels-idx1-ubyte.gz


In [2]:
import os
import tensorflow as tf
import numpy as np

from tqdm import tqdm

from models import modelA, modelB, modelC, modelD
from attacks import fgmt

class AdvTrainer(object):
    
    def __init__(self, sess, mnist, type=0):

        self.mnist = mnist
        self.sess = sess
        
        # Parameters
        self.learning_rate = 1e-4
        self.total_epoch = 10
        self.batch_size = 128
        
        # Network Parameters
        self.n_input = 784
        self.n_classes = 10
        self.n_size = 28
        self.n_channel = 1
        self.dropout = 0.75
        self.naive_scope='naive_{}'.format(type)
        self.adv_scope='adv_{}'.format(type)
        
        
        self.checkpoint_dir = './checkpoints'
        self.save_file_name = 'mnist_cnn_weight_type{}.ckpt'.format(type)
        self.adv_save_file_name = 'iter_fgsm_mnist_cnn_weight_type{}.ckpt'.format(type)
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
            
        models = [modelA, modelB, modelC, modelD]
        
        self.conv_net = models[type]
        self.pretrained_net = models[type]
        self.build()
        
        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.naive_scope)
        self.saver = tf.train.Saver(var_list = self.vars)
        self.naive_restore()
        
        self.adv_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.adv_scope)
        self.adv_saver = tf.train.Saver(var_list = self.adv_vars)
        
        print('setting done!')
    
    def naive_restore(self):
        self.saver.restore(self.sess, os.path.join(self.checkpoint_dir, self.save_file_name))
        
        
    def build(self):
        ## Modelling
        # Input, Output
        self.X = tf.placeholder(tf.float32, [None, 784], name='cnn_X')
        self.Y = tf.placeholder(tf.float32, [None, 10], name='cnn_Y')
        self.target_Y = tf.placeholder(tf.float32, [None, 10], name='cnn_Y')
        self.is_training = tf.placeholder(tf.bool, name='cnn_placeholder')
        
        self.X_img = tf.reshape(self.X, (-1, self.n_size, self.n_size, self.n_channel))
        self.trained_pred = self.pretrained_net(self.X_img, is_training=False, scope=self.naive_scope, reuse=False)
        self.pred = self.conv_net(self.X_img, is_training=self.is_training, scope=self.adv_scope, reuse=False)
        
        self.org_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.pred, labels = self.Y))
        self.target_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.pred, labels = self.target_Y))
        
        self.target_grad = tf.gradients(self.target_cost, self.X_img)[0]
        
        self.adv_X_img = fgmt(self.conv_net, self.X_img, epochs=10, eps=0.3)
        self.adv_pred = self.conv_net(self.adv_X_img, is_training=self.is_training, scope=self.adv_scope, reuse=True)
        self.trained_adv_pred = self.pretrained_net(self.adv_X_img, is_training=False, scope=self.naive_scope, reuse=True)
        self.adv_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.adv_pred, labels = self.Y))
        
        self.cost = (self.org_cost + self.adv_cost) / 2
        self.optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate).minimize(self.cost)

        correct_pred = tf.equal(tf.argmax(self.pred, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        adv_pred = tf.equal(tf.argmax(self.adv_pred, 1), tf.argmax(self.Y, 1))
        self.adv_accuracy = tf.reduce_mean(tf.cast(adv_pred, tf.float32))
        
        trained_adv_pred = tf.equal(tf.argmax(self.trained_adv_pred, 1), tf.argmax(self.Y, 1))
        self.trained_adv_accuracy = tf.reduce_mean(tf.cast(trained_adv_pred, tf.float32))
        
        print('build done!')
    
    def train(self):

        init = tf.global_variables_initializer()
        self.sess.run(init)
        
        total_iter = mnist.train.num_examples // self.batch_size
        
        self.sample_size = 256
        
        step = 1
        for epochs in range(self.total_epoch):
            for ii in tqdm(range(total_iter)):
                batch_x, batch_y = self.mnist.train.next_batch(self.batch_size)
                pred_y = self.get_pred_label(batch_x)
                label_y = (pred_y + 1) % self.n_classes
                target_y = np.zeros((self.batch_size, self.n_classes))
                target_y[np.arange(self.batch_size), label_y] = 1
                self.sess.run(self.optimizer, feed_dict={self.X: batch_x, self.Y: batch_y, self.target_Y: target_y, self.is_training: True})
                
            print("finished!")
            
            test_x, test_y = self.mnist.test.next_batch(self.sample_size)
            test_pred_y = self.get_pred_label(test_x)
            test_label_y = (test_pred_y + 1) % self.n_classes
            test_target_y = np.zeros((self.sample_size, self.n_classes))
            test_target_y[np.arange(self.sample_size), test_label_y] = 1

            print("Testing Accuracy on Original Images:", \
                  self.sess.run(self.accuracy, feed_dict={self.X: test_x,
                                                          self.Y: test_y,
                                                          self.is_training: False}))

            print("Testing Accuracy on Adversarial Images:", \
                  self.sess.run(self.adv_accuracy, feed_dict={self.X: test_x,
                                                              self.Y: test_y,
                                                              self.target_Y: test_target_y,
                                                              self.is_training: False}))
            
            print("Testing Accuracy of original model on Adversarial Images:", \
                  self.sess.run(self.trained_adv_accuracy, feed_dict={self.X: test_x,
                                                              self.Y: test_y,
                                                              self.target_Y: test_target_y,
                                                              self.is_training: False}))
            
        saved_path = self.adv_saver.save(self.sess, os.path.join(self.checkpoint_dir, self.adv_save_file_name))
        print("Model saved in {}".format(saved_path))
    
    def make_adv(self, X, Y):
        num_img = X.shape[0]
        target_y = np.zeros((num_img, self.n_classes))
        target_y[np.arange(num_img), Y] = 1
        return self.sess.run(self.adv_X_img, feed_dict={self.X: X, self.target_Y: target_y, self.is_training: False})
    
    def get_pred_label(self, X):
        return np.argmax(self.sess.run(self.trained_pred, feed_dict={self.X: X, self.is_training: False}), axis=1)
    
    def test(self, X):
        return self.sess.run(self.pred, feed_dict={self.X: X, self.is_training: False})
    
    def naive_test(self, X):
        return self.sess.run(self.trained_pred, feed_dict={self.X: X, self.is_training: False})

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"
sess = tf.Session()
trainer = AdvTrainer(sess, mnist, type=3)
trainer.train()

build done!
INFO:tensorflow:Restoring parameters from ./checkpoints/mnist_cnn_weight_type3.ckpt


  0%|          | 0/429 [00:00<?, ?it/s]

setting done!


100%|██████████| 429/429 [00:08<00:00, 53.40it/s]
  0%|          | 0/429 [00:00<?, ?it/s]

finished!
Testing Accuracy on Original Images: 0.51953125
Testing Accuracy on Adversarial Images: 0.15234375
Testing Accuracy of original model on Adversarial Images: 0.10546875


100%|██████████| 429/429 [00:07<00:00, 55.63it/s]
  0%|          | 2/429 [00:00<00:29, 14.52it/s]

finished!
Testing Accuracy on Original Images: 0.67578125
Testing Accuracy on Adversarial Images: 0.2421875
Testing Accuracy of original model on Adversarial Images: 0.09375


100%|██████████| 429/429 [00:07<00:00, 57.20it/s]
  1%|          | 3/429 [00:00<00:21, 19.97it/s]

finished!
Testing Accuracy on Original Images: 0.86328125
Testing Accuracy on Adversarial Images: 0.14453125
Testing Accuracy of original model on Adversarial Images: 0.10546875


100%|██████████| 429/429 [00:07<00:00, 55.66it/s]
  1%|          | 3/429 [00:00<00:21, 20.24it/s]

finished!
Testing Accuracy on Original Images: 0.9140625
Testing Accuracy on Adversarial Images: 0.1640625
Testing Accuracy of original model on Adversarial Images: 0.05859375


100%|██████████| 429/429 [00:07<00:00, 59.56it/s]
  0%|          | 0/429 [00:00<?, ?it/s]

finished!
Testing Accuracy on Original Images: 0.91015625
Testing Accuracy on Adversarial Images: 0.19921875
Testing Accuracy of original model on Adversarial Images: 0.09375


100%|██████████| 429/429 [00:07<00:00, 55.98it/s]
  0%|          | 0/429 [00:00<?, ?it/s]

finished!
Testing Accuracy on Original Images: 0.890625
Testing Accuracy on Adversarial Images: 0.19140625
Testing Accuracy of original model on Adversarial Images: 0.12890625


100%|██████████| 429/429 [00:07<00:00, 55.19it/s]
  0%|          | 0/429 [00:00<?, ?it/s]

finished!
Testing Accuracy on Original Images: 0.94921875
Testing Accuracy on Adversarial Images: 0.2265625
Testing Accuracy of original model on Adversarial Images: 0.08203125


100%|██████████| 429/429 [00:07<00:00, 55.36it/s]
  0%|          | 0/429 [00:00<?, ?it/s]

finished!
Testing Accuracy on Original Images: 0.91015625
Testing Accuracy on Adversarial Images: 0.1796875
Testing Accuracy of original model on Adversarial Images: 0.09765625


100%|██████████| 429/429 [00:07<00:00, 55.68it/s]
  1%|▏         | 6/429 [00:00<00:07, 54.77it/s]

finished!
Testing Accuracy on Original Images: 0.93359375
Testing Accuracy on Adversarial Images: 0.19140625
Testing Accuracy of original model on Adversarial Images: 0.1015625


100%|██████████| 429/429 [00:07<00:00, 55.48it/s]


finished!
Testing Accuracy on Original Images: 0.9375
Testing Accuracy on Adversarial Images: 0.2109375
Testing Accuracy of original model on Adversarial Images: 0.09765625
Model saved in ./checkpoints/iter_fgsm_mnist_cnn_weight_type3.ckpt


In [4]:
import matplotlib.pyplot as plt

x = mnist.test.images[0]
plt.imshow(np.reshape(x, (28, 28)))
plt.show()
adv_x = trainer.make_adv(np.array([x]), 1)[0]

plt.imshow(np.reshape(adv_x, (28, 28)))
plt.show()

<matplotlib.figure.Figure at 0x7f1b1dafc048>

<matplotlib.figure.Figure at 0x7f1b12124160>