In [1]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from collections import OrderedDict

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.utils import to_categorical

import sys
import os
sys.path.append('../libs')
import model_zoo as zoo
import measure
from adversarial import AdvTraining

import foolbox
from foolbox.models import KerasModel
from foolbox.attacks import LBFGSAttack, FGSM, CarliniWagnerL2Attack, DeepFoolAttack, SaliencyMapAttack

Using TensorFlow backend.


In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, 28, 28, 1) / 255.
x_test = x_test.reshape(-1, 28, 28, 1) / 255.

## Adversarial Training用の学習モデル

In [3]:
# 攻撃対象のCNNの保存先 or 読み込み先
model_path = Path("..", "model_dir")
model_path.mkdir(exist_ok=True)
model_path /= "target_cnn.h5"

# 学習済みのCNNがあればロード、なければ作成
if model_path.exists():
    model = load_model(str(model_path))
else:
    log_dir = Path('..', 'logs', 'cnn')
    model = zoo.CNN(input_shape=(28, 28, 1)) # MNIST用
    tb = TensorBoard(log_dir=str(log_dir))
    model.compile(optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["acc"])
    model.fit(x=x_train, y=y_train, epochs=5, validation_data=(x_test, y_test), callbacks=[tb])
    model.save(str(model_path))

Instructions for updating:
Colocations handled automatically by placer.


## Adversarial Training

In [4]:
import warnings
warnings.filterwarnings('ignore')

adv_tr = AdvTraining(model, foolbox.attacks.FGSM, foolbox.criteria.Misclassification())
adv_tr.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test), batch_size=32, verbose=2)


1 / 10 epochs
Train on 60030 samples, validate on 10000 samples
 - 25s - loss: 0.0356 - acc: 0.9902 - val_loss: 0.0287 - val_acc: 0.9913

2 / 10 epochs
Train on 60058 samples, validate on 10000 samples
 - 25s - loss: 0.0148 - acc: 0.9958 - val_loss: 0.0339 - val_acc: 0.9914

3 / 10 epochs
Train on 60089 samples, validate on 10000 samples
 - 25s - loss: 0.0167 - acc: 0.9955 - val_loss: 0.0284 - val_acc: 0.9921

4 / 10 epochs
Train on 60120 samples, validate on 10000 samples
 - 25s - loss: 0.0122 - acc: 0.9963 - val_loss: 0.0338 - val_acc: 0.9917

5 / 10 epochs
Train on 60151 samples, validate on 10000 samples
 - 24s - loss: 0.0124 - acc: 0.9965 - val_loss: 0.0322 - val_acc: 0.9918

6 / 10 epochs
Train on 60182 samples, validate on 10000 samples
 - 24s - loss: 0.0111 - acc: 0.9969 - val_loss: 0.0270 - val_acc: 0.9928

7 / 10 epochs
Train on 60212 samples, validate on 10000 samples
 - 24s - loss: 0.0100 - acc: 0.9969 - val_loss: 0.0390 - val_acc: 0.9918

8 / 10 epochs
Train on 60241 samp

## Adversarial TrainingによってAdversarial Examplesの耐性がついたのか評価

### もう一度modelをファイルから読み込み
Adversarial Trainingを行うと上で読み込んだmodelまで一緒に学習されてしまっているので（おそらくポインタで渡されているため）、もう一度学習済みモデルから読み込む

In [5]:
model = load_model(str(model_path))

### 各結果

In [9]:
advs = np.load('../data/adv_data.npz')

print('[Adversarial Training前]')
print("テストセット")
loss,acc = model.evaluate(x_test, y_test, verbose=0)
print(f'精度：{np.round(acc*100, 3)} % , 損失：{loss.round(3)}')

print('テストセットのAdversarial Examples')
loss,acc = model.evaluate(advs['adv_img'], advs['orig_label'], verbose=0)
print(f'精度：{np.round(acc*100, 3)} % , 損失：{loss.round(3)}')

print('')

print('[Adversarial Training後]')
print('テストセット')
loss,acc = adv_tr.kmodel.evaluate(x_test, y_test, verbose=0)
print(f'精度：{np.round(acc*100, 3)} % , 損失：{loss.round(3)}')

print('テストセットのAdversarial Examples')
loss,acc = adv_tr.kmodel.evaluate(advs['adv_img'], advs['orig_label'], verbose=0)
print(f'精度：{np.round(acc*100, 3)} % , 損失：{loss.round(3)}')

[Adversarial Training前]
テストセット
精度：99.13 % , 損失：0.031
テストセットのAdversarial Examples
精度：0.0 % , 損失：1.114

[Adversarial Training後]
テストセット
精度：99.13 % , 損失：0.045
テストセットのAdversarial Examples
精度：89.748 % , 損失：0.463
