In [1]:
import logging
import os
import sys

import keras
import keras.backend as K
import numpy as np
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant
from keras.layers import (Activation, Conv2D, Conv3D, Dense, Dropout, Flatten,
                          Input, Lambda, MaxPooling2D)
from keras.models import Model
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import to_categorical
from matplotlib import pyplot as plt
from PIL import Image
from scipy import ndimage
from skimage.restoration import (denoise_bilateral, denoise_tv_chambolle,
                                 denoise_wavelet, estimate_sigma)
from sklearn.model_selection import StratifiedShuffleSplit
from vis.regularizers import LPNorm, TotalVariation
from vis.utils import utils
from vis.visualization import (visualize_activation,
                               visualize_activation_with_losses)
from tensorflow.contrib import graph_editor as ge
from cleverhans.attacks import (FastGradientMethod, MadryEtAl,
                                ProjectedGradientDescent, SparseL1Descent)
from cleverhans.model import Model as CHModel

sys.path.append('..')

from attribution.InfluenceInvariants import InfluenceInvariants
from attribution.ActivationInvariants import ActivationInvariants
from attribution.invariant_utils import (probits_from_invariants,
                                         smooth_logits_from_invariants,
                                         smooth_logit_tensor_from_invariants,
                                         smooth_logit_tensor_from_invariant,
                                         smooth_probits_from_invariants,
                                         smooth_probit_tensor_from_invariants,
                                         tally_total_stats)

logging.captureWarnings(True)
logging.getLogger('tensorflow').setLevel(logging.CRITICAL)
logging.getLogger('cleverhans').setLevel(logging.CRITICAL)

Using TensorFlow backend.


In [2]:
K.set_image_data_format('channels_last')

In [3]:
from sklearn.datasets import fetch_olivetti_faces, fetch_lfw_people

# Use only classes that have at least 100 images
# There are five such classes in LFW
lfw_slice = (slice(68, 196, None), slice(61, 190, None))
faces_data = fetch_lfw_people(min_faces_per_person=100, color=True, slice_=lfw_slice)
images = faces_data.images
n_classes = faces_data.target.max()+1
x, y = faces_data.data, keras.utils.to_categorical(faces_data.target, n_classes)
images /= 255.0

# Use 3/4 for training, the rest for testing
N_tr = int(len(x)*0.75)
N_te = len(x) - N_tr
x_tr, y_tr = x[:N_tr], y[:N_tr]
x_te, y_te = x[N_tr:], y[N_tr:]
im_tr, im_te = images[:N_tr], images[N_tr:]

In [4]:
inp = keras.layers.Input(shape=im_tr[0].shape, name='features')
out = keras.layers.Conv2D(128, (3,3), activation='relu')(inp)
out = keras.layers.MaxPooling2D(pool_size=(2,2))(out)
out = keras.layers.Conv2D(64, (3,3), activation='relu')(out)
out = keras.layers.MaxPooling2D(pool_size=(2,2))(out)
out = keras.layers.Conv2D(32, (3,3), activation='relu')(out)
out = keras.layers.MaxPooling2D(pool_size=(2,2))(out)
out = keras.layers.Conv2D(16, (3,3), activation='relu')(out)
out = keras.layers.MaxPooling2D(pool_size=(2,2))(out)
out = keras.layers.Flatten()(out)
out = keras.layers.Dense(16, activation='relu')(out)
out = keras.layers.Dense(y[0].shape[0], name='logits')(out)
out = keras.layers.Activation('softmax', name='softmax')(out)
model = keras.Model(inp, out)
model.compile(optimizer=keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['acc'])
model.summary()
model.load_weights('weights/lfw-small-tf.h5')
print('accuracy:')
print('train={:.2}'.format(model.evaluate(im_tr, y_tr, verbose=False)[1]))
print('test={:.2}'.format(model.evaluate(im_te, y_te, verbose=False)[1]))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
features (InputLayer)        (None, 64, 64, 3)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 62, 62, 128)       3584      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 31, 31, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 29, 29, 64)        73792     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 14, 14, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 12, 12, 32)        18464     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 6, 6, 32)          0         
__________

The method below, `get_random_invariants`, does the following:
1. Samples $n$ uniform-random points from [0,1] into $x$
2. For each class label $l$, runs projected gradient descent (`MadryEtAl` from `cleverhans`) with both $L_\infty$ and $L_2$ norm over $x$ to create approximately $2n$ points with label $l$, stored in $x_l$
3. Concatenates all the labeled random points $x_1, \ldots, x_l$ into $x'$
4. For each layer $t$ specified in `layers`, generate a set of invariants using $x'$

Importantly, for influence invariants we exclude activation information (i.e., `multiply_attributions=False` in the invariant constructor). The reason for this will become clear later.

In [10]:
class KerasModel(CHModel):
    def __init__(self, model, **kwargs):
        del kwargs
        CHModel.__init__(self, 'model_b', model.output_shape[1], locals())

        self.model = model
        self.fprop(tf.placeholder(tf.float32, (128,)+model.input_shape[1:]))

    def fprop(self, x, **kwargs):
        del kwargs
        if isinstance(x, np.ndarray):
            x = K.variable(x)
        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
            return dict([(layer.name, keras.Model(self.model.input, layer.output)(x)) for layer in self.model.layers])

def get_random_invariants(model, layers, n, x_shape, n_classes, nbiter=50, eps_linf=0.1, eps_l2=5., agg_fn=None, seed=0):
    np.random.seed(seed)
    rand_data = []
    cls_size = n
    rand_x = np.random.uniform(size=(cls_size,) + x_shape)
    nbiter = 50
    for l in range(n_classes):
        model_ch = KerasModel(model)
        sess = K.get_session()
        pgd_params_l2 = {'eps': eps_l2,
                         'y_target': to_categorical(np.zeros((1,)) + l, n_classes),
                         'eps_iter': eps_l2 / nbiter,
                         'nb_iter': nbiter,
                         'ord': 2,
                         'clip_min': 0.,
                         'clip_max': 1.}
        pgd_params_linf = {'eps': eps_linf,
                           'y_target': to_categorical(np.zeros((1,)) + l, n_classes),
                           'eps_iter': eps_linf / nbiter,
                           'nb_iter': nbiter,
                           'ord': np.inf,
                           'clip_min': 0.,
                           'clip_max': 1.}
        pgd = MadryEtAl(model_ch, sess=sess)
        cur_data = []
        cur_data.append(pgd.generate_np(rand_x, **pgd_params_l2))
        cur_data.append(pgd.generate_np(rand_x, **pgd_params_linf))
        rand_data.append(np.concatenate(cur_data, axis=0))
        print('generated class', l)

    rand_data = np.concatenate(rand_data)

    log_model = Model(model.inputs, model.layers[-2].output)
    log_model.layers[-1].activation = keras.activations.softplus
    log_model = utils.apply_modifications(log_model)

    gens = [InfluenceInvariants(log_model, layer=target, agg_fn=agg_fn,
                                multiply_activation=False).compile() for target in layers]
    invs_by_layer = [gen.get_invariants(rand_data, batch_size = 1) for gen in gens]

    return invs_by_layer, gens

# Influence invariants

We'll start with `agg_fn=None`, so the invariants refer to specific locations on the feature maps. We'll test all convolutional layers (1-4).

In [15]:
layers = [7,8]

In [11]:
invs_by_layer, gens = get_random_invariants(model, 
                                            layers, 
                                            256, 
                                            im_tr[0].shape, 
                                            5, 
                                            agg_fn=None)

generated class 0
generated class 1
generated class 2
generated class 3
generated class 4


In [12]:
len(invs_by_layer[0]), len(invs_by_layer[1])

(16, 13)

In [13]:
print(invs_by_layer[0][0])
print(invs_by_layer[1][0])

(conv2d_4[133] > 0.0 &
 conv2d_4[43] > 0.0 &
 conv2d_4[1] > 0.0)
	--> Q = 4
support=0.0148, precision=1.0
(max_pooling2d_4[42] > 0.0 &
 max_pooling2d_4[43] > 0.0 &
 max_pooling2d_4[40] > 0.0 &
 max_pooling2d_4[18] > 0.0)
	--> Q = 0
support=0.00193, precision=1.0


In [14]:
x_tr, x_te = im_tr, im_te
inv_models = [probits_from_invariants(invs) for invs in invs_by_layer]
for l in range(len(layers)):
    invs = invs_by_layer[l]
    inv_model = inv_models[l]
    print('-' * 10, 'training set results, layer', layers[l])
    n_per, sup, prec = tally_total_stats(
        invs, model, x_tr, batch_size=1)
    print('# invariants per class:', n_per)
    print('support by class:', sup)
    print('precision by class:', prec)
    print('overall prediction accuracy: {:.3}'.format(
        (inv_model(x_tr).argmax(axis=1) == y_tr.argmax(axis=1)).mean()))

    print('-' * 10, 'test set results, layer', layers[l])
    n_per, sup, prec = tally_total_stats(
        invs, model, x_te, batch_size=1)
    print('# invariants per class:', n_per)
    print('support by class:', sup)
    print('precision by class:', prec)
    print('overall prediction accuracy: {:.3}'.format(
        (inv_model(x_te).argmax(axis=1) == y_te.argmax(axis=1)).mean()))

---------- training set results, layer 7
# invariants per class: {0: 4, 1: 5, 2: 4, 3: 1, 4: 2}
support by class: {0: 0.9836956521739131, 1: 0.9148936170212766, 2: 0.9974811083123426, 3: 0.8888888888888888, 4: 0.8585858585858586}
precision by class: {0: 0.8916256157635468, 1: 1.0, 2: 0.9974811083123426, 3: 0.972972972972973, 4: 0.8947368421052632}
overall prediction accuracy: 0.959
---------- test set results, layer 7
# invariants per class: {0: 4, 1: 5, 2: 4, 3: 1, 4: 2}
support by class: {0: 1.0, 1: 0.9142857142857143, 2: 1.0, 3: 0.9705882352941176, 4: 0.8787878787878788}
precision by class: {0: 0.8727272727272727, 1: 1.0, 2: 1.0, 3: 1.0, 4: 0.9666666666666667}
overall prediction accuracy: 0.804
---------- training set results, layer 8
# invariants per class: {0: 3, 1: 3, 2: 4, 3: 2, 4: 1}
support by class: {0: 1.0, 1: 0.9893617021276596, 2: 0.9924433249370277, 3: 0.7530864197530864, 4: 1.0}
precision by class: {0: 0.9945945945945946, 1: 0.9893617021276596, 2: 1.0, 3: 1.0, 4: 0.83193

#### Summary: random influents invariants at layers 7, 8 achieve 95-97% training accuracy, and 80% test accuracy. The original model achieved 100% training accuracy and 81% test accuracy.

Now we'll generate influence invariants with `agg_fn=K.sum`

In [16]:
invs_by_layer, gens = get_random_invariants(model, 
                                            layers, 
                                            256, 
                                            im_tr[0].shape, 
                                            5, 
                                            agg_fn=K.sum)

generated class 0
generated class 1
generated class 2
generated class 3
generated class 4


In [17]:
x_tr, x_te = im_tr, im_te
inv_models = [probits_from_invariants(invs) for invs in invs_by_layer]
for l in range(len(layers)):
    invs = invs_by_layer[l]
    inv_model = inv_models[l]
    print('-' * 10, 'training set results, layer', layers[l])
    n_per, sup, prec = tally_total_stats(
        invs, model, x_tr, batch_size=1)
    print('# invariants per class:', n_per)
    print('support by class:', sup)
    print('precision by class:', prec)
    print('overall prediction accuracy: {:.3}'.format(
        (inv_model(x_tr).argmax(axis=1) == y_tr.argmax(axis=1)).mean()))

    print('-' * 10, 'test set results, layer', layers[l])
    n_per, sup, prec = tally_total_stats(
        invs, model, x_te, batch_size=1)
    print('# invariants per class:', n_per)
    print('support by class:', sup)
    print('precision by class:', prec)
    print('overall prediction accuracy: {:.3}'.format(
        (inv_model(x_te).argmax(axis=1) == y_te.argmax(axis=1)).mean()))

---------- training set results, layer 7
# invariants per class: {0: 3, 1: 3, 2: 4, 3: 6, 4: 3}
support by class: {0: 0.717391304347826, 1: 0.7021276595744681, 2: 0.7531486146095718, 3: 0.9382716049382716, 4: 0.9393939393939394}
precision by class: {0: 0.9850746268656716, 1: 0.9705882352941176, 2: 0.9835526315789473, 3: 0.39790575916230364, 4: 1.0}
overall prediction accuracy: 0.836
---------- test set results, layer 7
# invariants per class: {0: 3, 1: 3, 2: 4, 3: 6, 4: 3}
support by class: {0: 0.7291666666666666, 1: 0.7142857142857143, 2: 0.6814814814814815, 3: 1.0, 4: 1.0}
precision by class: {0: 1.0, 1: 1.0, 2: 0.989247311827957, 3: 0.4, 4: 1.0}
overall prediction accuracy: 0.667
---------- training set results, layer 8
# invariants per class: {0: 3, 1: 3, 2: 4, 3: 6, 4: 3}
support by class: {0: 0.717391304347826, 1: 0.7021276595744681, 2: 0.7531486146095718, 3: 0.9382716049382716, 4: 0.7878787878787878}
precision by class: {0: 0.9850746268656716, 1: 0.9705882352941176, 2: 0.9835526

It seems that aggregating across spatial dimensions does not improve things. This is somewhat surprising especially in the lower recall results, as one might expect spatial dependencies to narrow the scope of an invariant.

# Activation invariants

Now we'll generate activation invariants from random data instead of influence.

In [20]:
def get_random_invariants(model, layers, n, x_shape, n_classes, nbiter=50, eps_linf=0.1, eps_l2=5., agg_fn=None, seed=0):
    np.random.seed(seed)
    rand_data = []
    cls_size = n
    rand_x = np.random.uniform(size=(cls_size,) + x_shape)
    nbiter = 50
    for l in range(n_classes):
        model_ch = KerasModel(model)
        sess = K.get_session()
        pgd_params_l2 = {'eps': eps_l2,
                         'y_target': to_categorical(np.zeros((1,)) + l, n_classes),
                         'eps_iter': eps_l2 / nbiter,
                         'nb_iter': nbiter,
                         'ord': 2,
                         'clip_min': 0.,
                         'clip_max': 1.}
        pgd_params_linf = {'eps': eps_linf,
                           'y_target': to_categorical(np.zeros((1,)) + l, n_classes),
                           'eps_iter': eps_linf / nbiter,
                           'nb_iter': nbiter,
                           'ord': np.inf,
                           'clip_min': 0.,
                           'clip_max': 1.}
        pgd = MadryEtAl(model_ch, sess=sess)
        cur_data = []
        cur_data.append(pgd.generate_np(rand_x, **pgd_params_l2))
        cur_data.append(pgd.generate_np(rand_x, **pgd_params_linf))
        rand_data.append(np.concatenate(cur_data, axis=0))
        print('generated class', l)

    rand_data = np.concatenate(rand_data)

    log_model = Model(model.inputs, model.layers[-2].output)
    log_model.layers[-1].activation = keras.activations.softplus
    log_model = utils.apply_modifications(log_model)

    gens = [ActivationInvariants(log_model, layers=[target], agg_fn=agg_fn).compile() for target in layers]
    invs_by_layer = [gen.get_invariants(rand_data, batch_size = 1) for gen in gens]

    return invs_by_layer, gens

In [22]:
invs_by_layer, gens = get_random_invariants(model, 
                                            layers, 
                                            256, 
                                            im_tr[0].shape, 
                                            5, 
                                            agg_fn=None)

generated class 0
generated class 1
generated class 2
generated class 3
generated class 4


In [23]:
x_tr, x_te = im_tr, im_te
inv_models = [probits_from_invariants(invs) for invs in invs_by_layer]
for l in range(len(layers)):
    invs = invs_by_layer[l]
    inv_model = inv_models[l]
    print('-' * 10, 'training set results, layer', layers[l])
    n_per, sup, prec = tally_total_stats(
        invs, model, x_tr, batch_size=1)
    print('# invariants per class:', n_per)
    print('support by class:', sup)
    print('precision by class:', prec)
    print('overall prediction accuracy: {:.3}'.format(
        (inv_model(x_tr).argmax(axis=1) == y_tr.argmax(axis=1)).mean()))

    print('-' * 10, 'test set results, layer', layers[l])
    n_per, sup, prec = tally_total_stats(
        invs, model, x_te, batch_size=1)
    print('# invariants per class:', n_per)
    print('support by class:', sup)
    print('precision by class:', prec)
    print('overall prediction accuracy: {:.3}'.format(
        (inv_model(x_te).argmax(axis=1) == y_te.argmax(axis=1)).mean()))

---------- training set results, layer 7
# invariants per class: {0: 96, 1: 98, 2: 75, 3: 57, 4: 99}
support by class: {0: 0.09239130434782608, 1: 0.20212765957446807, 2: 0.5113350125944585, 3: 0.14814814814814814, 4: 0.42424242424242425}
precision by class: {0: 0.3148148148148148, 1: 0.2087912087912088, 2: 0.5152284263959391, 3: 0.13636363636363635, 4: 0.18421052631578946}
overall prediction accuracy: 0.343
---------- test set results, layer 7
# invariants per class: {0: 96, 1: 98, 2: 75, 3: 57, 4: 99}
support by class: {0: 0.0625, 1: 0.11428571428571428, 2: 0.4, 3: 0.058823529411764705, 4: 0.3939393939393939}
precision by class: {0: 0.17647058823529413, 1: 0.12903225806451613, 2: 0.42857142857142855, 3: 0.06451612903225806, 4: 0.1625}
overall prediction accuracy: 0.281
---------- training set results, layer 8
# invariants per class: {0: 10, 1: 0, 2: 10, 3: 16, 4: 5}
support by class: {0: 0.09239130434782608, 1: 0, 2: 0.04785894206549118, 3: 0.012345679012345678, 4: 0.0202020202020202

As these results indicate, the property does not hold for activation invariants.

As one final experiment, we'll try aggregating across spatial dimensions with activations.

In [None]:
invs_by_layer, gens = get_random_invariants(model, 
                                            layers, 
                                            256, 
                                            im_tr[0].shape, 
                                            5, 
                                            agg_fn=K.sum)

generated class 0
generated class 1
generated class 2
generated class 3
generated class 4


In [None]:
x_tr, x_te = im_tr, im_te
inv_models = [probits_from_invariants(invs) for invs in invs_by_layer]
for l in range(len(layers)):
    invs = invs_by_layer[l]
    inv_model = inv_models[l]
    print('-' * 10, 'training set results, layer', layers[l])
    n_per, sup, prec = tally_total_stats(
        invs, model, x_tr, batch_size=1)
    print('# invariants per class:', n_per)
    print('support by class:', sup)
    print('precision by class:', prec)
    print('overall prediction accuracy: {:.3}'.format(
        (inv_model(x_tr).argmax(axis=1) == y_tr.argmax(axis=1)).mean()))

    print('-' * 10, 'test set results, layer', layers[l])
    n_per, sup, prec = tally_total_stats(
        invs, model, x_te, batch_size=1)
    print('# invariants per class:', n_per)
    print('support by class:', sup)
    print('precision by class:', prec)
    print('overall prediction accuracy: {:.3}'.format(
        (inv_model(x_te).argmax(axis=1) == y_te.argmax(axis=1)).mean()))