/
visualize.py
54 lines (45 loc) · 1.89 KB
/
visualize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import cv2
import numpy as np
import tensorflow as tf
from . import layers, vit
def attention_map(model, image):
"""Get an attention map for an image and model using the technique
described in Appendix D.7 in the paper (unofficial).
Args:
model: A ViT model
image: An image for which we will compute the attention map.
"""
img_height, img_width = model.input_shape[1], model.input_shape[2]
grid_size = int(np.sqrt(model.layers[5].output_shape[0][-2] - 1))
# Prepare the input
X = vit.preprocess_inputs(cv2.resize(image, (img_height, img_width)))[np.newaxis, :] # type: ignore
# Get the attention weights from each transformer.
outputs = [
l.output[1] for l in model.layers if isinstance(l, layers.TransformerBlock)
]
weights = np.array(
tf.keras.models.Model(inputs=model.inputs, outputs=outputs).predict(X)
)
num_layers = weights.shape[0]
num_heads = weights.shape[2]
reshaped = weights.reshape(
(num_layers, num_heads, grid_size**2 + 1, grid_size**2 + 1)
)
# From Appendix D.6 in the paper ...
# Average the attention weights across all heads.
reshaped = reshaped.mean(axis=1)
# From Section 3 in https://arxiv.org/pdf/2005.00928.pdf ...
# To account for residual connections, we add an identity matrix to the
# attention matrix and re-normalize the weights.
reshaped = reshaped + np.eye(reshaped.shape[1])
reshaped = reshaped / reshaped.sum(axis=(1, 2))[:, np.newaxis, np.newaxis]
# Recursively multiply the weight matrices
v = reshaped[-1]
for n in range(1, len(reshaped)):
v = np.matmul(v, reshaped[-1 - n])
# Attention from the output token to the input space.
mask = v[0, 1:].reshape(grid_size, grid_size)
mask = cv2.resize(mask / mask.max(), (image.shape[1], image.shape[0]))[
..., np.newaxis
]
return (mask * image).astype("uint8")