In [1]:
import io
import typing
from urllib.request import urlopen

import matplotlib.pyplot as plt
import numpy as np
import PIL
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2023-01-12 16:59:42.408490: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tf.__version__

'2.11.0'

In [None]:
class LayerScale(layers.Layer):
    """LayerScale as introduced in CaiT: https://arxiv.org/abs/2103.17239. - Going deeper with Image Transformers

    Args:
        init_values (float): LayerScale의 diagonal matrix 초기값.
        projection_dim (int): LayerScale에서 사용되는 projection dimension.
    """

    def __init__(self, init_values: float, projection_dim: int, **kwargs):
        super().__init__(**kwargs)
        #tf.ones((projection_dim,)) -> [projection_dim,1] shape의 1로 이루어진 vector [1,1,...,1]
        self.gamma = tf.Variable(init_values * tf.ones((projection_dim,)))

    def call(self, x, training=False):
        return x * self.gamma

In [None]:
class ClassAttention(layers.Layer):
    """Class attention as proposed in CaiT: https://arxiv.org/abs/2103.17239. - Going deeper with Image Transformers

    Args:
        projection_dim (int): attention에서 사용되는 query, key, value의 projection dimension 
        num_heads      (int): attention heads의 갯수.
        dropout_rate (float): attention scores와 final projected outputs에서 사용될 dropout rate.
    """

    def __init__(
        self, projection_dim: int, num_heads: int, dropout_rate: float, **kwargs
    ):
        super().__init__(**kwargs)
        self.num_heads = num_heads

        head_dim = projection_dim // num_heads
        self.scale = head_dim**-0.5

        self.q = layers.Dense(projection_dim)
        self.k = layers.Dense(projection_dim)
        self.v = layers.Dense(projection_dim)
        self.attn_drop = layers.Dropout(dropout_rate)
        self.proj = layers.Dense(projection_dim)
        self.proj_drop = layers.Dropout(dropout_rate)

    def call(self, x, training=False):
        batch_size, num_patches, num_channels = (
            tf.shape(x)[0],
            tf.shape(x)[1],
            tf.shape(x)[2],
        )

        # Query projection. `cls_token` embeddings이 queries로 사용됩니다.
        q = tf.expand_dims(self.q(x[:, 0]), axis=1)
        q = tf.reshape(q, (batch_size, 1, self.num_heads, num_channels // self.num_heads))  
       
        # Shape: (batch_size, 1, num_heads, dimension_per_head)
        q = tf.transpose(q, perm=[0, 2, 1, 3])
        scale = tf.cast(self.scale, dtype=q.dtype)
        q = q * scale

        # Key projection. Patch embeddings과 cls embedding이 keys로 사용됩니다.
        k = self.k(x)
        k = tf.reshape(k, (batch_size, num_patches, self.num_heads, num_channels // self.num_heads))  
        
        # Shape: (batch_size, num_tokens, num_heads, dimension_per_head)
        k = tf.transpose(k, perm=[0, 2, 1, 3])

        # Value projection. Patch embeddings과 cls embedding이 values로 사용됩니다.
        v = self.v(x)
        v = tf.reshape(v, (batch_size, num_patches, self.num_heads, num_channels // self.num_heads))
        v = tf.transpose(v, perm=[0, 2, 1, 3])

        #cls_token embedding과 patch embeddings의 attention scores 계산하는 부분입니다.
        attn = tf.matmul(q, k, transpose_b=True)
        attn = tf.nn.softmax(attn, axis=-1)
        attn = self.attn_drop(attn, training)

        x_cls = tf.matmul(attn, v)
        x_cls = tf.transpose(x_cls, perm=[0, 2, 1, 3])
        x_cls = tf.reshape(x_cls, (batch_size, 1, num_channels))
        x_cls = self.proj(x_cls)
        x_cls = self.proj_drop(x_cls, training)

        return x_cls, attn