In [1]:
!pip install einops

Collecting einops
  Downloading https://files.pythonhosted.org/packages/5d/a0/9935e030634bf60ecd572c775f64ace82ceddf2f504a5fd3902438f07090/einops-0.3.0-py2.py3-none-any.whl
Installing collected packages: einops
Successfully installed einops-0.3.0


In [2]:
#구현하는 모델에서 쓰이는 모든 activation함수는 정의하여 드린 GELU 함수를 사용해야함.
#MultiHeadAttention에서 Head로 나눌때, 이미지를 patch로자른후 sequence로 만들때 Rearrange함수를 사용하면 편리함.(사용하지 않으셔도 됩니다)
#CIFAR10에 대한 test accuracy가 60프로 이상인 ViT모델을 만드시오.
import tensorflow as tf
from einops.layers.tensorflow import Rearrange
from tensorflow.keras.activations import gelu
GELU = lambda x : gelu(x)

In [3]:
#논문[1]에서 설명하는 MultiHeadAttention을 만들어라.
class MultiHeadedAttention(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension)
    def __init__(self, dimension, heads=8):
        super(MultiHeadedAttention, self).__init__()
        ############Write your code Here############
        self.heads = heads
        # scale attention value with 1/sqrt(dk)
        self.scale = dimension ** -0.5

        # Multihead attention input dimension => dk + dk + dv (dk == dv) 
        self.multihead_in = tf.keras.layers.Dense(dimension * 3, use_bias=False)
        # yield dv dimensional output
        self.multihead_out = tf.keras.layers.Dense(dimension)
        
        # b = batch size, n = number of patches, qkv = query key value, h = height 
        self.rearrange_attention = Rearrange(
            'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads)
        self.rearrange_output = Rearrange('b h n d -> b n (h d)')
        ############################################
    def call(self, inputs):
        output = None
        ############Write your code Here############
        qkv = self.multihead_in(inputs)
        qkv = self.rearrange_attention(qkv)
        
        # query key value
        query = qkv[0]
        key = qkv[1]
        value = qkv[2]

        dot_product = tf.einsum('bhid,bhjd->bhij', query, key) * self.scale
        attention = tf.nn.softmax(dot_product, axis=-1)
        output = tf.einsum('bhij,bhjd->bhid', attention, value)
        output = self.rearrange_output(output)
        output = self.multihead_out(output)
        ############################################
        return output

#인자로 받은 residual_function을 사용하여 real_function값을 return하여주는 Class를 만들어라.(call함수 참고)
class ResidualBlock(tf.keras.Model):
    def __init__(self, residual_function):
        super(ResidualBlock, self).__init__()
        ############Write your code Here############
        self.residual_function = residual_function
        ############################################

    def call(self, inputs):
        return self.residual_function(inputs) + inputs

#인자로 받은 normfunction에 들어가기전에 LayerNormalization을 해주는 Class를 만들어라.(call함수 참고)
class NormalizationBlock(tf.keras.Model):
    def __init__(self, norm_function, epsilon=1e-5):
        super(NormalizationBlock, self).__init__()
        ############Write your code Here############
        self.norm_function = norm_function
        self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-5)
        ############################################

    def call(self, inputs):
        return self.norm_function(self.norm(inputs))

#논문[1]에서의 MLPBlock을 만들어라.
class MLPBlock(tf.keras.Model):
    #output_dimension - MLPBlock의 output dimension
    #hidden_dimension - MLPBlock의 hidden layer dimension
    def __init__(self, output_dimension, hidden_dimension):
        super(MLPBlock, self).__init__()
        ############Write your code Here############
        # fully connected layer to produce outputs of dmodel
        self.fc_1 = tf.keras.layers.Dense(hidden_dimension)
        self.fc_2 = tf.keras.layers.Dense(output_dimension)
        self.dropout = tf.keras.layers.Dropout(0.1)
        ############################################

    def call(self, inputs):
        output = None
        ############Write your code Here############
        output = self.fc_1(inputs)
        output = GELU(output)
        output = self.dropout(output)
        output = self.fc_2(output)
        ############################################
        return output

#논문[1]을 읽고 TransformerEncoder를 위에서 정의한 class들을 사용하여 만들어라.
class TransformerEncoder(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), heads - MHA에서 head의 개수
    #depth - encoder layer의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    def __init__(self, dimension, depth, heads, mlp_dimension): 
        super(TransformerEncoder, self).__init__()
        layers_ = []
        for _ in range(depth):
            ############Write your code Here############
            layers_.extend([
                NormalizationBlock(ResidualBlock(MultiHeadedAttention(dimension, heads=heads))),
                tf.keras.layers.Dropout(0.1),
                NormalizationBlock(ResidualBlock(MLPBlock(dimension, mlp_dimension))),
                tf.keras.layers.Dropout(0.1)
            ])
            ############################################
        self.layers_ = tf.keras.Sequential(layers_)

    def call(self, inputs):
        return self.layers_(inputs)

#논문[2]를 읽고 ViT모델을 위에서 정의한 class들을 사용하여 만들어라.
class ImageTransformer(tf.keras.Model):
    #image_size - 이미지의 W==H의 크기(int), patch_size - 이미지를 쪼갤 patch의 크기(int)
    #n_classes - 최종 class의 개수, batch_size - 배치사이즈
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), depth - encoder layer의 개수
    #heads - MHA에서 head의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    #channel - input image에 대한 channel의 수
    def __init__(
            self, image_size, patch_size, n_classes, batch_size,
            dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'invalid patch size for image size'

        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension
        self.batch_size = batch_size

        self.positional_embedding = self.add_weight(
            "position_embeddings", shape=[num_patches + 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        self.classification_token = self.add_weight(
            "classification_token", shape=[1, 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        ############Write your code Here############
        # divide image into patches (p1 * p2)
        self.rearrange = Rearrange(
            'b c (h p1) (w p2) -> b (h w) (p1 p2 c)',
            p1=self.patch_size, p2=self.patch_size
        )

        # patch -> embedding (flatten patches)
        self.patch_to_embedding = tf.keras.layers.Dense(dimension)
        self.transformer = TransformerEncoder(dimension, depth, heads, mlp_dimension)
        self.classification_identity = tf.identity
        self.fc_1 = tf.keras.layers.Dense(mlp_dimension)
        self._output = tf.keras.layers.Dense(n_classes)
        ############################################

    def call(self, inputs):
        output = None
        ############Write your code Here############
        shapes = tf.shape(inputs)
        y = self.rearrange(inputs)
        y = self.patch_to_embedding(y)

        cls_tokens = tf.broadcast_to(
            self.classification_token,
            (shapes[0], 1, self.dimension)
        )
        # add class embeddings
        y = tf.concat((cls_tokens, y), axis=1)
        # add positional embeddings
        y += self.positional_embedding
        # encoder: multi-head -> normalization -> residual -> feed forward -> normalization -> residual
        y = self.transformer(y)
        y = self.classification_identity(y[:, 0])
        # MLP head
        y = self.fc_1(y)
        y = GELU(y)
        output = self._output(y)
        ############################################
        return output

In [4]:
from tensorflow.keras import datasets
# Download and prepare the CIFAR10 dataset
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
# Normalize pixel values to be between 0 and 1
############Write your code Here############
train_images = tf.cast(train_images, dtype=tf.float32) / 255.
test_images = tf.cast(test_images, dtype=tf.float32) / 255.
############################################
# Make image shape (BS, H, W, C) to (BS, C, H, W)
############Write your code Here############
train_images = tf.transpose(train_images, [0, 3, 1, 2])
test_images = tf.transpose(test_images, [0, 3, 1, 2])
############################################

#Initialize your model
#Initialize optimizer and loss and compile it to the model
############Write your code Here############
model = ImageTransformer(
        image_size=32, patch_size=2, n_classes=10, batch_size=64,
        dimension=64, depth=3, heads=8, mlp_dimension=256
    )
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
cross_entropy_loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE
    )
model.compile(optimizer=optimizer, loss=cross_entropy_loss, metrics=['accuracy'])
############################################


#Train your model
############Write your code Here############
model.fit(train_images, train_labels, batch_size=64, epochs=100)
############################################
print('==============Training Finished===============')

#Evaluate your test samples
accuracy = 0
############Write your code Here############
_, accuracy = model.evaluate(test_images, test_labels)
accuracy = accuracy * 100
############################################

print('Test Accuracy :', accuracy)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72