# LIbrary

In [1]:
!pip install einops



In [2]:
#구현하는 모델에서 쓰이는 모든 activation함수는 정의하여 드린 GELU 함수를 사용해야함.
#MultiHeadAttention에서 Head로 나눌때, 이미지를 patch로자른후 sequence로 만들때 Rearrange함수를 사용하면 편리함.(사용하지 않으셔도 됩니다)
#CIFAR10에 대한 test accuracy가 60프로 이상인 ViT모델을 만드시오.
import tensorflow as tf
from einops.layers.tensorflow import Rearrange
from tensorflow.keras.activations import gelu
GELU = lambda x : gelu(x)

In [3]:
#논문[1]에서 설명하는 MultiHeadAttention을 만들어라.
class MultiHeadAttention(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension)
    def __init__(self, dimension, heads=8):
        super(MultiHeadAttention, self).__init__()
        ############Write your code Here############
        self.heads = heads
        self.scale = dimension ** -0.5

        self.to_qkv = tf.keras.layers.Dense(dimension * 3, use_bias=False)
        self.to_out = tf.keras.layers.Dense(dimension)

        self.rearrange_qkv = Rearrange('b n (qkv h d) -> qkv b h n d', qkv = 3, h = self.heads)
        self.rearrange_out = Rearrange('b h n d -> b n (h d)')
        ############################################
    def call(self, inputs):
        output = None
        ############Write your code Here############
        qkv = self.to_qkv(inputs)
        qkv = self.rearrange_qkv(qkv)
        q = qkv[0]
        k = qkv[1]
        v = qkv[2]

        dots = tf.einsum('bhid,bhjd->bhij', q, k) * self.scale
        attn = tf.nn.softmax(dots,axis=-1)

        output = tf.einsum('bhij,bhjd->bhid', attn, v)
        output = self.rearrange_out(output)
        output = self.to_out(output)
        ############################################
        return output

#인자로 받은 residual_function을 사용하여 real_function값을 return하여주는 Class를 만들어라.(call함수 참고)
class ResidualBlock(tf.keras.Model):
    def __init__(self, residual_function):
        super(ResidualBlock, self).__init__()
        ############Write your code Here############
        self.residual_function = residual_function
        ############################################

    def call(self, inputs):
        return self.residual_function(inputs) + inputs
        
#인자로 받은 normfunction에 들어가기전에 LayerNormalization을 해주는 Class를 만들어라.(call함수 참고)
class NormalizationBlock(tf.keras.Model):
    def __init__(self, norm_function, epsilon=1e-5):
        super(NormalizationBlock, self).__init__()
        ############Write your code Here############
        self.norm_function = norm_function
        self.normalize = tf.keras.layers.LayerNormalization(epsilon=epsilon)
        ############################################

    def call(self, inputs):
        return self.norm_function(self.normalize(inputs))

#논문[1]에서의 MLPBlock을 만들어라.
class MLPBlock(tf.keras.Model):
    #output_dimension - MLPBlock의 output dimension
    #hidden_dimension - MLPBlock의 hidden layer dimension
    def __init__(self, output_dimension, hidden_dimension):
        super(MLPBlock, self).__init__()
        ############Write your code Here############
        self.hidden_layer = tf.keras.layers.Dense(hidden_dimension, activation=GELU)
        self.output_layer = tf.keras.layers.Dense(output_dimension)
        ############################################

    def call(self, inputs):
        output = None
        ############Write your code Here############
        hidden = GELU(self.hidden_layer(inputs))
        output = GELU(self.output_layer(hidden))
        ############################################
        return output

#논문[1]을 읽고 TransformerEncoder를 위에서 정의한 class들을 사용하여 만들어라.
class TransformerEncoder(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), heads - MHA에서 head의 개수
    #depth - encoder layer의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    def __init__(self, dimension, depth, heads, mlp_dimension): 
        super(TransformerEncoder, self).__init__()
        layers_ = []
        for _ in range(depth):
            ############Write your code Here############
            ResidualBlock(NormalizationBlock(MultiHeadAttention(dimension, heads)))
            ResidualBlock(NormalizationBlock(MLPBlock(dimension, mlp_dimension)))
            ############################################
        self.layers_ = tf.keras.Sequential(layers_)

    def call(self, inputs):
        return self.layers_(inputs)

#논문[2]를 읽고 ViT모델을 위에서 정의한 class들을 사용하여 만들어라.
class ImageTransformer(tf.keras.Model):
    #image_size - 이미지의 W==H의 크기(int), patch_size - 이미지를 쪼갤 patch의 크기(int)
    #n_classes - 최종 class의 개수, batch_size - 배치사이즈
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), depth - encoder layer의 개수
    #heads - MHA에서 head의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    #channel - input image에 대한 channel의 수
    def __init__(
            self, image_size, patch_size, n_classes, batch_size,
            dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'invalid patch size for image size'

        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension
        self.batch_size = batch_size

        self.positional_embedding = self.add_weight(
            "position_embeddings", shape=[num_patches + 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        self.classification_token = self.add_weight(
            "classification_token", shape=[1, 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        ############Write your code Here############
        self.patch_to_embedding = tf.keras.layers.Dense(dimension)
        self.rearrange = Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', 
                                   p1=self.patch_size, 
                                   p2=self.patch_size)
        self.transformer_encoder = TransformerEncoder(dimension, depth, heads, mlp_dimension)
        self.to_cls_token = tf.identity
        self.mlp_head = tf.keras.Sequential([
                                             tf.keras.layers.Dense(mlp_dimension, activation=GELU),
                                             tf.keras.layers.Dense(n_classes)
                                             ])
        ############################################

    def call(self, inputs):
        output = None
        ############Write your code Here############
        shapes = tf.shape(inputs)

        x = self.rearrange(inputs)
        x = self.patch_to_embedding(x)

        cls_tokens = tf.broadcast_to(self.classification_token,(shapes[0],1,self.dimension))
        print(cls_tokens.shape)
        x = tf.concat((cls_tokens, x), axis=1)
        x += self.positional_embedding
        x = self.transformer_encoder(x)

        x = self.to_cls_token(x[:, 0])
        output = self.mlp_head(x)
        ############################################
        return output

In [4]:
from tensorflow.keras import datasets
# Download and prepare the CIFAR10 dataset
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
# Normalize pixel values to be between 0 and 1
############Write your code Here############
import numpy as np
def zero_one_normalize(X):
    X = X.astype(np.float64)
    X /= 255.
    return X
train_images = zero_one_normalize(train_images)
test_images = zero_one_normalize(test_images)
############################################
# Make image shape (BS, H, W, C) to (BS, C, H, W)
############Write your code Here############
def image_reshape(X):
    (FB, H, W, C) = X.shape
    X = X.reshape((FB,C,H,W))
    return X
train_images = image_reshape(train_images)
test_images = image_reshape(test_images)
############################################

#Initialize your model
#Initialize optimizer and loss and compile it to the model
############Write your code Here############
image_size = 32
patch_size = 4
n_classes = 10
dimension = 64
depth = 3
heads = 4
mlp_dimension = 128
max_epochs = 10
batch_size = 64
learning_rate = 1e-3
############################################

#Train your model
############Write your code Here############
model = ImageTransformer(image_size, patch_size, n_classes, batch_size, dimension, depth, heads, mlp_dimension, channels=3)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(train_images, train_labels, epochs=10)

############################################
print('==============Training Finished===============')

#Evaluate your test samples
accuracy = 0
############Write your code Here############
accuracy = model.evaluate(test_images,  test_labels, verbose=2)
############################################

print('Test Accuracy :', accuracy)

Epoch 1/10
(None, 1, 64)
(None, 1, 64)
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(None, 1, 64)
313/313 - 1s - loss: 2.3026 - accuracy: 0.1000
Test Accuracy : [2.30259108543396, 0.10000000149011612]
