In [9]:
from tensorflow.keras import Input, layers, Model
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout, Softmax, Embedding, Concatenate
from tensorflow.keras.optimizers import Adam
import numpy as np
from keras.utils import np_utils
from tensorflow.keras.datasets import mnist
from keras.utils.np_utils import to_categorical
import tensorflow as tf
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
Y_train = to_categorical(Y_train)
X_train = X_train.reshape(X_train.shape[0], -1)
#tf.config.run_functions_eagerly(True)

In [10]:
n_vocabs = 256
class DotProduct(layers.Layer):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
  def call(self, t_queries, t_keys, t_values):
    softmax_layer = Softmax(axis = -1)
    softmax_output = softmax_layer(tf.matmul(t_queries, tf.transpose(t_keys, [0, 2, 1]))/np.sqrt(t_queries.shape[2]))
    return tf.matmul(softmax_output, t_values)

In [11]:
class Attention(layers.Layer):
  def __init__(self, qk_hidden,  v_hidden, **kwargs):
    super().__init__(**kwargs)
    self.dense_queries = Dense(qk_hidden)
    self.dense_keys = Dense(qk_hidden)
    self.dense_values = Dense(v_hidden)
    self.dot_product = DotProduct()
    self.flatten = Flatten()
  def call(self, embedding):
    t_queries = self.dense_queries(embedding)
    t_keys = self.dense_keys(embedding)
    t_values = self.dense_values(embedding)
    values_probability = self.dot_product(t_queries, t_keys, t_values)
    return values_probability
class MultiAttention(layers.Layer):
  def __init__(self, qk_hiddens, v_hidden, **kwargs):
    super().__init__(**kwargs)
    self.qk_hiddens = qk_hiddens
    self.v_hidden = v_hidden
    self.list_attention = [Attention(self.qk_hiddens[i], self.v_hidden) for i in range(len(self.qk_hiddens))]
    self.concatenate = Concatenate(axis = 2)
  def call(self, embedding):
    mutil_attention = [attention(embedding) for attention in self.list_attention]
    return self.concatenate(mutil_attention)

In [12]:
input = Input((X_train.shape[1]))
n_embedding = 128
embedding = Embedding(n_vocabs, n_embedding, input_length = X_train.shape[1])(input)
multi_attention = MultiAttention([128, 256, 512], 256)(embedding)
flatten = Flatten()(multi_attention)
dense = Dense(128, activation = "relu")(flatten)
output = Dense(10, activation = "softmax")(dense)
model_attention = Model(input, output)
model_attention.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["acc"])
with tf.device("/gpu:0"):
  model_attention.fit(X_train, Y_train, batch_size = 32, epochs = 10, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
#Nếu k dùng attention layer phía trước mà dùng các pixels ảnh làm feature luôn.
input = Input((X_train.shape[1]))
dense1 = Dense(128, activation = "relu")(input)
output = Dense(10, activation = "softmax")(dense1)
model_mlp = Model(input, output)
model_mlp.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["acc"])
with tf.device("/gpu:0"):
  model_mlp.fit(X_train, Y_train, batch_size = 32, epochs = 10, verbose = 1)
#Hiệu năng thấp hơn là training với Attention Layer

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
(X_test, Y_test) = mnist.load_data()[1]
X_test = X_test.reshape(X_test.shape[0], -1)
Y_test = np_utils.to_categorical(Y_test)
model_attention.evaluate(X_test, Y_test)
model_mlp.evaluate(X_test, Y_test)
#Dùng thêm Attention Layer có hiệu năng tốt hơn trên tập test
#Tuy nhiên thời gian training dùng thêm attention layer rất lâu ! dù đã có GPU của colab



[0.2867424190044403, 0.947700023651123]