In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras

# Multi-Head Attention

In [2]:
class Multi_Head_Attention(keras.layers.Layer):
    def __init__(self, embedding_dims, num_heads=8):
        super(Multi_Head_Attention, self).__init__()
        self.embedding_dims = embedding_dims
        self.num_heads = num_heads
        self.head_dims = embedding_dims // num_heads
        
        self.WQ = keras.layers.Dense(embedding_dims)
        self.WK = keras.layers.Dense(embedding_dims)
        self.WV = keras.layers.Dense(embedding_dims)
        self.WO = keras.layers.Dense(embedding_dims)
        
    def scaled_dot_attention(self, query, key, value):
        # query, key, value shape: (batch_size, num_heads, seq_len, head_dims)
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        depth = tf.cast(tf.shape(key)[-1], tf.float32) # last dimension of key
        logits = matmul_qk / tf.math.sqrt(depth)
        attention_weights = tf.nn.softmax(logits) # softmax is normalized on the last axis
        output = tf.matmul(attention_weights, value)
        return output, attention_weights
    
    def split_heads(self, x, batch_size):
        # split the last dimension into (num_heads, head_dims)
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dims))
        return tf.transpose(x, perm=[0, 2, 1, 3]) # permute the dimensions into (batch_size, num_heads, seq_len, head_dims)
    
    def call(self, input):
        # input shape: (batch_size, seq_len, embedding_dims)
        batch_size = tf.shape(input)[0]
        
        query = self.WQ(input) # (batch_size, seq_len, embedding_dims)
        key = self.WK(input) # (batch_size, seq_len, embedding_dims)
        value = self.WV(input) # (batch_size, seq_len, embedding_dims)
        
        query = self.split_heads(query, batch_size) # (batch_size, num_heads, seq_len, head_dims)
        key = self.split_heads(key, batch_size) # (batch_size, num_heads, seq_len, head_dims)
        value = self.split_heads(value, batch_size) # (batch_size, num_heads, seq_len, head_dims)
        
        scaled_attention, attention_weights = self.scaled_dot_attention(query, key, value)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # restore the original dimensions (batch_size, seq_len, num_heads, head_dims)
        
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.embedding_dims)) # (batch_size, seq_len, embedding_dims)
        output = self.WO(concat_attention) # (batch_size, seq_len, embedding_dims)
        return output

In [3]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embedding_dims, num_heads, ff_dims, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = Multi_Head_Attention(embedding_dims, num_heads)
        self.ffn = keras.Sequential(layers=[
            keras.layers.Dense(units=ff_dims, activation='relu'),
            keras.layers.Dense(units=embedding_dims)
        ])
        
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate=dropout_rate)
        self.dropout2 = keras.layers.Dropout(rate=dropout_rate)
        
    def call(self, input, training):
        attention_output = self.att(input) 
        attention_output = self.dropout1(attention_output, training=training)
        attention_value = self.layernorm1(input + attention_output)
        
        ffn_output = self.ffn(attention_value)
        ffn_output = self.dropout2(ffn_output, training=training)
        final_output = self.layernorm2(attention_value + ffn_output)
        return final_output

### tf.keras.layers
 - `call(self, inputs, *args, **kwargs)`: Called in `__call__` after making
      sure `build()` has been called. `call()` performs the logic of applying
      the layer to the `inputs`. The first invocation may additionally create
      state that could not be conveniently created in `build()`; see its
      docstring for details.
      Two reserved keyword arguments you can optionally use in `call()` are:
        - `training` (boolean, whether the call is in inference mode or training
          mode). See more details in [the layer/model subclassing guide](
          https://www.tensorflow.org/guide/keras/custom_layers_and_models#privileged_training_argument_in_the_call_method)
        - `mask` (boolean tensor encoding masked timesteps in the input, used
          in RNN layers). See more details in
          [the layer/model subclassing guide](
          https://www.tensorflow.org/guide/keras/custom_layers_and_models#privileged_mask_argument_in_the_call_method)
      A typical signature for this method is `call(self, inputs)`, and user
      could optionally add `training` and `mask` if the layer need them. `*args`
      and `**kwargs` is only useful for future extension when more input
      parameters are planned to be added.

In [4]:
class Token_And_Position_Embedding(keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embedding_dims):
        super(Token_And_Position_Embedding, self).__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dims) # (none, none, embedding_dims)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embedding_dims) # (none, none, embedding_dims)
                
    def call(self, input):
        limit = tf.shape(input)[-1]
        positions = tf.range(start=0, limit=limit, delta=1) # (maxlen,)
        positions = self.pos_emb(positions) # (maxlen, embedding_dims)
        token = self.token_emb(input) # (batch_size, limit, embedding_dims)
        
        return token + positions

```python
maxlen = 100
positions = tf.range(start=0, limit=maxlen, delta=1)
print(positions) # tf.Tensor([ 0  1  2 ... 97 98 99], shape=(100,), dtype=int32)
positions_embedding = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=512)(positions)
print(positions_embedding) # tf.Tensor(..., shape=(100, 512), dtype=float32))
print(positions.shape, positions_embedding.shape) # (100,) (100, 512)
```

### layer.embedding
```python
model.add(tf.keras.layers.Embedding(input_dims=1000, output_dims=64, input_length=10))
# The model will take as input an integer matrix of size (batch, input_length).
# And the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# Now model.output_shape is (None, 10, 64), where `None` is the batch
```
```python
print('vocab_size: ', vocab_size) # 20000
print('max_len: ', max_len) # 200
print('embedding_dims: ', embedding_dims) # 32

model2 = keras.Sequential()
model2.add(keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dims, input_length=max_len)) # (None, 200, 32)
# model2.add(keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dims)) # (None, None, 32)
model2.summary()
```

# Data Preprocessing

In [5]:
vocab_size = 20000  # 빈도수 상위 2만개의 단어만 사용
max_len = 200  # 문장의 최대 길이

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)
print('훈련용 리뷰 개수 : {}'.format(len(X_train)))
print('테스트용 리뷰 개수 : {}'.format(len(X_test)))

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_len)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)

훈련용 리뷰 개수 : 25000
테스트용 리뷰 개수 : 25000


# Modeling

In [6]:
from keras.layers import Input, GlobalAveragePooling1D, Dropout, Dense

In [7]:
# hyperparameters
embedding_dims = 32
num_heads = 2
ff_dims = 32 # hidden layer size in feed forward network inside transformer

embedding = Token_And_Position_Embedding(max_len, vocab_size, embedding_dims)
token_and_position_emb = TransformerBlock(embedding_dims, num_heads, ff_dims)

input = Input(shape=(max_len,))

x = embedding(input)
x = token_and_position_emb(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation='relu')(x)
x = Dropout(0.1)(x)

output = Dense(2, activation='softmax')(x)

model = keras.Model(inputs=input, outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 token__and__position__embe  (None, 200, 32)           646400    
 dding (Token_And_Position_                                      
 Embedding)                                                      
                                                                 
 transformer_block (Transfo  (None, 200, 32)           6464      
 rmerBlock)                                                      
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_2 (Dropout)         (None, 32)                0     

In [8]:
history = model.fit(X_train, y_train, batch_size=32, epochs=2, validation_data=(X_test, y_test))

Epoch 1/2
Epoch 2/2


In [9]:
print("테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))

테스트 정확도: 0.8674
