In [1]:
from keras.datasets import imdb
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In [2]:
vocab_size = 10000
max_len = 500
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

print(max(map(len, X_train)))
print(sum(map(len, X_train))/len(X_train))

X_train_padded = pad_sequences(X_train, maxlen=max_len)
X_test_padded = pad_sequences(X_test, maxlen=max_len)

2494
238.71364


# Bahdanau Attention

In [5]:
import numpy as np

import tensorflow as tf
from keras import Model
from keras.layers import Dense

In [37]:
class BahdanauAttention(Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units) # units : output dimensionality
        self.W2 = Dense(units)
        self.V = Dense(1)
        
    def call(self, values, query):
        # query shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # score 계산을 위해 뒤에서 할 덧셈을 위해서 차원을 변경해줍니다.
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # attention_weights : (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights

### super(BahdanauAttention, self).__init__()
 - super() 라는 함수는 super class 즉, 부모 클래스의 임시적인 객체를 반환하여 부모클래스의 메소드를 사용할 수 있게 하는 것.
 - ``super(BahdanauAttention, self)`` : returns a temporary object of the superclass, which in this case is ``tf.keras.Model``, as ``BahdanauAttention`` is a subclass of tf.keras.Model.
 - ``super(BahdanauAttention, self).__init__()`` :  calls the constructor of the superclass ``(tf.keras.Model)``. This is necessary to ensure that the initialization code in the base class ``(tf.keras.Model)`` is executed.
 - In summary, the line ``super(BahdanauAttention, self).__init__()`` in the ``BahdanauAttention`` class ensures that the class inherits and initializes all necessary properties and methods from its parent class ``tf.keras.Model``.

### call
 - ``call`` method is called in the ``BahdanauAttention`` class when you call the instance of the ``BahdanauAttention`` class.
 - By subclassing the `Model` class: in that case, you should define your
    layers in `__init__()` and you should implement the model's forward pass
    in `call()`. (https://www.tensorflow.org/api_docs/python/tf/keras/Model)

### tf.layers.Dense
 - Dense implements the operation: ``output = activation(dot(input, kernel) + bias)`` , Belows are all attributes of Dense.
  - ``activation`` is the element-wise activation function passed as the activation argument.
  - ``kernel`` is a weights matrix created by the layer.
  - ``bias`` is a bias vector created by the layer (only applicable if use_bias is True). 
 - If the input to the layer has a rank greater than 2, then Dense computes the dot product between the inputs and the kernel along the last axis of the inputs and axis 0 of the kernel (using tf.tensordot). For example, if input has dimensions (batch_size, d0, d1), then we create a kernel with shape (d1, units), and the kernel operates along axis 2 of the input, on every sub-tensor of shape (1, 1, d1) (there are batch_size * d0 such sub-tensors). The output in this case will have shape (batch_size, d0, units). (https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense?hl=en)

# TensorDot (Example)
## https://www.tensorflow.org/api_docs/python/tf/tensordot
## https://numpy.org/doc/stable/reference/generated/numpy.tensordot.html

### tensordot.axes : int or (2,) array_like
 - integer_like If an int N, sum over the last N axes of a and the first N axes of b in order. The sizes of the corresponding axes must match.
 - (2,) array_like Or, a list of axes to be summed over, first sequence applying to a, second to b. Both elements array_like must be of the same length.



In [17]:
a = np.arange(8.).reshape(1,2,4)
b = np.arange(6.).reshape(1,3,2)
c = np.tensordot(a,b, axes=([0,1],[0,2]))

print(c.shape)

(4, 3)
[[ 4. 12. 20.]
 [ 5. 17. 29.]
 [ 6. 22. 38.]
 [ 7. 27. 47.]]


In [18]:
print(a)
print(b)

[[[0. 1. 2. 3.]
  [4. 5. 6. 7.]]]
[[[0. 1.]
  [2. 3.]
  [4. 5.]]]


In [19]:
print(c)

[[ 4. 12. 20.]
 [ 5. 17. 29.]
 [ 6. 22. 38.]
 [ 7. 27. 47.]]


In [22]:
_c = np.zeros((4,3))

for i in range(3):
    for j in range (4):
        for k in range(2):
            for l in range(1):
                _c[j,i] += a[l,k,j] * b[l,i,k]
print(_c)

[[ 4. 12. 20.]
 [ 5. 17. 29.]
 [ 6. 22. 38.]
 [ 7. 27. 47.]]


In [31]:
a = np.arange(8.).reshape(1,2,4)
b = np.arange(12.).reshape(4,3)
c = np.tensordot(a,b, axes=([2],[0]))

print(c.shape)

__c = np.zeros((1,2,3))

for i in range(1):
    for j in range (2):
        for k in range(3):
            for l in range(4):
                __c[i,j,k] += a[i,j,l] * b[l,k]

print(c)
print(__c)

# numpy.tensordot == tf.tensordot
tc = tf.tensordot(a,b, axes=([2],[0]))
print(tc.shape)
print(tc)

(1, 2, 3)
[[[ 42.  48.  54.]
  [114. 136. 158.]]]
[[[ 42.  48.  54.]
  [114. 136. 158.]]]
(1, 2, 3)
tf.Tensor(
[[[ 42.  48.  54.]
  [114. 136. 158.]]], shape=(1, 2, 3), dtype=float64)


# Modeling

In [32]:
from keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout
from keras import Input, Model
from keras import optimizers

In [39]:
# hyperparameters
embedding_dim = 128
hidden_units = 64
dropout_ratio = 0.5

# input-embedding
input = Input(shape=(max_len,), dtype='int32')
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, mask_zero=True)(input)
# bi-LSTM
lstm = Bidirectional(LSTM(hidden_units, dropout=dropout_ratio, return_sequences=True))(embedding)
lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(hidden_units, dropout=dropout_ratio, return_sequences=True, return_state=True))(lstm)
print(lstm.shape, forward_h.shape, forward_c.shape, backward_h.shape, backward_c.shape)

state_h = Concatenate()([forward_h, backward_h]) # hidden state
state_c = Concatenate()([forward_c, backward_c]) # cell state
print(state_h.shape, state_c.shape)

# Attention
# use hidden state to compute attention score
attention = BahdanauAttention(64) 
context_vector, attention_weights = attention(lstm, state_h)

# output
output = Dense(1, activation='relu')(context_vector)
output = Dropout(dropout_ratio)(output)
output = Dense(1, activation='sigmoid')(output)

# model
model = Model(inputs=input, outputs=output)
model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(lr=0.001), metrics=['accuracy'])
model.summary()



(None, 500, 128) (None, 64) (None, 64) (None, 64) (None, 64)
(None, 128) (None, 128)
<keras.src.layers.core.dense.Dense object at 0x2c48ef2d0>
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 500)]                0         []                            
                                                                                                  
 embedding_4 (Embedding)     (None, 500, 128)             1280000   ['input_5[0][0]']             
                                                                                                  
 bidirectional_8 (Bidirecti  (None, 500, 128)             98816     ['embedding_4[0][0]']         
 onal)                                                                                            
                                                  

In [40]:
history = model.fit(X_train_padded, y_train, epochs=3, batch_size=256, validation_data=(X_test_padded, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Evaluating

In [41]:
model.evaluate(X_test_padded, y_test)



[0.47830942273139954, 0.8667200207710266]