**Goal:**
- Train a Transformer model using Keras to perform sentiment analysis on the IMDB movie reviews dataset.
- Inputs: movie reviews, tokenized and padded to a fixed lenght
- Output: binary classification (positive

In [44]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D
import keras

In [45]:
# Load the dataset
context = np.load('../data/custom_context_sequences.npz')
question = np.load('../data/custom_question_sequences.npz')
answer = np.load('../data/custom_answer_sequences.npz')

In [46]:
# Extract the train, validation, and test data for context, question, and answer
context_train = context['train']
context_val = context['val']
context_test = context['test']

question_train = question['train']
question_val = question['val']
question_test = question['test']

answer_train = answer['train']
answer_val = answer['val']
answer_test = answer['test']

In [47]:
print(f'Context Traning Data Shape: {context_train.shape}')
print(f'Context Validation Data Shape: {context_val.shape}')
print(f'Context Test Data Shape: {context_test.shape}\n')

print(f'Question Traning Data Shape: {question_train.shape}')
print(f'Question Validation Data Shape: {question_val.shape}')
print(f'Question Test Data Shape: {question_test.shape}\n')

print(f'Answer Traning Data Shape:  {answer_train.shape}')
print(f'Answer Validation Data Shap: {answer_val.shape}')
print(f'Answer Test Data Shape: {answer_test.shape}')

Context Traning Data Shape: (67275, 653)
Context Validation Data Shape: (14016, 653)
Context Test Data Shape: (17520, 653)

Question Traning Data Shape: (67275, 653)
Question Validation Data Shape: (14016, 653)
Question Test Data Shape: (17520, 653)

Answer Traning Data Shape:  (67275, 43)
Answer Validation Data Shap: (14016, 43)
Answer Test Data Shape: (17520, 43)


In [51]:
# Build the Transformer Block
def transformer_block(context_embedding, question_embedding):
    # MultiHead Attention with context attending to itself
    attention_layer = MultiHeadAttention(num_heads=2, key_dim=128, dropout=0.2)(context_embedding, context_embedding)
    out1 = LayerNormalization()(context_embedding + attention_layer)
    
    # Attention between encoded context and question
    combined_representation = keras.layers.concatenate([out1, question_embedding], axis=1)  # Concatenate context and question
    attention_layer = MultiHeadAttention(num_heads=2, key_dim=128, dropout=0.2)(combined_representation, combined_representation)
    out2 = LayerNormalization()(combined_representation + attention_layer)
    
    # Feed Forward Network (applied to the combined representation)
    ffn_output = Dense(2048, activation='relu')(out2)
    ffn_output = Dropout(0.2)(ffn_output)
    ffn_output = Dense(128, activation='relu')(ffn_output)
    
    return LayerNormalization()(out2 + ffn_output)

In [61]:
# Assemble full model
def build_model():
    context_input = Input(shape=(653,))
    question_input = Input(shape=(653,))
    context_embedding_layer = Embedding(10000, 128)(context_input)
    question_embedding_layer = Embedding(10000, 128)(question_input)
    
    # Pass transformer block output directly as model output
    encoded_representation = transformer_block(context_embedding_layer, question_embedding_layer)
    
    # Pointer Network within the model definition (using returned encoded_representation)
    attention_weights = encoded_representation
    pointer_network_input = Dense(1)(attention_weights)
    answer_logits = keras.activations.softmax(pointer_network_input, axis=1)
    
    model = Model(inputs=[context_input, question_input], outputs=answer_logits)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


# def build_model():

#     context_input = Input(shape = (653,))
#     question_input = Input(shape = (40,))
#     context_embedding_layer = Embedding(10000,128)(context_input)
#     question_embedding_layer = Embedding(10000, 32)(question_input)
#     encoded_representation = transformer_block(context_embedding_layer, question_embedding_layer)
    
#     attention_weights = encoded_representation  # Assuming attention weights are the last output of transformer_block

#     # Pointer Network
#     # Option 1: Dense layer before softmax (common approach)
#     pointer_network_input = Dense(1, activation='none')(attention_weights)  # Project to single dimension
#     answer_logits = tf.nn.softmax(pointer_network_input, axis=-1)

#     model = Model(inputs=[context_input, question_input], outputs=answer_logits)
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model

In [62]:
model = build_model()
model.summary()

In [83]:
# train model

model.fit([context_train, question_train], answer_train, batch_size = 32, epochs = 3, validation_data=([context_val,x_test,y_test)) #define params

Epoch 1/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 374ms/step - accuracy: 0.8203 - loss: 0.3945 - val_accuracy: 0.8686 - val_loss: 0.3100
Epoch 2/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 309ms/step - accuracy: 0.9215 - loss: 0.2039 - val_accuracy: 0.8601 - val_loss: 0.3354
Epoch 3/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 288ms/step - accuracy: 0.9482 - loss: 0.1447 - val_accuracy: 0.8554 - val_loss: 0.3736


<keras.src.callbacks.history.History at 0x23014e4db10>

In [84]:
# Evaluate model

loss, accuracy = model.evaluate(x_test,y_test)
print(f'Test Accuracy: {accuracy:.2f}')

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 67ms/step - accuracy: 0.8577 - loss: 0.3712
Test Accuracy: 0.86
