## Setup

Install `tf-nightly` via `pip install tf-nightly`.

In [None]:
! pip install tf-nightly

In [None]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from dataclasses import dataclass
import pandas as pd
import numpy as np
import random
import pickle
import glob
import re
from pprint import pprint
import apache_beam
from datasets import load_dataset

## Test Tensorflow and GPU use

In [None]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


## Set-up Configuration

In [None]:
@dataclass
class Config:
    MAX_LEN = 64
    BATCH_SIZE = 32
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model
    NUM_LAYERS = 1

config = Config()

## <span style="color: #333399;">Load the Data</span>

- **Data Loading**: We will first load the Amazon Polarity dataset and separate it into training and testing sets.

In [None]:
dataset = load_dataset("amazon_polarity")
# print(dataset)

# Calculate lengths
length_train = len(dataset['train'])
length_test = len(dataset['test'])

# Use list comprehensions to create the lists
train_amazonreview = [dataset['train'][i]['content'] for i in range(length_train)]
test_amazonreview = [dataset['test'][i]['content'] for i in range(length_test)]

# Print the results
print(length_train)
print(length_test)
# print(train_amazonreview[:10])

# Access a specific test sentence
test_sentence = dataset['test'][100]['content']
# print(test_sentence)

## <span style="color: #333399;">Sample Data for Training</span>

### <span style="color: #333399;">Current Status</span>

- **Training Data Utilization**: 100,000 out of 3,600,000 samples have been used for training purposes so far.

In [66]:
# Sample from the list
amazon_sample = random.sample(train_amazonreview, 100000)
amazon_test_sample = random.sample(test_amazonreview, 50)

# Convert the samples to numpy arrays if needed
arr_amazon_sample = np.array(amazon_sample)
arr_amazon_test_sample = np.array(amazon_test_sample)

# Print the sampled test reviews
# print(arr_amazon_test_sample)

## <span style="color: #333399;">Overview of Text Processing and Vectorization</span>

### <span style="color: #333399;">General Functionality</span>

- **Text Preprocessing**: Normalizes input text by lowercasing and cleaning up special characters, preparing it for further processing.
- **Text Vectorization**: Converts text into a sequence of integers where each integer represents a unique token in the vocabulary. This transformation is essential for feeding textual data into machine learning models.

### <span style="color: #333399;">Customization Details</span>

- **Custom Preprocessing Function**: Uses a specialized function to preprocess text data by specifically targeting sentence-ending punctuations and incorporating a unique <span style="color: #0077CC;">**[sep]**</span> token to clearly demarcate sentence boundaries.
- **Adjustable Vocabulary**: The vectorization setup includes a mechanism to adapt the vocabulary based on the input texts and extend it by incorporating special tokens like <span style="color: #CC0077;">**[mask]**</span>, which are crucial for tasks such as masked language modeling.
- **Devectorization Method**: Provides a method to reverse the vectorization process, which is helpful for verifying the correctness of the vectorization process and understanding model predictions.

These customizations enhance the model's ability to handle various textual nuances, making it suitable for advanced NLP tasks.

In [None]:
def preprocess_text(input_data):
    lowercase = tf.strings.lower(input_data)
    
    # Replace multiple sentence-ending punctuations with [sep]
    sentence_sep_pattern = r"([!?.]+)"
    standardized_text = tf.strings.regex_replace(lowercase, sentence_sep_pattern, " [sep] ")

    # Remove other special characters except the [sep] token
    stripped_html = tf.strings.regex_replace(standardized_text, "<br />", " ")
    cleaned_text = tf.strings.regex_replace(
        stripped_html, r"[{}]+".format(re.escape("#$%&'()*+,-/:;<=>@^_`{|}~\"")), ""
    )
    
    # Ensure there is no trailing [sep] token at the end
    cleaned_text = tf.strings.regex_replace(cleaned_text, r"\s*\[sep\]\s*$", "")
    
    return cleaned_text

def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):

    # Initialize TextVectorization layer with custom standardization
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=preprocess_text,
        output_sequence_length=max_seq,
    )

    # Adapt the vectorize layer to the input texts
    vectorize_layer.adapt(texts)

    # Get the current vocabulary
    vocab = vectorize_layer.get_vocabulary()

    vocab = vocab[:-1] + special_tokens

    # Reapply the updated vocabulary to the vectorization layer
    vectorize_layer.set_vocabulary(vocab)

    return vectorize_layer

def devectorize_sequences(vectorize_layer, sequences):
    # Get the vocabulary and reverse map from indices to tokens
    vocab = vectorize_layer.get_vocabulary()
    index_to_token = {i: token for i, token in enumerate(vocab)}

    # Convert integer sequences back to text
    def index_to_text(indices):
        return ' '.join(index_to_token.get(index, '') for index in indices)
    
    # Apply the conversion to all sequences
    return [index_to_text(sequence) for sequence in sequences]

# Configure the vectorize layer
vectorize_layer = get_vectorize_layer(
    train_amazonreview,
    config.VOCAB_SIZE,
    config.MAX_LEN,
    special_tokens=["[mask]"],
)

# Get mask token id for masked language model
mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]
print("\nMask token ID:", mask_token_id)

# print(vectorize_layer.get_vocabulary())

## <span style="color: #333399;">Save Text Vectorization Configuration</span>

- **Function Overview**: This function saves the configuration and vocabulary of a TensorFlow `TextVectorization` layer to ensure model consistency during deployment or further training sessions.

### <span style="color: #333399;">Key Processes:</span>
- **Vocabulary Saving**: The vocabulary of the `TextVectorization` layer, crucial for encoding text data, is serialized and saved using Python's `pickle` module.
- **Configuration Saving**: Additional configuration parameters like maximum token counts and output sequence lengths are also serialized and saved. This step is essential for recreating the vectorization environment accurately in different computational contexts.

In [None]:
# Assuming `vectorize_layer` is your TextVectorization layer
def save_vectorize_layer(vectorize_layer, filename):
    # Save the vocabulary
    vocab = vectorize_layer.get_vocabulary()
    with open(filename + '_vocab.pkl', 'wb') as f:
        pickle.dump(vocab, f)

    # Save other necessary parameters if needed
    config_vec = {
        'max_tokens': config.VOCAB_SIZE,
        'output_sequence_length': config.MAX_LEN
    }
    with open(filename + '_config.pkl', 'wb') as f:
        pickle.dump(config_vec, f)

save_vectorize_layer(vectorize_layer, 'vectorize_layer')

## <span style="color: #333399;">Load Text Vectorization Configuration</span>

- **Function Overview**: This function retrieves and reinstates the configuration and vocabulary of a TensorFlow `TextVectorization` layer from previously saved files, ensuring seamless continuation of text processing capabilities in different operational environments.

### <span style="color: #333399;">Key Processes:</span>
- **Vocabulary Loading**: The vocabulary, crucial for the correct encoding of text data, is loaded from a pickle file. This step re-establishes the essential word-to-index mappings used in text vectorization.
- **Configuration Loading**: Additional parameters such as maximum token counts and output sequence lengths are loaded from another pickle file. These settings are vital for maintaining the operational consistency of the vectorization process across different setups.
- **Layer Recreation**: After loading the necessary configurations, the `TextVectorization` layer is recreated with the exact parameters that were previously in use, ensuring that the layer operates exactly as it did before saving.



In [None]:
def load_vectorize_layer(filename):
    # Load the vocabulary
    with open(filename + '_vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)

    # Load other necessary parameters
    with open(filename + '_config.pkl', 'rb') as f:
        config_vec = pickle.load(f)

    # Recreate the TextVectorization layer with the loaded parameters
    vectorize_layer = TextVectorization(
        max_tokens=config_vec['max_tokens'],
        output_mode='int',
        output_sequence_length=config_vec['output_sequence_length']
    )

    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer

vectorize_layer = load_vectorize_layer('vectorize_layer')

## <span style="color: #333399;">Left-Padding Sequences Function</span>

- **Purpose**: Adjusts the alignment of token sequences by left-padding them, which is essential for certain types of data processing where alignment affects model performance.
- **Process**:
    - **Initialization**: Creates a new array of zeros with the same dimensions as the input, ensuring a clean slate for padding adjustments.
    - **Non-zero Counting**: Determines the count of non-zero tokens (actual data) in each sequence to avoid padding these meaningful tokens.
    - **Padding Calculation**: Computes the correct starting position for the non-zero tokens in the new array to shift them towards the end, ensuring the padding is on the left.
    - **Data Rearrangement**: Moves the non-zero elements to their new positions, effectively left-padding the sequence with zeros.

This function is crucial for preparing data in a format that is compatible with neural network architectures that require fixed-length input sequences.


In [64]:
# Function to left-pad sequences
def left_pad_sequences(vectorized_texts):
    # Initialize a new array of zeros with the same shape as the input
    padded_texts = np.zeros_like(vectorized_texts)
    
    # For each sequence in the input array
    for i, seq in enumerate(vectorized_texts):
        # Count the number of non-zero (non-padding) elements
        non_zeros = np.count_nonzero(seq)
        
        # Compute the starting index for non-padding elements in the new array
        start_index = len(seq) - non_zeros
        
        # Place the non-zero elements at the end of the corresponding new sequence
        padded_texts[i, start_index:] = seq[:non_zeros]
    
    return padded_texts

## <span style="color: #333399;">Text Encoding and Token Masking Functions</span>

### <span style="color: #333399;">Text Encoding</span>
- **Purpose**: Converts input texts into a vectorized format using a provided `TextVectorization` layer, facilitating machine learning processing.
- **Process**: 
    - The input text is transformed into a sequence of integer tokens that represent each word based on a predefined vocabulary.
    - The resulting tensor is then converted into a NumPy array, preparing it for further manipulation such as padding or masking.

### <span style="color: #333399;">Token Masking</span>
- **Purpose**: Selectively masks tokens in the vectorized text to support tasks like training a masked language model (MLM), which can improve the model's ability to predict missing or masked words.
- **Process**:
    - **Initialization**: Masks 15% of the non-special, non-zero-padding tokens in each sequence. Special tokens and padding are excluded from masking to maintain contextual integrity and avoid altering structural elements.
    - **Masking Logic**: Ensures that at least the last valid token in each sequence is masked, which helps the model learn to predict token endings in sentences.
    - **Output Creation**: Generates three outputs:
        - **Masked Text**: The input text with selected tokens replaced by a mask token ID.
        - **Labels**: The original tokens at the masked positions, used as targets for model training.
        - **Sample Weights**: Indicators (1s and 0s) for which tokens should be considered during the loss calculation, focusing model learning on the masked positions.

### <span style="color: #333399;">TensorFlow Dataset Preparation</span>
- **Purpose**: Converts the masked text, labels, and sample weights into a TensorFlow dataset, suitable for training machine learning models in a batched and shuffled manner.
- **Process**:
    - The data is shuffled and batched according to a predefined batch size, making it ready for efficient training cycles.
    - A sample batch is retrieved and printed, providing a snapshot of the data that will be fed into the model.



In [67]:
# Encode function
def encode(texts, vectorize_layer):
    """Encodes the input texts using the provided vectorization layer."""
    vectorized = vectorize_layer(texts)
    
    # Convert the vectorized text tensor to a NumPy array using .numpy()
    vectorized_numpy = vectorized.numpy()
    
    # Apply the left-padding function to the NumPy array
    left_padded = left_pad_sequences(vectorized_numpy)

    return left_padded

# Mask tokens function
def mask_tokens(vectorized_text, mask_token_id=29999, special_token_ids=[0, 1, 2]):
    """Masks 15% of the tokens in the vectorized text, excluding special tokens, and includes the last token in the sequence in every sequence."""
    masked_text = np.copy(vectorized_text)
    labels = -1 * np.ones_like(vectorized_text)  # Initialize labels with -1
    sample_weights = np.zeros_like(vectorized_text)  # Initialize sample weights with 0

    for i, sequence in enumerate(masked_text):
        # Identify valid indices (not special tokens and not zero-padding)
        valid_indices = [idx for idx, token in enumerate(sequence) if token not in special_token_ids and token != 0]
        
        # Continue only if there are any valid indices
        if not valid_indices:
            continue
        
        # Calculate the number of tokens to mask: 15% of valid tokens
        num_to_mask = max(1, int(0.15 * len(valid_indices)))
        
        # Randomly choose indices to mask, ensuring not to include the last valid token yet
        if len(valid_indices) > 1:  # Check to ensure there's more than one valid index to avoid selection error
            mask_indices = np.random.choice(valid_indices[:-1], num_to_mask - 1, replace=False)
        else:
            mask_indices = np.array([], dtype=int)
        
        # Always include the last valid token in the mask indices
        mask_indices = np.append(mask_indices, valid_indices[-1])

        # Mask the chosen indices
        masked_text[i, mask_indices] = mask_token_id
        # Set labels for masked tokens (useful for training to predict these tokens)
        labels[i, mask_indices] = vectorized_text[i, mask_indices]
        # Set sample weights to 1 for masked tokens (indicating these should be learned)
        sample_weights[i, mask_indices] = 1  

    return masked_text, labels, sample_weights
    
# Vectorize the example texts
vectorized_text = encode(amazon_sample, vectorize_layer)

# Apply the masking function
masked_text, labels, sample_weights = mask_tokens(vectorized_text)

# Creating the TensorFlow dataset
mlm_ds = tf.data.Dataset.from_tensor_slices((masked_text, labels, sample_weights))
mlm_ds = mlm_ds.shuffle(1000).batch(Config.BATCH_SIZE)

# Take one batch and print its contents
for x_batch, y_batch, sw_batch in mlm_ds.take(1):
    print("Masked Inputs:\n", x_batch.numpy())
    print("Labels:\n", y_batch.numpy())
    print("Sample Weights:\n", sw_batch.numpy())
# Sample texts (fit the vectorize layer to these)

Masked Inputs:
 [[   22   399    45 ...     2    74 29999]
 [    0     0     0 ...     2   270 29999]
 [    5 29999 29999 ...   131     7 29999]
 ...
 [21533     1    49 ...  4251  1947 29999]
 [    0     0     0 ...    11     3 29999]
 [29999    21    83 ...  2527    18 29999]]
Labels:
 [[ -1  -1  -1 ...  -1  -1  75]
 [ -1  -1  -1 ...  -1  -1 499]
 [ -1  94  36 ...  -1  -1 730]
 ...
 [ -1  -1  -1 ...  -1  -1  14]
 [ -1  -1  -1 ...  -1  -1 110]
 [  5  -1  -1 ...  -1  -1   6]]
Sample Weights:
 [[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 1 1 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [1 0 0 ... 0 0 1]]


In [None]:
amazon_example = [
    "I got one for christmas and it is one of the worst cars i have had I have went through two motors the last one i got and ran it twice and it would not work so I called traxxas and they sent me a new one and they didn't say sorry or anything they acted like they didn't even care. they don't even want to make their customers happy I will not ever buy another traxxas product again just because of the rustler and their support when i called them. and the car is not all that great the battery charger takes eight hours to charge which is terrible because people don't want to wait eight hours to play with the car. they make the car out of very cheap materials and they want you to have to buy a faster charger because they know that most people are going to buy a faster charger.",
    "I would give it 20 stars if I could. Absolutely fantastic storyline. Couldn't put my Kindle Fire down. Wish it would happen to me for real.",
    "Nice little cooker, but the button broke in about 2 months. I took it apart to try to fix it and the button is held on by two tiny screws into little plastic holders that were broken and ruined. I'm not hamfisted, so not a very strong setup for the only moving button on the thing. Oh well, that's why it's $15. Makes me mad though because now it's in the landfill and I would have gladly paid a few extra bucks (or more like 25 cents) so it would last longer."
]
aa = encode(amazon_example, vectorize_layer)
print(aa)

## <span style="color: #333399;">Position Embedding Layer Implementation</span>

- **Purpose**: Implements a custom `PositionEmbeddingLayer` class to provide positional context to tokens in sequence data, enhancing the model's understanding of sequence order.
- **Key Features**:
  - **Initialization**: Sets up an embedding layer specifically for encoding positional information of tokens in a sequence, using predefined sequence length and output dimensions.
  - **Position Indices**: Dynamically generates position indices for each token in a sequence and retrieves their embeddings.
  - **Batch Handling**: Scales the position embeddings to match the batch size of the inputs, ensuring every sequence in the batch receives a corresponding position embedding.
  - **Serialization Support**: Includes customization of the `get_config` method to ensure that the layer's configuration can be saved and loaded effectively.

This layer is crucial for models that process sequence data where the order of elements is important, such as in natural language processing tasks.


In [68]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding

class PositionEmbeddingLayer(Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super(PositionEmbeddingLayer, self).__init__(**kwargs)
        self.sequence_length = sequence_length
        self.output_dim = output_dim
        
        # Initialize the position embedding layer
        self.position_embedding_layer = Embedding(
            input_dim=sequence_length, 
            output_dim=output_dim,
            name="position_embeddings"
        )
    
    def call(self, inputs):
        # Determine the batch size from the inputs tensor
        batch_size = tf.shape(inputs)[0]

        # Generate position indices based on the provided sequence length
        position_indices = tf.range(start=0, limit=self.sequence_length, delta=1)
        
        # Get embeddings for these position indices
        embedded_positions = self.position_embedding_layer(position_indices)
        
        # Replicate the position embeddings for each instance in the batch
        embedded_positions = tf.tile(tf.expand_dims(embedded_positions, 0), [batch_size, 1, 1])

        return embedded_positions

    # Override get_config to enable serialization of the custom layer
    def get_config(self):
        config = super(PositionEmbeddingLayer, self).get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "output_dim": self.output_dim,
        })
        return config

In [69]:
import tensorflow as tf
from tensorflow.keras import layers

def attention(query, key, value, layer_idx):
    """Multi-headed self-attention and feed-forward network for transformer models.

    Args:
        query: Input query for self-attention.
        key: Input key for self-attention.
        value: Input value for self-attention.
        layer_idx (int): Layer index to name layers uniquely.
        config: Configuration object containing model parameters.

    Returns:
        Tensor: Output tensor from the transformer encoder block.
    """
    # Multi-headed self-attention layer
    attention_layer = layers.MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name=f"encoder_{layer_idx}/multiheadattention"
    )
    attention_output = attention_layer(query, key, value)
    attention_output = layers.Dropout(0.1, name=f"encoder_{layer_idx}/att_dropout")(attention_output)
    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name=f"encoder_{layer_idx}/att_layernormalization"
    )(query + attention_output)

    # Position-wise feed-forward network
    ffn = tf.keras.Sequential([
        layers.Dense(config.FF_DIM, activation="relu", name=f"encoder_{layer_idx}/ffn_dense1"),
        layers.Dense(config.EMBED_DIM, name=f"encoder_{layer_idx}/ffn_dense2")
    ], name=f"encoder_{layer_idx}/ffn")

    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name=f"encoder_{layer_idx}/ffn_dropout")(ffn_output)
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name=f"encoder_{layer_idx}/ffn_layernormalization"
    )(attention_output + ffn_output)

    return sequence_output


In [70]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='auto')
loss_tracker = keras.metrics.Mean(name="loss")


class MaskedLanguageModel(keras.Model):
    def train_step(self, inputs):
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None

        with tf.GradientTape() as tape:
            predictions = self(features, training=True)
            loss = loss_fn(labels, predictions, sample_weight=sample_weight)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Compute our own metrics
        loss_tracker.update_state(loss, sample_weight=sample_weight)

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]


In [71]:
def build_bert_encoder():
    """
    Constructs a BERT-like masked language model encoder.

    Args:
        config: A configuration object containing attributes like MAX_LEN, VOCAB_SIZE, EMBED_DIM, NUM_LAYERS, LR.

    Returns:
        A compiled TensorFlow model ready for training.
    """
    # Define the input layer
    inputs = layers.Input(shape=(config.MAX_LEN,), dtype=tf.int64, name="input_ids")

    print(inputs.shape)
    
    # Embedding layers: token embeddings + positional embeddings
    word_embeddings = layers.Embedding(
        input_dim=config.VOCAB_SIZE, output_dim=config.EMBED_DIM, name="word_embedding"
    )(inputs)

    print(word_embeddings.shape)
    
    position_embeddings = PositionEmbeddingLayer(
        sequence_length=config.MAX_LEN, output_dim=config.EMBED_DIM, name="position_embedding"
    )(inputs)

    print(position_embeddings.shape)
    # Sum embeddings
    embeddings = layers.Add(name="embeddings_add")([word_embeddings, position_embeddings])

    print(embeddings.shape)
    # Encoder layers
    encoder_output = embeddings
    for i in range(config.NUM_LAYERS):
        encoder_output = attention(encoder_output, encoder_output, encoder_output, i)

    print(encoder_output.shape)
    # MLM prediction head
    mlm_output = layers.Dense(config.VOCAB_SIZE, activation="softmax", name="mlm_cls")(encoder_output)

    # Build and compile the model
    print(mlm_output.shape)

    mlm_model = MaskedLanguageModel(inputs=inputs, outputs=mlm_output, name="masked_bert_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_model.compile(optimizer=optimizer)
    
    return mlm_model

bert_mlm = build_bert_encoder()
bert_mlm.summary()

(None, 64)
(None, 64, 128)
(None, 64, 128)
(None, 64, 128)
(None, 64, 128)
(None, 64, 30000)
Model: "masked_bert_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 64)]         0           []                               
                                                                                                  
 word_embedding (Embedding)     (None, 64, 128)      3840000     ['input_ids[0][0]']              
                                                                                                  
 position_embedding (PositionEm  (None, 64, 128)     8192        ['input_ids[0][0]']              
 beddingLayer)                                                                                    
                                                                                        

In [72]:
id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}

class MaskedTextGenerator(keras.callbacks.Callback):
    def __init__(self, sample_tokens, mask_token_id, top_k=5):
        self.sample_tokens = sample_tokens
        self.mask_token_id = mask_token_id
        self.k = top_k

    def decode(self, tokens):
        """Converts token IDs back to text. Converts tensor to a list first."""
        if isinstance(tokens, tf.Tensor):
            tokens = tokens.numpy()  # Convert tensor to NumPy array
        elif isinstance(tokens, np.ndarray):
            tokens = tokens.tolist()  # Convert NumPy array to list
        return " ".join([id2token[t] for t in tokens if t != 0])

    def convert_ids_to_tokens(self, id):
        """Converts a single token ID to the corresponding token."""
        return id2token.get(id, "[UNK]")

    def on_epoch_end(self, epoch, logs=None):
        """Generates and prints predictions for the masked tokens at the end of each epoch."""
        print(f"Epoch {epoch + 1} ended. Generating predictions...")

        # Make predictions on the sample tokens
        prediction = self.model.predict(self.sample_tokens)
        print(f"Prediction shape: {prediction.shape}")

        # Find the indices of masked tokens
        masked_indices = np.where(self.sample_tokens == self.mask_token_id)
        if len(masked_indices[0]) == 0:
            print("No masked tokens found in the sample.")
            return

        masked_indices = masked_indices[1]
        print(f"Masked token indices: {masked_indices}")

        # Make a copy of the original sample tokens to update incrementally
        tokens = np.copy(self.sample_tokens[0])

        # Process each masked token and its predictions
        for idx in masked_indices:
            mask_prediction = prediction[0][idx]

            # Get the top-k predictions
            top_indices = mask_prediction.argsort()[-self.k:][::-1]
            top_probabilities = mask_prediction[top_indices]

            # Iterate over the top-k predictions
            for rank, (predicted_id, probability) in enumerate(zip(top_indices, top_probabilities), start=1):
                # Update the tokens incrementally for each masked index
                tokens[idx] = predicted_id

                result = {
                    "epoch": epoch + 1,
                    "input_text": self.decode(self.sample_tokens[0]),
                    "predicted_text": self.decode(tokens),
                    "probability": probability,
                    "predicted_token": self.convert_ids_to_tokens(predicted_id),
                    "rank": rank
                }
                pprint(result)

# # Initialize sample tokens and the callback
sample_text = ["Definitely a [mask] buy. I [mask] recommend it for anyone [mask] for quality and reliability."]
sample_tokens = encode(sample_text, vectorize_layer)

generator_callback = MaskedTextGenerator(sample_tokens, mask_token_id)


# sample_tokens = vectorizer(["Definitely a good buy. I strongly [mask] it"])
# generator_callback = MaskedTextGenerator(sample_tokens.numpy())
# sample_tokens2 = vectorizer(["Definitely a [mask] buy. I strongly recommend it"])
# generator_callback2 = MaskedTextGenerator(sample_tokens2.numpy())
# sample_tokens3 = vectorizer(["Definitely a good [mask]. I strongly recommend it"])
# generator_callback3 = MaskedTextGenerator(sample_tokens3.numpy())


In [73]:
bert_mlm.fit(mlm_ds, epochs=20, callbacks=[generator_callback])
bert_mlm.save("bert_mlm_amazon_sep_epoch_20.h5")

Epoch 1/20


2024-09-08 12:52:14.473059: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


 408/3125 [==>...........................] - ETA: 12:04 - loss: 0.7648Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/esraonal/miniconda3/envs/tensorflow2/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/_7/cxy2r8fn70774bt9rncztq580000gn/T/ipykernel_12290/939367481.py", line 1, in <module>
    bert_mlm.fit(mlm_ds, epochs=20, callbacks=[generator_callback])
  File "/Users/esraonal/miniconda3/envs/tensorflow2/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "/Users/esraonal/miniconda3/envs/tensorflow2/lib/python3.8/site-packages/keras/engine/training.py", line 1409, in fit
    tmp_logs = self.train_function(iterator)
  File "/Users/esraonal/miniconda3/envs/tensorflow2/lib/python3.8/site-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "/Users/esraonal/miniconda3/envs/tensorflow2/lib/python3.8/site-pack

In [None]:
def build_bert_lstm_encoder():
    """
    Constructs a BERT-like masked language model encoder.

    Args:
        config: A configuration object containing attributes like MAX_LEN, VOCAB_SIZE, EMBED_DIM, NUM_LAYERS, LR.

    Returns:
        A compiled TensorFlow model ready for training.
    """
    # Define the input layer
    inputs = layers.Input(shape=(config.MAX_LEN,), dtype=tf.int64, name="input_ids")
    
    # Embedding layers: token embeddings + positional embeddings
    word_embeddings = layers.Embedding(
        input_dim=config.VOCAB_SIZE, output_dim=config.EMBED_DIM, name="word_embedding"
    )(inputs)
    
    position_embeddings = PositionEmbeddingLayer(
        sequence_length=config.MAX_LEN, output_dim=config.EMBED_DIM, name="position_embedding"
    )(inputs)

    # Sum embeddings
    embeddings = layers.Add(name="embeddings_add")([word_embeddings, position_embeddings])

    # Encoder layers
    encoder_output = embeddings
    for i in range(config.NUM_LAYERS):
        encoder_output = attention(encoder_output, encoder_output, encoder_output, i)

    # LSTM layers taking word embeddings directly
    lstm_output = layers.LSTM(config.EMBED_DIM, return_sequences=True)(word_embeddings)
    lstm_output = layers.Dropout(0.1)(lstm_output)
    lstm_output = layers.LSTM(config.EMBED_DIM, return_sequences=True)(lstm_output)
    lstm_output = layers.Dropout(0.1)(lstm_output)

    # Encoder output and LSTM output are added together
    lstm_attention = encoder_output + lstm_output

    # MLM prediction head
    mlm_output = layers.Dense(config.VOCAB_SIZE, activation="softmax", name="mlm_cls")(encoder_output)

    # Build and compile the model
    print(mlm_output.shape)

    mlm_model = MaskedLanguageModel(inputs=inputs, outputs=mlm_output, name="masked_bert_lstm_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_model.compile(optimizer=optimizer)
    
    return mlm_model

bert_lstm_mlm = build_bert_lstm_encoder()
bert_lstm_mlm.summary()


In [None]:
bert_lstm_mlm.fit(mlm_ds, epochs=20, callbacks=[generator_callback])
bert_lstm_mlm.save("bert_lstm_mlm_amazon_sep_epoch_20.h5")

In [None]:
def build_lstm_encoder():
    """
    Constructs a BERT-like masked language model encoder.

    Args:
        config: A configuration object containing attributes like MAX_LEN, VOCAB_SIZE, EMBED_DIM, NUM_LAYERS, LR.

    Returns:
        A compiled TensorFlow model ready for training.
    """
    # Define the input layer
    inputs = layers.Input(shape=(config.MAX_LEN,), dtype=tf.int64, name="input_ids")
    
    word_embeddings = layers.Embedding(
        input_dim=config.VOCAB_SIZE, output_dim=config.EMBED_DIM, name="word_embedding"
    )(inputs)
    
    # LSTM layers taking word embeddings directly
    lstm_output = layers.LSTM(config.EMBED_DIM, return_sequences=True)(word_embeddings)
    lstm_output = layers.Dropout(0.1)(lstm_output)
    lstm_output = layers.LSTM(config.EMBED_DIM, return_sequences=True)(lstm_output)
    lstm_output = layers.Dropout(0.1)(lstm_output)

    # MLM prediction head
    lstm_output = layers.Dense(config.VOCAB_SIZE, activation="softmax", name="mlm_cls")(lstm_output)

    mlm_lstm_model = MaskedLanguageModel(inputs=inputs, outputs=lstm_output, name="masked_bert_lstm_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_lstm_model.compile(optimizer=optimizer)
    
    return mlm_lstm_model

lstm_mlm = build_lstm_encoder()
lstm_mlm.summary()

In [None]:
lstm_mlm.fit(mlm_ds, epochs=20, callbacks=[generator_callback])
lstm_mlm.save("lstm_mlm_amazon_sep_epoch_20.h5")

In [None]:
import os
import numpy as np

# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

def decode(tokens):
    """Converts a sequence of token IDs into a text string."""
    return " ".join([id2token.get(t, "[UNK]") for t in tokens if t != 0])

def convert_ids_to_tokens(token_id):
    """Converts a single token ID to its corresponding token."""
    return id2token.get(token_id, "[UNK]")

def generate_next_word(text, max_length, model):
    """
    Generates the next word in the sequence for a given input text, up to max_length.

    Args:
        text (str): The input text to which words will be added.
        max_length (int): Maximum number of words to generate.
        model (tf.Model): Trained language model used to predict the next word.

    Returns:
        str: The generated sequence after appending predicted words.
    """
    for _ in range(max_length):
        # Append [mask] token to the text to predict the next word
        predict_input = text + " [mask]"
        sample_tokens = encode(predict_input, vectorize_layer)

        # Predict masked word
        prediction = model.predict(sample_tokens.numpy(), verbose=0)

        # Find the index of the [mask] token in the sample
        masked_index = np.where(sample_tokens == mask_token_id)
        if len(masked_index[1]) == 0:
            print("No masked tokens found.")
            return text
        
        masked_index = masked_index[1][0]  # Get first masked index

        # Get the prediction for the masked token
        mask_prediction = prediction[0][masked_index]

        # Get the top prediction
        top_index = mask_prediction.argsort()[-1:][::-1][0]  # Get top prediction index
        predicted_token = convert_ids_to_tokens(top_index)

        # Add the predicted word to the text and continue
        text = text + " " + predicted_token

    return text

# Example usage
text = "Absolutely amazing"

In [None]:
generated_text1 = generate_next_word(text, 25, bert_mlm)
generated_text2 = generate_next_word(text, 25, lstm_mlm)
generated_text3 = generate_next_word(text, 25, bert_lstm_mlm)

print(f"Final generated text from BERT: {generated_text1}")
print(f"Final generated text from LSTM: {generated_text2}")
print(f"Final generated text from BERT_LSTM: {generated_text3}")