In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

# 1. Prepare the dataset
data = {
    'id': [1, 2, 3],
    'input_text': [
        "I love machine learning!",
        "Deep learning models are powerful.",
        "Natural Language Processing with TensorFlow."
    ],
    'target_text': [
        "Love ML!",
        "Powerful deep models.",
        "NLP using TensorFlow."
    ]
}

df = pd.DataFrame(data)

# 2. Add special tokens to target texts
df['target_text'] = df['target_text'].apply(lambda x: f"<sos> {x} <eos>")

# 3. Initialize and fit tokenizers
input_vocab_size = 10000
target_vocab_size = 10000
oov_token = "<OOV>"

input_tokenizer = Tokenizer(num_words=input_vocab_size, oov_token=oov_token,
                            filters='!"#$%&()*+,-./:;=?@[\]^_`{|}~')
target_tokenizer = Tokenizer(num_words=target_vocab_size, oov_token=oov_token,
                             filters='!"#$%&()*+,-./:;=?@[\]^_`{|}~')

input_tokenizer.fit_on_texts(df['input_text'])
target_tokenizer.fit_on_texts(df['target_text'])

input_word_index = input_tokenizer.word_index
target_word_index = target_tokenizer.word_index

print("Input Word Index:", input_word_index)
print("Target Word Index:", target_word_index)

# 4. Convert texts to sequences
input_sequences = input_tokenizer.texts_to_sequences(df['input_text'])
target_sequences = target_tokenizer.texts_to_sequences(df['target_text'])

# 5. Determine maximum sequence lengths
max_input_length = max(len(seq) for seq in input_sequences)
max_target_length = max(len(seq) for seq in target_sequences)

print("Max Input Length:", max_input_length)
print("Max Target Length:", max_target_length)

# 6. Pad sequences
input_padded = pad_sequences(input_sequences, maxlen=max_input_length, padding='post', truncating='post')
target_padded = pad_sequences(target_sequences, maxlen=max_target_length, padding='post', truncating='post')

print("Padded Input Sequences:\n", input_padded)
print("Padded Target Sequences:\n", target_padded)

# 7. Create decoder input and target
decoder_input = target_padded[:, :-1]
decoder_target = target_padded[:, 1:]

print("Decoder Input:\n", decoder_input)
print("Decoder Target:\n", decoder_target)

# 8. Define the Encoder, Attention, and Decoder classes
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.enc_units,
                                        return_sequences=True,
                                        return_state=True,
                                        recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state_h, state_c = self.lstm(x, initial_state=hidden)
        return output, state_h, state_c
    
    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)),
                tf.zeros((self.batch_sz, self.enc_units))]

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, query, values):
        # query: (batch_size, hidden size)
        # values: (batch_size, max_len, hidden size)
        query_with_time_axis = tf.expand_dims(query, 1)  # (batch_size, 1, hidden size)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(query_with_time_axis)
        ))  # (batch_size, max_len, 1)
        
        attention_weights = tf.nn.softmax(score, axis=1)  # (batch_size, max_len, 1)
        context_vector = attention_weights * values  # (batch_size, max_len, hidden size)
        context_vector = tf.reduce_sum(context_vector, axis=1)  # (batch_size, hidden size)
        
        return context_vector, attention_weights

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                        return_sequences=True,
                                        return_state=True,
                                        recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        self.attention = BahdanauAttention(self.dec_units)
    
    def call(self, x, hidden, enc_output):
        # x: (batch_size, 1)
        # hidden: decoder hidden state (batch_size, hidden size)
        # enc_output: (batch_size, max_len, hidden size)
        
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        x = self.embedding(x)  # (batch_size, 1, embedding_dim)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)  # (batch_size, 1, embedding_dim + hidden size)
        
        output, state_h, state_c = self.lstm(x)
        
        output = tf.reshape(output, (-1, output.shape[2]))  # (batch_size, hidden size)
        
        x = self.fc(output)  # (batch_size, vocab_size)
        
        return x, state_h, state_c, attention_weights

# 9. Initialize Encoder and Decoder
embedding_dim = 256
units = 512
batch_size = 2

encoder = Encoder(input_vocab_size, embedding_dim, units, batch_size)
decoder = Decoder(target_vocab_size, embedding_dim, units, batch_size)

# 10. Define the optimizer and loss
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none'
)

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))  # Padding mask
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask  # Apply mask
    
    return tf.reduce_mean(loss_)

# 11. Checkpointing
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = f"{checkpoint_dir}/ckpt"
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

# 12. Define the training step
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden_h, enc_hidden_c = encoder(inp, enc_hidden)
        
        dec_hidden_h = enc_hidden_h
        dec_hidden_c = enc_hidden_c
        
        dec_input = tf.expand_dims([target_word_index['<sos>']] * batch_size, 1)
        
        # Teacher Forcing
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden_h, dec_hidden_c, _ = decoder(dec_input, dec_hidden_h, enc_output)
            
            loss += loss_function(targ[:, t], predictions)
            
            dec_input = tf.expand_dims(targ[:, t], 1)
    
    batch_loss = loss / int(targ.shape[1])
    
    variables = encoder.trainable_variables + decoder.trainable_variables
    
    gradients = tape.gradient(loss, variables)
    
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

# 13. Prepare the dataset
BUFFER_SIZE = len(df)
steps_per_epoch = BUFFER_SIZE // batch_size

dataset = tf.data.Dataset.from_tensor_slices((input_padded, decoder_target)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(batch_size, drop_remainder=True)

# 14. Train the model
EPOCHS = 100  # Increase as needed

for epoch in range(EPOCHS):
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
    
    # Save the model every 10 epochs
    if (epoch + 1) % 10 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
    
    print(f'Epoch {epoch+1} Loss {total_loss / steps_per_epoch:.4f}')


Input Word Index: {'<OOV>': 1, 'learning': 2, 'i': 3, 'love': 4, 'machine': 5, 'deep': 6, 'models': 7, 'are': 8, 'powerful': 9, 'natural': 10, 'language': 11, 'processing': 12, 'with': 13, 'tensorflow': 14}
Target Word Index: {'<OOV>': 1, '<sos>': 2, '<eos>': 3, 'love': 4, 'ml': 5, 'powerful': 6, 'deep': 7, 'models': 8, 'nlp': 9, 'using': 10, 'tensorflow': 11}
Max Input Length: 5
Max Target Length: 5
Padded Input Sequences:
 [[ 3  4  5  2  0]
 [ 6  2  7  8  9]
 [10 11 12 13 14]]
Padded Target Sequences:
 [[ 2  4  5  3  0]
 [ 2  6  7  8  3]
 [ 2  9 10 11  3]]
Decoder Input:
 [[ 2  4  5  3]
 [ 2  6  7  8]
 [ 2  9 10 11]]
Decoder Target:
 [[ 4  5  3  0]
 [ 6  7  8  3]
 [ 9 10 11  3]]


2024-12-29 21:09:36.316866: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-12-29 21:09:36.369852: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-12-29 21:09:36.482116: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1 Loss 5.7567
Epoch 2 Loss 6.9031
Epoch 3 Loss 5.7492
Epoch 4 Loss 6.8893
Epoch 5 Loss 6.8803
Epoch 6 Loss 5.7285
Epoch 7 Loss 5.7165
Epoch 8 Loss 6.8310


2024-12-29 21:09:36.696502: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 9 Loss 6.7967
Epoch 10 Loss 6.7352
Epoch 11 Loss 5.5364
Epoch 12 Loss 5.3564
Epoch 13 Loss 5.1454
Epoch 14 Loss 5.4079
Epoch 15 Loss 4.6552
Epoch 16 Loss 3.6959
Epoch 17 Loss 2.6733
Epoch 18 Loss 1.9608


2024-12-29 21:09:37.220716: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 19 Loss 1.6858
Epoch 20 Loss 1.5314
Epoch 21 Loss 1.3950
Epoch 22 Loss 1.2878
Epoch 23 Loss 1.2221
Epoch 24 Loss 1.2005
Epoch 25 Loss 1.5894
Epoch 26 Loss 1.2433
Epoch 27 Loss 1.6939
Epoch 28 Loss 1.2739
Epoch 29 Loss 1.2577
Epoch 30 Loss 1.2307
Epoch 31 Loss 1.3926


2024-12-29 21:09:38.251811: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 32 Loss 1.4230
Epoch 33 Loss 1.1777
Epoch 34 Loss 1.2442
Epoch 35 Loss 1.2150
Epoch 36 Loss 1.1802
Epoch 37 Loss 1.1429
Epoch 38 Loss 1.0859
Epoch 39 Loss 1.3258
Epoch 40 Loss 1.3334
Epoch 41 Loss 0.9850
Epoch 42 Loss 1.0056
Epoch 43 Loss 0.9722
Epoch 44 Loss 1.0263
Epoch 45 Loss 1.4292
Epoch 46 Loss 1.0144
Epoch 47 Loss 1.3757
Epoch 48 Loss 0.9555
Epoch 49 Loss 1.2983
Epoch 50 Loss 1.0452
Epoch 51 Loss 1.2412
Epoch 52 Loss 1.2138
Epoch 53 Loss 1.0571
Epoch 54 Loss 1.0408
Epoch 55 Loss 1.0079
Epoch 56 Loss 0.9924
Epoch 57 Loss 1.1539
Epoch 58 Loss 0.9453
Epoch 59 Loss 1.1598
Epoch 60 Loss 0.9006
Epoch 61 Loss 1.1657
Epoch 62 Loss 1.1613
Epoch 63 Loss 1.0959
Epoch 64 Loss 1.1253
Epoch 65 Loss 0.8730


2024-12-29 21:09:40.371074: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 66 Loss 0.8671
Epoch 67 Loss 0.9377
Epoch 68 Loss 0.8943
Epoch 69 Loss 0.8431
Epoch 70 Loss 1.0976
Epoch 71 Loss 0.8860
Epoch 72 Loss 0.8876
Epoch 73 Loss 1.1103
Epoch 74 Loss 1.0970
Epoch 75 Loss 0.7207
Epoch 76 Loss 1.0545
Epoch 77 Loss 0.7128
Epoch 78 Loss 0.8216
Epoch 79 Loss 0.6969
Epoch 80 Loss 0.9853
Epoch 81 Loss 0.7797
Epoch 82 Loss 0.6858
Epoch 83 Loss 0.9483
Epoch 84 Loss 0.6774
Epoch 85 Loss 0.7294
Epoch 86 Loss 0.9135
Epoch 87 Loss 0.6622
Epoch 88 Loss 0.8858
Epoch 89 Loss 0.6441
Epoch 90 Loss 0.7092
Epoch 91 Loss 0.8489
Epoch 92 Loss 0.6872
Epoch 93 Loss 0.6614
Epoch 94 Loss 0.8278
Epoch 95 Loss 0.8225
Epoch 96 Loss 0.6773
Epoch 97 Loss 0.6615
Epoch 98 Loss 0.6309
Epoch 99 Loss 0.7833
Epoch 100 Loss 0.6255
