<a href="https://colab.research.google.com/github/gowtamyreddy/NLP/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Understanding Transformer in decoder format.
 *This Decoder is excluding Masking from a regular Decoder.*

 *This model can see the fututre words during training.*

 *This Model has a single Transformer block.*

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout
import numpy as np

#load the harry potter book as the dataset ->  url - https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books
def load_data(file_path):
  with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
  return text

file_path = "/content/sample_data/01 Harry Potter and the Sorcerers Stone.txt"
text = load_data(file_path).lower()


# Tokenize the text
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1


# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 50

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

#print(input_sequences[0])

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

Transformer model


In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout

class MultiHeadAttention(Layer):

    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads # example - 8

        self.embed_dim = embed_dim # example - 512
        # embed_dim = dimension of Q, K, and V before splitting into multiple heads
        self.projection_dim = embed_dim // num_heads # Size of Each Attention Head's Subspace
        self.query_dense = Dense(embed_dim) # Q Determines "what to focus on"
        self.key_dense = Dense(embed_dim) # K Acts as "labels" to be matched with queries
        self.value_dense = Dense(embed_dim) # V Holds the actual information

        self.combine_heads = Dense(embed_dim)


    def attention(self, query, key, value):
        scores = tf.matmul(query, key, transpose_b=True)
        scores /= tf.math.sqrt(tf.cast(self.projection_dim, tf.float32)) # converting integer to a float32 tensor

        attention_probs = tf.nn.softmax(scores, axis=-1) # how much attention each token should give to other tokens
        return tf.matmul(attention_probs, value), attention_probs

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value = inputs
        batch_size = tf.shape(query)[0] # (batch_size, seq_len, embed_dim)

        query = self.split_heads(self.query_dense(query), batch_size)
        key = self.split_heads(self.key_dense(key), batch_size)
        value = self.split_heads(self.value_dense(value), batch_size)

        attention, _ = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))

        return self.combine_heads(concat_attention)

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att([inputs, inputs, inputs])

        # Dropout randomly deactivates some neurons during training to reduce overfitting
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output) # Residual Connection
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output) # Residual Connection

class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)


    def call(self, x):

        maxlen = tf.shape(x)[-1] # sets maxlen to the length of the input sequence
        positions = tf.range(start=0, limit=maxlen, delta=1) # Generate [0, 1, 2, ..., maxlen-1]
        positions = self.pos_emb(positions) # Each position index is mapped to a trainable embedding of shape (maxlen, embed_dim)
        x = self.token_emb(x) # Each token ID in x is mapped to an embedding of shape (batch_size, maxlen, embed_dim)
        return x + positions



In [5]:
# Model Parameters
embed_dim = 128  # Embedding size
num_heads = 4    # Number of attention heads
ff_dim = 512     # Feed-forward layer size
maxlen = seq_length # here it is 50 defined above


# Build the model
inputs = tf.keras.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, total_words, embed_dim)
x = embedding_layer(inputs)
print(x.shape)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x, training=True)
print(x.shape)
x = x[:, -1, :]
print(x.shape)
x = Dense(total_words, activation="softmax")(x)
print(x.shape)
model = tf.keras.Model(inputs=inputs, outputs=x)

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()

(None, 50, 128)
(None, 50, 128)
(None, 128)
(None, 6663)


In [6]:
history = model.fit(X, y, batch_size=32, epochs=15)

Epoch 1/15
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 70ms/step - accuracy: 0.0840 - loss: 6.5001
Epoch 2/15
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 70ms/step - accuracy: 0.1612 - loss: 5.0723
Epoch 3/15
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 69ms/step - accuracy: 0.2114 - loss: 4.2803
Epoch 4/15
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 70ms/step - accuracy: 0.2569 - loss: 3.6334
Epoch 5/15
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 70ms/step - accuracy: 0.3186 - loss: 3.1030
Epoch 6/15
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 71ms/step - accuracy: 0.3991 - loss: 2.6039
Epoch 7/15
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 72ms/step - accuracy: 0.4792 - loss: 2.1852
Epoch 8/15
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 71ms/step - accuracy: 0.5494 - loss: 1.8167


In [13]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word = tokenizer.index_word[np.argmax(predicted)]
        seed_text += " " + predicted_word
    return seed_text

# Generate text
seed_text = "harry at Hogwarts"
generated_text = generate_text(seed_text, next_words=50, max_sequence_len=seq_length + 1)
print(len(generated_text))

237


In [14]:
print(generated_text)

harry at Hogwarts and he heard of course books the first few days mr dursley however was going to do he put it back in his mind as he went on to bed and stared at him if he wouldn’t be able to see if he was doing very that he was wearing


In [15]:


# Generate text
seed_text = "harry and Snape"
generated_text = generate_text(seed_text, next_words=50, max_sequence_len=seq_length + 1)
print(generated_text)

289


In [16]:
print(generated_text)

harry and Snape went on the table piled so teach him magic tricks and yelled uncle vernon but he had finally gone he hadn’t brought the umbrella swishing down through his newspaper at hagrid and had pulled the wild cart ride later in this was empty there was he looking stunned and stroked


In [19]:
# Generate text
seed_text = "harry in the team of gryffindor"
generated_text = generate_text(seed_text, next_words=50, max_sequence_len=seq_length + 1)
print(generated_text)


harry in the team of gryffindor meant to turn up at first years it’s really dangerous with the sorcerer’s stone and neville was the very quiet until he was wearing bits of wood and reach them he had a few seconds later he had complete twelve the thought snape would never have been sick on as


In [27]:
# Generate text
seed_text = "Harry,Ron"
generated_text = generate_text(seed_text, next_words=50, max_sequence_len=seq_length + 1)
print(generated_text)

Harry,Ron and hermione at all over the bundle of blankets and then came floating in midair over the floor it was standing on the floor and tiptoed toward the trophy room malfoy and crabbe weren’t gringotts the candles in the direction that ron wasn’t field where hermione was going to tell
