# Transformers - Architecture

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as torch_data
import math
import copy
import os
import json

### Input Embeddings


In essence, InputEmbeddings is the first step in a Transformer model that converts input data (like words in a sentence) into a format that the neural network can understand and process. 

* Embedding: The core of this class is the nn.Embedding layer from PyTorch. This layer is responsible for representing each unique word (or token) in your vocabulary as a dense vector. These vectors capture semantic information about the words, making them suitable for processing by the Transformer.

* Model Dimension: The model_dim parameter specifies the size of the embedding vectors. This dimension dictates how much information about each word is encoded in the embedding. A larger model dimension allows for more complex representations but also increases computational requirements.

* Vocabulary Size: The vocab_size parameter determines how many unique words (or tokens) your model needs to represent. Each word in your vocabulary gets its own unique embedding.

##### Code Implementation

In [2]:
import math
import torch.nn as nn

class InputEmbeddings(nn.Module):
    """
    This class defines the input embedding layer for a neural network. 
    It converts input words (represented as indices) into dense vector representations (embeddings).
    """
    def __init__(self, model_dim: int, vocab_size: int) -> None:
        """
        Initializes the InputEmbeddings module.

        Args:
            model_dim (int): The dimensionality of the embedding vectors.
            vocab_size (int): The size of the vocabulary.
        """
        super().__init__() # Calls Parent's constructor , i.e. nn.Module
        self.model_dim = model_dim
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=model_dim)
    
    def forward(self,x):
        """
        Performs the forward pass of the module.

        Args:
            x (torch.Tensor): Input tensor representing a sequence of word indices.

        Returns:
            torch.Tensor: Embedded representation of the input sequence.
        """
        # This is the core function of the class, defining how the module processes input data.
        # It takes an input tensor x, representing a sequence of words (usually indices into the vocabulary).
        # It uses the embedding layer to lookup the corresponding embedding vector for each word in x.
        # Scaling: The result is then multiplied by the square root of model_dim. 
        # NOTE : It helps with numerical stability and prevents the gradients from vanishing or exploding during training.
        
        return self.embedding(x) * math.sqrt(self.model_dim)


##### Example

In [3]:
sentence = "The cat sat on the mat."

"""
First, you'd convert this sentence into a sequence of integers, 
representing each word's index in your vocabulary. 
Let's say:
"""

sentence_indices = [1, 2, 3, 4, 1, 5]  

embeddings = InputEmbeddings(model_dim=512, vocab_size=10000)  # Example values
# PyTorch automatically invokes the forward() method behind the scenes.
embedded_sentence = embeddings(torch.tensor(sentence_indices)) 


In [4]:
embedded_sentence.shape

torch.Size([6, 512])

In [5]:
embedded_sentence

tensor([[ 41.9009,   8.8849,  -6.2731,  ...,  -9.9132, -21.3264, -11.4548],
        [ 26.8885, -26.3593,  27.7979,  ..., -32.8299,  20.4210,   0.2708],
        [-11.2794,  52.2548,  -4.8354,  ...,  -2.5915,  -2.3917, -17.4102],
        [ 14.0425, -38.6314,  26.4700,  ...,  27.4415,  18.1834, -14.9053],
        [ 41.9009,   8.8849,  -6.2731,  ...,  -9.9132, -21.3264, -11.4548],
        [  5.6508,  13.5732,  -9.1930,  ...,  10.7587,  -7.4208, -26.4575]],
       grad_fn=<MulBackward0>)

##### Summary
The InputEmbeddings class effectively converts your input data from raw words or indices into dense vectors that the Transformer model can understand and manipulate. 
These vectors are crucial for the Transformer to learn relationships between words, understand the context of sentences, and ultimately perform tasks like translation, text generation, or question answering.

### Positional Encoding

##### <b>Importance</b>
Positional encoding is essential for Transformers to understand the order of words in a sequence. Without it, the model wouldn't be able to differentiate between "The cat sat on the mat" and "Mat the on sat cat the."

![image.png][def]

[def]: ./image.png

Where:

- pos is the position of the word in the sequence.
- i is the dimension index.
- d_model is the size of the word embeddings.


#### Code Implementation

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, model_dim:int, seq_len: int,dropout:float) -> None:
        super().__init__()
        self.model_dim = model_dim
        self.seq_length = seq_len
        self.dropout = nn.Dropout(p=dropout)
        #create a matrix of shape (seq_len, model_dim)
        positional_encoder = torch.zeros(seq_len,model_dim)

        # create a vector of shape(seq_len)
        position_vector = torch.arange(0,seq_len,dtype=torch.float).unsqueeze(1)

        # create a vector of shape(d_model)
        div_term = torch.exp(torch.arange(0,model_dim,2).float() * (-math.log(10000.0)/model_dim))

        # Apply sine to even indexes
        positional_encoder[:,0::2] = torch.sin(position_vector * div_term)

        # Apply cosine to odd indexes
        positional_encoder[:,1::2] = torch.cos(position_vector * div_term)

        ## Add a batch dimension to positional encoding
        positional_encoder = positional_encoder.unsqueeze(0)

        # Register the encoding as buffer
        self.register_buffer('pe',positional_encoder)

    """
    The forward method takes the output of the input embeddings (x) as input.
    It adds the positional encoding matrix (sliced to the length of the input sequence) 
    to the input embeddings.
    The resulting tensor is passed through the dropout layer to prevent overfitting.
    """
    
    def forward(self,x):
        x = x +  (self.pe[:,:x.shape[1],:]).requires_grad_(False)
        return self.dropout(x)


#### Example

In [7]:


# Set parameters
vocab_size = 1000  # Example vocabulary size
model_dim = 256    # Embedding dimension
seq_length = 20  # Maximum sequence length
dropout = 0.1    # Dropout probability

# Create instances of the modules
input_embeddings = InputEmbeddings(model_dim, vocab_size)
positional_encoding = PositionalEncoding(model_dim, seq_length, dropout)

# Sample input (random indices into the vocabulary)
input_indices = torch.randint(0, vocab_size, (1, seq_length))

# Perform the embedding and positional encoding
embedded_input = input_embeddings(input_indices)
encoded_input = positional_encoding(embedded_input)

# Print the results
print("Input Indices:", input_indices)
print("Embedded Input Shape:", embedded_input.shape)
print("Encoded Input Shape:", encoded_input.shape)

# You can also visualize the encoded input to see how positional information is added
print("Encoded Input (First Few Elements):\n", encoded_input[0, :5, :])

Input Indices: tensor([[632, 213, 745, 631, 616, 670, 124, 128, 333, 359, 439, 997, 618, 839,
         600, 316, 935, 981, 606, 530]])
Embedded Input Shape: torch.Size([1, 20, 256])
Encoded Input Shape: torch.Size([1, 20, 256])
Encoded Input (First Few Elements):
 tensor([[ -5.8089,  29.9849,  -0.0000,  ...,  -4.6928,  -1.7949,  -0.0000],
        [ -3.8191,  -3.2951,  -6.8730,  ...,  39.1492,  14.5779,  21.3420],
        [  8.8207,  13.9228,  20.5074,  ...,   0.1306,  38.0097,   8.8268],
        [ -0.0000, -24.6249,  13.2669,  ...,  -0.1755,  -6.7886,  20.4346],
        [  6.7219,  18.7789,  15.6983,  ..., -14.4984,  10.3004,   5.5087]],
       grad_fn=<SliceBackward0>)


## MultiHead-Attention

$$
\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,..., \text{head}_h) W^O
$$
$$
\text{where}, \quad \text{head}_i = \text{Attention}(Q W_Q^i, K W_K^i, V W_V^i)
$$

#### Scaled Dot-Product Attention

\begin{align}
\text Attention(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right) V
\end{align}

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, model_dim:int, h:int, dropout:float) -> None:
        super().__init__()

        # Embedding Model Size
        self.model_dim = model_dim
        # Number of heads
        self.h = h

        # Make sure d_model is divisible by h
        assert model_dim % h == 0, "d_model is not divisible by h"

        # Initializing dimensions
        ## Dimension of vector seen by each head
        self.d_k = self.model_dim // h
        # Wq, Wk, Wv, Wo - Linear layers for query, key, value and output transformations
        self.w_q = nn.Linear(model_dim,model_dim,bias=False)
        self.w_k = nn.Linear(model_dim,model_dim,bias=False)
        self.w_v = nn.Linear(model_dim,model_dim,bias=False)
        self.w_o = nn.Linear(model_dim,model_dim,bias=False)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query,key,value,mask, dropout : nn.Dropout):
        """
        Scaled Dot-Product Attention

        Args:
            query (torch.Tensor): Query tensor of shape (batch_size, seq_len, model_dim).
            key (torch.Tensor): Key tensor of shape (batch_size, seq_len, model_dim).
            value (torch.Tensor): Value tensor of shape (batch_size, seq_len, model_dim).
            mask (torch.Tensor): Mask tensor of shape (batch_size, seq_len), where 1 indicates a valid position and 0 indicates a masked position.
            dropout (nn.Dropout): _description_

        Returns:
            _type_: _description_
        """
        # Calculate dimension of the query vector
        d_k = query.shape[-1]
        # Perform scaled dot-product attention here... 
        # (Calculations for attention weights and output)
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len) 
        attention_scores = (query @ key.transpose(-2,-1))/ math.sqrt(d_k)

        # Apply mask if provided
        if mask is not None:
            # ref: https://pytorch.org/docs/stable/generated/torch.Tensor.masked_fill_.html
            # Set the attention scores corresponding to masked positions to a very low value (-1e9).
            # This ensures that the model does not attend to the masked positions.
            attention_scores.masked_fill_(mask==0,-1e9)
        
        # Normalize attention scores using softmax
        attention_scores = attention_scores.softmax(dim=-1)

        # Apply dropout to attention scores if dropout is not None
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        
        # Multiply attention scores by the value matrix to get the output
        return (attention_scores @ value) , attention_scores

    def forward(self, q, k, v, mask):
        """
        Forward pass of the Multi-Head Attention layer.

        Args:
            q (torch.Tensor): Query tensor of shape (batch_size, seq_len, model_dim).
            k (torch.Tensor): Key tensor of shape (batch_size, seq_len, model_dim).
            v (torch.Tensor): Value tensor of shape (batch_size, seq_len, model_dim).
            mask (torch.Tensor): Mask tensor of shape (batch_size, seq_len), where 1 indicates a valid position and 0 indicates a masked position.

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, seq_len, model_dim).
        """

        # Apply linear transformations to query, key, and value tensors
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        # Reshape and transpose query, key, and value tensors to prepare for multi-head attention
        query = query.view(query.shape[0],query.shape[1],self.h,self.d_k).transpose(1,2)
        key = key.view(key.shape[0],key.shape[1],self.h,self.d_k).transpose(1,2)
        value = value.view(value.shape[0],value.shape[1],self.h,self.d_k).transpose(1,2)
        
        # Calculate attention
        # The static method 'attention' is used here to perform the scaled dot-product attention
        # The returned value 'x' is the output of the attention mechanism
        # 'self.attention_scores' stores the attention weights for later use
        x, self.attention_scores = MultiHeadAttention.attention(query,key, value, mask, self.dropout)

        # Combine all the heads together
        # The output 'x' is reshaped and transposed to combine the outputs of all heads
        # The output 'x' now has the shape (batch_size, seq_len, model_dim)
        x = x.transpose(1,2).contiguous().view(x.shape[0],-1, self.h * self.d_k)

        # Multiply by Wo
        # The final output is obtained by applying a linear transformation to the combined output 'x'
        return self.w_o(x)


#### Example

In [9]:
# Set parameters
vocab_size = 1000
model_dim = 256
seq_length = 20
h = 8      # Number of attention heads
dropout = 0.1

# Create instances of the modules
input_embeddings = InputEmbeddings(model_dim, vocab_size)
positional_encoding = PositionalEncoding(model_dim, seq_length, dropout)
multihead_attention = MultiHeadAttention(model_dim, h, dropout)

# Sample input (random indices)
input_indices = torch.randint(0, vocab_size, (1, seq_length))

# Embed and encode the input
embedded_input = input_embeddings(input_indices)
encoded_input = positional_encoding(embedded_input)

# Perform multi-head attention (using encoded input as query, key, and value)
attention_output = multihead_attention(encoded_input, encoded_input, encoded_input, None)  # No mask in this example

print("Attention Output Shape:", attention_output.shape)  # (batch_size, seq_len, model_dim)
attention_output


Attention Output Shape: torch.Size([1, 20, 256])


tensor([[[ 2.2879,  5.5303, -0.8271,  ..., -3.5598, -0.0692,  2.5537],
         [-0.4942, -3.5450, -4.1114,  ...,  6.4556, -3.5498, -7.0402],
         [-4.7805, -0.1277,  2.7567,  ..., -3.5298,  0.5763, -1.8399],
         ...,
         [-7.3817,  0.5795,  8.3423,  ...,  0.8757, -5.3898,  7.5155],
         [-8.9999, -2.4231, -0.7978,  ...,  0.7756,  2.1163,  0.4824],
         [ 1.5358, -5.0703, -4.6353,  ...,  1.4985,  3.1268, 13.1597]]],
       grad_fn=<UnsafeViewBackward0>)

## LayerNormalization

In [10]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int , eps: float= 10 ** -6) -> None:
        """
        Initializes the LayerNormalization module.

        Args:
            features (int): The number of features in the input tensor.
            eps (float, optional): A small value added to the denominator to prevent division by zero. Defaults to 10^-6.
        """
        super().__init__()
        self.eps = eps
        # alpha and bias are learnable parameters
        self.alpha = nn.Parameter(torch.ones(features))
        self.bias = nn.Parameter(torch.ones(features))

    def forward(self,x):
        """
        Applies LayerNormalization to the input tensor.

        Args:
            x (torch.Tensor): The input tensor with shape (batch_size, seq_len, hidden_size).

        Returns:
            torch.Tensor: The normalized output tensor with the same shape as the input.
        """
        #x :( batch_size, seq_len, hidden_size)
        
        mean = x.mean(dim = -1, keepdim = True) #(batch,seq_len, 1)
        std = x.std(dim = -1 , keepdim = True)

        return self.alpha * (x - mean) / (std + self.eps) +  self.bias
    

## FeedForward Network

In [11]:
import torch
import torch.nn as nn

class FeedForward(nn.Module):
    """
    A feedforward neural network with two linear layers and ReLU activation.
    """
    def __init__(self, model_dim:int, d_ff:int, dropout:float) -> None:
        """
        Initializes the FeedForward module.

        Args:
            model_dim (int): The dimension of the input and output.
            d_ff (int): The dimension of the hidden layer.
            dropout (float): The dropout probability.
        """
        super().__init__()
        # w1 and b1
        self.linear_1 = nn.Linear(model_dim,d_ff) # First linear layer
        self.dropout = nn.Dropout(dropout) # Dropout layer
        # w2 and b2
        self.linear_2 = nn.Linear(d_ff,model_dim) # Second linear layer
    
    def forward(self,x):
        """
        Performs the forward pass of the network.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor.
        """
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))


## Optimizer : ResidualConnection

In [12]:
class ResidualConnection(nn.Module):
    """
    Implements a residual connection with dropout and layer normalization.

    Args:
        features (int): The number of features in the input tensor.
        dropout (float): The dropout probability.
    """
    def __init__(self, features: int, dropout: float) -> None:
        super().__init__()
        # Initialize dropout layer
        self.dropout = nn.Dropout(dropout)
        # Initialize layer normalization
        self.norm  = LayerNormalization(features=features)
    
    def forward(self, x, sublayer):
        """
        Performs the forward pass of the residual connection.

        Args:
            x (torch.Tensor): The input tensor.
            sublayer (nn.Module): The sublayer to apply to the input.

        Returns:
            torch.Tensor: The output tensor.
        """
        # Apply layer normalization to the input
        normalized_x = self.norm(x)
        # Apply the sublayer to the normalized input
        sublayer_output = sublayer(normalized_x)
        # Apply dropout to the sublayer output
        dropped_output = self.dropout(sublayer_output)
        # Add the original input to the dropped sublayer output
        return x + dropped_output
    

## Encoder

In [13]:
class EncoderBlock(nn.Module):
    """
    Represents a single encoder block in a Transformer architecture. 

    Args:
        features (int): Number of features in the input and output of the block.
        self_attention (MultiHeadAttention): The multi-head self-attention module.
        feed_forward (FeedForward): The feed-forward neural network.
        dropout (float): Dropout probability.
    """
    def __init__(self, features: int, self_attention:MultiHeadAttention, feed_forward: FeedForward, dropout: float) -> None:
        super().__init__()

        # Initialize the self-attention and feed-forward modules
        self.self_attention = self_attention
        self.feed_forward = feed_forward

        # Create a list of residual connections, one for each sub-layer
        self.residual_connections = nn.ModuleList([ResidualConnection(features,dropout) for _ in range(2)])
    
    def forward(self, x, src_mask):
        """
        Performs a forward pass through the encoder block.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length, features).
            src_mask (torch.Tensor): Mask for the source sequence, used in the self-attention layer.
        Returns:
            torch.Tensor: Output tensor of shape (batch_size, sequence_length, features).
        """
        # Apply self-attention with residual connection
        x = self.residual_connections[0](x, lambda x: self.self_attention(x,x,x,src_mask))

        # Apply feed-forward network with residual connection
        x = self.residual_connections[1](x, self.feed_forward)

        # Return the output of the encoder block
        return x


In [14]:
class Encoder(nn.Module):
    """
    Encoder module for the Transformer model.

    Args:
        features (int): The number of features in the input sequence.
        layers (nn.ModuleList): A list of encoder layers.
    """
    def __init__(self, features: int, layers: nn.ModuleList ) -> None:
        super().__init__()
        self.layers = layers  # Store the encoder layers
        self.norm = LayerNormalization(features=features) # Initialize layer normalization

    def forward(self, x, mask):
        """
        Forward pass of the encoder.

        Args:
            x (torch.Tensor): The input sequence.
            mask (torch.Tensor): The attention mask.

        Returns:
            torch.Tensor: The encoded sequence.
        """
        for layer in self.layers:  # Iterate through each encoder layer
            x = layer(x, mask)  # Apply the layer to the input
            
        return self.norm(x)  # Apply layer normalization to the output


## Decoder

In [15]:
class DecoderBlock(nn.Module):
    """
    A single decoder block in the Transformer architecture.

    Args:
        features (int): Number of features in the input and output.
        self_attention (MultiHeadAttention): Multi-head self-attention module.
        cross_attention (MultiHeadAttention): Multi-head cross-attention module.
        feed_forward (FeedForward): Feed-forward neural network module.
        dropout (float): Dropout probability.
    """
    def __init__(self, features: int, self_attention: MultiHeadAttention, cross_attention:MultiHeadAttention, feed_forward: FeedForward, dropout: float) -> None:
        super().__init__()
        self.self_attention = self_attention
        self.cross_attention = cross_attention
        self.feed_forward = feed_forward
        # Create 3 residual connections for self-attention, cross-attention, and feed-forward
        self.residual_connections = nn.ModuleList([ResidualConnection(features,dropout) for _ in range(3)])
    

    def forward(self, x , encoder_output, src_mask, tgt_mask):
        """
        Forward pass of the decoder block.

        Args:
            x (torch.Tensor): Decoder input, shape (batch_size, seq_len, features).
            encoder_output (torch.Tensor): Encoder output, shape (batch_size, src_seq_len, features).
            src_mask (torch.Tensor): Source mask, shape (batch_size, src_seq_len).
            tgt_mask (torch.Tensor): Target mask, shape (batch_size, tgt_seq_len).
        Returns:
            torch.Tensor: Decoder output, shape (batch_size, seq_len, features).
        """
        # Apply self-attention with residual connection
        x = self.residual_connections[0](x, lambda x: self.self_attention(x,x,x, tgt_mask))
        # Apply cross-attention with residual connection
        x = self.residual_connections[1](x, lambda x: self.cross_attention(x,encoder_output,encoder_output,src_mask))
        # Apply feed-forward with residual connection
        x = self.residual_connections[2](x, self.feed_forward)
        return x


In [16]:
class Decoder(nn.Module):
    """
    The decoder module for the Transformer model.

    Args:
        features (int): The number of features in the decoder.
        layers (nn.ModuleList): A list of decoder layers.

    """
    def __init__(self, features: int , layers: nn.ModuleList) -> None:
        super().__init__()

        # Initialize the decoder layers
        self.layers = layers
        # Initialize the layer normalization layer
        self.norm = LayerNormalization(features= features)

    def forward(self,x, encoder_output, src_mask, tgt_mask):
        """
        Forward pass of the decoder.

        Args:
            x (torch.Tensor): The decoder input.
            encoder_output (torch.Tensor): The encoder output.
            src_mask (torch.Tensor): The source mask.
            tgt_mask (torch.Tensor): The target mask.

        Returns:
            torch.Tensor: The decoder output.
        """
        # Iterate over the decoder layers
        for layer in self.layers:
            # Apply the current layer to the input
            x = layer(x, encoder_output, src_mask, tgt_mask)
        
        # Apply layer normalization to the output
        return self.norm(x)

## Projection Layer

In [17]:
class ProjectionLayer(nn.Module):
    """
    This class defines a projection layer that maps a hidden state representation 
    to a vocabulary space. 
    """

    def __init__(self, model_dim: int, vocab_size: int) -> None:
        """
        Initializes the ProjectionLayer.

        Args:
            model_dim (int): The dimensionality of the hidden state.
            vocab_size (int): The size of the vocabulary.
        """
        super().__init__()
        # Define a linear layer to project the hidden state to the vocabulary size
        self.proj = nn.Linear(model_dim, vocab_size)

    def forward(self, x) -> None:
        """
        Performs the forward pass of the projection layer.

        Args:
            x (torch.Tensor): The hidden state representation.

        Returns:
            torch.Tensor: The projected output in the vocabulary space.
        """
        # Apply the linear projection to the input
        return self.proj(x)

## Transformer

In [18]:
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings,src_pos: PositionalEncoding, tgt_pos: PositionalEncoding,projection_layer:ProjectionLayer):
        """
        Initializes the Transformer model.

        Args:
            encoder (Encoder): The encoder module.
            decoder (Decoder): The decoder module.
            src_embed (InputEmbeddings): The source embedding layer.
            tgt_embed (InputEmbeddings): The target embedding layer.
            src_pos (PositionalEncoding): The source positional encoding layer.
            tgt_pos (PositionalEncoding): The target positional encoding layer.
            projection_layer (ProjectionLayer): The projection layer for the final output.
        """
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer
    
    def encode(self, src, src_mask):
        """
        Encodes the source sequence.

        Args:
            src (torch.Tensor): The source sequence.
            src_mask (torch.Tensor): The source mask.

        Returns:
            torch.Tensor: The encoded source sequence.
        """
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        """
        Decodes the target sequence given the encoded source sequence.

        Args:
            encoder_output (torch.Tensor): The encoded source sequence.
            src_mask (torch.Tensor): The source mask.
            tgt (torch.Tensor): The target sequence.
            tgt_mask (torch.Tensor): The target mask.

        Returns:
            torch.Tensor: The decoded target sequence.
        """
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt,encoder_output,src_mask,tgt_mask)
    
    def project(self,x):
        """
        Projects the output of the decoder to the target vocabulary.

        Args:
            x (torch.Tensor): The output of the decoder.

        Returns:
            torch.Tensor: The projected output.
        """
        return self.projection_layer(x)


In [19]:
def build_transformer(src_vocab_size: int, tgt_vocab_size:int, src_seq_len:int, tgt_seq_len: int, model_dim: int = 512 , N: int = 6, h : int = 8, dropout: float= 0.1, d_ff: int = 2048):
    # Create Embedding Layers
    src_embed = InputEmbeddings(model_dim,vocab_size= src_vocab_size) # Embedding layer for source language
    tgt_embed = InputEmbeddings(model_dim,vocab_size= tgt_vocab_size) # Embedding layer for target language

    # Create Positional Encoding Layers
    src_pos = PositionalEncoding(model_dim, seq_len = src_seq_len,dropout=dropout) # Positional encoding for source language
    tgt_pos = PositionalEncoding(model_dim, seq_len = tgt_seq_len,dropout=dropout) # Positional encoding for target language

    # Create encoder blocks
    encoder_blocks = []
    for _ in range(N): # Create N encoder blocks
        encoder_self_attention_block = MultiHeadAttention(model_dim,h,dropout) # Multi-head self-attention block for encoder
        feed_forward_block = FeedForward(model_dim,d_ff,dropout) # Feed-forward network for encoder
        encoder_block = EncoderBlock(features=model_dim,self_attention=encoder_self_attention_block,feed_forward=feed_forward_block,dropout=dropout) # Combine attention and feedforward for an encoder block
        encoder_blocks.append(encoder_block)

    # Create decoder blocks
    decoder_blocks = []
    for _ in range(N): # Create N decoder blocks
        decoder_self_attention_block = MultiHeadAttention(model_dim,h,dropout) # Multi-head self-attention block for decoder
        decoder_cross_attention_block = MultiHeadAttention(model_dim,h, dropout) # Multi-head cross-attention block for decoder

        feed_forward_block = FeedForward(model_dim,d_ff,dropout) # Feed-forward network for decoder
        decoder_block = DecoderBlock(features=model_dim,self_attention=decoder_self_attention_block,cross_attention=decoder_cross_attention_block,feed_forward=feed_forward_block,dropout=dropout) # Combine attention and feedforward for a decoder block
        decoder_blocks.append(decoder_block)
        
    
    # Create the encoder and decoder block
    encoder = Encoder(model_dim, nn.ModuleList(encoder_blocks)) # Combine all encoder blocks
    decoder = Decoder(model_dim, nn.ModuleList(decoder_blocks)) # Combine all decoder blocks

    # Create the projection layer
    projection_layer = ProjectionLayer(model_dim=model_dim,vocab_size=tgt_vocab_size) # Layer to project decoder output to target language vocabulary

    # Create the Transformer
    transformer = Transformer(encoder,decoder,src_embed,tgt_embed,src_pos,tgt_pos,projection_layer) # Combine all components into the transformer

    # Intialize the parameters , rather than starting randomly
    for p in transformer.parameters(): 
        if p.dim() > 1:
            nn.init.xavier_uniform(p) # Initialize parameters using Xavier uniform initialization
    return transformer
    

# Training Transformer NN

In [20]:
%pip install datasets tokenizers -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
import torch
import torch.nn as nn

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


##  DataLoader

In [22]:
def remove_inner_quotes(text):
  """
  Removes inner quotes (both single and double) if they form a pair.

  Args:
    text: The input string with potential inner quotes.

  Returns:
    The string with inner quotes removed, if present.
  """
  quote_type = None
  result = []
  for char in text:
    if char in ("'", '"'):
      if quote_type is None:
        quote_type = char
      elif char == quote_type:
        quote_type = None
      else:
        result.append(char)
    else:
      result.append(char)
  return "".join(result)

# Example usage:
text1 = "This is a test 'with' some inner quotes."
text2 = 'This is another test "with" more inner quotes.'
text3 = "This has 'single' and \"double\" quotes."

print(f"Text 1: {remove_inner_quotes(text1)}")
print(f"Text 2: {remove_inner_quotes(text2)}")
print(f"Text 3: {remove_inner_quotes(text3)}")


Text 1: This is a test with some inner quotes.
Text 2: This is another test with more inner quotes.
Text 3: This has single and double quotes.


In [23]:

def remove_trailing_newline(text):
  """
  Removes trailing newline characters (\n) from a string.

  Args:
    text: The input string.
  Returns:
    The string with trailing newlines removed.
  """
  return text.rstrip('\n').strip()




def preprocess(text: str):
  """
  Preprocesses a string by removing inner quotes and trailing newlines.

  Args:
    text: The input string.

  Returns:
    The preprocessed string.
  """
  text = remove_inner_quotes(text) # Remove inner quotes
  text = remove_trailing_newline(text) # Remove trailing newlines
  return text


In [None]:
# import langdetect

# def detect_text_language(text):
#   """
#   Detects the language of the given text using the langdetect library.

#   Args:
#     text: The text string to analyze.

#   Returns:
#     A string representing the detected language code (e.g., 'en', 'fr', 'es')
#     or 'Unknown' if no language could be confidently detected.
#   """
#   try:
#     return langdetect.detect(text)
#   except langdetect.LangDetectException:
#     return 'Unknown'




ModuleNotFoundError: No module named 'langdetect'

#### Dataset Creation

In [25]:
from tqdm import tqdm 

directory = Path.cwd().parents[1]
data_dir = os.path.join(directory,"data")
filepath = os.path.join(data_dir,"x00.txt")

# with open(filepath,"r",encoding="utf-8") as file:
#     data = file.readlines()


# def create_dataset(data):
#     dataset = []
    
#     for record in tqdm(data):
#         fields = record.split("\t")
#         src_text = preprocess(fields[2])
#         tgt_text = preprocess(fields[1])
        
#         dataset.append({
#             "src": src_text,
#             "tgt": tgt_text,
#             "src_lang": detect_text_language(src_text),
#             "tgt_lang": detect_text_language(tgt_text),
#             "src_len": len(src_text),
#             "tgt_len":  len(tgt_text)
#         })
    
#     return dataset

# dataset = create_dataset(data)

# with open(os.path.join(data_dir,'translation_dataset_01.json'), 'w',encoding="utf-8") as json_file:
#     json.dump(dataset, json_file, indent=4, ensure_ascii=False)

In [26]:
dataset_path = os.path.join(data_dir,"translation_dataset_01.json")

raw_dataset = json.load(open(dataset_path,encoding="utf-8"))

In [27]:
# Function to filter records by src_lang and tgt_lang
def filter_records(data, src_lang, tgt_lang):
    return [record for record in tqdm(data) if record['src_lang'] == src_lang and record['tgt_lang'] == tgt_lang]


# Call the function to filter records where src_lang is 'en' and tgt_lang is 'bn'
filtered_data = filter_records(raw_dataset, src_lang='en', tgt_lang='bn')

# Print the filtered data


100%|██████████| 100000/100000 [00:00<00:00, 5041049.00it/s]


In [28]:
tgt_sentences = [ data.get("tgt") for data in filtered_data]
src_sentences = [ data.get("src") for data in filtered_data]

In [29]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
# tokenizer_path = os.path(data_dir,'bengali_bpe_tokenizer.json')

def create_or_load_tokenizer(sentences: str , tokenizer_path: os.path, vocab_size:int =30000 , min_frequency: int =2):
    # Check if the tokenizer file already exists
    if os.path.exists(tokenizer_path):
        # Load the existing tokenizer
        tokenizer = Tokenizer.from_file(tokenizer_path)
        print(f"Tokenizer loaded from {tokenizer_path}")
    else:
        # Initialize a BPE tokenizer
        tokenizer = Tokenizer(models.BPE())

        # Define a pre-tokenizer (to split on whitespace and punctuation)
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

        # Define the BPE trainer
        trainer = trainers.BpeTrainer(vocab_size=vocab_size, min_frequency=min_frequency, 
                                      special_tokens=["[PAD]", "[UNK]", "[SOS]", "[EOS]"])

        # Train the tokenizer on your Bengali dataset
        tokenizer.train_from_iterator(sentences, trainer)

        # Save the tokenizer for future use
        tokenizer.save(tokenizer_path)
        print(f"Tokenizer trained and saved to {tokenizer_path}")

    return tokenizer


In [30]:
tokenizer_src = create_or_load_tokenizer(sentences=src_sentences, tokenizer_path= os.path.join(data_dir,"en_src_bpe_tokenizer_01.json"))
tokenizer_tgt = create_or_load_tokenizer(sentences=tgt_sentences, tokenizer_path= os.path.join(data_dir,"bn_tgt_bpe_tokenizer_01.json"))


Tokenizer loaded from /Users/nityavg/Labs/ML4Interviews/data/en_src_bpe_tokenizer_01.json
Tokenizer loaded from /Users/nityavg/Labs/ML4Interviews/data/bn_tgt_bpe_tokenizer_01.json


In [31]:
import torch

def casual_mask(size):
    """
    Creates a casual mask for a sequence of length `size`.

    Args:
        size: The length of the sequence.

    Returns:
        A PyTorch tensor of shape (1, size, size) where:
        - 1s represent valid connections (i.e., elements can see themselves and elements before them)
        - 0s represent invalid connections (i.e., elements cannot see elements after them)

    Example:
        >>> casual_mask(5)
        tensor([[[ True,  True,  True,  True,  True],
                [False,  True,  True,  True,  True],
                [False, False,  True,  True,  True],
                [False, False, False,  True,  True],
                [False, False, False, False,  True]]])
    """
    mask  = torch.triu(torch.ones((1,size, size)),diagonal=1).type(torch.int)
    return mask == 0

In [62]:
import torch
from torch.utils.data import Dataset


# # It should contain both src and tgt sentences,
# class MultilingualDataset(Dataset):
#     def __init__(self, src_sentences,tgt_sentences, tokenizer_src, tokenizer_tgt, seq_len, dtype = torch.int64) -> None:
#         super().__init__()
#         self.seq_len = seq_len
#         self.dtype = dtype

#         self.src_sentences = src_sentences
#         self.tgt_sentences = tgt_sentences
#         self.tokenizer_src = tokenizer_src
#         self.tokenizer_tgt = tokenizer_tgt

#         self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=self.dtype)
#         self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=self.dtype)
#         self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=self.dtype)

#     def __len__(self):
#         return len(self.src_sentences)
    
#     def __getitem__(self, index):
#         src_text = self.src_sentences[index]
#         tgt_text = self.tgt_sentences[index]

#         # Transform the text into tokens
#         encoded_input_tokens = self.tokenizer_src.encode(src_text).ids
#         decoded_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
       
#         # Add [SOS],[EOS] and padding to each sentence
#         enc_num_padding_tokens = self.seq_len - len(encoded_input_tokens) - 2 # for [SOS] and [EOS] token

#         # For decoder add just [SOS]
#         dec_num_padding_tokens = self.seq_len - len(decoded_input_tokens) - 1

#         # Make sure the number of padding is not negative.
#         if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
#             raise ValueError(f"sentence is more than seq_len:{self.seq_len}")
        

#         # Add [SOS],[EOS] token
#         # Concatenate the [SOS], encoded tokens, [EOS] and padding tokens for the encoder input
#         encoder_input = torch.cat(
#             [
#                 self.sos_token,  # Start of sentence token
#                 torch.tensor(encoded_input_tokens,dtype=self.dtype),  # Encoded source sentence tokens
#                 self.eos_token,  # End of sentence token
#                 torch.tensor([self.pad_token]* enc_num_padding_tokens,dtype=self.dtype)  # Padding tokens to reach the desired sequence length
#             ],
#             dim= 0 
#         )
#         # Add only [S0S] token
#         decoder_input = torch.cat(
#             [
#                 self.sos_token,
#                 torch.tensor(decoded_input_tokens,dtype=self.dtype),
#                 torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype= self.dtype)
#             ],
#             dim = 0
#         )

#         label = torch.cat(
#             [
#                 torch.tensor(decoded_input_tokens,dtype=self.dtype),
#                 self.eos_token,
#                 torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=self.dtype)

#             ],
#             dim = 0,
#         )

#         return {
#             "encoder_input": encoder_input,
#             "decoder_input" :  decoder_input,
#             "encoder_mask" : (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
#             "decoder_mask" : (decoder_input != self.pad_token).unsqueeze(0).int() & casual_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
#             "label": label,  # (seq_len)
#             "src_text": src_text,
#             "tgt_text" : tgt_text
            
#         }

def causal_mask(seq_len: int) -> torch.Tensor:
    """
    Creates a causal mask for self-attention.
    Shape: (1, seq_len, seq_len)
    """
    mask = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.int))
    return mask.unsqueeze(0)  # Add batch dimension


class MultilingualDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, tokenizer_src, tokenizer_tgt, seq_len, dtype=torch.int64) -> None:
        super().__init__()
        self.seq_len = seq_len
        self.dtype = dtype

        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=self.dtype)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=self.dtype)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=self.dtype)

    def __len__(self):
        return len(self.src_sentences)
    
    def __getitem__(self, index):
        src_text = self.src_sentences[index]
        tgt_text = self.tgt_sentences[index]

        # Tokenize
        encoded_input_tokens = self.tokenizer_src.encode(src_text).ids
        decoded_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
       
        # Padding calculation
        enc_num_padding_tokens = self.seq_len - len(encoded_input_tokens) - 2  # [SOS], [EOS]
        dec_num_padding_tokens = self.seq_len - len(decoded_input_tokens) - 1  # [SOS]

        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError(f"Sentence is longer than seq_len: {self.seq_len}")
        
        # Encoder input: [SOS] ... [EOS] [PAD...]
        encoder_input = torch.cat(
            [self.sos_token,
             torch.tensor(encoded_input_tokens, dtype=self.dtype),
             self.eos_token,
             self.pad_token.repeat(enc_num_padding_tokens)],
            dim=0
        )

        # Decoder input: [SOS] ... [PAD...]
        decoder_input = torch.cat(
            [self.sos_token,
             torch.tensor(decoded_input_tokens, dtype=self.dtype),
             self.pad_token.repeat(dec_num_padding_tokens)],
            dim=0
        )

        # Labels: ... [EOS] [PAD...]
        label = torch.cat(
            [torch.tensor(decoded_input_tokens, dtype=self.dtype),
             self.eos_token,
             self.pad_token.repeat(dec_num_padding_tokens)],
            dim=0
        )

        return {
            "encoder_input": encoder_input,
            "decoder_input": decoder_input,
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(1).int() & causal_mask(decoder_input.size(0)),
            "label": label,
            "src_text": src_text,
            "tgt_text": tgt_text
        }    

In [63]:

dataset = MultilingualDataset(src_sentences=src_sentences,tgt_sentences=tgt_sentences,tokenizer_src=tokenizer_src,tokenizer_tgt=tokenizer_tgt,seq_len=512)

In [64]:
dataset[122]

{'encoder_input': tensor([    2,  1387,  2387,  2273,  3192,  1417,  1515,  1616,  1377,  1345,
         26048,  1355,  2273,  6177,    17,     3,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0

In [65]:
from torch.utils.data import Dataset, DataLoader, random_split

def test_train_split(dataset: Dataset,prob : float = 0.9):
    """Splits a dataset into training and validation sets.

    Args:
        dataset (Dataset): The dataset to split.
        prob (float, optional): The proportion of the dataset to use for training. Defaults to 0.9.

    Returns:
        tuple: A tuple containing the training and validation datasets.
    """
    # Keep 90% for training, 10% for validation
    train_ds_size = int(prob * len(dataset))
    val_ds_size = len(dataset) - train_ds_size
    train_ds, val_ds = random_split(dataset, [train_ds_size, val_ds_size])
    return train_ds, val_ds

def get_src_tgt_sentences(ds:Dataset):
    """Extracts source and target sentences from a dataset.

    Args:
        ds (Dataset): The dataset to extract sentences from.

    Returns:
        tuple: A tuple containing the source and target sentences.
    """
    src_sentences = [ sent[0] for sent in ds.dataset]
    tgt_sentences = [ sent[1] for sent in ds.dataset]
    return src_sentences, tgt_sentences

def get_dataset(dataset:Dataset,batch_size = 7):
    """Prepares the dataset for training and validation.

    Args:
        dataset (Dataset): The dataset to prepare.

    Returns:
        tuple: A tuple containing the training and validation datasets.
    """
    filtered_data = filter_records(dataset, src_lang='en', tgt_lang='bn')  # Filter records based on languages
    tgt_sentences = [ data.get("tgt") for data in filtered_data]
    src_sentences = [ data.get("src") for data in filtered_data]

    tokenizer_src = create_or_load_tokenizer(sentences=src_sentences, tokenizer_path= os.path.join(data_dir,"en_src_bpe_tokenizer_01.json"))
    tokenizer_tgt = create_or_load_tokenizer(sentences=tgt_sentences, tokenizer_path= os.path.join(data_dir,"bn_tgt_bpe_tokenizer_01.json"))

    train_ds, val_ds = test_train_split(list(zip(src_sentences,tgt_sentences)))  # Split into train and validation sets
    train_src_ds, train_tgt_ds = get_src_tgt_sentences(train_ds)  # Extract source and target sentences from train set
    val_src_ds, val_tgt_ds = get_src_tgt_sentences(val_ds)  # Extract source and target sentences from validation set
    train_ds = MultilingualDataset(src_sentences=train_src_ds,tgt_sentences=train_tgt_ds,tokenizer_src=tokenizer_src,tokenizer_tgt=tokenizer_tgt,seq_len=512)
    val_ds = MultilingualDataset(src_sentences=val_src_ds,tgt_sentences=val_tgt_ds,tokenizer_src=tokenizer_src,tokenizer_tgt=tokenizer_tgt,seq_len=512)
    
    # Find the max len of each sentence in source and target sentences
    max_len_src = 0
    max_len_tgt = 0

    for item in list(zip(src_sentences,tgt_sentences)):
        src_ids = tokenizer_src.encode(item[0]).ids
        tgt_ids = tokenizer_tgt.encode(item[1]).ids
        max_len_src = max(max_len_src,len(src_ids))
        max_len_tgt = max(max_len_tgt,len(tgt_ids))
    
    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')

    train_dataloader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)
    val_dataloader = DataLoader(val_ds,batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt




In [66]:
train_dl, val_dl, tokenizer_src, tokenizer_tgt = get_dataset(raw_dataset)

100%|██████████| 100000/100000 [00:00<00:00, 4103533.83it/s]


Tokenizer loaded from /Users/nityavg/Labs/ML4Interviews/data/en_src_bpe_tokenizer_01.json
Tokenizer loaded from /Users/nityavg/Labs/ML4Interviews/data/bn_tgt_bpe_tokenizer_01.json
Max length of source sentence: 2603
Max length of target sentence: 1813


In [67]:
train_dl.dataset.__getitem__(1)

{'encoder_input': tensor([    2, 14182,  1614,  1417,  8131,  1424,  1546,  4204,    17,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0

In [68]:
def get_model(vocab_src_len, vocab_tgt_len):
    model = build_transformer(src_vocab_size=vocab_src_len,
                              tgt_vocab_size=vocab_tgt_len,
                              src_seq_len= 512,
                              tgt_seq_len=512,
                              model_dim= 512)
    return model

In [69]:
from pathlib import Path

def get_config():
    return {
        "batch_size": 4,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "model_dim": 512,
        "datasource" : "en_bn_open",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

In [70]:
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)
            
            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break
    
    if writer:
        # Evaluate the character error rate
        # Compute the char error rate 
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

In [71]:
import torchmetrics
from torch.utils.tensorboard import SummaryWriter

def train_model(config,dataset):
    # Define the device
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    elif (device == 'mps'):
        print(f"Device name: <mps>")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
    device = torch.device(device)

    # Make sure the weights folder exists
    Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_dataset(dataset)
    model = get_model(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)



In [72]:
import warnings
# warnings.filterwarnings("ignore")
                                   
if __name__ == '__main__':
    config = get_config()
    dataset = raw_dataset
    train_model(config=config,dataset=dataset)

  device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"


Using device: mps
Device name: <mps>


100%|██████████| 100000/100000 [00:00<00:00, 3873393.36it/s]

Tokenizer loaded from /Users/nityavg/Labs/ML4Interviews/data/en_src_bpe_tokenizer_01.json
Tokenizer loaded from /Users/nityavg/Labs/ML4Interviews/data/bn_tgt_bpe_tokenizer_01.json





Max length of source sentence: 2603
Max length of target sentence: 1813


  nn.init.xavier_uniform(p) # Initialize parameters using Xavier uniform initialization


No model to preload, starting from scratch


Processing Epoch 00:  32%|███▏      | 4121/12990 [1:13:36<2:38:24,  1.07s/it, loss=7.669]


ValueError: Sentence is longer than seq_len: 512