# A Gentle Introduction to Attention and Transformer Models

In [1]:
# Importing torch for tensor creation and manipulation
import torch

# Importing torch.nn for building neural network modules
import torch.nn as nn


# Defining a TransformerEncoderLayer class to model one layer of the Transformer encoder
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, d_ff, num_heads):
        # Initialize the parent class nn.Module
        super(TransformerEncoderLayer, self).__init__()

        # Self-attention layer with multiple heads, works on sequences with embedding size d_model
        self.attention = nn.MultiheadAttention(d_model, num_heads, batch_first=True)

        # First dense layer in the feed-forward block, projecting the input embedding size (d_model) to a larger size (d_ff)
        self.ff_proj = nn.Linear(d_model, d_ff)

        # Second dense layer in the feed-forward block, projecting back from d_ff to d_model
        self.output_proj = nn.Linear(d_ff, d_model)

        # Layer normalization applied after the self-attention block
        self.norm1 = nn.LayerNorm(d_model)

        # Layer normalization applied after the feed-forward block
        self.norm2 = nn.LayerNorm(d_model)

        # ReLU activation function to introduce non-linearity in the feed-forward block
        self.act = nn.ReLU()

    def forward(self, x):
        # Self-attention sublayer: x attends to itself to capture dependencies between positions in the sequence
        residual = x  # Save residual connection for self-attention output
        x = self.attention(x, x, x)  # Perform multi-head self-attention
        x = self.norm1(x[0] + residual)  # Add residual connection and normalize the output

        # Feed-forward sublayer: processes each position in the sequence independently
        residual = x  # Save residual connection for feed-forward output
        x = self.act(self.ff_proj(x))  # Apply the first dense layer and activation
        x = self.act(self.output_proj(x))  # Apply the second dense layer and activation
        x = self.norm2(x + residual)  # Add residual connection and normalize the output

        return x  # Return the transformed sequence


# Create a random input tensor representing a batch of 3 sequences, each of length 7, with 16 features
seq = torch.rand(3, 7, 16)

# Instantiate the TransformerEncoderLayer with input size 16, feed-forward size 32, and 4 attention heads
layer = TransformerEncoderLayer(16, 32, 4)

# Pass the input sequence through the Transformer encoder layer
out_seq = layer(seq)

# Print the shapes of all learnable parameters in the layer
print({name: weight.shape for name, weight in layer.state_dict().items()})

# Print the shape of the output sequence
print(out_seq.shape)

{'attention.in_proj_weight': torch.Size([48, 16]), 'attention.in_proj_bias': torch.Size([48]), 'attention.out_proj.weight': torch.Size([16, 16]), 'attention.out_proj.bias': torch.Size([16]), 'ff_proj.weight': torch.Size([32, 16]), 'ff_proj.bias': torch.Size([32]), 'output_proj.weight': torch.Size([16, 32]), 'output_proj.bias': torch.Size([16]), 'norm1.weight': torch.Size([16]), 'norm1.bias': torch.Size([16]), 'norm2.weight': torch.Size([16]), 'norm2.bias': torch.Size([16])}
torch.Size([3, 7, 16])


In [3]:
# Importing the PyTorch library for tensor operations and deep learning utilities
import torch
# Importing torch.nn, which contains modules and classes for building neural networks
import torch.nn as nn


# Defining the TransformerEncoderLayer2 class that inherits from PyTorch's nn.Module
# This models a single layer of the Transformer encoder block
class TransformerEncoderLayer2(nn.Module):
    def __init__(self, d_model, d_ff, num_heads):
        # Initialize the parent class nn.Module
        super(TransformerEncoderLayer2, self).__init__()

        # Multi-head attention mechanism to model relationships between sequence elements
        # d_model: dimensionality of the input, num_heads: number of attention heads
        self.attention = nn.MultiheadAttention(d_model, num_heads, batch_first=True)

        # Feed-forward block: First linear layer expands the dimensionality of input from d_model to d_ff
        self.ff_proj = nn.Linear(d_model, d_ff)
        # Feed-forward block: Second linear layer reduces the dimensionality back to d_model
        self.output_proj = nn.Linear(d_ff, d_model)

        # First LayerNorm applied after self-attention
        self.norm1 = nn.LayerNorm(d_model)
        # Second LayerNorm applied after the feed-forward block
        self.norm2 = nn.LayerNorm(d_model)

        # ReLU activation function to introduce non-linearity
        self.act = nn.ReLU()

    def forward(self, x):
        # Forward method defines how the input flows through the layer

        # Save the input (residual connection) for the self-attention sublayer
        residual = x

        # Normalize the input before feeding it into the attention module
        x = self.norm1(x)

        # Multi-head self-attention: allows every element in the sequence to attend to every other element
        # `x` attends to itself, considering all keys, queries, and values.
        x = self.attention(x, x, x)
        # Combine the output from self-attention and the residual connection
        x = x[0] + residual

        # Save the current output as a residual for the feed-forward sublayer
        residual = x

        # Normalize output before applying the feed-forward block
        x = self.norm2(x)

        # Pass normalized data through the first feed-forward layer and apply the activation function
        x = self.act(self.ff_proj(x))
        # Pass the result through the second feed-forward layer and apply the activation function
        x = self.act(self.output_proj(x))
        # Add the residual connection to the feed-forward block's output
        x = x + residual

        # Return the final output sequence
        return x


# Create a random tensor to represent a batch of sequences
# Shape: (batch size, sequence length, embedding size)
seq = torch.rand(3, 7, 16)

# Instantiate the Transformer layer with:
# d_model = 16 (input size), d_ff = 32 (feed-forward size), num_heads = 4 (attention heads)
layer = TransformerEncoderLayer2(16, 32, 4)

# Pass the input sequence through the Transformer encoder layer and compute the output
out_seq = layer(seq)

# Print the shapes of all learnable parameters (weights and biases) in the layer
print({name: weight.shape for name, weight in layer.state_dict().items()})

# Print the shape of the final output tensor after processing through the layer
print(out_seq.shape)

{'attention.in_proj_weight': torch.Size([48, 16]), 'attention.in_proj_bias': torch.Size([48]), 'attention.out_proj.weight': torch.Size([16, 16]), 'attention.out_proj.bias': torch.Size([16]), 'ff_proj.weight': torch.Size([32, 16]), 'ff_proj.bias': torch.Size([32]), 'output_proj.weight': torch.Size([16, 32]), 'output_proj.bias': torch.Size([16]), 'norm1.weight': torch.Size([16]), 'norm1.bias': torch.Size([16]), 'norm2.weight': torch.Size([16]), 'norm2.bias': torch.Size([16])}
torch.Size([3, 7, 16])


In [4]:
# This code defines and demonstrates the usage of a TransformerDecoderLayer class.

import torch  # Import PyTorch for tensor and deep learning operations
import torch.nn as nn  # Import neural network functionality from PyTorch


# Define the TransformerDecoderLayer class, modeling a single layer of the Transformer decoder
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, d_ff, num_heads):
        super(TransformerDecoderLayer, self).__init__()  # Initialize the base class nn.Module
        # Multi-head self-attention layer processes the decoder sequence by allowing each element to attend to all others
        self.attention = nn.MultiheadAttention(d_model, num_heads, batch_first=True)
        # Multi-head cross-attention layer attends to the encoder's output for generating context-aware decoder outputs
        self.xattention = nn.MultiheadAttention(d_model, num_heads, batch_first=True)
        # First feed-forward layer increases representation dimensionality from d_model to d_ff
        self.ff_proj = nn.Linear(d_model, d_ff)
        # Second feed-forward layer reduces representation dimensionality back from d_ff to d_model
        self.output_proj = nn.Linear(d_ff, d_model)
        # Layer normalization (norm1) stabilizes the output of the self-attention mechanism
        self.norm1 = nn.LayerNorm(d_model)
        # Layer normalization (norm2) stabilizes the output of cross-attention
        self.norm2 = nn.LayerNorm(d_model)
        # Layer normalization (norm3) stabilizes the output of the feed-forward network
        self.norm3 = nn.LayerNorm(d_model)
        # ReLU activation function introduces non-linear transformations for the feed-forward network
        self.act = nn.ReLU()

    def forward(self, x, y):
        # Forward pass defines how input tensors flow through the TransformerDecoderLayer

        # Self-attention sublayer: x attends to itself for intra-sequence dependencies
        residual = x  # Preserve input x as residual for skip connection
        x = self.norm1(x)  # Apply layer normalization to stabilize input
        x = self.attention(x, x, x)  # Compute multi-head self-attention
        x = x[0] + residual  # Add skip connection (residual) to self-attention output

        # Cross-attention sublayer: attends to the encoder's output (context)
        residual = x  # Preserve current value of x for residual connection
        x = self.norm2(x)  # Normalize input before cross-attention
        x = self.xattention(x, y, y)  # Compute cross-attention with encoder outputs (keys and values)
        x = x[0] + residual  # Add residual connection to cross-attention output

        # Feed-forward sublayer: introduces non-linear transformations and higher representational power
        residual = x  # Preserve current x for another skip connection
        x = self.norm3(x)  # Normalize input before passing through feed-forward network
        x = self.act(self.ff_proj(x))  # Pass normalized input through first feed-forward layer and apply ReLU
        x = self.act(self.output_proj(x))  # Pass through second feed-forward layer and apply ReLU
        x = x + residual  # Add residual connection to feed-forward output

        return x  # Return the fully processed tensor


# Create a random input tensor for the decoder sequence
# Shape: (batch_size=3, seq_len=7, embedding size=16)
dec_seq = torch.rand(3, 7, 16)

# Create a random input tensor to represent the encoder's output sequence
# Shape: (batch_size=3, seq_len=11, embedding size=16)
enc_seq = torch.rand(3, 11, 16)

# Instantiate a TransformerDecoderLayer with model dim=16, ff dim=32, 4 attention heads
layer = TransformerDecoderLayer(16, 32, 4)

# Forward pass: Process the decoder sequence (dec_seq) and encoder's output (enc_seq) through the layer
out_seq = layer(dec_seq, enc_seq)

# Print the shapes of all learnable parameters in the TransformerDecoderLayer
print({name: weight.shape for name, weight in layer.state_dict().items()})

# Print the shape of the final output tensor
print(out_seq.shape)

{'attention.in_proj_weight': torch.Size([48, 16]), 'attention.in_proj_bias': torch.Size([48]), 'attention.out_proj.weight': torch.Size([16, 16]), 'attention.out_proj.bias': torch.Size([16]), 'xattention.in_proj_weight': torch.Size([48, 16]), 'xattention.in_proj_bias': torch.Size([48]), 'xattention.out_proj.weight': torch.Size([16, 16]), 'xattention.out_proj.bias': torch.Size([16]), 'ff_proj.weight': torch.Size([32, 16]), 'ff_proj.bias': torch.Size([32]), 'output_proj.weight': torch.Size([16, 32]), 'output_proj.bias': torch.Size([16]), 'norm1.weight': torch.Size([16]), 'norm1.bias': torch.Size([16]), 'norm2.weight': torch.Size([16]), 'norm2.bias': torch.Size([16]), 'norm3.weight': torch.Size([16]), 'norm3.bias': torch.Size([16])}
torch.Size([3, 7, 16])
