In [1]:
!pip3 install torch torchvision torchaudio



# Import Libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

# 🔹 Understanding Self-Attention in Multi-Head Attention


## **Step 1: Tokenization and Embedding**  
Before attention is applied, each word in the sentence is **tokenized**, then converted into **embeddings**, concatenated with its **positional encoding**, and finally represented as a numerical vector (**embedding**).

- In the *"Attention Is All You Need"* paper, `d_model = 512`. Each word is represented as a **1D vector of 512 dimensions (512D)**.

### **Example Sentence:**
> **"Hi how are you"**

Each word is converted into a vector:

- **"Hi"** → `[0.1, 0.3, 0.7, ...]`  (Size: `d_model`)
- **"how"** → `[0.2, 0.6, 0.5, ...]`  (Size: `d_model`)
- **"are"** → `[0.4, 0.2, 0.8, ...]`  (Size: `d_model`)
- **"you"** → `[0.9, 0.1, 0.3, ...]`  (Size: `d_model`)

At this point, each word is just an **embedding vector of size `d_model`**.

> **Note:** Positional encoding is **added** to each embedding to retain word order information.

---

## **Step 2: Creating Query (Q), Key (K), and Value (V)**
For **each word**, we create three separate vectors:

- **Query (Q)** → Determines how much focus a word should get.
- **Key (K)** → Helps decide how much attention a word receives from other words.
- **Value (V)** → Contains the actual word information to be passed on.

These vectors are computed using linear transformations:

Q = W_q * X


K = W_k * X


V = W_v * X

where \( W_q, W_k, W_v \) are **learnable weight matrices**. Whereas, X represents the input embeddings of the words/tokens in the sentence.

For example, for the word **"Hi"**, we get:
- Query vector **Q_hi** (size: `d_model` = 512D)
- Key vector **K_hi** (size: `d_model` = 512D)
- Value vector **V_hi** (size: `d_model` = 512D)

This same process applies to all other words.

---

## **Step 3: Compute Attention Scores in One Head**
Now, we compare how much each word should **attend to** every other word in the sentence. This is done by computing the **dot product** between the **query of one word** and the **keys of all words**.

Since **multi-head attention** is used, each head works with a smaller subspace:
- `d_k = d_model / num_heads`
- If `d_model = 512` and `num_heads = 8`, then `d_k = 512 / 8 = 64`.
- Each head gets **Q, K, V vectors of size 64D**.

| Word  | Query (Q)  | Key (K)  | Value (V)  |
|-------|-----------|----------|------------|
| Hi    | Q_hi (64D) | K_hi (64D) | V_hi (64D) |
| How   | Q_how (64D) | K_how (64D) | V_how (64D) |
| Are   | Q_are (64D) | K_are (64D) | V_are (64D) |
| You   | Q_you (64D) | K_you (64D) | V_you (64D) |

### **Step 3.1: Compute Raw Scores**
For word **Hi**, we compute the dot product of its query with the keys of all words:

Score1= Q_hi.K_hi

Score2= Q_hi.K_how

Score3= Q_hi.K_are

Score4= Q_hi.K_you

### **Step 3.2: Apply Softmax**
The scores are **scaled** to avoid large gradients by dividing by \( \sqrt{d_k} \) and then passed through **softmax** to get probabilities:

$$
\text{Attention Weight} = \text{softmax} \left( \frac{Q \cdot K^T}{\sqrt{d_k}} \right)
$$

$$
\text{Attention Weight}_i = \text{softmax} \left(Score_i\right)
$$

Each word now has an **attention score** that tells how much focus it should give to the other words.

### **Step 3.3: Compute Final Weighted Sum**
Multiply each attention weight by the corresponding **Value (V) vector**:

Output1= Weight1*V_hi

Output2= Weight2*V_how

Output3= Weight2*V_are

Output4= Weight4*V_you

Final embedding for the word **Hi** is: Output1 + Output2 + Output3 + Output4 

Each output is **64D**, so we get **one 64D vector per word per attention head**.

> **Note:** We have Calculated Final Contexual Embeddings of word Hi. Similarly, we have to for other words how, are, you. This is done by computing the **dot product** between the **query of one word** and the **keys of all words**.
---

## **Step 4: Multi-Head Attention**
Since we have **8 heads**, each computes **separate self-attention** and gives an output of size **(batch, seq_length, d_k) = (1, 4, 64)**.

After processing all heads, their outputs are **concatenated** to restore the original `d_model = 512`: **(1, 4, 512)**

The final **multi-head attention output** has the same size as the input embeddings (`d_model = 512`).



In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads): 
        #d_model are dimensions of embeddings for each token or word in sentence. In Attention is all you need paper they used 512D 1D vector size.
        #num_heads is self attention heads used in multi head attention. In Attention is all you need paper they used 8 self attention heads to make a multi head mechanisim
        super(MultiHeadAttention, self).__init__() # Calls the constructor of the parent class (nn.Module).
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        # This is important because each head needs an equal portion of the total embedding space.
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Embedding Model dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        #  Define Learnable Linear Parameters with input and outputs layer
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) # Formula to calculate attention scores which comes from dot product

        #If mask == 0, it replaces the attention score with -1e9, making softmax output close to 0. Also called Masked Multihead Attention
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9) # if mask is not none we add mask too small that its softmax is zero
        
        # Softmax is applied to obtain attention probabilities 
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # Multiply by values to obtain the final output vector
        output = torch.matmul(attn_probs, V) 
        return output 

    # TO convert 512D vector into 64D vector per head so we can run all 8 heads parellel.
    #This function reshapes the input from (batch_size, seq_len, d_model) to (batch_size, num_heads, seq_len, d_k) where d_k = d_model / num_heads.
    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    # Combine the multiple heads back to original shape which was 512D. We get 512D final embeddings for each word.
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    #putting all the functions together
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

# 🔹Understanding FeedForward In Transformer

The Position-Wise Feed Forward Network (FFN) is a key component of the Transformer architecture.
It is applied independently to each token in the sequence after multi-head attention.

- Multi-head attention captures **relationships between words**, but it does **not change individual word representations much**.  
- The **FFN introduces non-linearity and richer transformations** to enhance each token’s representation.  
- It consists of **two linear transformations** with a **ReLU activation** in between.


In [4]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        #d_ff: The hidden layer size (usually larger, e.g., 2048 in the original Transformer).
        # we have 4 words in sentence "Hi How Are You" so each word will be expaned to 2048. The input, hidden, output layers (512 → 2048 → 512)
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU() # Activation Function

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# 🔹 Positional Encoding

##  How Positional Encoding Works?
Since self-attention treats all words **independently**, it doesn't understand their order. Positional encoding assigns each position a unique vector, ensuring the model understands **word order**.

### **Step 1: Convert Words to Word Embeddings**
Before adding positional encoding, each word gets converted into a **512-dimensional vector** using an embedding layer.

Let's assume our embedding model has an **embedding size (`d_model`) of 512**.

| Token  | Word Embedding (Simplified: 3D instead of 512D) |
|--------|--------------------------------|
| **Hi**  | `[0.3, 0.5, -0.2]`  |
| **How** | `[0.7, -0.1, 0.9]`  |
| **Are** | `[-0.5, 0.3, 0.6]`  |
| **You** | `[0.1, -0.4, 0.8]`  |

---

### **Step 2: Generate Unique Positional Encoding**
Each **position** (0, 1, 2, 3) is assigned a **unique vector** using a combination of **sine and cosine functions** at different frequencies.

#### **Formula:**
Each position `p` (word index) is assigned a **512-dimensional vector** using:

$$
PE(p, 2i) = \sin\left(\frac{p}{10000^{\frac{2i}{d_{\text{model}}}}}\right)
$$

$$
PE(p, 2i+1) = \cos\left(\frac{p}{10000^{\frac{2i}{d_{\text{model}}}}}\right)
$$


where:
- **`p`** = Position index (0 for "Hi", 1 for "How", etc.)
- **`i`** = Dimension index (half use `sin`, half use `cos`)
- **`d_model`** = Embedding size (e.g., 512)
- **10000** = A constant to control frequency scaling

For **simplicity**, let's assume `d_model = 6` instead of 512:

| Position `p` | PE(0) (sin) | PE(1) (cos) | PE(2) (sin) | PE(3) (cos) | PE(4) (sin) | PE(5) (cos) |
|-------------|------------|------------|------------|------------|------------|------------|
| **0** (Hi)  | `0.0000`  | `1.0000`  | `0.0000`  | `1.0000`  | `0.0000`  | `1.0000`  |
| **1** (How) | `0.8415`  | `0.5403`  | `0.4207`  | `0.9070`  | `0.2104`  | `0.9775`  |
| **2** (Are) | `0.9093`  | `-0.4161` | `0.6543`  | `0.7561`  | `0.3784`  | `0.9256`  |
| **3** (You) | `0.1411`  | `-0.9900` | `0.8415`  | `0.5403`  | `0.5000`  | `0.8660`  |

Each position receives **a unique vector**, ensuring that different words have different encodings.

---

### **Step 3: Add Positional Encoding to Word Embeddings**
Each word’s embedding is **element-wise added** to its corresponding positional encoding.

| Token  | Word Embedding | Positional Encoding | **Final Embedding (Word + PE)** |
|--------|-----------------|-----------------|------------------|
| **Hi**  | `[0.3, 0.5, -0.2]`  | `[0.00, 1.00, 0.00]`  | `[0.3, 1.5, -0.2]` |
| **How** | `[0.7, -0.1, 0.9]`  | `[0.84, 0.54, 0.42]`  | `[1.54, 0.44, 1.32]` |
| **Are** | `[-0.5, 0.3, 0.6]`  | `[0.91, -0.41, 0.65]`  | `[0.41, -0.11, 1.25]` |
| **You** | `[0.1, -0.4, 0.8]`  | `[0.14, -0.99, 0.84]`  | `[0.24, -1.39, 1.64]` |

---


In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        #max_seq_length: The maximum length of the input sequence (i.e., the longest sequence that the model will process).
        
        super(PositionalEncoding, self).__init__()

        #Example: If d_model = 512 and max_seq_length = 100, each element in a sequence of length up to 100 will be represented as a 512-dimensional vector.
        #Initializes a tensor pe with zeros, with dimensions (max_seq_length, d_model). This will store the positional encoding values.
        pe = torch.zeros(max_seq_length, d_model)
        #Creates a tensor position that represents the position of each token in the sequence 
        #unsqueeze(1) adds an additional dimension to make it a column vector of shape (max_seq_length, 1).
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# 🔹 Encoder Block

<p align="center">
  <img src="https://miro.medium.com/v2/resize:fit:640/format:webp/1*7sjcgd_nyODdLbZSxyxz_g.png" width="300"/>
</p>



In [5]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        # d_model: the dimensionality of the input and output vectors (the embedding size).
        # num_heads: the number of self attention heads for the multi-head attention mechanism.
        #d_ff: the size of the feed-forward network's hidden layer.
        #dropout: the dropout rate to be applied for regularization
        
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads) # calls Constructor of Multihead class gives Wq,Wk,Wv,Wo
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask) # For better details check Dummy Encoder Run
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

### Dummy Encoder Run

In [6]:
# 🚀 Step 1: Create Encoder Layer
encoder_layer = EncoderLayer(d_model=512, num_heads=8, d_ff=2048, dropout=0.1)
print("Self Attention Module:\n", encoder_layer.self_attn)

# 🚀 Step 2: Create Dummy Input Data which is K,Q,V coming from constructor method of MultiHead Class
batch_size = 2
seq_length = 10
d_model = 512

x = torch.randn(batch_size, seq_length, d_model)  # Random input tensor
mask = torch.ones(batch_size, 1, 1, seq_length)  # Example mask (all ones means no masking in encoder)
print("Random Tenser X:", x)
print("Mask:", mask)

# 🚀 Step 3: Directly call self.self_attn like inside EncoderLayer 
# implicitly calling the forward() method of the MultiHeadAttention class because it's a subclass of nn.Module.
attn_output = encoder_layer.self_attn(x, x, x, mask) # x is here Q,K,V which is calculated by constructor method of multiheadattention
print("Attention Output Shape:", attn_output.shape)  # Expected: (batch_size, seq_length, d_model)

# 🚀 Step 4: Apply Dropout on attn_output
dropout_output = encoder_layer.dropout(attn_output) # x + self.dropout(attn_output)

# 🚀 Step 5: Residual Connection (Adding x + dropout_output)
residual_output = x + dropout_output 

# 🚀 Step 6: Apply Layer Normalization
normalized_output = encoder_layer.norm1(residual_output) #self.norm1(x + self.dropout(attn_output))
print("Final Output Shape After Norm1:", normalized_output.shape)  # Expected: (batch_size, seq_length, d_model)
 
# 🚀 Step 7: Apply PositionWiseFeedForward ## Feed Forward
PositionWise_FFN = PositionWiseFeedForward(d_model, d_ff=2048)
print("\nFeed Forward Network:\n", PositionWise_FFN)

# 🚀 Step 8: Forward pass through Feed-Forward Network  
ffn_output = PositionWise_FFN.forward(normalized_output)
print("Feed Forward Output Shape:", ffn_output.shape)  # Expected: (batch_size, seq_length, d_model)

Self Attention Module:
 MultiHeadAttention(
  (W_q): Linear(in_features=512, out_features=512, bias=True)
  (W_k): Linear(in_features=512, out_features=512, bias=True)
  (W_v): Linear(in_features=512, out_features=512, bias=True)
  (W_o): Linear(in_features=512, out_features=512, bias=True)
)
Random Tenser X: tensor([[[ 1.3470,  0.8165,  0.8658,  ...,  0.4236,  0.4099, -0.2743],
         [ 1.2221, -0.4489, -1.1829,  ...,  0.0461,  0.9481, -0.2679],
         [ 0.4355,  0.2548,  1.1793,  ..., -1.2696, -1.1451, -1.1789],
         ...,
         [ 0.8899, -1.7102, -1.1989,  ..., -0.6538,  1.1413,  0.5932],
         [-2.0163, -0.8217, -0.2244,  ..., -0.6873,  0.3968, -0.9653],
         [ 0.7070, -1.2205, -1.8267,  ...,  1.0535, -0.1410,  0.6317]],

        [[-0.2732,  1.2016, -0.1432,  ...,  0.0168,  0.2845,  1.0413],
         [-0.1246,  0.6747, -1.3493,  ..., -0.9134, -0.6291, -0.7386],
         [-0.5668,  0.0650, -0.0444,  ..., -0.1624,  1.0665, -0.3356],
         ...,
         [ 0.4902,  

# 🔹 Decoder Block

<p align="center">
  <img src="https://miro.medium.com/v2/resize:fit:640/format:webp/1*vYgZyhNOoPKdeSEnN1i9Kg.png">
</p>



# 🔹 Understanding Masked Self-Attention in Multi-Head Attention


## 1. Translation and Tokens

- **English:** “Hi how are you”  [this we pass from encoder block]
- **Hindi:** “हाय कैसे हो तुम”  [ this we pass in decoder block while training so it is non autogressive in training]

Token sequence (4 tokens):
```text
["हाय", "कैसे", "हो", "तुम"]
```

---

## 2. Q, K, V Matrices

We stack each token’s Q/K/V vectors into 4×4 matrices (rows = tokens, cols = d_model = 4): [in attention paper we have d_model 512]

Q = 
\begin{bmatrix}
0.1 & 0.2 & 0.3 & 0.4 \\
0.2 & 0.1 & 0.4 & 0.3 \\
0.3 & 0.4 & 0.2 & 0.1 \\
0.4 & 0.3 & 0.1 & 0.2
\end{bmatrix},

K = 
\begin{bmatrix}
0.4 & 0.3 & 0.2 & 0.1 \\
0.5 & 0.3 & 0.6 & 0.1 \\
0.6 & 0.4 & 0.5 & 0.2 \\
0.1 & 0.2 & 0.3 & 0.5
\end{bmatrix}
V = 
\begin{bmatrix}
0.1 & 0.5 & 0.2 & 0.4 \\
0.3 & 0.7 & 0.4 & 0.1 \\
0.2 & 0.3 & 0.5 & 0.3 \\
0.6 & 0.4 & 0.3 & 0.2
\end{bmatrix}

1st row of Q,K,V → हाय  
2nd row of Q,K,V → कैसे  
3rd row of Q,K,V → हो  
4th row of Q,K,V → तुम  




## 3. Raw Attention Scores  
Compute  

$$
S = \left( \frac{Q \cdot K^T}{\sqrt{d_k}} \right)
$$

so that each row *i* contains dot‑products of token *i*’s Q with every token’s K:


S = 
\begin{bmatrix}
0.20 & 0.33 & 0.37 & 0.34 \\   
0.22 & 0.40 & 0.42 & 0.31 \\   
0.29 & 0.40 & 0.46 & 0.22 \\  
0.29 & 0.37 & 0.45 & 0.23     
\end{bmatrix}

1st row has scores for → हाय  
2nd row has scores for → कैसे  
3rd row has scores for → हो  
4th row has scores for → तुम 

## 4. Causal Mask  
Enforce autoregressive order by masking out future positions(at हाय we dont know rest of words so we mask them with -inf as softmax of -inf is 0, same for all other words) (setting them to –∞):


Mask =
\begin{bmatrix}
0      & -\infty & -\infty & -\infty \\  % हाय (i=0)
0      & 0       & -\infty & -\infty \\  % कैसे (i=1)
0      & 0       & 0       & -\infty \\  % हो   (i=2)
0      & 0       & 0       & 0        % तुम  (i=3)
\end{bmatrix}


Add to *S* to get masked scores *S′*:


S' = S + Mask =
\begin{bmatrix}
0.20 & -\infty & -\infty & -\infty \\
0.22 & 0.40    & -\infty & -\infty \\
0.29 & 0.40    & 0.46    & -\infty \\
0.29 & 0.37    & 0.45    & 0.23
\end{bmatrix}

## 5. Softmax → Attention Weights  
Apply softmax **row‑wise** (ignore –∞ entries, which become zero):


W =
\begin{bmatrix}
1.000 & 0     & 0     & 0     \\[6pt]
0.455 & 0.545 & 0     & 0     \\[6pt]
0.303 & 0.338 & 0.359 & 0     \\[6pt]
0.238 & 0.258 & 0.280 & 0.224
\end{bmatrix}


- **Row “हाय”**: attends only to itself → `[1, 0, 0, 0]`  
- **Row “कैसे”**: softmax\((0.22,0.40)\approx(0.455,0.545)\)  
- **Row “हो”**: softmax\((0.29,0.40,0.46)\approx(0.303,0.338,0.359)\)  
- **Row “तुम”**: softmax\((0.29,0.37,0.45,0.23)\approx(0.238,0.258,0.280,0.224)\)



## 6. Final Output  
Compute the contextualized vectors by multipling weights by Value vector:


O = W \times V
=
\begin{bmatrix}
1\cdot V_{\text{हाय}} \\[4pt]
0.455\,V_{\text{हाय}} + 0.545\,V_{\text{कैसे}} \\[4pt]
0.303\,V_{\text{हाय}} + 0.338\,V_{\text{कैसे}} + 0.359\,V_{\text{हो}} \\[4pt]
0.238\,V_{\text{हाय}} + 0.258\,V_{\text{कैसे}} + 0.280\,V_{\text{हो}} + 0.224\,V_{\text{तुम}}
\end{bmatrix}
=
\begin{bmatrix}
0.10   & 0.50   & 0.20   & 0.40   \\[4pt]
0.209  & 0.609  & 0.309  & 0.237  \\[4pt]
0.2035 & 0.4958 & 0.3753 & 0.2627 \\[4pt]
0.2916 & 0.4732 & 0.3580 & 0.2498
\end{bmatrix}


- **Output “हाय”** = `[0.10, 0.50, 0.20, 0.40]`  
- **Output “कैसे”** ≈ `[0.209, 0.609, 0.309, 0.237]`  
- **Output “हो”**   ≈ `[0.2035, 0.4958, 0.3753, 0.2627]`  
- **Output “तुम”**  ≈ `[0.2916, 0.4732, 0.3580, 0.2498]`



# 🔹 Understanding Cross‑Attention in Encoder–Decoder Attention

Below is a step‑by‑step worked example of **cross‑attention** between an English source (“Hi how are you”) and its Hindi translation (“हाय कैसे हो तुम”), using toy matrices (with model dimension d_model = 4(In Attention paper we have 512D vector)).


## 1. Source & Target Tokens

- **Encoder (English):**  
  “Hi how are you”  
  → Tokens:  
  ```text
  ["Hi", "how", "are", "you"]
  ```

- **Decoder (Hindi):**  
  “हाय कैसे हो तुम”  
  → Tokens (feeding in at one time during training):  
  ```text
  ["हाय", "कैसे", "हो", "तुम"]
  ```


## 2. Q, K, V Matrices

- **Queries** come from the **decoder** hidden states (one per Hindi token).  
  Q (from decoder) =
  \begin{bmatrix}
    0.1 & 0.2 & 0.3 & 0.4 \\ 
    0.2 & 0.1 & 0.4 & 0.3 \\ 
    0.3 & 0.4 & 0.2 & 0.1 \\ 
    0.4 & 0.3 & 0.1 & 0.2
  \end{bmatrix}
  – Row 1 → हाय  
  – Row 2 → कैसे  
  – Row 3 → हो  
  – Row 4 → तुम  

- **Keys** and **Values** come from the **encoder** outputs (one per English token).  
  K (from encoder) =
  \begin{bmatrix}
    0.4 & 0.3 & 0.2 & 0.1 \\ 
    0.5 & 0.3 & 0.6 & 0.1 \\ 
    0.6 & 0.4 & 0.5 & 0.2 \\ 
    0.1 & 0.2 & 0.3 & 0.5
  \end{bmatrix},
  V (from encoder) =
  \begin{bmatrix}
    0.1 & 0.5 & 0.2 & 0.4 \\ 
    0.3 & 0.7 & 0.4 & 0.1 \\ 
    0.2 & 0.3 & 0.5 & 0.3 \\ 
    0.6 & 0.4 & 0.3 & 0.2
  \end{bmatrix}
  – Row 1 → “Hi”  
  – Row 2 → “how”  
  – Row 3 → “are”  
  – Row 4 → “you”  


## 3. Raw Cross‑Attention Scores

Compute:
$$
S = \left( \frac{Q \cdot K^T}{\sqrt{d_k}} \right)
$$

S =
\begin{bmatrix}
0.20 & 0.33 & 0.37 & 0.34 \\  
0.22 & 0.40 & 0.42 & 0.31 \\  
0.29 & 0.40 & 0.46 & 0.22 \\  
0.29 & 0.37 & 0.45 & 0.23
\end{bmatrix}

- Row 1 (“हाय”) scores: `[0.20, 0.33, 0.37, 0.34]`  
- Row 2 (“कैसे”) scores: `[0.22, 0.40, 0.42, 0.31]`  
- Row 3 (“हो”)   scores: `[0.29, 0.40, 0.46, 0.22]`  
- Row 4 (“तुम”)  scores: `[0.29, 0.37, 0.45, 0.23]`  


## 4. Softmax → Attention Weights

$$
\text{Attention Weight}(W) = \text{softmax} \left( \frac{Q \cdot K^T}{\sqrt{d_k}} \right)
$$

$$
\text{Attention Weight}_i(W) = \text{softmax} \left(Score_i\right)
$$

\begin{bmatrix}
0.223 & 0.254 & 0.265 & 0.259 \\[4pt]
0.222 & 0.265 & 0.271 & 0.242 \\[4pt]
0.236 & 0.264 & 0.279 & 0.221 \\[4pt]
0.238 & 0.258 & 0.280 & 0.224
\end{bmatrix}

- **Row “हाय”**: attends most to “are” (0.265) and “you” (0.259)  
- **Row “तुम”**: attends most to “are” (0.280)  


## 5. Contextualized Outputs (with explicit weighted sums)

W =
\begin{bmatrix}
0.223 & 0.254 & 0.265 & 0.259 \\[4pt]
0.222 & 0.265 & 0.271 & 0.242 \\[4pt]
0.236 & 0.264 & 0.279 & 0.221 \\[4pt]
0.238 & 0.258 & 0.280 & 0.224
\end{bmatrix},
V (from encoder)=
\begin{bmatrix}
V_{\text{Hi}}  = [0.1,\,0.5,\,0.2,\,0.4] \\[3pt]
V_{\text{how}} = [0.3,\,0.7,\,0.4,\,0.1] \\[3pt]
V_{\text{are}} = [0.2,\,0.3,\,0.5,\,0.3] \\[3pt]
V_{\text{you}} = [0.6,\,0.4,\,0.3,\,0.2]
\end{bmatrix}

Each output row is a weighted sum of the encoder values:

1. **“हाय”**  
   \begin{aligned}
   O_{\text{हाय}}
   &= 0.223\,V_{\text{Hi}}
     + 0.254\,V_{\text{how}}
     + 0.265\,V_{\text{are}}
     + 0.259\,V_{\text{you}} \\[4pt]
   &\approx [0.307,\,0.472,\,0.356,\,0.246]
   \end{aligned}

2. **“कैसे”**  
   \begin{aligned}
   O_{\text{कैसे}}
   &= 0.222\,V_{\text{Hi}}
     + 0.265\,V_{\text{how}}
     + 0.271\,V_{\text{are}}
     + 0.242\,V_{\text{you}} \\[4pt]
   &\approx [0.301,\,0.475,\,0.359,\,0.245]
   \end{aligned}

3. **“हो”**  
   \begin{aligned}
   O_{\text{हो}}
   &= 0.236\,V_{\text{Hi}}
     + 0.264\,V_{\text{how}}
     + 0.279\,V_{\text{are}}
     + 0.221\,V_{\text{you}} \\[4pt]
   &\approx [0.291,\,0.475,\,0.359,\,0.249]
   \end{aligned}

4. **“तुम”**  
   \begin{aligned}
   O_{\text{तुम}}
   &= 0.238\,V_{\text{Hi}}
     + 0.258\,V_{\text{how}}
     + 0.280\,V_{\text{are}}
     + 0.224\,V_{\text{you}} \\[4pt]
   &\approx [0.292,\,0.473,\,0.358,\,0.250]
   \end{aligned}

O= 
\begin{bmatrix}
0.307 & 0.472 & 0.356 & 0.246 \\[4pt]
0.301 & 0.475 & 0.359 & 0.245 \\[4pt]
0.291 & 0.475 & 0.359 & 0.249 \\[4pt]
0.292 & 0.473 & 0.358 & 0.250
\end{bmatrix}

- Row 1 (“हाय”)   
- Row 2 (“कैसे”) `  
- Row 3 (“हो”)     
- Row 4 (“तुम”)  `  

In [6]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads) # autocall constructor of MultiHeadClass we get K,Q,V,O
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)  ## we are providing here masking it is masked multihead attention
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [7]:
# import torch

# batch_size = 2
# seq_len = 10
# d_model = 512

# x = torch.randn(batch_size, seq_len, d_model)  # Random dummy input which will be actually Q,k,V

# obj = MultiHeadAttention(d_model=512, num_heads=8)
# output = obj.forward(x, x, x, mask=None)

# print(output.shape)  # Should be (2, 10, 512)


### Dummy Decoder Run

In [8]:
# 🚀 Step 1: Create Decoder Layer
decoder_layer = DecoderLayer(d_model=512, num_heads=8, d_ff=2048, dropout=0.1)
print("Self Attention Module:\n", decoder_layer.self_attn) 
print("Cross Attention Module:\n", decoder_layer.cross_attn)

# 🚀 Step 2: Create Dummy Input Data
batch_size = 2
seq_length = 10
d_model = 512

# Target (Decoder) Input
x = torch.randn(batch_size, seq_length, d_model)  # Random dummy input which will be actually Q,k,V

# Source (Encoder) Output 
enc_output = torch.randn(batch_size, seq_length, d_model)  # Dummy encoder output

# Masks
src_mask = torch.ones(batch_size, 1, 1, seq_length)         # Source mask (for encoder output)
tgt_mask = torch.tril(torch.ones(seq_length, seq_length))  # Target mask (causal mask for decoder input) #at each time step we dont know the output so we put all future words are zero
tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(0)              # Reshape to match attention dimensions

print("x:", x)
print("enc_output:", enc_output)
print("src_mask:", src_mask) # ["I am learning transformers . [PAD] [PAD] [PAD]"] # src_mask = [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]
print("tgt_mask:", tgt_mask)

# 🚀 Step 3: Self-Attention in Decoder
attn_output = decoder_layer.self_attn(x, x, x, tgt_mask)
print("Self Attention Output Shape:", attn_output.shape)

# 🚀 Step 4: Apply Dropout on Self-Attention Output
dropout_output = decoder_layer.dropout(attn_output)

# 🚀 Step 5: Residual Connection
residual_output = x + dropout_output

# 🚀 Step 6: LayerNorm after Self-Attention
normalized_output = decoder_layer.norm1(residual_output)
print("Output After Norm1:", normalized_output.shape)

# 🚀 Step 7: Cross-Attention with Encoder Output
attn_output = decoder_layer.cross_attn(normalized_output, enc_output, enc_output, src_mask)
print("Cross Attention Output Shape:", attn_output.shape)

# 🚀 Step 8: Residual Connection for Cross-Attention
residual_output = normalized_output + decoder_layer.dropout(attn_output)

# 🚀 Step 9: LayerNorm after Cross-Attention
normalized_output = decoder_layer.norm2(residual_output)
print("Output After Norm2:", normalized_output.shape)

# 🚀 Step 10: Feed-Forward Network
ffn_output = decoder_layer.feed_forward(normalized_output)
print("Feed Forward Output Shape:", ffn_output.shape)

# 🚀 Step 11: Residual Connection After FFN
residual_output = normalized_output + decoder_layer.dropout(ffn_output)

# 🚀 Step 12: Final Layer Normalization
final_output = decoder_layer.norm3(residual_output)
print("Final Output Shape After Norm3:", final_output.shape)


Self Attention Module:
 MultiHeadAttention(
  (W_q): Linear(in_features=512, out_features=512, bias=True)
  (W_k): Linear(in_features=512, out_features=512, bias=True)
  (W_v): Linear(in_features=512, out_features=512, bias=True)
  (W_o): Linear(in_features=512, out_features=512, bias=True)
)
Cross Attention Module:
 MultiHeadAttention(
  (W_q): Linear(in_features=512, out_features=512, bias=True)
  (W_k): Linear(in_features=512, out_features=512, bias=True)
  (W_v): Linear(in_features=512, out_features=512, bias=True)
  (W_o): Linear(in_features=512, out_features=512, bias=True)
)
x: tensor([[[-0.7171,  0.3262, -1.0513,  ..., -0.0124,  0.4913, -0.0346],
         [ 1.5408,  1.4473,  0.8066,  ...,  0.9618,  1.1064, -1.0759],
         [ 0.6266,  1.2028, -1.7829,  ...,  0.6074,  0.4523, -0.3445],
         ...,
         [ 1.0011,  1.0257, -0.9379,  ...,  0.5158,  1.7313,  0.4191],
         [ 1.1177,  0.0554, -0.8258,  ...,  0.1114, -1.8023,  0.7312],
         [-0.9025,  0.2066,  1.6075,  .

# 🔹 Combining the Encoder and Decoder layers to create the complete Transformer network

<p align="center">
  <img src="https://cdn.prod.website-files.com/62c4a9809a85693c49c4674f/6580badb5a5031ccde99abac_transformer-model.png" width="600"/>/>
</p>



In [10]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        #  Embeddings + Positional Encoding:
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) ## Embeddings from Module
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        #loop through multiple layers as it is in paper
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        # src_mask: Hides padding tokens in the input.
        # tgt_mask: Prevents the decoder from looking at future tokens (no-peek).
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [12]:
# 🚀 Step 1: Create Transformer Model
model = Transformer(
    src_vocab_size=10,
    tgt_vocab_size=10,
    d_model=512,
    num_heads=8,
    num_layers=2,
    d_ff=2048,
    max_seq_length=10,
    dropout=0.1
)

# 🚀 Step 2: Create Dummy Tokenized Input
src = torch.tensor([[1, 2, 3, 4]])   # shape: (1, 4) = batch_size, seq_length
tgt = torch.tensor([[1, 2, 3]])      # shape: (1, 3)
print(src, tgt)

# 🚀 Step 3: Generate Masks
src_mask, tgt_mask = model.generate_mask(src, tgt)
print("SRC Mask:", src_mask.shape)   # (1, 1, 1, 4)
print("TGT Mask:", tgt_mask.shape)   # (1, 1, 3, 3)

# 🚀 Step 4: Embed + Positional Encoding
src_emb = model.encoder_embedding(src)                     # (1, 4, 512)
src_emb = model.positional_encoding(src_emb)               # (1, 4, 512)
src_emb = model.dropout(src_emb)

tgt_emb = model.decoder_embedding(tgt)                     # (1, 3, 512)
tgt_emb = model.positional_encoding(tgt_emb)               # (1, 3, 512)
tgt_emb = model.dropout(tgt_emb)

# 🚀 Step 5: Encoder Pass
enc_output = src_emb
for enc_layer in model.encoder_layers:
    enc_output = enc_layer(enc_output, src_mask)
print("Encoder Output:", enc_output.shape)  # (1, 4, 512)

# 🚀 Step 6: Decoder Pass
dec_output = tgt_emb
for dec_layer in model.decoder_layers:
    dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
print("Decoder Output:", dec_output.shape)  # (1, 3, 512)

# 🚀 Step 7: Final Linear Layer
output = model.fc(dec_output)               # (1, 3, 10)
print("Final Output (Logits):", output.shape)


tensor([[1, 2, 3, 4]]) tensor([[1, 2, 3]])
SRC Mask: torch.Size([1, 1, 1, 4])
TGT Mask: torch.Size([1, 1, 3, 3])
Encoder Output: torch.Size([1, 4, 512])
Decoder Output: torch.Size([1, 3, 512])
Final Output (Logits): torch.Size([1, 3, 10])


# 🔹 Training the PyTorch Transformer Model



In [37]:
src_vocab_size = 5000 # The size of the source vocabulary. This represents the total number of unique tokens (words or sub-words) in the source language.
tgt_vocab_size = 5000 # The size of the target vocabulary. This represents the total number of unique tokens (words or sub-words) in the target language.
d_model = 512 # dimensions of embeddings vector
num_heads = 8 # self attention heads in multi head attention.
num_layers = 6 # The number of encoder and decoder blocks in transformer.
d_ff = 2048 # Hidden Layer dimension in FNN
max_seq_length = 100 #The maximum length of input/output sequences. This means that the input and output sequences can have up to 100 tokens. Any sequences longer than this would be truncated.
dropout = 0.1 # dropout rate used to prevent overfitting.

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

In [39]:
# random word embeddings
src_data.shape, tgt_data.shape, src_data, tgt_data

(torch.Size([64, 100]),
 torch.Size([64, 100]),
 tensor([[3118, 1677, 3564,  ..., 1398, 1970, 1946],
         [3280,  810, 1202,  ..., 4571, 1018, 4718],
         [4039, 1184, 3386,  ..., 4016, 4265, 2708],
         ...,
         [3910, 4409, 1089,  ..., 4814, 3007,  793],
         [2734,  111, 4036,  ..., 4992, 3873, 1282],
         [1951, 3727, 1519,  ..., 2099, 1435,  657]]),
 tensor([[1355,  652, 1298,  ..., 2618, 1007, 3646],
         [1696, 1638, 2502,  ..., 4756, 4015, 2533],
         [1137,  769, 2567,  ..., 2456,  253, 3202],
         ...,
         [2415,  145,  423,  ..., 3233, 4082,   63],
         [4612, 4396, 4301,  ...,  329, 3771, 1609],
         [3613,  421, 4539,  ..., 3359, 1831, 3654]]))

# 🔹 Training the Model

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(100):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")