### Coding Attention Mechanisms

In [57]:
# A compact self-attention class. It uses nn.Parameters to define the weight matrices Wk Wq Wv.

import torch.nn as nn
import torch

class SelfAttentionV1(nn.Module):

    def __init__(self, dIn, dOut):
        super().__init__()
        self.wKey = nn.Parameter(torch.rand(dIn, dOut))
        self.wQuery = nn.Parameter(torch.rand(dIn, dOut))
        self.wValue = nn.Parameter(torch.rand(dIn, dOut))

    def forward(self, x):
        K = x @ self.wKey
        Q = x @ self.wQuery
        V = x @ self.wValue

        attnScores = Q @ K.T
        attnWeights = torch.softmax(attnScores / K.shape[-1]**0.5, dim=-1)
        return attnWeights @ V
    
    def updateWeights(self, wKey, wQuery, wValue):
        self.wKey = nn.Parameter(wKey)
        self.wQuery = nn.Parameter(wQuery)
        self.wValue = nn.Parameter(wValue)

In [62]:
torch.manual_seed(123)

# Example input values to try the self-attention layer.
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
 [0.57, 0.85, 0.64], 
 [0.22, 0.58, 0.33],
 [0.77, 0.25, 0.10],
 [0.05, 0.80, 0.55]]
)

dIn, dOut = 3, 2
print(SelfAttentionV1(dIn, dOut)(inputs))
del inputs, dIn, dOut

tensor([[0.2685, 0.7413],
        [0.2738, 0.7564],
        [0.2668, 0.7366],
        [0.2618, 0.7218],
        [0.2712, 0.7495]], grad_fn=<MmBackward0>)


In [2]:
# Self-attention V2. It uses nn.Linear to define the matrices. nn.Linear is preferred as it has efficient initialization as well as optimized matrix multiplication.

class SelfAttentionV2(nn.Module):

    def __init__(self, dIn, dOut, qkv_bias=False):
        super().__init__()
        self.W_key = nn.Linear(dIn, dOut, bias=qkv_bias)
        self.W_query = nn.Linear(dIn, dOut, bias=qkv_bias)
        self.W_value = nn.Linear(dIn, dOut, bias=qkv_bias)

    def forward(self, x):
        # Compute Keys, Queries, Values.
        K = self.W_key(x)
        Q = self.W_query(x)
        V = self.W_value(x)
        # Compute attention weights
        attnScores = Q @ K.t()
        attnWeights = torch.softmax(attnScores / K.shape[-1]**0.5, dim=-1)
        return attnWeights @ V
        # return attnWeights @ V

NameError: name 'nn' is not defined

In [3]:
torch.manual_seed(123)

dIn = 3
dOut = 2
# Example input values to try the self-attention layer.
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
 [0.57, 0.85, 0.64], 
 [0.22, 0.58, 0.33],
 [0.77, 0.25, 0.10],
 [0.05, 0.80, 0.55]]
)

print(SelfAttentionV2(dIn, dOut)(inputs))
del inputs, dIn, dOut

NameError: name 'torch' is not defined

In [None]:
# Validate that both attention mechanisms output the same values.

torch.manual_seed(123)

dIn = 3
dOut = 2
# Example input values to try the self-attention layer.
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
 [0.57, 0.85, 0.64], 
 [0.22, 0.58, 0.33],
 [0.77, 0.25, 0.10],
 [0.05, 0.80, 0.55]]
)

sav2 = SelfAttentionV2(dIn, dOut)
sav1 = SelfAttentionV1(dIn, dOut)
# Update SelfAttentionV1 to be the same as for SelfAttentionV2.
sav1.updateWeights(sav2.W_key.weight.t(), sav2.W_query.weight.t(), sav2.W_value.weight.t())

print("Self Attention V2 output: \n", sav2(inputs))
print("Self Attention V1 output: \n", sav1(inputs))
del inputs, dIn, dOut

Self Attention V2 output: 
 tensor([[-0.4927, -0.0791],
        [-0.4938, -0.0806],
        [-0.4924, -0.0851],
        [-0.4923, -0.0819],
        [-0.4928, -0.0853]], grad_fn=<MmBackward0>)
Self Attention V1 output: 
 tensor([[-0.4927, -0.0791],
        [-0.4938, -0.0806],
        [-0.4924, -0.0851],
        [-0.4923, -0.0819],
        [-0.4928, -0.0853]], grad_fn=<MmBackward0>)


#### Causal Self Attention

In causal self-attention / masked attention, the model is restricted to the previous and current elements in a sequence when computing attention scores.  This is in contrast to standard self-attention which considers all elements in the sequence.

It allows LLMs to learn to predict the next token in a sequence.

The following implementation builds on SimpleAttentionV2, with the following changes:
- A mask is applied to the upper diagonal elements.
- Dropout is applied to reduce model overfitting.

Drpout is typically applied at two specific times: after computing the attention weights or after applying the attention weights to the value vectors. 

Here we apply the attention weights after computing the attention weights.

In [None]:
class CausalAttention(nn.Module):

    def __init__(self, dIn, dOut, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.W_key = nn.Linear(dIn, dOut, bias=qkv_bias)
        self.W_query = nn.Linear(dIn, dOut, bias=qkv_bias)
        self.W_value = nn.Linear(dIn, dOut, bias=qkv_bias)
        self.dropout = nn.Dropout(p=dropout)
        # `register_buffer` helps store the mask as a non-trainable tensors
        # within a model, never directly impacting gradients or weight updates.

        self.register_buffer(
            'mask', 
            # Diagønal=1 ignores the diagonal from mask.
            torch.triu(torch.ones(context_length, context_length), diagonal=1),
        )

    def forward(self, x):
        b, numTokens, dIn = x.shape 
        K = self.W_key(x) # [b, numTokens, dIn] x [dIn, dOut] = [b, numTokens, dOut]
        Q = self.W_query(x)
        V = self.W_value(x)

        attnScores = Q @ K.transpose(1,2) # [b, numTokens, dOut] x [b, dOut, numTokens]
        attnScores = attnScores.masked_fill(self.mask.bool()[:numTokens][:numTokens], -torch.inf)
        attnWeights = torch.softmax(attnScores / K.shape[-1]**0.5, dim=-1)
        # Apply dropout after computing weights, before applying them.
        attnWeights = self.dropout(attnWeights)
        return attnWeights @ V


In [83]:
torch.manual_seed(123)

dIn = 3
dOut = 2
# Example input values to try the self-attention layer.
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
 [0.57, 0.85, 0.64], 
 [0.22, 0.58, 0.33],
 [0.77, 0.25, 0.10],
 [0.05, 0.80, 0.55]]
)
batch = torch.stack((inputs, inputs))

print("batch.shape:", batch.shape)
print("contextVecs.shape:", CausalAttention(dIn, dOut, batch.shape[1], 0.0)(batch).shape)

del dIn, dOut, inputs, batch

batch.shape: torch.Size([2, 5, 3])
contextVecs.shape: torch.Size([2, 5, 2])


#### Multi-head Attention

Multi-head attention means computing attention multiple times using different matrices. This allows the model to learn different relationships between the tokens, and attend to different parts of the sequence.

As a first-approach, we implement multi-head attention in the most straightforward way. 
It is not the most efficient implementation, but allows us to try this idea.


In [None]:
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self, dIn, dOut, context_length, droput, n_heads):
        super().__init__()
        self.heads = [CausalAttention(dIn, dOut, context_length, droput) for i in range(n_heads)]

    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

    


In [None]:
torch.manual_seed(123)

dIn = 3
dOut = 2
# Example input values to try the self-attention layer.
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
 [0.57, 0.85, 0.64], 
 [0.22, 0.58, 0.33],
 [0.77, 0.25, 0.10],
 [0.05, 0.80, 0.55]]
)
batch = torch.stack((inputs, inputs))

print("batch.shape:", batch.shape)
print("contextVecs.shape:", MultiHeadAttentionWrapper(dIn, dOut, batch.shape[1], 0.0, n_heads=2)(batch).shape)

del dIn, dOut, inputs, batch

batch.shape: torch.Size([2, 5, 3])
contextVecs.shape: torch.Size([2, 5, 4])


### Efficient Multi-Head Attention

Let's implement multi-head attention in a more efficent manner that does not require computing an attention matrix for each one of the heads.

It consists in splitting the Keys, Queries, Values matrices into several submatrices, one per head.
This is done by splitting the "d_out" dimension into "n_heads", "d_head". 

i.e. `(batch_size, context_size, d_out)` --> `(batch_size, context_size, n_heads, d_head)`

This reshaping allows us to have a K,Q,V sub-matrix for each head using a single matrix. 

Next step is to reshape the matrix so we can do matrix multipilcation:

i.e.  `(batch_size, context_size, n_heads, d_head)` --> `(batch_size, n_heads, context_size, d_head)`

We can then use this matrix as usual to compute the attention weights. 

Once the outputs are computed, we must reshape the matrix to keep its original shape

`(batch_size, n_heads, context_size, d_head)` --> `(batch_size, context_size, d_out)`

In [1]:
class MultiHeadAttention(nn.Module):

    def __init__(self, dIn, dOut, n_heads, context_length, dropout=0.0, qkv_bias=False):
        super().__init__()
        assert (dOut % n_heads == 0), "dOut must be divisible by n_heads"

        self.W_key = nn.Linear(dIn, dOut, bias=qkv_bias)
        self.W_query = nn.Linear(dIn, dOut, bias=qkv_bias)
        self.W_value = nn.Linear(dIn, dOut, bias=qkv_bias)
        self.dHead = dOut//n_heads
        self.n_heads = n_heads
        self.dropout = nn.Dropout(p=dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )
        self.out_proj = nn.Linear(dOut, dOut)
        self.dOut = dOut


    def forward(self, x):
        b, numTokens, dIn = x.shape

        # Compute K, Q, V vectors.
        K = self.W_key(x) # (b, numTokens, dOut)
        Q = self.W_query(x)
        V = self.W_value(x)

        # Unroll last matrix dimension, enabling a dimension for each head.
        K = K.view(b, numTokens, self.n_heads, self.dHead) # (b, numTokens, n_heads, dHead)
        Q = Q.view(b, numTokens, self.n_heads, self.dHead)
        V = V.view(b, numTokens, self.n_heads, self.dHead)

        # Transpose matrices, making n_heads the second dim.
        K = K.transpose(1, 2) # (b, n_heads, numTokens, dHead)
        Q = Q.transpose(1, 2)
        V = V.transpose(1, 2)

        # Compute attention scores.
        attnScores = Q @ K.transpose(2,3) # = (b, n_heads, numTokens, numTokens)
        maskBool = self.mask.bool()[:numTokens,:numTokens]
        # Apply mask
        attnScores.masked_fill_(maskBool, -torch.inf)

        # Compute attention weights.
        attnWeights = torch.softmax(attnScores / K.shape[-1]**0.5, dim=-1)
        attnWeights = self.dropout(attnWeights)

        # Compute context vector.
        context = attnWeights @ V # (b, n_heads, numTokens, dHead)
        context = context.transpose(1,2) # (b, numTokens, n_heads, dHead)
        context = context.contiguous().view(b, numTokens, self.dOut) # (b, numTokens, dOut)
        return self.out_proj(context) # (b, numTokens, dOut)

NameError: name 'nn' is not defined

In [None]:
torch.manual_seed(123)

dIn = 3
dOut = 2
# Example input values to try the self-attention layer.
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
 [0.57, 0.85, 0.64], 
 [0.22, 0.58, 0.33],
 [0.77, 0.25, 0.10],
 [0.05, 0.80, 0.55]]
)
batch = torch.stack((inputs, inputs))

print("batch.shape:", batch.shape)
print("contextVecs.shape:", MultiHeadAttention(dIn, dOut, context_length=5, n_heads=2)(batch).shape)

del dIn, dOut, inputs, batch

batch.shape: torch.Size([2, 5, 3])
contextVecs.shape: torch.Size([2, 5, 2])


### Exercise 3.3 Initializing GPT-2 size attention modules

Initialize a multi-head attention module that has the same numer of attention heads as the smallest GPT-2 model (12 attention heads). Also ensurr that you use the respective input and output embedding sisze simlar to GPT-2 (768 dimensions). Note that the smallest GPT-2 model supports a context length of 1,024 tokens.

In [None]:

torch.manual_seed(123)

dIn = 768
dOut = 768
# Example input values to try the self-attention layer.
inputs =torch.ones((1024,768))
batch = torch.stack((inputs, inputs))

print("batch.shape:", batch.shape)
print("contextVecs.shape:", MultiHeadAttention(dIn, dOut, context_length=1024, n_heads=12)(batch).shape)

del dIn, dOut, inputs, batch

batch.shape: torch.Size([2, 1024, 768])
contextVecs.shape: torch.Size([2, 1024, 768])
