In [32]:
from importlib.metadata import version

print("torch version:", version("torch"))

torch version: 2.6.0+cu124


#### -----
#### Build a Large Language Model
#### Sebastian Raschka
#### -----

# Coding attention mechanisms

  - This chapter will focus on coding the remaining parts of the LLMs surrounding the self-attention mechanism to see it in action and to create a model to generate text.
  - Different types of attention mechanisms
    - Simplified self-attention:
    - Self-attention: Simplified self-attention with trainable weights
    - Casual attention: Type of self-attention used in LLMs that allows a model to consider on previous and current inputs in a sequence
    - Multi-head attention:

## 3.1 - The problem with modeling long sequences

  - You can't simply translate a text word into another language due to the grammatical structures in source and target language.
  - Deep neural network with two submodules, encoder and decoder:
    - encode : first read in and process the entire text
    - decoder : then produces the translated text.
  - Before transformers, recurrent neural networks (RNNs) were the most popular encoder-decoder architecture for language translation.

## 3.2 - Capturing data dependencies with attention mechanisms

  - Bahdanau attention mechanism : modifies the encoder-decoder RNN such that the decoder can selectively access different parts of the input sequence at each decoding step.
  - Self-attention is a mechanism that allows each position in the input sequence when computing the representation of a sequence.

## 3.3 - Attending to different parts of the input with self-attention

  - In self-attention, the "self" refers to the mechanism's ability to compute attention weights by relating different positions within a single input sequence.

### 3.3.1 - A simple self-attention mechanism without trainable weights

  - In self-attention, our goal is to calculate context vectors for each element x in the input sequence.  A context vector can be interpreted as an enrighted embedding vector.
  - Context vectors play a crucial role in self-attention.  Their purpose is to create enriched representations of each element in an input sequence by incorporating information from all other elsements in the sequence.
  - The first step of implementing self-attention is to compute the intermediate values w, referred to as attention scores.

In [1]:
# small embedding dimension
import torch
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # Your       (x^1)
     [0.55, 0.87, 0.66],  # journey    (x^2)
     [0.57, 0.85, 0.64],  # starts     (x^3)
     [0.22, 0.58, 0.33],  # with       (x^4)
     [0.77, 0.25, 0.10],  # one        (x^5)
     [0.05, 0.80, 0.55]]  # step       (x^6)
)

query = inputs[1]
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query)
attn_scores_2

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])

In [2]:
# Next step is to normalize each of the attention scores we computed previously
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()
print("Attention weights: ", attn_weights_2_tmp)
print("Sum: ", attn_weights_2_tmp.sum())

Attention weights:  tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum:  tensor(1.0000)


In [3]:
#   More common to use the softmax function for normalization.
#   Additionally, softmax function ensures that the attention weights are always
# positive.
def softmax_naive(x):
  return torch.exp(x) / torch.exp(x).sum(dim = 0)

attn_weights_2_naive = softmax_naive(attn_scores_2)
print("Attention weights: ", attn_weights_2_naive)
print("Sum: ", attn_weights_2_naive.sum())

Attention weights:  tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum:  tensor(1.)


In [4]:
#   It's advisable to use PyTorch of softmax to prevent overflow and underflow
att_weights_2 = torch.softmax(attn_scores_2, dim = 0)
print("Attention weights: ", att_weights_2)
print("Sum: ", att_weights_2.sum())

Attention weights:  tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum:  tensor(1.)


In [5]:
#   After we computed the normalized attention weights, we are ready to
# calculate the context vector z, by multiplying the embedded input tokens, x
query = inputs[1]
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2_naive[i] * x_i
print(context_vec_2)

tensor([0.4419, 0.6515, 0.5683])


### 3.3.2 - Computing attention weights for all input tokens

In [6]:
#   Let's extend this computation to calculate attention weights and context
# vectors for all inputs
attn_scores = torch.empty(6, 6)
for i, x_i in enumerate(inputs):
  for j, x_j in enumerate(inputs):
    attn_scores[i, j] = torch.dot(x_i, x_j)
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [7]:
#   Each element in the tensor represents an attention score beween each pair
# of inputs.  The values are normalized, which is why they differ from
# unnormalized attention scores in the preceding tensor.
attn_scores = inputs @ inputs.T
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [8]:
attn_weights = torch.softmax(attn_scores, dim = 1)
print(attn_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [9]:
#   By setting dim = -1, we are instructing the softmax function to apply
# normalization along the last dimension of the attn_scores tensor.

# Verify that the rows indeed sum to 1:
row_2_sum = sum([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
print("Row 2 sum: ", row_2_sum)
print("All row sums: ", attn_weights.sum(dim = -1))

Row 2 sum:  1.0
All row sums:  tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [10]:
#   In the third and final step we use these attention weights to comput all
# context vectors via matrix multiplication
all_context_vectors = attn_weights @ inputs
print(all_context_vectors)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


In [11]:
# We can doubl-check that the code is correct
print("Previous 2nd context vector: ", context_vec_2)

Previous 2nd context vector:  tensor([0.4419, 0.6515, 0.5683])


## 3.4 - Implementing self-attention with trainable weights

  - Next implement the self-attention mechanism called <i>scaled dot-product</i>.
  - Self-attention mechanism with trainable weights builds on the previous concepts: we want to compute context vectors as weighted sums over the input vectors specific to a certain input elements.
  - The most notable difference is the introduction of weight matrices that are updated during model training.  These trainable weight matrices are crucial so that the model (specifically, the attention module inside the model) can learn to produce "good" context vectors.

### 3.4.1 - Computing the attention weights step by step

  - Implement the self-attention mechanism step by step by introducing the tree trainable weight matrices wq, wk, and wv.  These 3 matrices are used to project the embedded input tokens x(i), into query, key and value vectors.

In [12]:
#   We start by computing only one context vector, z(2)
x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2

#   Next initialize the three weight matrices wq, wk, wv
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key   = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

#   Set requires_grad = False, to reduce clutter in the outputs, but if we were
# to use the weight matrices for model training, we would set requires_grad = True

#   Compute the query, key, and value vectors
query_2 = x_2 @ W_query
key_2   = x_2 @ W_key
value_2 = x_2 @ W_value
print(query_2)

tensor([0.4306, 1.4551])


In [13]:
#  We can obtain all keys and values via matrix multiplication:
keys = inputs @ W_key
values = inputs @ W_value
print("keys.shape: ", keys.shape)
print("values.shape: ", values.shape)

keys.shape:  torch.Size([6, 2])
values.shape:  torch.Size([6, 2])


In [14]:
# The next step is to comput the attention score w22:
keys_2 = keys[1]
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)

tensor(1.8524)


In [15]:
#   We can generalize this computation to all attention scores via
# matrix multiplication
attn_scores_2 = query_2 @ keys.T
print(attn_scores_2)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [16]:
#   We can compute the attention weights by scaling the attention scores and
# using the softmax function.
d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim = -1)
print(attn_weights_2)

tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])


In [17]:
#   The final step is to comput the context vectors
#   We compute the context vector as weighted sum over the input vectors
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

tensor([0.3061, 0.8210])


### 3.4.2 - Implementing a compact self-attention Python class

In [18]:
# A compact self-attention class
import torch.nn as nn

class SelfAttention_v1(nn.Module):
  def __init__(self, d_in, d_out):
    super().__init__()
    self.W_query = nn.Parameter(torch.rand(d_in, d_out))
    self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
    self.W_value = nn.Parameter(torch.rand(d_in, d_out))

  def forward(self, x):
    keys        = x @ self.W_key
    queries     = x @ self.W_query
    values      = x @ self.W_value
    attn_scores = queries @ keys.T #omega
    attn_weights = torch.softmax(
        attn_scores / keys.shape[-1]**0.5, dim = -1
    )
    context_vec = attn_weights @ values
    return context_vec

In [19]:
# We can use this class as follows:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


  - Self-attention involves the trainable weight matrices wq, wk, and wv.
  - These matrices transform input data into queries, keys, and values, repectively, which are crucial components of the attention mechanism.
  - We can improve the SelfAttention_v1 implementation further by utilizing PyTorch's nn.Linear layers, which effectively perform matrix multiplication when the bias unites are disabled.  
  - Additionall, a significant advantage of using nn.Linear instead of manually implementing nn.Parameter(torch.rand(...)) is that nn.Linear has an optimized weight initialization scheme, contributing to more stable and effective model training.

In [20]:
#### TO-DO : ERROR STARTS HERE

class SelfAttention_v2(nn.Module):
  def __init__(self, d_in, d_out, qkv_bias=False):
    super().__init__()
    self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

  def forward(self, x):
    keys        = self.W_key(x)
    queries     = self.W_query(x)
    values      = self.W_value(x)
    attn_scores = queries @ keys.T
    attn_weights = torch.softmax(
        attn_scores / keys.shape[-1]**0.5, dim = -1
    )
    context_vec = attn_weights @ values
    return context_vec

torch.manual_seed(123)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(inputs))

tensor([[-0.5337, -0.1051],
        [-0.5323, -0.1080],
        [-0.5323, -0.1079],
        [-0.5297, -0.1076],
        [-0.5311, -0.1066],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)


  - v1 and v2 give different outputs because they use different initial weights.

## 3.5 - Hiding future words with causal attention

  - For many LLM tasks, you will want the self-attention mechanism to consider only tokens that appear prior to the current position when predicting the next token in a sequence.
  - Casual attention (masked attention) is specialized form of self-attention.
    - It restricts a model to only consider previous and current inputs in a sequence when processing any given token when computing attention scores.

### 3.5.1 - Applying a casual attention mask

In [21]:
# Compute the attentoion weights using the softmax function as we have done before
queries = sa_v2.W_query(inputs)
keys = sa_v2.W_key(inputs)
attn_scores = queries @ keys.T
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim = -1)
print(attn_weights)

tensor([[0.1717, 0.1762, 0.1761, 0.1555, 0.1627, 0.1579],
        [0.1636, 0.1749, 0.1746, 0.1612, 0.1605, 0.1652],
        [0.1637, 0.1749, 0.1746, 0.1611, 0.1606, 0.1651],
        [0.1636, 0.1704, 0.1702, 0.1652, 0.1632, 0.1674],
        [0.1667, 0.1722, 0.1721, 0.1618, 0.1633, 0.1639],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<SoftmaxBackward0>)


In [22]:
# Use PyTorch's tril function to create a mask where the values above the diagonal are zero
context_length = attn_scores.shape[0]
mask_simple = torch.tril(torch.ones(context_length, context_length))
print(mask_simple)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [23]:
# Now multiply this mask with the attention weights to zero-out the values above the diagonal:
mask_simple = attn_weights * mask_simple
print(mask_simple)

tensor([[0.1717, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1636, 0.1749, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1637, 0.1749, 0.1746, 0.0000, 0.0000, 0.0000],
        [0.1636, 0.1704, 0.1702, 0.1652, 0.0000, 0.0000],
        [0.1667, 0.1722, 0.1721, 0.1618, 0.1633, 0.0000],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<MulBackward0>)


In [24]:
# The next step is to renormalize the attention weights to sum up to 1 again in each row.
row_sums = mask_simple.sum(dim = -1, keepdim = True)
masked_simple_norm = mask_simple / row_sums
print(masked_simple_norm)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4833, 0.5167, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3190, 0.3408, 0.3402, 0.0000, 0.0000, 0.0000],
        [0.2445, 0.2545, 0.2542, 0.2468, 0.0000, 0.0000],
        [0.1994, 0.2060, 0.2058, 0.1935, 0.1953, 0.0000],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<DivBackward0>)


In [25]:
#   To improve our casual attention, let's take a mathematical property of the
# softmax function and implement the computation of the masked attention weights
# more effivciently in fewer steps
#   The softmax function converts into an probablility distribution.  When
# negative infinity values are preseent in a row, it treats them as zer.
#   Implement this by creating a mask with 1s above the diagonal the then
# replacing these 1s with negative infinity
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
print(masked)

tensor([[0.3111,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1655, 0.2602,   -inf,   -inf,   -inf,   -inf],
        [0.1667, 0.2602, 0.2577,   -inf,   -inf,   -inf],
        [0.0510, 0.1080, 0.1064, 0.0643,   -inf,   -inf],
        [0.1415, 0.1875, 0.1863, 0.0987, 0.1121,   -inf],
        [0.0476, 0.1192, 0.1171, 0.0731, 0.0477, 0.0966]],
       grad_fn=<MaskedFillBackward0>)


In [26]:
#   No apply the softmax function
attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim = -1)
print(attn_weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4833, 0.5167, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3190, 0.3408, 0.3402, 0.0000, 0.0000, 0.0000],
        [0.2445, 0.2545, 0.2542, 0.2468, 0.0000, 0.0000],
        [0.1994, 0.2060, 0.2058, 0.1935, 0.1953, 0.0000],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<SoftmaxBackward0>)


### 3.5.2 - Masking additional attention weights with dropout

  - Dropout is deep learning is a technique where randomly selected hidden layer units are ingnored during training.  
  - This method helps prevent overfitting by ensuring that the model does not become overly reliant on any specific set of hidden layer units.

In [27]:
#   We use a dropout rate of 50%, which means masking out half of the attention
# weights.
torch.manual_seed(123)
droupout = torch.nn.Dropout(0.5)
example = torch.ones(6, 6)
print(droupout(example))

#   our example doesn't match the one from the book.  Are they using a differnt
# seed, or did torch change their algorithm

tensor([[2., 2., 2., 2., 2., 2.],
        [0., 2., 0., 0., 0., 0.],
        [0., 0., 2., 0., 2., 0.],
        [2., 2., 0., 0., 0., 2.],
        [2., 0., 0., 0., 0., 2.],
        [0., 2., 0., 0., 0., 0.]])


In [28]:
#   To compensate for the reduction in active elements, the values of the
# remaining elements in the matrix are scaled up by a factor of 1/0.5 = 2.
# Allpy dropout to the attention weight matrix itself:
torch.manual_seed(123)
print(droupout(attn_weights))

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 1.0335, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.6804, 0.0000, 0.0000, 0.0000],
        [0.4889, 0.5090, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3988, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3418, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


### 3.5.3 - Implementing a compact casual attention class

In [29]:
#   Incorporate the causal attention and dropout modications into the SelfAttention
# python class.
batch = torch.stack((inputs, inputs), dim = 0)
print(batch.shape)

torch.Size([2, 6, 3])


In [31]:
#   The Casual Attention class is similar to the SelfAttention class

class CausalAttention(nn.Module):
  def __init__(self, d_in, d_out, context_length,
               droupout, qkv_bias=False):
    super().__init__()
    self.d_out   = d_out
    self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.dropout  = nn.Dropout(droupout)
    self.register_buffer(
        'mask',
        torch.triu(torch.ones(context_length, context_length)),
        diagonal = 1
    )

  def forward(self, x):
    b, num_tokens, d_in = x.shape
    keys        = self.W_key(x)
    queries     = self.W_query(x)
    values      = self.W_value(x)

    attn_scores = queries @ keys.transpose(1, 2)
    attn_scores.masked_fill_(
        self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)

    attn_weights = torch.softmax(
        attn_scores / keys.shape[-1]**0.5, dim = -1
    )
    attn_weights = self.dropout(attn_weights)
    context_vec = attn_weights @ values

    return context_vec

#   We now added a self.register_buffer() call in the __init__ method.  The use of
# register_buffer in PyTorch is not strictly necessary for all use cases but
# offers several advantages here.

#   Instantiate and use our CausalAttention class
torch.manual_seed(123)
context_length = batch.shape[1]
ca = CausalAttention(d_in, d_out, context_length, 0.0)
context_vecs = ca(batch)
print("context_vecs.shape: ", context_vecs.shape)

TypeError: Module.register_buffer() got an unexpected keyword argument 'diagonal'