### Requiriments

In [5]:
# pip install tiktoken
# import torch
# print("PyTorch version:", torch.__version__)

### Vocabulary

- **vocab_size:** es el tamaño del vocabulario en tokens únicos disponibles+extensiones. En nuestro caso dado por el tokenizador creado con BPE.
- **output_dim:** el número de dimensiones de cada token. Las dimensiones describen a una palabra o concepto. Más dimensiones capturan más detalles.
- **max_length:** es la máxima logitud de tokens por secuencia.
- **batch_size:** es cuantas secuencias tiene cada batch.
- **stride:** el tamaño de la zancada en tokens, cuantos tokens salta para la siguiente secuencia, esto determina que tanto se empalma una secuencia con otra.
- **shuffle:** determina si le da un orden aleatorio a las secuencias.

### Load the text for training (our corpus)

In [4]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


### BytePair Encoding (BPE)


In [None]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
enc_text.append(tokenizer.eot_token)

# First 10 tokens from raw_text
first_10_token_ids = enc_text[:10]
decoded_tokens = [tokenizer.decode([token_id]) for token_id in first_10_token_ids]
delimited_tokens = ' |-|'.join(decoded_tokens)
print(delimited_tokens)
print(enc_text[:10])
print(f'\n Total of tokens: {len(enc_text)}')

I |-| H |-|AD |-| always |-| thought |-| Jack |-| G |-|is |-|burn |-| rather
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]

 Total of tokens: 5146


### Dataset loader (creating tokenIDs for inputs and targets)

In [None]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        token_ids.append(tokenizer.eot_token)

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

# dataset = GPTDatasetV1(raw_text, tokenizer, max_length=4, stride=1)
# print(dataset.input_ids)
# print(dataset.target_ids)

In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

### Use DataLoader

In [None]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

# for batch in dataloader:
#     input, target = batch
#     print(input, target)

data_iter = iter(dataloader)

first_batch = next(data_iter)
print(first_batch) # input and target
second_batch = next(data_iter)
print(second_batch) # input and target

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [None]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
# First batch
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)
print("Inputs:\n", tokenizer.decode(inputs[0].tolist()))
print("\nTargets:\n", tokenizer.decode(targets[0].tolist()))

# Second batch
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)
print("Inputs:\n", tokenizer.decode(inputs[0].tolist()))
print("\nTargets:\n", tokenizer.decode(targets[0].tolist()))

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
Inputs:
 I HAD always

Targets:
  HAD always thought
Inputs:
 tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]])

Ta

### Create our token embedding layer

In [None]:
vocab_size = 50257
output_dim = 3 # 256 is a more common starting point

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(token_embedding_layer.weight.shape)
print(token_embedding_layer.weight)

torch.Size([50257, 3])
Parameter containing:
tensor([[-0.3140,  0.3158, -0.0184],
        [-0.6753, -0.3501,  0.3179],
        [ 0.9742,  0.9866, -0.2060],
        ...,
        [-0.3815, -0.2799, -1.2723],
        [-0.1445, -0.1240,  0.9424],
        [ 0.0474,  2.2421,  1.2822]], requires_grad=True)


#### Load our dataset to get the inputs

In [None]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Token IDs:\n",  inputs) # we take the first batch and  ignore the targets for now
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


### Create token embeddings

In [None]:
token_embeddings = token_embedding_layer(inputs)
# each token now has the assigned number of dimentions instead of being a single token ID
print(token_embeddings.shape)

# uncomment & execute the following line to see how the embeddings look like
print(token_embeddings)

torch.Size([8, 4, 3])
tensor([[[-1.5667, -0.4043,  2.0194],
         [ 0.0318, -0.5878,  0.1383],
         [ 0.1082,  0.3448,  0.5661],
         [ 1.2075,  0.2069, -0.4459]],

        [[ 0.4012,  0.3681, -0.5622],
         [-1.4374, -0.3197, -0.8440],
         [ 0.1240, -0.2647,  0.1389],
         [-0.1040, -0.3369,  0.9580]],

        [[-1.7125, -1.2589,  0.1095],
         [-0.0731, -0.5805, -1.4300],
         [ 0.3627,  1.1747,  2.5600],
         [-0.1531, -0.6818, -1.6566]],

        [[ 1.1510,  0.6730, -0.4737],
         [-0.0960, -0.2659,  0.4583],
         [-2.5486, -0.4815,  0.5106],
         [ 0.3627,  1.1747,  2.5600]],

        [[ 1.0877,  0.6921, -0.9737],
         [-1.9039,  1.7196,  0.4717],
         [-0.6503, -0.6275, -1.3836],
         [-0.0960, -0.2659,  0.4583]],

        [[-0.9901, -1.3036,  1.1135],
         [ 0.2733,  0.4451,  0.7787],
         [-0.1911,  0.2768,  0.1496],
         [-0.7183, -0.1906, -0.6420]],

        [[-0.0503,  1.5289,  1.3821],
         [ 1.300

### Create absolute positional embeddings

In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

# # [0, 1, 2, 3] "column" position is the position of each word on each sequence of 4 context_length
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

# # uncomment & execute the following line to see how the embeddings look like
print(pos_embeddings)

torch.Size([4, 3])
tensor([[ 0.2898,  1.3335, -0.0266],
        [ 0.1003, -0.6332, -1.2612],
        [ 0.2369, -0.6738,  1.5943],
        [-0.1716,  0.6951,  0.1599]], grad_fn=<EmbeddingBackward0>)


### Create input embeddings

In [None]:
input_embeddings = token_embeddings + pos_embeddings
print(f'{token_embeddings.shape} + {pos_embeddings.shape} = {input_embeddings.shape}')

# uncomment & execute the following line to see how the embeddings look like
print(input_embeddings)

torch.Size([8, 4, 3]) + torch.Size([4, 3]) = torch.Size([8, 4, 3])
tensor([[[-1.2770,  0.9292,  1.9928],
         [ 0.1321, -1.2209, -1.1229],
         [ 0.3451, -0.3290,  2.1604],
         [ 1.0359,  0.9019, -0.2860]],

        [[ 0.6909,  1.7016, -0.5888],
         [-1.3371, -0.9529, -2.1052],
         [ 0.3609, -0.9385,  1.7333],
         [-0.2756,  0.3582,  1.1179]],

        [[-1.4228,  0.0746,  0.0829],
         [ 0.0272, -1.2137, -2.6912],
         [ 0.5995,  0.5009,  4.1544],
         [-0.3246,  0.0133, -1.4967]],

        [[ 1.4407,  2.0065, -0.5003],
         [ 0.0043, -0.8991, -0.8029],
         [-2.3118, -1.1553,  2.1049],
         [ 0.1911,  1.8698,  2.7199]],

        [[ 1.3775,  2.0256, -1.0003],
         [-1.8036,  1.0864, -0.7895],
         [-0.4134, -1.3013,  0.2108],
         [-0.2675,  0.4292,  0.6182]],

        [[-0.7004,  0.0298,  1.0869],
         [ 0.3736, -0.1881, -0.4825],
         [ 0.0458, -0.3970,  1.7439],
         [-0.8899,  0.5044, -0.4821]],

        [

### What about more dimmension?

In [None]:
# vocab_size = 50257
# output_dim = 256 # 256 is a more common starting point

# token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
# token_embeddings = token_embedding_layer(inputs)
# print(token_embeddings.shape)
# print(token_embeddings)

# context_length = max_length
# pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
# pos_embeddings = pos_embedding_layer(torch.arange(max_length))
# print(pos_embeddings.shape)
# print(pos_embeddings)


# input_embeddings = token_embeddings + pos_embeddings
# print(f'{token_embeddings.shape} + {pos_embeddings.shape} = {input_embeddings.shape}')
# print(input_embeddings)

# Chapter 3

## Preparing data to work with

### Get input embeddings

In [None]:
def get_input_embeddings(my_raw_text):
  dataloader = create_dataloader_v1(my_raw_text, batch_size=1, max_length=6, stride=6, shuffle=False)

  data_iter = iter(dataloader)
  # First and only batch
  inputs, targets = next(data_iter)
  # print(inputs)
  # print(targets)  # ignore the targets

  vocab_size = 50257
  output_dim = 3

  token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
  token_embeddings = token_embedding_layer(inputs)
  context_length = 6
  pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
  pos_embeddings = pos_embedding_layer(torch.arange(context_length))

  input_embeddings = token_embeddings + pos_embeddings
  # print(f'{token_embeddings.shape} + {pos_embeddings.shape} = {input_embeddings.shape}')
  # print(input_embeddings)
  return input_embeddings

In [None]:
my_raw_text = "Your journey starts with one step"
small_input_embeddings = get_input_embeddings(my_raw_text)[0]
print(small_input_embeddings)

tensor([[-0.2882, -1.0678, -1.1739],
        [-0.6651, -3.8029,  0.4468],
        [-1.1316,  1.3744, -0.2364],
        [-1.0242,  0.8816,  1.2310],
        [-2.4379, -2.3635, -1.9857],
        [ 1.2236,  2.3880,  1.7501]], grad_fn=<SelectBackward0>)


In [None]:
# tensor([
#     [ 0.4113,  1.3397, -1.2234], Your    (x^1)
#     [-1.8881, -0.0679, -1.1267], journey (x^2)
#     [-0.2323, -2.2089, -1.6685], starts  (x^3)
#     [ 0.5615,  1.2698,  2.5768], with    (x^4)
#     [-0.9290, -0.0227,  0.6467], one     (x^5)
#     [ 0.5691, -2.0627, -3.2411]  step    (x^6)
# ])

### Forcing values to fit between 0 and 1

In [None]:
import decimal
min_val = small_input_embeddings.min()
max_val = small_input_embeddings.max()
scaled_embeddings = (small_input_embeddings - min_val) / (max_val - min_val)
rounded_embeddings = torch.round(scaled_embeddings * 100) / 100
print(rounded_embeddings)

tensor([[1.0000, 0.0000, 0.1700],
        [0.5800, 0.4600, 0.0300],
        [0.4500, 0.4500, 0.0600],
        [0.0100, 0.6600, 0.5700],
        [0.4600, 0.6800, 0.4400],
        [0.2600, 0.0700, 0.2200]], grad_fn=<DivBackward0>)


## Simple self-attention

### Step 1 - Compute unormalized attention scores

In [None]:
query = rounded_embeddings[1]  # 2nd input token is the query)
# just allocate a tensor in memory with 6 spaces
attn_scores_2 = torch.empty(rounded_embeddings.shape[0])

#fill the tensor with the dot products which multiply and sum
for i, x_i in enumerate(rounded_embeddings):
    print(f'dot product of {x_i} against journey {query}')
    attn_scores_2[i] = torch.dot(x_i, query) # dot product (transpose not necessary here since they are 1-dim vectors)

print(attn_scores_2)

dot product of tensor([1.0000, 0.0000, 0.1700], grad_fn=<UnbindBackward0>) against journey tensor([0.5800, 0.4600, 0.0300], grad_fn=<SelectBackward0>)
dot product of tensor([0.5800, 0.4600, 0.0300], grad_fn=<UnbindBackward0>) against journey tensor([0.5800, 0.4600, 0.0300], grad_fn=<SelectBackward0>)
dot product of tensor([0.4500, 0.4500, 0.0600], grad_fn=<UnbindBackward0>) against journey tensor([0.5800, 0.4600, 0.0300], grad_fn=<SelectBackward0>)
dot product of tensor([0.0100, 0.6600, 0.5700], grad_fn=<UnbindBackward0>) against journey tensor([0.5800, 0.4600, 0.0300], grad_fn=<SelectBackward0>)
dot product of tensor([0.4600, 0.6800, 0.4400], grad_fn=<UnbindBackward0>) against journey tensor([0.5800, 0.4600, 0.0300], grad_fn=<SelectBackward0>)
dot product of tensor([0.2600, 0.0700, 0.2200], grad_fn=<UnbindBackward0>) against journey tensor([0.5800, 0.4600, 0.0300], grad_fn=<SelectBackward0>)
tensor([0.5851, 0.5489, 0.4698, 0.3265, 0.5928, 0.1896], grad_fn=<CopySlices>)


### Step 2 - Normalize the attenton scores to sum up to 1

In [None]:
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

attn_weights_2_naive = softmax_naive(attn_scores_2)

print("Attention weights:", attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

Attention weights: tensor([0.1884, 0.1817, 0.1679, 0.1454, 0.1898, 0.1268],
       grad_fn=<DivBackward0>)
Sum: tensor(1., grad_fn=<SumBackward0>)


In [None]:
# Fooling around with dimensions
my_tensor = torch.ones(2,3,6,4)
# print(my_tensor)
print(my_tensor.shape)
print(my_tensor.shape[-1])
print(my_tensor.shape[0])
print(my_tensor.shape[1])
print(my_tensor.shape[2])

torch.Size([2, 3, 6, 4])
4
2
3
6


In [None]:
# using pytorch softmax fucntion
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)

print("Attention weights:", attn_weights_2)
print("Sum:", attn_weights_2.sum())


Attention weights: tensor([0.1884, 0.1817, 0.1679, 0.1454, 0.1898, 0.1268],
       grad_fn=<SoftmaxBackward0>)
Sum: tensor(1., grad_fn=<SumBackward0>)


### Step 3 - Compute the context vector $z^{(2)}$

In [None]:
query = rounded_embeddings[1] # 2nd input token is the query

context_vec_2 = torch.zeros(query.shape)
attn_weights_sum = 0
for i,x_i in enumerate(rounded_embeddings):
    print(f'{attn_weights_2[i]} * {x_i}')
    context_vec_2 += attn_weights_2[i]*x_i
    attn_weights_sum += attn_weights_2[i]

print(attn_weights_sum)
print(context_vec_2)

0.18836823105812073 * tensor([1.0000, 0.0000, 0.1700], grad_fn=<UnbindBackward0>)
0.18167123198509216 * tensor([0.5800, 0.4600, 0.0300], grad_fn=<UnbindBackward0>)
0.1678546965122223 * tensor([0.4500, 0.4500, 0.0600], grad_fn=<UnbindBackward0>)
0.1454450935125351 * tensor([0.0100, 0.6600, 0.5700], grad_fn=<UnbindBackward0>)
0.18982425332069397 * tensor([0.4600, 0.6800, 0.4400], grad_fn=<UnbindBackward0>)
0.12683647871017456 * tensor([0.2600, 0.0700, 0.2200], grad_fn=<UnbindBackward0>)
tensor(1.0000, grad_fn=<AddBackward0>)
tensor([0.4910, 0.3931, 0.2419], grad_fn=<AddBackward0>)


### Get All attention weights

In [None]:
attn_scores = torch.empty(6, 6)

for i, x_i in enumerate(rounded_embeddings):
    for j, x_j in enumerate(rounded_embeddings):
        attn_scores[i, j] = torch.dot(x_i, x_j)

print(attn_scores)

tensor([[1.0289, 0.5851, 0.4602, 0.1069, 0.5348, 0.2974],
        [0.5851, 0.5489, 0.4698, 0.3265, 0.5928, 0.1896],
        [0.4602, 0.4698, 0.4086, 0.3357, 0.5394, 0.1617],
        [0.1069, 0.3265, 0.3357, 0.7606, 0.7042, 0.1742],
        [0.5348, 0.5928, 0.5394, 0.7042, 0.8676, 0.2640],
        [0.2974, 0.1896, 0.1617, 0.1742, 0.2640, 0.1209]],
       grad_fn=<CopySlices>)


We can achive the same but more efficiently via matrix multiplication

In [None]:
# Using matrix transpose (remember is row * column)
print(rounded_embeddings.shape)
print(rounded_embeddings.T.shape)
print('-------------------')
print(rounded_embeddings)
print('-------------------')
print(rounded_embeddings.T)

torch.Size([6, 3])
torch.Size([3, 6])
-------------------
tensor([[1.0000, 0.0000, 0.1700],
        [0.5800, 0.4600, 0.0300],
        [0.4500, 0.4500, 0.0600],
        [0.0100, 0.6600, 0.5700],
        [0.4600, 0.6800, 0.4400],
        [0.2600, 0.0700, 0.2200]], grad_fn=<DivBackward0>)
-------------------
tensor([[1.0000, 0.5800, 0.4500, 0.0100, 0.4600, 0.2600],
        [0.0000, 0.4600, 0.4500, 0.6600, 0.6800, 0.0700],
        [0.1700, 0.0300, 0.0600, 0.5700, 0.4400, 0.2200]],
       grad_fn=<PermuteBackward0>)


In [None]:
attn_scores = rounded_embeddings @ rounded_embeddings.T
print(attn_scores)

tensor([[1.0289, 0.5851, 0.4602, 0.1069, 0.5348, 0.2974],
        [0.5851, 0.5489, 0.4698, 0.3265, 0.5928, 0.1896],
        [0.4602, 0.4698, 0.4086, 0.3357, 0.5394, 0.1617],
        [0.1069, 0.3265, 0.3357, 0.7606, 0.7042, 0.1742],
        [0.5348, 0.5928, 0.5394, 0.7042, 0.8676, 0.2640],
        [0.2974, 0.1896, 0.1617, 0.1742, 0.2640, 0.1209]],
       grad_fn=<MmBackward0>)


Apply softmax

In [None]:
# dim=-1 is to apply softmax to the last dimenison, in this case rows
attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)

tensor([[0.2705, 0.1735, 0.1532, 0.1076, 0.1650, 0.1302],
        [0.1884, 0.1817, 0.1679, 0.1454, 0.1898, 0.1268],
        [0.1765, 0.1782, 0.1676, 0.1558, 0.1910, 0.1309],
        [0.1203, 0.1499, 0.1512, 0.2313, 0.2186, 0.1287],
        [0.1561, 0.1654, 0.1568, 0.1849, 0.2177, 0.1191],
        [0.1831, 0.1644, 0.1599, 0.1619, 0.1771, 0.1535]],
       grad_fn=<SoftmaxBackward0>)


In [None]:
row_0_sum = sum([0.1403, 0.1365, 0.1915, 0.1552, 0.1659, 0.2106])
print(row_0_sum)
print("All row sums:", attn_weights.sum(dim=-1))

1.0
All row sums: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)


### Compute All context vectors

In [None]:
all_context_vecs = attn_weights @ rounded_embeddings
print(all_context_vecs)
print("Previous 2nd context vector:", context_vec_2)

tensor([[0.5509, 0.3411, 0.2230],
        [0.4910, 0.3931, 0.2419],
        [0.4787, 0.3993, 0.2471],
        [0.4116, 0.4473, 0.2904],
        [0.4556, 0.4251, 0.2683],
        [0.4735, 0.3856, 0.2497]], grad_fn=<MmBackward0>)
Previous 2nd context vector: tensor([0.4910, 0.3931, 0.2419], grad_fn=<AddBackward0>)


## Self Attention

**Weight parameters** are learned coefficients that define the network connections, while **attention weights** are dynamic, context-specific values.

In [None]:
x_2 = rounded_embeddings[1] # second input element
d_in = rounded_embeddings.shape[1] # the input embedding size, d=3
d_out = 2 # the output embedding size, d=2

torch.manual_seed(123)

W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key   = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

print(W_query)
print(W_key)
print(W_value)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])
Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [None]:
query_2 = x_2 @ W_query # _2 because it's with respect to the 2nd input element
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value

print(query_2)

# calculate keys and values vector for all inputs
keys = rounded_embeddings @ W_key
values = rounded_embeddings @ W_value

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

tensor([0.2897, 0.6423], grad_fn=<SqueezeBackward4>)
keys.shape: torch.Size([6, 2])
values.shape: torch.Size([6, 2])


In [None]:
keys_2 = keys[1] # Python starts index at 0
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)

tensor(0.3163, grad_fn=<DotBackward0>)


In [None]:
attn_scores_2 = query_2 @ keys.T # All attention scores for given query
print(attn_scores_2)

tensor([0.1960, 0.3163, 0.3134, 0.6479, 0.6364, 0.1810],
       grad_fn=<SqueezeBackward4>)


The difference to earlier is that we now scale the attention scores by dividing them by the square root of the embedding dimension,  𝑑𝑘‾‾‾√  (i.e., d_k**0.5):

Imagine you have two vectors, and their dot product results in a large value. When this large value is passed through the softmax function, it might dominate the probabilities, making the attention mechanism less sensitive to other relevant parts of the input. Scaling helps to mitigate this issue by preventing any single dot product from becoming overly influential.

In [None]:
d_k = keys.shape[1] # dimension of keys
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(attn_weights_2)

tensor([0.1448, 0.1576, 0.1573, 0.1993, 0.1977, 0.1433],
       grad_fn=<SoftmaxBackward0>)


In [None]:
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

tensor([0.1979, 0.4785], grad_fn=<SqueezeBackward4>)


### Compact SelfAttention Class

<img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/ch03_compressed/18.webp" width="400px">

- We can streamline the implementation above using PyTorch's Linear layers, which are equivalent to a matrix multiplication if we disable the bias units
- Another big advantage of using `nn.Linear` over our manual `nn.Parameter(torch.rand(...)` approach is that `nn.Linear` has a preferred weight initialization scheme, which leads to more stable model training

In [None]:
import torch.nn as nn

In [None]:
class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

In [None]:
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(rounded_embeddings))

<class 'torch.nn.modules.linear.Linear'>
<class 'torch.nn.modules.linear.Linear'>
tensor([[0.0160, 0.1600],
        [0.0148, 0.1583],
        [0.0147, 0.1582],
        [0.0175, 0.1624],
        [0.0176, 0.1627],
        [0.0148, 0.1582]], grad_fn=<MmBackward0>)


### Hiding futer words with causal attention (one step back)

In [None]:
# Reuse data from previous section
queries = sa_v2.W_query(rounded_embeddings)
keys = sa_v2.W_key(rounded_embeddings)
attn_scores = queries @ keys.T

attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
print(attn_weights)

tensor([[0.1797, 0.1623, 0.1612, 0.1626, 0.1659, 0.1683],
        [0.1776, 0.1613, 0.1606, 0.1652, 0.1674, 0.1679],
        [0.1769, 0.1617, 0.1611, 0.1652, 0.1672, 0.1678],
        [0.1860, 0.1601, 0.1586, 0.1608, 0.1656, 0.1690],
        [0.1878, 0.1587, 0.1572, 0.1612, 0.1661, 0.1691],
        [0.1744, 0.1644, 0.1638, 0.1638, 0.1660, 0.1677]],
       grad_fn=<SoftmaxBackward0>)


Applying negative infinity effectively zeros out the probabilities for these future tokens in the subsequent softmax calculation.

In [None]:
context_length = attn_scores.shape[-1]
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
print(masked)

tensor([[0.1565,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1500, 0.0135,   -inf,   -inf,   -inf,   -inf],
        [0.1393, 0.0125, 0.0064,   -inf,   -inf,   -inf],
        [0.2310, 0.0187, 0.0054, 0.0253,   -inf,   -inf],
        [0.2599, 0.0217, 0.0078, 0.0433, 0.0857,   -inf],
        [0.0901, 0.0069, 0.0012, 0.0015, 0.0200, 0.0348]],
       grad_fn=<MaskedFillBackward0>)


In [None]:
attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim=-1)
print(attn_weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5241, 0.4759, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3540, 0.3237, 0.3223, 0.0000, 0.0000, 0.0000],
        [0.2795, 0.2405, 0.2383, 0.2417, 0.0000, 0.0000],
        [0.2261, 0.1910, 0.1891, 0.1939, 0.1998, 0.0000],
        [0.1744, 0.1644, 0.1638, 0.1638, 0.1660, 0.1677]],
       grad_fn=<SoftmaxBackward0>)


### Masking additional attention weights with dropout

In [None]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.5) # dropout rate of 50%
example = torch.ones(6, 6) # create a matrix of ones

print(dropout(example))
print(dropout(attn_weights))

tensor([[2., 2., 2., 2., 2., 2.],
        [0., 2., 0., 0., 0., 0.],
        [0., 0., 2., 0., 2., 0.],
        [2., 2., 0., 0., 0., 2.],
        [2., 0., 0., 0., 0., 2.],
        [0., 2., 0., 0., 0., 0.]])
tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.6446, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.4811, 0.0000, 0.4833, 0.0000, 0.0000],
        [0.0000, 0.3820, 0.3783, 0.3879, 0.3997, 0.0000],
        [0.3488, 0.3288, 0.0000, 0.0000, 0.3319, 0.3354]],
       grad_fn=<MulBackward0>)


### Causal Attention Class

In [None]:
class CausalAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length,
                 dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout) # New
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New

    def forward(self, x):
        b, num_tokens, d_in = x.shape # New batch dimension b
        print(b)
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.transpose(1, 2) # Changed transpose
        attn_scores.masked_fill_(  # New, _ ops are in-place
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)  # `:num_tokens` to account for cases where the number of tokens in the batch is smaller than the supported context_size
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )
        attn_weights = self.dropout(attn_weights) # New

        context_vec = attn_weights @ values
        return context_vec

In [None]:
torch.manual_seed(123)

batch = torch.stack((rounded_embeddings, rounded_embeddings), dim=0)
print(batch.shape) # 2 inputs with 6 tokens each, and each token has embedding dimension 3

context_length = batch.shape[1]
ca = CausalAttention(d_in, d_out, context_length, 0.0)

context_vecs = ca(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

torch.Size([2, 6, 3])
tensor([[[-0.5261, -0.0492],
         [-0.4886, -0.1538],
         [-0.4559, -0.1782],
         [-0.4310, -0.1523],
         [-0.4569, -0.1589],
         [-0.4150, -0.1289]],

        [[-0.5261, -0.0492],
         [-0.4886, -0.1538],
         [-0.4559, -0.1782],
         [-0.4310, -0.1523],
         [-0.4569, -0.1589],
         [-0.4150, -0.1289]]], grad_fn=<UnsafeViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])


## Self Attention multi-head

<img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/ch03_compressed/26.webp" width="400px">

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape #b is for batches
        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        print('--------------------')
        print(f'{keys.shape} vs {keys.transpose(1,2).shape}')
        print('--------------------')
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

torch.manual_seed(123)

batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)

context_vecs = mha(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

--------------------
torch.Size([2, 6, 2, 1]) vs torch.Size([2, 2, 6, 1])
--------------------
tensor([[[0.2699, 0.4130],
         [0.2399, 0.4178],
         [0.2289, 0.4308],
         [0.2306, 0.4467],
         [0.2335, 0.4329],
         [0.2335, 0.4581]],

        [[0.2699, 0.4130],
         [0.2399, 0.4178],
         [0.2289, 0.4308],
         [0.2306, 0.4467],
         [0.2335, 0.4329],
         [0.2335, 0.4581]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])
