# Concept Questions

1. What is the dot product between vectors `<1, 2, 3>` and `<4, 5, 6>`?
2. Given information about sizes of matrices `A` and `B`, find the size of `A @ B` if it exists.

    a. `A` is 8x5 and `B` is 5x7

    b. `A` is 8x6 and `B` is 3x4

    c. `A` is 8x8 and `B` is 2x8

    d. `A` is 2x8 and `B` is 8x8

    e. `A` is 4x6 and `B.T` is 4x6

3. True or False:

    a. Queries are only matched to one key.

    b. Query, Key, and Value vectors have the same size.

    c. We can obtain Q, K, and V matrices by multiplying an input matrix by trainable matrices.

4. What does $QK^T$ represent?

5. Explain the role of `__init__` and `forward` in a machine learning Module.

1. 1 * 4 + 2 * 5 + 3 * 6 = 32
2.

  a. 8x7

  b. Doesn't exist since 6 is different from 3

  c. Doesn't exist since 8 is different from 2

  d. 2x8

  e. Note that B.T being 4x6 means B is 6x4, which makes A@B 4x4

3.

  a. False. Each query is matched to every key in the context window.

  b. True.

  c. True.

4. Represents similarity scores between all pairs of query and key vectors. It allows tokens to communicate with each other.

5.
  `__init__` is used to initialize all parameters of a model and initializes other things that can be used later on.

  `forward` is used to obtain output from a machine learning model. We feed an input into `forward`, and the forward utilizes the parameters initialized in `__init__` to calculate an output.


# Attention

In [1]:
import torch
import torch.nn as nn

Matrix multiplication with Tensors:

In [8]:
a = torch.tensor([[1, 2, 3], [4, 5, 6]]) # 2x3 tensor
b = torch.tensor([[3], [4], [1]]) # 3x1 tensor
c = a @ b # 2x1 tensor
print(c)

d = torch.zeros(40, 75)
e = torch.zeros(75, 25)
f = d @ e # 40x25
print(f.shape)

g = torch.zeros(25, 83)
h = d @ e @ g
print(h.shape)

m1 = torch.zeros(35, 50)
m2 = torch.zeros(35, 50)
m3 = m1 @ m2.T
print(m3.shape)

tensor([[14],
        [38]])
torch.Size([40, 25])
torch.Size([40, 83])
torch.Size([35, 35])


In [27]:
print(torch.randn(4, 5))
print(torch.rand(4, 5))
# We want torch.randn because we want negative numbers
# If we restrict to positive or use something like torch.zeros, then
# the result could be biased towards certain types of data

tensor([[ 1.0720e+00,  1.5026e+00, -8.1899e-01,  2.6860e-01, -9.4053e-01],
        [-4.6806e-01,  1.0322e+00, -2.8300e-01,  4.9275e-01, -1.4078e-02],
        [-2.7466e-01, -7.6409e-01,  1.3966e+00, -9.9491e-01, -1.5822e-03],
        [ 1.2471e+00, -7.7105e-02,  1.2774e+00, -1.4596e+00, -2.1595e+00]])
tensor([[0.6397, 0.8954, 0.2979, 0.6314, 0.5028],
        [0.1239, 0.3786, 0.1661, 0.7211, 0.5449],
        [0.5490, 0.3483, 0.5024, 0.3445, 0.6437],
        [0.9856, 0.5757, 0.2785, 0.1946, 0.5382]])


In [29]:
X = torch.tensor([[0.43,0.15,0.89],
                        [0.55,0.87,0.66],
                        [0.57,0.85,0.64],
                        [0.22,0.58,0.33],
                        [0.77,0.25,0.10],
                        [0.05,0.80,0.55]])
n = X.shape[0] # 6
d_in = X.shape[1] # 3
d_out = 2
torch.manual_seed(123)

W_q = nn.Parameter(torch.randn(d_in, d_out))
W_k = nn.Parameter(torch.randn(d_in, d_out))
W_v = nn.Parameter(torch.randn(d_in, d_out))

# X is n x d_in
# W_q is d_in x d_out
# X @ W_q is n x d_out

Q = X @ W_q # n x d_out
K = X @ W_k # n x d_out
V = X @ W_v # n x d_out
print(Q)
print(K)
print(V)

# Q is n x d_out
# K is n x d_out
# K.T is d_out x n
# Q x K.T is n x n
QKT = Q @ K.T
print(QKT)

A = torch.softmax(QKT / (d_out ** 0.5), dim=1)
print(A)
print(sum(A[0])) # 1 because softmax ensures that the elements of the vector sum to 1

# A is n x n
# V is n x d_out
# context_vector = n x d_out
# actually is a context matrix made up of context vectors of each token
context_vector = A @ V
print(context_vector)

tensor([[-1.1686,  0.2019],
        [-1.1729, -0.0048],
        [-1.1438, -0.0018],
        [-0.6339, -0.0439],
        [-0.2979,  0.0535],
        [-0.9596, -0.0712]], grad_fn=<MmBackward0>)
tensor([[-0.1823, -0.6888],
        [-0.1142, -0.7676],
        [-0.1443, -0.7728],
        [ 0.0434, -0.3580],
        [-0.6467, -0.6476],
        [ 0.3262, -0.3395]], grad_fn=<MmBackward0>)
tensor([[ 0.1196, -0.3566],
        [ 0.4107,  0.6274],
        [ 0.4091,  0.6390],
        [ 0.2436,  0.4182],
        [ 0.2653,  0.6668],
        [ 0.2728,  0.3242]], grad_fn=<MmBackward0>)
tensor([[ 0.0740, -0.0216,  0.0126, -0.1230,  0.6250, -0.4498],
        [ 0.2172,  0.1376,  0.1730, -0.0491,  0.7616, -0.3809],
        [ 0.2098,  0.1320,  0.1665, -0.0489,  0.7408, -0.3725],
        [ 0.1458,  0.1061,  0.1254, -0.0118,  0.4384, -0.1919],
        [ 0.0175, -0.0071,  0.0017, -0.0321,  0.1580, -0.1153],
        [ 0.2240,  0.1642,  0.1935, -0.0161,  0.6667, -0.2888]],
       grad_fn=<MmBackward0>)
tensor([[

**Exercise 1:** Fill out missing or incomplete parts of the following attention module.

In [28]:
class SelfAttentionV1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        self.W_query = nn.Parameter(torch.randn(d_in, d_out))
        self.W_key = nn.Parameter(torch.randn(d_in, d_out))
        self.W_value = nn.Parameter(torch.randn(d_in, d_out))

    def forward(self, x):
        # x is n x d_in and W_q is d_in x d_out
        # Q is n x d_out
        Q = x @ W_q
        K = x @ W_k
        V = x @ W_v
        QKT = Q @ K.T
        A = torch.softmax(QKT / (self.d_out ** 0.5), dim=1)
        context_vector = A @ V
        return context_vector

Using the self-attention module:

In [31]:
torch.manual_seed(123)
# X is 6 x 3
self_attention = SelfAttentionV1(d_in=X.shape[1], d_out=2)  # calls __init__
context_vector = self_attention(X)  # calls the forward function
# self_attention.forward(X)
print(context_vector) # 6 x 2

tensor([[0.2845, 0.4071],
        [0.2854, 0.4081],
        [0.2854, 0.4075],
        [0.2864, 0.3974],
        [0.2863, 0.3910],
        [0.2860, 0.4039]], grad_fn=<MmBackward0>)


Generally, `nn.Linear` is used instead of `nn.Parameter`.

In [33]:
linear_layer = nn.Linear(10, 20) # This linear layer transforms a vector of size
# 10 into a vector of size 20

a = torch.zeros(10)
b = linear_layer(a) # Shape: 20
# linear_layer(a) internally multiplies by a matrix of size 10x20 on the right side
# Math-wise, it is matrix multiplication of 1x10 multiplied by 10x20
print(b.shape)

# Linear also works well with batches
c = torch.zeros(8, 10) # Batch size 8; Shape: 8x10
d = linear_layer(c) # Shape: 8x20
# Multiplies the 8x10 matrix c by a 10x20 parameter matrix
# Result is an 8x20 matrix
print(d.shape)

# Linear works with even more dimensions by only transforming the last layer
e = torch.zeros(2, 4, 10)
f = linear_layer(e) # Shape: 2x4x20
print(f.shape)



g = torch.zeros(5, 5, 5, 5, 5, 5, 10)
h = linear_layer(g)
print(h.shape)

torch.Size([20])
torch.Size([8, 20])
torch.Size([2, 4, 20])
torch.Size([5, 5, 5, 5, 5, 5, 20])


**Exercise 2:** Fill out missing or incomplete parts of the following module while utilizing `nn.Linear`.

In [40]:
class SelfAttentionV2(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        # W_query is a linear function that maps a d_in dimensional vector
        # to a d_out dimensional vector
        # Mathematically, it is the same as a d_in by d_out matrix
        # Same for the key and value
        self.W_query = nn.Linear(d_in, d_out)
        self.W_key = nn.Linear(d_in, d_out)
        self.W_value = nn.Linear(d_in, d_out)

    def forward(self, x):
        # x is B x N x d_in
        # Q, K, and V are B x N x d_out
        Q = self.W_query(x)
        K = self.W_key(x)
        V = self.W_value(x)

        QKT = Q @ K.transpose(1, 2) # dim 1 is N, and dim 2 is d_out
        A = torch.softmax(QKT / (self.d_out ** 0.5), dim=-1)

        # A is B x N x N
        # V is B x N x d_out
        # A @ V is B x N x d_out
        context_vector = A @ V
        return context_vector

**Exercise 3:** Create a tensor representing a batch of 40 context windows, where each context window has 50 tokens and each token has embedding size 768. Pass it through the `SelfAttentionV2` module. Print the size of the output.

In [49]:
# Your code here
X_batch = torch.randn(40, 50, 768)
self_attention = SelfAttentionV2(768, 1024) # calls __init__
context_batch = self_attention(X_batch) # calls the forward function
print(context_batch.shape)

X_batch_2 = torch.randn(28, 50, 768)
context_batch_2 = self_attention(X_batch_2) # calls the forward function
# forward is generally called many times, while init is only called once
print(context_batch_2.shape)

# X_batch_3 = torch.randn(40, 50, 512)
# context_batch_3 = self_attention(X_batch_3) # error
# You cannot use a different embedding size with the same model
# because the parameters of the model were already defined with a fixed embedding
# size in mind

#X_batch_4 = torch.randn(40, 35, 768)
#context_batch_4 = self_attention(X_batch_4)
#print(context_batch_4.shape)
# no error, but may cause complications later if not careful
# since we have QKT being a different size


torch.Size([40, 50, 1024])
torch.Size([28, 50, 1024])


**Exercise 4:** Create your own text (at least 10 words). Make a dataloader of the text with batch size 2, context size 4, and stride 1. Make input embeddings of size 768 (including both token and positional embeddings) and pass them through your SelfAttentionV2 module.

In [55]:
import tiktoken
from torch.utils.data import Dataset, DataLoader

# Dataset class
class MyData(Dataset):
    # Init function, called when the dataset is created
    # dataset = MyData(text, tokenizer, context_length=4, stride=1)
    def __init__(self, text, tokenizer, context_length, stride=1):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - context_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i : i + context_length]))
            self.target_ids.append(torch.tensor(token_ids[i + 1 : i + context_length + 1]))

    # Length function
    # len(dataset)
    def __len__(self):
        return len(self.input_ids)

    # Get item function
    # dataset[idx]
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

# Dataloader
def my_batch(text, batch_size, context_length, stride, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create the dataset object
    dataset = MyData(text, tokenizer, context_length, stride)

    # Use the DataLoader library to create a dataloader that batches the data
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)

    return dataloader

# Your code here
my_text = "I am currently taking a class about learning how to build LLMs"
loader = my_batch(my_text, batch_size=2, context_length=4, stride=1)
vocab_size = 50257
token_embs = nn.Embedding(vocab_size, 768)
pos_embs = nn.Embedding(vocab_size, 768)
model = SelfAttentionV2(768, 1024)
context_embs = []
for input, target in loader:
  embedding = token_embs(input) + pos_embs(torch.arange(4))
  context_emb = model(embedding)
  context_embs.append(context_emb)
  print(context_emb.shape)



torch.Size([2, 4, 1024])
torch.Size([2, 4, 1024])
torch.Size([2, 4, 1024])
torch.Size([2, 4, 1024])


**Optional: Building a bad minimal language model**

In [60]:
# Input: B x N
# After embedding: B x N x emb_dim
# After self-attention: B x N x att_dim
# What we want: B x N x vocab_size
# This is an actually complete language model,
# except it is way too small and doesn't have many of the ideas
# that makes GPT actually work
# Later on, we will actually build a functional GPT
class BadLM(nn.Module):
  def __init__(self, context_length, vocab_size, emb_dim, att_dim):
    super().__init__()
    self.context_length = context_length
    self.emb_dim = emb_dim
    self.att_dim = att_dim
    self.vocab_size = vocab_size
    self.token_embs = nn.Embedding(vocab_size, emb_dim)
    self.pos_embs = nn.Embedding(vocab_size, emb_dim)
    self.att = SelfAttentionV2(emb_dim, att_dim)
    self.prediction_layer = nn.Linear(att_dim, vocab_size)

  def forward(self, x):
    # x is B x N
    embedding = self.token_embs(x) + self.pos_embs(torch.arange(self.context_length))
    context_embedding = self.att(embedding)
    prediction = self.prediction_layer(context_embedding)
    return prediction


In [61]:
my_text = "I am currently taking a class about learning how to build LLMs"
loader = my_batch(my_text, batch_size=2, context_length=4, stride=1)
vocab_size = 50257
model = BadLM(4, vocab_size, 768, 1024)
predictions = []
for input, target in loader:
  output = model(input) # B x N x 50257
  tokens = torch.argmax(output, dim=-1)
  predictions.append(tokens)
print(predictions)

[tensor([[40685, 40523, 44613, 26037],
        [ 6153, 18592, 18592, 18592]]), tensor([[27397, 10369, 17422, 17422],
        [38761, 23733, 23733, 43100]]), tensor([[39077, 38761, 19789, 40304],
        [ 9106, 26461, 18857, 26461]]), tensor([[20136, 38709, 47734, 50000],
        [ 4756, 25881,  4756, 40461]])]
