# Week 1: Attention

This week I'm working on pytorch basis and the attention mechanism commonly used in LLMs: causal attention.

Thank you, Sebastian Raschka, for the excellent examples and explanations in Build A Large Language Model.

In [2]:
# Test if MPS is available on the machine
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


## 1. Pytorch Basics

In [29]:
# tensors come in all shapes and sizes in deep learning.

tensor_0d = torch.tensor(1)

tensor_1d = torch.tensor([1, 3, 3, 7])

tensor_2d = torch.tensor([
    [1, 2],
    [3, 4]])

tensor_3d = torch.tensor([
    [
        [1, 2],
        [3, 4]
    ],
    [
        [5, 6],
        [7, 8]
    ]
])

# print all the tensors
print(tensor_0d)
print(tensor_1d)
print(tensor_2d)
print(tensor_3d)

# hmmm... the 3d is hard to visualise looking at this example. Now, I see it.
# It's like 2 2d tensors stacked like plates on a dish rack lol.

print(tensor_3d.shape)
print(tensor_3d.dtype)

# Ok. I can see torch defaults to Int64. It'll default to float32 if we provide a float.

float_tensor_1d = torch.tensor([1.0, 2.0, 3.0, 4.0])
print(float_tensor_1d.dtype)

# Before we move on, most GPUs are optimised for float32. And generally speaking, float32
# is more than enough precision for most deep learning tasks. However, you'll see 
# a lot of folks are GPU poor, so they use all sorts of tricks to get by with less precision (e.g., bfloat16, int8, etc.)

tensor(1)
tensor([1, 3, 3, 7])
tensor([[1, 2],
        [3, 4]])
tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])
torch.Size([2, 2, 2])
torch.int64
torch.float32


In [28]:
# Like linear algebra, the torch provides common operations for tensors.

# 1) reshaping is important

print(tensor_2d.reshape(1, 4)) # tensor([[1, 2, 3, 4]])

# but view is the preferred method for reshaping tensors
# because it requires the tensor to be contiguous in memory
# and will fail if the tensor isn't. this sounds safer to me.

# let's make a tensor that isn't contiguous and demonstrate the difference
tensor_2d_non_contiguous = tensor_2d.t()
print(tensor_2d_non_contiguous.is_contiguous())
print(tensor_2d_non_contiguous.shape)
try:
    print(tensor_2d_non_contiguous.view(1, 4)) # tensor([[1, 2, 3, 4]])
except RuntimeError as e:
    print(e)

# 2) matrix multiplication is key

tensor_2d.matmul(tensor_2d.t()) 
print(tensor_2d.shape, tensor_2d.t().shape, tensor_2d.matmul(tensor_2d.t()).shape)

# more compactly we can use @)

print(tensor_2d @ tensor_2d.t())

# 3) broadcasting is a powerful feature

tensor_2d + 1

# 4) indexing and slicing

print(tensor_2d[0, 1])
print(tensor_2d[:, 1])



tensor([[1, 2, 3, 4]])
False
torch.Size([2, 2])
view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
torch.Size([2, 2]) torch.Size([2, 2]) torch.Size([2, 2])
tensor([[ 5, 11],
        [11, 25]])
tensor(2)
tensor([2, 4])


In [70]:
import numpy as np

# creating matmul from scratch in numpy

def naive_matmul(A, B):
    m, n = A.shape
    p, q = B.shape

    if n != p: raise ValueError("Number of columns in A must match number of rows in B")

    C = np.zeros((m, q))

    for i in range(m):
        for j in range(q):
            for k in range(n):
                C[i, j] += A[i, k] * B[k, j] # pegs column of A to row of B and sums the products

    return C

A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

print(naive_matmul(A, B))

assert np.allclose(naive_matmul(A, B), np.matmul(A, B))
assert np.allclose(naive_matmul(A, B), A @ B)

# 1) test identity matrix

I = np.eye(3)
assert np.allclose(naive_matmul(I, I), I)

# 2) test dimension mismatch

A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8], [9, 10]])

try:
    print(naive_matmul(A, B))
except ValueError as e:
    print(e)

# 3) test scalar multiplication

I = np.eye(A.shape[0])
assert np.allclose(naive_matmul(A, I), A)


[[19. 22.]
 [43. 50.]]
Number of columns in A must match number of rows in B


## 2. Models as Graphs

In [34]:
import torch.nn.functional as F

# define the inputs
y = torch.tensor([0.0]) # ground truth
x1 = torch.tensor([3.3]) # input
w1 = torch.tensor([1.1]) # weight
b = torch.tensor([0.0]) # bias unit

# forward pass
z = x1 * w1 + b # linear layer
print("output of z:", z)
a = torch.sigmoid(z) # activation
loss = F.binary_cross_entropy(a, y) # loss function
print("loss:", loss)

# torch goes ahead and builds computational graph in the background
# it does this when a terminal node has requires_grad=True
# This is crucial to the backpropagation algorithm, which is used 
# to update the weights and biases of the model during learning.

# backpropagation is basically the chain rule applied to the graph to
# compute the gradients of the loss with respect to the weights and biases.

output of z: tensor([3.6300])
loss: tensor(3.6562)


In [38]:
# let's run it back with backpropagation using autograd

from torch.autograd import grad

# inputs again
y = torch.tensor([0.0]) # ground truth
x1 = torch.tensor([3.3]) # input
w1 = torch.tensor([1.1], requires_grad=True) # weight
b = torch.tensor([0.0], requires_grad=True) # bias unit

# forward pass
z = x1 * w1 + b # linear layer
print("output of z:", z)
a = torch.sigmoid(z) # activation
loss = F.binary_cross_entropy(a, y) # loss function
print("loss:", loss)

# backpropagation
grad_loss_w1 = grad(loss, w1, retain_graph=True)
grad_loss_b = grad(loss, b, retain_graph=True)

print("gradient of loss with respect to w1:", grad_loss_w1)
print("gradient of loss with respect to b:", grad_loss_b)

# alternatively, we can use the backward method
# that also computes the gradients of the loss with respect
# to the weights and biases. this is the preferred method.

loss.backward()

print("weights after backpropagation:", w1.grad, b.grad)

output of z: tensor([3.6300], grad_fn=<AddBackward0>)
loss: tensor(3.6562, grad_fn=<BinaryCrossEntropyBackward0>)
gradient of loss with respect to w1: (tensor([3.2148]),)
gradient of loss with respect to b: (tensor([0.9742]),)
weights after backpropagation: tensor([3.2148]) tensor([0.9742])


## 3. Bringing it all together: Multi-Layer Perceptron

In [52]:
class MLP(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.layers = torch.nn.Sequential(
            
            # first hidden layer
            torch.nn.Linear(input_dim, 20),
            torch.nn.ReLU(),

            # second hidden layer
            torch.nn.Linear(20, 10),
            torch.nn.ReLU(),
            
            # logits layer
            torch.nn.Linear(10, output_dim)
        )

    def forward(self, x):
        return self.layers(x) # returns logits
    
torch.manual_seed(1337)
model = MLP(input_dim=50, output_dim=3)
print(model)

# ok.. so a few interesting things here. how does an MLP work?
# well in this scenario it routes information through a hierarchical set
# of "questions" such that the activations represent the "answers" to those questions.

# how does a MLP differ to say a CNN? MLPs process every input feature with 
# full connectivity, whereas CNNs use local spatial maps, often in parallel, 
# to process the input.

# what about RNNs? RNNs can be thought of as special MLPs that process sequential data recursively.

print("number of parameters in the model:", sum(p.numel() for p in model.parameters() if p.requires_grad))

# oh, it's a little cutie. 

print(model.layers[0].weight)

print(model.layers[0].weight.shape)

MLP(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=10, bias=True)
    (3): ReLU()
    (4): Linear(in_features=10, out_features=3, bias=True)
  )
)
number of parameters in the model: 1263
Parameter containing:
tensor([[-1.1927e-01, -1.2398e-03,  3.4817e-02, -2.1938e-02, -8.4743e-02,
         -1.3332e-01,  2.4077e-02,  5.5628e-02, -9.1602e-02, -6.8016e-02,
          5.8990e-02,  2.2892e-02, -1.2519e-01,  7.5482e-02,  1.0687e-01,
         -7.2587e-02,  2.8433e-02,  5.8810e-02,  2.8852e-03, -2.6434e-02,
          1.0930e-01, -4.0211e-02,  1.1209e-01,  1.3121e-01, -1.3723e-01,
          6.6699e-02,  8.1584e-02,  1.2639e-01, -9.7397e-03,  9.5536e-02,
         -1.0552e-01, -8.1127e-02, -5.8165e-02, -3.7421e-02, -4.8189e-03,
          4.9556e-02,  6.3948e-03,  1.5154e-02,  1.9822e-02, -7.1256e-02,
          2.9408e-02,  5.8757e-02, -1.1909e-01,  7.4363e-02,  7.6591e-02,
         -1.2040e-01,  2.1

In [56]:
# inference baby

torch.manual_seed(1337)

x = torch.randn(1, 50)

print(model(x))

# notice the AddmmBackward specifies how the output is computed.
# we don't actually need this extra computation and memory for inference
# so we can use the torch.no_grad() context manager to disable gradient computation.

with torch.no_grad():
    print(model(x))

# what about the probs? these are only logits.
# logits are effectively the raw output of the last layer
# they represent the unnormalised scores for each class

with torch.no_grad():
    print(torch.softmax(model(x), dim=1))

# now we have class membership probabilities
# we can use the argmax method to get the predicted class

with torch.no_grad():
    print(torch.argmax(torch.softmax(model(x), dim=1), dim=1))

# this is the predicted class.

tensor([[ 0.0340, -0.2001,  0.3014]], grad_fn=<AddmmBackward0>)
tensor([[ 0.0340, -0.2001,  0.3014]])
tensor([[0.3228, 0.2554, 0.4218]])
tensor([2])
