# Assignment 11.1 - Transformer

Please submit your solution of this notebook in the Whiteboard at the corresponding Assignment entry as .ipynb-file and as .pdf.

#### Please state both names of your group members here:
Jane and John Doe

In [1]:
# Paola Gega, Daniel Thompson

## Task 11.1.1: Self-Attention

Implement the attention mechanism by yourself. You are free to use torch and numpy to speed up the matrix multiplications, but please don't just use their transformer implementation.

In the image below, you see the design of one Encoder Block. We want you to set up this Block. Please use your implementation of the Self-Attention (doesn't have to be multi-head) and build the Add & Norm and Feed Forward layers on top of it. Add & Norm and the Feed Forward should be implementations by PyTorch or else. You only need to use your own Self-Attention function.

* Show that your model block works, by forwarding a randomly initialized tensor through it once. Print the values of the Random input tensor, the output tensor and the Q,K and V matrices. **(RESULT)** 

In [2]:
from IPython.display import Image
Image(url="https://www.researchgate.net/publication/334288604/figure/fig1/AS:778232232148992@1562556431066/The-Transformer-encoder-structure.ppm", height=300)

In [3]:
import numpy as np
import torch

In [None]:
class SelfAttentionEncoderBlock(torch.nn.Module):
    def __init__(self, d_model, n_head, dim_feedforward, dropout=0.1):
        super(SelfAttentionEncoderBlock, self).__init__()
        self.d_model = d_model
        self.n_head = n_head
        self.dim_feedforward = dim_feedforward
        self.dropout = dropout
        
        # Initialize weights for attention and feedforward layers
        self.W_q = torch.nn.Parameter(torch.randn(d_model, d_model))
        self.W_k = torch.nn.Parameter(torch.randn(d_model, d_model))
        self.W_v = torch.nn.Parameter(torch.randn(d_model, d_model))
        self.W_o = torch.nn.Parameter(torch.randn(d_model, d_model))
        self.W1 = torch.nn.Parameter(torch.randn(d_model, dim_feedforward))
        self.W2 = torch.nn.Parameter(torch.randn(dim_feedforward, d_model))

    def softmax(self, x):
        e_x = torch.exp(x - torch.max(x, dim=-1, keepdim=True)[0])
        return e_x / e_x.sum(dim=-1, keepdim=True)

    def forward(self, src):
        # Self-attention
        Q = torch.matmul(src, self.W_q)
        K = torch.matmul(src, self.W_k)
        V = torch.matmul(src, self.W_v)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.d_model)
        attn_weights = self.softmax(scores)
        attn_output = torch.matmul(attn_weights, V)
        attn_output = torch.matmul(attn_output, self.W_o)

        src = src + attn_output  # Add & Norm
        # Feedforward
        ff_output = torch.relu(torch.matmul(src, self.W1))  # ReLU activation
        ff_output = torch.matmul(ff_output, self.W2)

        src = src + ff_output  # Add & Norm
        return src

In [5]:
d_model = 2
n_head = 8
dim_feedforward = 16
seq_length = 2
batch_size = 2
encoder_block = SelfAttentionEncoderBlock(d_model, n_head, dim_feedforward)
src = torch.randn(seq_length, batch_size, d_model)
output = encoder_block.forward(src)

print("Input Tensor:\n", src)
print("Output Tensor:\n", output)
Q = torch.matmul(src, encoder_block.W_q)
K = torch.matmul(src, encoder_block.W_k)
V = torch.matmul(src, encoder_block.W_v)
print("Q Matrix:\n", Q)
print("K Matrix:\n", K)
print("V Matrix:\n", V)

Input Tensor:
 tensor([[[-0.1013, -0.4890],
         [-1.3201,  0.4021]],

        [[-0.3013,  0.5486],
         [ 0.3052, -0.4288]]])
Output Tensor:
 tensor([[[-0.6798, -1.4005],
         [-5.2709, -7.2139]],

        [[-1.3682, -0.6569],
         [ 0.6920, -0.0146]]], grad_fn=<AddBackward0>)
Q Matrix:
 tensor([[[-0.0914, -0.7301],
         [-0.6554, -0.7728]],

        [[-0.1135,  0.4129],
         [ 0.1250, -0.2547]]], grad_fn=<UnsafeViewBackward0>)
K Matrix:
 tensor([[[-0.8854,  0.1660],
         [ 1.6493,  0.9159]],

        [[ 1.2656,  0.1250],
         [-1.0351, -0.1499]]], grad_fn=<UnsafeViewBackward0>)
V Matrix:
 tensor([[[-0.2667,  0.4326],
         [ 0.0842,  0.1197]],

        [[ 0.2592, -0.3447],
         [-0.1959,  0.2459]]], grad_fn=<UnsafeViewBackward0>)


### Task 11.2 Use your own Transformer Block

* Chain 3 of your transformer blocks to set up a model. Put 1 fully connected layer head on top. **(RESULT)**
* Train your model on the MNIST dataset for image classification. **(RESULT)**
* Report the test accuracy after training. **(RESULT)**

Can you make your own attention work? :)

In [None]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

# We couldn't get any decent training results with our implementation so we have to
# use the built-in attention unfortunately.
# Maybe we just weren't training long enough but accuracy did not seem better than a random guess.

class SelfAttentionEncoderBlock(torch.nn.Module):
    def __init__(self, d_model, n_head, dim_feedforward, dropout=0.1):
        super(SelfAttentionEncoderBlock, self).__init__()
        self.self_attn = torch.nn.MultiheadAttention(d_model, n_head, dropout=dropout)
        self.linear1 = torch.nn.Linear(d_model, dim_feedforward)
        self.dropout = torch.nn.Dropout(dropout)
        self.linear2 = torch.nn.Linear(dim_feedforward, d_model)
        self.norm1 = torch.nn.LayerNorm(d_model)
        self.norm2 = torch.nn.LayerNorm(d_model)
        self.dropout1 = torch.nn.Dropout(dropout)
        self.dropout2 = torch.nn.Dropout(dropout)
        self.activation = torch.nn.ReLU()

    def forward(self, src):
        # Self-attention
        src2, _ = self.self_attn(src, src, src)
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        # Feedforward
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

In [7]:
class TransformerModel(torch.nn.Module):
    def __init__(self, d_model, n_head, dim_feedforward, num_layers, num_classes, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.layers = torch.nn.ModuleList([
            SelfAttentionEncoderBlock(d_model, n_head, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])
        self.fc = torch.nn.Linear(d_model, num_classes)

    def forward(self, src):
        for layer in self.layers:
            src = layer(src)
        # Take the mean across the sequence length dimension
        src = src.mean(dim=0)
        output = self.fc(src)
        return output
    

In [8]:
d_model = 512
n_head = 8
dim_feedforward = 2048
num_layers = 3
num_classes = 10

# Train
n_epochs = 5
model = TransformerModel(d_model, n_head, dim_feedforward, num_layers, num_classes)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=64, shuffle=True)
for epoch in range(n_epochs):
    for images, labels in train_loader:
        # Flatten images and project to d_model dimensions
        images = images.view(images.size(0), -1)  # Flatten
        images = torch.nn.functional.pad(images, (0, d_model - images.size(1)), "constant", 0)  # Pad to d_model
        images = images.unsqueeze(0)  # Add sequence length dimension

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Test
mnist_test = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())
test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=64, shuffle=False)
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(images.size(0), -1)
        images = torch.nn.functional.pad(images, (0, d_model - images.size(1)), "constant", 0)
        images = images.unsqueeze(0)

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Test Accuracy: {100 * correct / total}%')

Epoch 1, Loss: 0.2508094012737274
Epoch 2, Loss: 0.197285994887352
Epoch 3, Loss: 0.11669344455003738
Epoch 4, Loss: 0.27959802746772766
Epoch 5, Loss: 0.21897737681865692
Test Accuracy: 95.07%


**Report:** Trained only for five epochs, but kind of lackluster results?

## Congratz, you made it! :)