In [2]:
#Just run this one time if pytorch is not already installed in the current juyter kernel 
# !conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch -y

In [1]:
import torch

device = "cpu"
if torch.backends.mps.is_available():
    device = torch.device("mps")
if torch.cuda.is_available():
    device = "cuda:0" 

print(device)
x = torch.ones(1, device=device)
print(x)

mps
tensor([1.], device='mps:0')


The torch framework can be installed with a copy of the CUDA toolkit by nvidia so it can have access to the current device dedicated GPU if there is one.
The `device` variable will have the GPU availability information so we can now if our code can be run in the GPU insted of the CPU. For more infor on install options go to: https://pytorch.org/get-started/locally/

# Transformers

In this notebook we will be implementing one of the most popular architectures that has taken over the DL community, Tranformers!!. 
They started with the 2017 paper by Google called "Attention is all you need" since the main idea behind transformers is to use the so calles "attention mechanism" as the core part of the architecture. This started as a sequence modeling architecture, more specifically for Language modelling, but in recent year it has taken over almost every big field of Deep Learning. Vision with ViT, RL with Decision Transformers, Speech with the Conformer and many more. One of my goals with this project is to implement all of these architectures and see how they compare with more traditional approaches like LSTM for Language, CNN for vision and Encoder-Decoder for Speech.

In particular in this notebook we will be implementing the core and basic transformer architecture.

## 1. The Attention Mechanism

Note: Here all vectors are always column vectors, so $x \in R^{n}$ is a column of size $n$ and $x^T$ a row of size $n$

The key idea behind transformers, as said above, is the attention mechanism. This can be represented as 3 matrices that act as parameters of the model. Query, Key, Value are the usual names given to the said matrices. 
Self attention is a sequence to sequence operator, so it inputs *t* vectors, each in $R^{n}$ and outputs also *t* vectors in $R^{n}$ (in the context of NLP each vector is a word embedding, and the *t* vectors represent a sentence of *t* words). Another way of seeing it is that is takes an element of $R^{n x t}$ and outputs another element of $R^{n x t}$. So one way of doing this is to multiply the entry matrix of vectors (lets call it $X \in R^{n x t}$) with a matrix $W$ of size (t x t) so the output is another matrix (lets call it $Y \in R^{n x t}$). This is exaclty what the basic attention mechanism does. Basically each output column vector $y_i \in R^{n}$ is calculates like:

$$ y_i = \sum_j w_{ij}x_j \text{ where } w_{ij} = softmax(x_i^T \cdot X, \text{row wise})_j \text{ .j-th entry of the vector obtain by applying a softmax opperation. } $$

Sea $w_i \in R^t := softmax(x_i^T \cdot X, \text{row wise})^T$ entonces: 

$$ y_i = X \cdot w_i$$

más aún, sea $W := softmax(X^T \cdot X, \text{row wise})^T = [w_1 | ... | w_i | ... | w_t]$

$$ Y =  X \cdot W = [y_1 | ... | y_i | ... | y_t]$$

So We have that: 

$$ Y = X \cdot softmax(X^T \cdot X, \text{row wise})^T$$

Note that since $X^T \cdot X$ is a symmetric matrix (This is really easy to check), the above formula can also be written as:

$$ Y = X \cdot softmax(X \cdot X^T, \text{row wise})^T$$

But we also know that $softmax(B, \text{row wise})^T = softmax(B^T, \text{Column wise})$. Then:

$$ Y = X \cdot softmax(X^T \cdot X, \text{column wise})$$

We can appreciate that in the formula above the entry matrix $X$ appear 3 times. What self attentiont does is to replicate this behaviour with 3 different matrices that will be parameters that the model needs to optimize via back propagation. 

So, in self-attention the role of the first appearence of X is made by the Value matrix. The second is the Query and the Third is the Key. So the formula for $Y$ becomes:

$$
Y = V \cdot softmax(Q^T \cdot K, \text{column wise})
$$

Where $Q, K \in R^{n x t}$ and $V \in R^{txt}$




In [None]:
def basic_self_att_1():
    

In [6]:
from torch.nn import Softmax
import numpy as np
x = torch.tensor([[1,2,3], [2, 2, 4], [3, 4, 5]], dtype=torch.float32)
print(x.shape)
print(x)
print(torch.transpose(x, 1, 0))
print(x.softmax(0))
print(x.softmax(1).transpose(0, 1))

torch.Size([3, 3])
tensor([[1., 2., 3.],
        [2., 2., 4.],
        [3., 4., 5.]])
tensor([[1., 2., 3.],
        [2., 2., 4.],
        [3., 4., 5.]])
tensor([[0.0900, 0.1065, 0.0900],
        [0.2447, 0.1065, 0.2447],
        [0.6652, 0.7870, 0.6652]])
tensor([[0.0900, 0.1065, 0.0900],
        [0.2447, 0.1065, 0.2447],
        [0.6652, 0.7870, 0.6652]])


## Input Embedding

The first layer of the encoder part of the transformer architecture is the input embedding, with a fixed context lenght and trainable weight the input embedding layer will have vector-based representations for each word in the input sentence. 

In [2]:
import torch.nn as nn
import math

class InputEmbedding(nn.Module):

    def __init__(self, 
     embedding_size: int = 256,
     vocab_size: int = 20_000):

        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)

    
    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.embedding_size)

    



### Potitional encoding

In [3]:
import math
class PotitionalEncoding(nn.Module):

    def __init__(self, 
                 embedding_size: int = 256,
                 context_length: int = 256,
                 dropout: float = 0.2):

        super().__init__()
        self.embedding_size = embedding_size
        self.context_length = context_length
        self.dropout = nn.Dropout(dropout)

        #create a matrix of size (context_length, embedding_size)
        pe = torch.zeros(context_length, embedding_size)
        # Create vector of shape (context_length)
        position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_size, 2).float() * (-math.log(10_000.0) / embedding_size))
        # apply sine to even positions
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # (1, context_len, embedding_size)

        self.register_buffer('pe', pe)
    

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)


## Layer Normalization

In [4]:
class LayerNormalization(nn.Module):
    def __init__(self, epsilon: float = 10**-6) -> None:
        super().__init__()
        self.epsilon = epsilon # Parameter used for numerical stability (sets an upper boundary on the scale of the normalization)

        self.alpha = nn.Parameter(torch.ones(1)) # Multiplicative
        self.bias = nn.Parameter(torch.zeros(1)) # Additive


    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.bias + self.alpha * (x - mean) / (std + self.epsilon)


## Position-wise Feed-Forward Network

In [None]:
class FeedForward(nn.Module):
    def __init__(self, 
                 embedding_size: int = 256, 
                 hidden_state_size:int = 1024, 
                 dropout: float = 0.2) -> None:
        super().__init__()
        self.embedding_size = embedding_size
        self.hidden_state_size = hidden_state_size
        self.linear_1 = nn.Linear(embedding_size, hidden_state_size)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(hidden_state_size, embedding_size)
    
    def forward(self, x):
        # x -> (batch, context_size, embedding_size)
        # -> broadcasting will perform: 
        #       linear_1 matrix of size (hidden_state_size, embedding_size) @ (batch, context_size, embedding_size) = (batch, context_size, hidden_state_size)
        #       linear_2 matrix of size (embedding_size, hidden_state_size) @ (batch, context_size, hidden_state_size) = (batch, context_size, embedding_size)

        x = self.linear_1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.linear_2(x)
        return x




## Multi-Head Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_size: int = 256,
                 context_length: int = 256,
                 heads: int = 4) -> None:
        self.embedding_size = embedding_size
        self.context_length = context_length
        self.heads = heads

        self.tokeys    = nn.Linear(embedding_size, embedding_size, bias=False)
        self.toqueries = nn.Linear(embedding_size, embedding_size, bias=False)
        self.tovalues  = nn.Linear(embedding_size, embedding_size, bias=False)
        self.unifyheads = nn.Linear(embedding_size, embedding_size)

    def forward(self, x):
        # X shape: (b, context_lenght, embd_size) -- row-wise embeddings
        b, cl, es = x.size() # batch, context length, embedding size
        s = es // self.heads

        Q = self.toqueries(x) # --> (b, context_lenght, embd_size)
        K = self.tokeys(x)
        V = self.tovalues(x)


        # Folds each head as a new batch
        Q = Q.transpose(1, 2).contiguous().view(b * self.heads, cl, s)
        K = K.transpose(1, 2).contiguous().view(b * self.heads, cl, s)
        V = V.transpose(1, 2).contiguous().view(b * self.heads, cl, s)

        # Scaled Dot product attention between heads
        scaled_dot = torch.bmm(Q, K.transpose(1, 2)) / torch.sqrt(es)
        QK_attention = torch.softmax(scaled_dot, dim=2)

        # Apply learned attention
        self_attention = torch.bmm(QK_attention, V).view(b, self.heads, cl, s)

        # Concatenate heads
        self_attention = self_attention.transpose(1, 2).contiguous().view(b, cl, s * self.heads)

        return self.unifyheads(self_attention)        
        

## Transformer Block

In [None]:
class TransformerBlock(nn.Module):

    def __init__(self, embedding_size: int = 256,
                 context_length: int = 256,
                 heads: int = 4) -> None:
        self.embedding_size = embedding_size
        self.context_length = context_length
        self.heads = heads

        self.multihead_attention = MultiHeadAttention(embedding_size, context_length, heads)
        self.layer_norm_1 = LayerNormalization()
        self.layer_norm_2 = LayerNormalization()
        self.mlp = FeedForward(embedding_size, hidden_state_size = 1024, dropout = 0.2)
    
    def forward(self, x):
        residual_conn = x.copy()
        x = self.multihead_attention(x)
        x = x + residual_conn
        x =  self.layer_norm_1(x)
        residual_conn = x.copy()
        x = self.mlp(x)
        x =  self.layer_norm_2(x)
        return x