# Decoder

In [1]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

In [2]:
class CausalSelfAttention(nn.Module):
    def __init__(self, d_k, d_model, n_heads, max_len):
        super().__init__()
        
        self.d_k = d_k
        self.n_heads = n_heads
        self.key = nn.Linear(d_model, d_k * n_heads)
        self.query = nn.Linear(d_model, d_k*n_heads)
        self.value = nn.Linear(d_model, d_k*n_heads)
        self.fc = nn.Linear(d_k * n_heads, d_model)
        
        # creiamo la causal mask, motivo per il quale passiamo anche la max_len nell'init
        # la causal mask ci permette di diagonalizzare la matrice dei pesi dell'attenzione
        # torch.tril è triangular lower (gli 1 stanno sotto)
        cm = torch.tril(torch.ones(max_len, max_len))
        # registriamo questa matrice con questa forma: 1 x 1 x max_len x max_len
        self.register_buffer("causal_mask", cm.view(1, 1, max_len, max_len))
        
    def forward(self, q, k, v, pad_mask=None):
        q = self.query(q)
        k = self.key(k)
        v = self.value(v)
        
        N = q.shape[0]
        T = q.shape[1]
        
        q = q.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        k = k.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        v = v.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        
        attn_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)
        if pad_mask is not None:
            attn_scores = attn_scores.masked_fill(pad_mask[:, None, None, :] == 0, float("-inf"))
        # Causal Mask si applica su tutte le dimensioni, fino alla lunghezza dell'input per velocizzare
        attn_scores = attn_scores.masked_fill(self.causal_mask[:, :, :T, :T] == 0, float("-inf"))
        attn_weights = F.softmax(attn_scores, dim=-1)
        
        A = attn_weights @ v
        
        A = A.transpose(1, 2)
        A = A.contiguous().view(N, T, self.d_k * self.n_heads)
        
        return self.fc(A)

In [3]:
class TransformerBlock(nn.Module):
    def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
        super().__init__()
        
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.mha = CausalSelfAttention(d_k, d_model, n_heads, max_len)
        self.ann = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(p=dropout_prob)
        )
        self.dropout = nn.Dropout(p=dropout_prob)
       
    
    def forward(self, x, pad_mask=None):
        x = self.ln1(x + self.mha(x, x, x, pad_mask))
        x = self.ln2(x + self.ann(x))
        x = self.dropout(x)
        return x

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048, dropout_prob=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout_prob)

        position = torch.arange(max_len).unsqueeze(1)
        exp_term = torch.arange(0, d_model, 2) 
        div_term = torch.exp(exp_term * (- math.log(10000) / d_model))

        pe = torch.zeros(1, max_len, d_model)
        pe[0,:,0::2] = torch.sin(position * div_term) 
        pe[0,:,1::2] = torch.cos(position * div_term)

        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :] 
        return self.dropout(x)

In [5]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, max_len, d_k, d_model, n_heads, n_layers, dropout_prob):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
        transformer_blocks = [TransformerBlock(d_k, d_model, n_heads, max_len, dropout_prob) for _ in range(n_layers)]
        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.ln = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, vocab_size) # Qui cambia rispetto all'encoder! Abbiamo bisogno della vocab size
        
    def forward(self, x, pad_mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for block in self.transformer_blocks:
            x = block(x, pad_mask)
            
        # Qui c'è la differenza: prima noi prendevamo solo il primo hidden vector (x[:,0,:]) per il calcolo dei logits
        # Ora invece noi calcoliamo simultaneamente T risultati
        x = self.ln(x)
        x = self.fc(x) # many to many
        return x

### Testing

In [6]:
model = Decoder(20000, 1024, 16, 64, 4, 2, 0.1)

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda:0


Decoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

In [8]:
x = np.random.randint(0, 20000, size=(8,512))
x_t = torch.tensor(x).to(device)

In [9]:
mask = np.ones((8, 512))
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)

In [10]:
y = model(x_t, mask_t)
y.shape

torch.Size([8, 512, 20000])

## Tokenizer & Data Collator

In [11]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

In [12]:
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [16]:
# Siccome abbiamo bisogno solo di testi, usiamo lo stesso dataset del GLUE, ma eliminiamo le labels
raw_datasets = load_dataset("glue", "sst2")

In [17]:
def tokenize_fn(batch):
    return tokenizer(batch["sentence"], truncation=True)

In [18]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True).remove_columns(["sentence", "idx", "label"])

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Training

### Data Loading

In [20]:
from torch.utils.data import DataLoader

In [21]:
train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

In [22]:
for batch in train_loader:
    for k, v in batch.items():
        print("k:", k, "v:", v)
    break

k: input_ids v: tensor([[  101,  1103,  1469,  ...,     0,     0,     0],
        [  101,  1157, 20197,  ...,     0,     0,     0],
        [  101,  1103,  1273,  ...,     0,     0,     0],
        ...,
        [  101,  7246,  1105,  ...,     0,     0,     0],
        [  101,  1110,  5098,  ...,     0,     0,     0],
        [  101,   176, 13356,  ...,     0,     0,     0]])
k: attention_mask v: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [23]:
tokenizer.pad_token_id

0

### Model Building

In [24]:
model = Decoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.model_max_length,
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1
)
model.to(device)

Decoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

### Loss & Optimizer

In [25]:
# Nel criterio, ignoriamo tutti i pad token, dato che stiamo facendo predizione "many to many"
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters())

### Training Function

In [26]:
from datetime import datetime

In [27]:
def train(model, criterion, optimizer, train_loader, epochs):
    train_losses = np.zeros(epochs)
    
    for it in range(epochs):
        model.train()
        t0 = datetime.now()
        train_loss = [] # Questa volta la train loss sarà mediata per ogni campione
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            
            # shiftiamo i target indietro, cloniamo e stacchiamo i gradienti dal clone
            targets = batch["input_ids"].clone().detach()
            targets = torch.roll(targets, shifts=-1, dims=1) # rolliamo sulla dimensione T
            targets[:, -1] = tokenizer.pad_token_id # inseriamo un padding all'ultimo, dato lo shift
            
            # forward
            outputs = model(batch["input_ids"], batch["attention_mask"])
            # V è la vocabulary size
            # Gli output avranno una dimensione pari a N x T x V, ma pytorch si aspetta N x V x T nel criterion
            # Per questo motivo faremo una trasposizione degli output in questo modo:
            loss = criterion(outputs.transpose(2, 1), targets)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item()) # loss nel batch
        
        # semplificazione: facciamo la media nel batch, anche se il numero di esempi per batch (T) sarà diverso
        train_loss = np.mean(train_loss) 
        train_losses[it] = train_loss
        
        dt = datetime.now() - t0
        print(f"Epoch {it+1}/{epochs} - Train Loss: {train_loss:.4f}, Duration: {dt}")
    
    return train_losses
        
        

In [28]:
train_losses = train(model, criterion, optimizer, train_loader, epochs=15)

Epoch 1/15 - Train Loss: 5.9519, Duration: 0:20:35.220803


KeyboardInterrupt: 

In [29]:
valid_loader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=1,
    collate_fn=data_collator
)

In [30]:
model.eval()
for batch in valid_loader:
    batch = {k:v.to(device) for k, v in batch.items()}
    outputs = model(batch["input_ids"], batch["attention_mask"])
    break

In [31]:
outputs.shape

torch.Size([1, 12, 28996])

In [32]:
torch.argmax(outputs, axis=-1)

tensor([[1103,  112,  188, 1136, 2523, 1104,  102,  170,  117,  102,  102,  117]],
       device='cuda:0')

In [33]:
prediction_ids = torch.argmax(outputs, axis=-1)

In [34]:
tokenizer.decode(batch["input_ids"][0])

"[CLS] it ' s a charming and often affecting journey. [SEP]"

In [35]:
tokenizer.decode(prediction_ids[0])

"the ' s not movie of [SEP] a, [SEP] [SEP],"

In [44]:
# Stiamo concatenando i primi cinque token dell'input con il risultato della predizione
# NB: siccome l'input include 0-1-2-3-4, noi attualmente vorremmo l'output del 4, che è esattamente prediction_ids[0,4]
# se usiamo lo 0, però, da problemi.
tokenizer.decode(torch.concat((batch["input_ids"][0, :5], prediction_ids[:, 4])))

"[CLS] it ' s a movie"

### Inferenza

In [45]:
# Generiamo qualcosa
prompt = "it's"
tokenized_prompt = tokenizer(prompt, return_tensors="pt")
tokenized_prompt

{'input_ids': tensor([[ 101, 1122,  112,  188,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [49]:
# Gli leviamo il [SEP] token manualmente, che definisce la fine della frase
ouputs = model(
    tokenized_prompt["input_ids"][:, :-1].to(device),
    tokenized_prompt["attention_mask"][:, :-1].to(device)
)
outputs.shape

torch.Size([1, 12, 28996])

In [50]:
# Prendiamo unicamente l'ultimo timestep, cercando il massimo sulla dimensione del vocab
prediction_ids = torch.argmax(outputs[:,-1,:], axis=-1)

In [51]:
tokenizer.decode(prediction_ids[0])

','

In [66]:
# Generation Loop
prompt = "I think there"
tokenized_prompt = tokenizer(prompt, return_tensors="pt")
input_ids = tokenized_prompt["input_ids"][:, :-1].to(device)
mask = tokenized_prompt["attention_mask"][:, :-1].to(device)

for _ in range(100):
    outputs = model(input_ids, mask)
    prediction_id = torch.argmax(outputs[:, -1, :], axis=-1)
    
    # appendiamo il nuovo risultato (come token id). Hstack è horizontal stack (columnwise stack)
    input_ids = torch.hstack((input_ids, prediction_id.view(1, 1)))
    mask = torch.ones_like(input_ids) # genero la maschera nuova sul nuovo input, basandomi sulla shape
    
    if prediction_id == tokenizer.sep_token_id:
        break

In [67]:
tokenizer.decode(input_ids[0])

"[CLS] I think there ' s most of the film ' s most of the film ' s most of the film ' s most of the film ' s most of the film ' s most of the film [SEP]"