In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# implementation of the self-attention layer
class SelfAttention(nn.Module):

    def __init__(self, d_model, dk, n_heads, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.dk = dk
        self.n_heads = n_heads

        self.key = nn.Linear(d_model, dk * n_heads)  # input shape: (N, T, d_model), output shape: (N, T, dk * n_heads)
        self.query = nn.Linear(d_model, dk * n_heads)
        self.value = nn.Linear(d_model, dk * n_heads)

        self.fc = nn.Linear(dk * n_heads, d_model)
    
    def forward(self, q, k, v, mask=None):
        # the mask is a boolean tensor of shape (N, T) where 1 means that the token is not masked
        q = self.query(q)  # output shape: (N, T, dk * n_heads)
        k = self.key(k)
        v = self.value(v)

        N = q.shape[0]  # batch size
        T = q.shape[1]  # sequence length

        # change shapes from (N, T, dk * n_heads) to (N, T, n_heads, dk) then to (N, n_heads, T, dk)
        q = q.view(N, T, -1, self.dk).transpose(1, 2)
        k = k.view(N, T, -1, self.dk).transpose(1, 2)
        v = v.view(N, T, -1, self.dk).transpose(1, 2)

        # compute attention scores
        # the following line implements the algebraic multiplication of Q times K^T normalized by sqrt(dk)
        # ignoring batch size and heads, Q is shaped (T, dk) and K is shaped (T, dk), so output is (T, T)
        scores = q @ k.transpose(-2, -1) / np.sqrt(self.dk)  # output shape: (N, n_heads, T, T)

        if mask is not None:
            scores = scores.masked_fill(
                mask[:, None, None, :] == 0,  # the None's create two new dimensions of size 1 thus shaping it to (N, 1, 1, T) making it broadcastable
                float('-inf')  # this is the value that will be used to replace the masked values; -inf is used because it will be replaced by 0 after the softmax
                )

        # apply softmax to get attention probabilities
        attention = F.softmax(scores, dim=-1)  # output shape: (N, n_heads, T, T)

        # apply attention to values
        # ignoring batch size and heads, attention is shaped (T, T) and V is shaped (T, dk), so output is (T, dk)
        out = attention @ v  # output shape: (N, n_heads, T, dk)

        # change shape from (N, n_heads, T, dk) to (N, T, n_heads, dk) then to (N, T, dk * n_heads)
        out = out.transpose(1, 2).contiguous().view(N, T, -1)

        # apply linear layer
        return self.fc(out)  # output shape: (N, T, d_model)

In [3]:
# implementation of the transformer block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, dk, n_heads, dropout_prob=0.1, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.mha = SelfAttention(d_model, dk, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout_prob)
        )
        self.dropout = nn.Dropout(dropout_prob)


    def forward(self, x, mask=None):
        x = self.ln1(x + self.mha(x, x, x, mask))
        x = self.ln2(x + self.ffn(x))
        x = self.dropout(x)
        return x

In [4]:
# implementation of positional encoding

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048, dropout_prob=0.1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dropout = nn.Dropout(dropout_prob)

        position = torch.arange(max_len).unsqueeze(1)  # shape: (max_len, 1)
        exp_term = torch.arange(0, d_model, 2)
        div_term = torch.exp(exp_term * (-np.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)


    def forward(self, x):
        # x is of shape (N, T, d_model)
        x = x + self.pe[:, :x.shape[1], :]
        return self.dropout(x)

In [5]:
# implementation of the encoder of the transformer model
class Encoder(nn.Module):

    def __init__(
            self,
            vocab_size,
            max_len,
            d_model,
            dk,
            n_heads,
            n_layers,
            n_classes,
            dropout_prob=0.1):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim=d_model)

        self.pos_encoding = PositionalEncoding(
            d_model=d_model, 
            max_len=max_len, 
            dropout_prob=dropout_prob)

        self.transformer_blocks = [TransformerBlock(
            d_model=d_model, 
            dk=dk, 
            n_heads=n_heads, 
            dropout_prob=dropout_prob
            ) for _ in range(n_layers)]

        self.ln = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, n_classes)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for block in self.transformer_blocks:
            x = block(x, mask)

        # many to one: x is of shape (N, T, d_model) ...
        x = x[:, 0, :]  # ... but we want to get the first token of the sequence (N, d_model)
        # x = x.mean(dim=1)  # ... and we want to average over T to get (N, d_model)

        x = self.ln(x)
        x = self.fc(x)
        return x

In [6]:
model = Encoder(
    vocab_size=20_000,
    max_len=1024,
    d_model=64,
    dk=16,
    n_heads=4,
    n_layers=2,
    n_classes=5,
    dropout_prob=0.1
)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device "{device}"')
model.to(device)

Using device "cpu"


Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (fc): Linear(in_features=64, out_features=5, bias=True)
)

In [8]:
x = torch.randint(0, 20000, (8, 512)).to(device)
x_t = torch.tensor(x).to(device)
x_t

  x_t = torch.tensor(x).to(device)


tensor([[  882, 14118,  4074,  ...,   201,  1275,  2189],
        [ 1298, 17497, 18765,  ...,  5031, 15060,  7135],
        [16884,  3177, 16577,  ..., 18879,  5062, 13805],
        ...,
        [14096, 12631,  2245,  ..., 13232,  6078, 14346],
        [17593, 16965,  5821,  ...,  4822, 10690, 16190],
        [ 7157,  2234, 19530,  ...,  5718,  7174,   732]])

In [9]:
mask = torch.ones_like(x_t).to(device)
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)

  mask_t = torch.tensor(mask).to(device)


In [10]:
y = model(x_t, mask_t)

In [11]:
y.shape

torch.Size([8, 5])

In [12]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [13]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [14]:
from datasets import load_dataset

In [15]:
raw_datasets = load_dataset('glue', 'sst2')

Found cached dataset glue (C:/Users/fonta/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [17]:
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [18]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [20]:
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [21]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [22]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets['train'],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

val_loader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=32,
    collate_fn=data_collator
)

In [23]:
for batch in train_loader:
    for k, v in batch.items():
        print(f'k: {k}', f'v shape: {v.shape}')
    break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


k: labels v shape: torch.Size([32])
k: input_ids v shape: torch.Size([32, 41])
k: attention_mask v shape: torch.Size([32, 41])


In [24]:
set(tokenized_datasets['train']['labels'])

{0, 1}

In [25]:
tokenizer.vocab_size

28996

In [26]:
tokenizer.max_model_input_sizes

{'distilbert-base-uncased': 512,
 'distilbert-base-uncased-distilled-squad': 512,
 'distilbert-base-cased': 512,
 'distilbert-base-cased-distilled-squad': 512,
 'distilbert-base-german-cased': 512,
 'distilbert-base-multilingual-cased': 512}

In [27]:
model = Encoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.max_model_input_sizes[checkpoint],
    d_model=64,
    dk=16,
    n_heads=4,
    n_layers=2,
    n_classes=2,
    dropout_prob=0.1)

model.to(device)

Encoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)

In [34]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [35]:
from datetime import datetime

In [36]:
def train(model, criterion, optimizer, train_loader, val_loader, epochs):
    train_losses = []
    test_losses = []

    for epoch in range(epochs):
        model.train()
        t0 = datetime.now()

        train_loss = 0
        n_train = 0

        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}	
            optimizer.zero_grad()

            y = model(batch['input_ids'], batch['attention_mask'])

            loss = criterion(y, batch['labels'])

            loss.backward()
            optimizer.step()

            train_loss += loss.item() * batch['input_ids'].shape[0]
            n_train += batch['input_ids'].shape[0]

        train_loss /= n_train

        model.eval()
        test_loss = 0
        n_test = 0
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}	

            y = model(batch['input_ids'], batch['attention_mask'])

            loss = criterion(y, batch['labels'])

            test_loss += loss.item() * batch['input_ids'].shape[0]
            n_test += batch['input_ids'].shape[0]
        test_loss /= n_test

        train_losses.append(train_loss)
        test_losses.append(test_loss)

        dt = datetime.now() - t0
        print(
            f'Epoch {epoch + 1}/{epochs} | ', 
            f'train_loss: {train_loss:.5f} | ',
            f'test_loss: {test_loss:.5f} | ',
            f'time: {dt}'
        )

    return train_losses, test_losses

train_losses, test_losses = train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=4
)


Epoch 1/4 |  train_loss: 0.57149 |  test_loss: 0.55224 |  time: 0:01:12.168597
Epoch 2/4 |  train_loss: 0.44175 |  test_loss: 0.45744 |  time: 0:01:10.180943
Epoch 3/4 |  train_loss: 0.36602 |  test_loss: 0.42810 |  time: 0:01:14.285955
Epoch 4/4 |  train_loss: 0.32105 |  test_loss: 0.42704 |  time: 0:01:18.730365


In [37]:
model.eval()

def get_loader_acc(model, loader):
    n_correct = 0
    n_total = 0

    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}	

        y = model(batch['input_ids'], batch['attention_mask'])

        n_correct += (y.argmax(dim=-1) == batch['labels']).sum().item()
        n_total += batch['input_ids'].shape[0]

    return n_correct / n_total

train_acc = get_loader_acc(model, train_loader)
test_acc = get_loader_acc(model, val_loader)

print(f'Train acc: {train_acc:.5f}')
print(f'Test acc: {test_acc:.5f}')



Train acc: 0.88907
Test acc: 0.80619
