In [1]:
!pip install datasets transformers==4.28.0 torchinfo

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m 

In [2]:
import numpy as np
from torch.utils.data import dataset
import torchinfo
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch import optim
import math
import datasets
from transformers import AutoTokenizer, DataCollatorWithPadding
from datetime import datetime

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_k, d_model, n_heads):
        super().__init__()

        self.d_k = d_k
        self.d_model = d_model
        self.n_heads = n_heads

        # generate matrices weights
        self.key = nn.Linear(d_model, d_k * n_heads) # (d_model x d_k)
        self.query = nn.Linear(d_model, d_k * n_heads) # (d_model x d_k)
        self.value = nn.Linear(d_model, d_k * n_heads) # (d_model x d_k)

        # final linear layer
        self.final_layer = nn.Linear(d_k * n_heads, d_model)


    def forward(self, x, mask=None):
        # x -> batch_size (N) x T x d_model
        k = self.key(x)   # N x T x h*d_k
        q = self.query(x) # N x T x h*d_k
        v = self.value(x) # N x T x h*d_v

        N = q.shape[0] # store batch size
        T = q.shape[1] # store sequence dimension

        # tranform to (N x T x h, d_k) -> (N x h x T x d_k)
        k = k.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        q = q.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        v = v.view(N, T, self.n_heads, self.d_k).transpose(1, 2)

        # (N x h x T x d_k) * (N x h x d_k x T) -> (N x h x T x T)
        attn_scores = q @ k.transpose(-1, -2) / np.sqrt(self.d_k)
        if mask is not None:
            # mask is vector size 1 x T
            attn_scores = attn_scores.masked_fill(
                mask[:, None, None, :] == 0, float('-inf')
            )
        attn_weights = F.softmax(attn_scores, dim=-1)

        # (N x h x T x T) * (N x h x T x d_v) -> (N x h x T x d_v)
        A = attn_weights @ v
        # reshape to (N x T x h*d_v)
        A = A.transpose(1, 2)
        A = A.contiguous().view(N, T, self.n_heads * self.d_k)

        # (N x T x d_k * h) -> (N x T x d_model)
        return self.final_layer(A)

In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, d_k, d_model, n_heads, dropout_prob=0.1):
        super().__init__()

        self.d_k = d_k
        self.d_model = d_model
        self.n_heads = n_heads

        self.mha = MultiHeadAttention(d_k, d_model, n_heads)
        self.ln1 = nn.LayerNorm(d_model)
        # (N x T x d_model) -> (N x T)
        self.ann = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(p=dropout_prob)
        )
        self.ln2 = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(p=dropout_prob)

    def forward(self, x, mask=None):
        # x = (N x T x d_model) -> (N x T x d_model)
        x = self.ln1(x + self.mha(x, mask=mask))
        x = self.ln2(x + self.ann(x))
        x = self.drop(x)
        return x


In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_length=2048):
        super().__init__()

        self.dropout = nn.Dropout(p=dropout)

        # PE(pos, 2i) = sin(pos / 10000^(2i / d_model))
        # PE(pos, 2i+1) = cos(pos / 10000^(2i / d_model))
        position = torch.arange(max_length).unsqueeze(1)
        exp_term = torch.arange(0, d_model, 2)
        pe = torch.zeros(1, max_length, d_model)
        div_term = torch.exp(exp_term * (-np.log(10000.0)) / d_model)
        pe[:, :, 0::2] = torch.sin(position * div_term)
        pe[:, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [6]:
class Encoder(nn.Module):
    def __init__(
            self,
            d_k,
            max_length,
            vocab_size,
            d_model,
            n_heads,
            n_layers,
            n_classes,
            dropout=0.1
            ):
        super().__init__()

        # after tokenization -> batch_size x max_length (N x T)

        self.embed = nn.Embedding(vocab_size, d_model)
        # after embedding -> batch_size x max_length x d_model (N x T x d_model)

        self.positional_encoding = PositionalEncoding(d_model, max_length=max_length)
        transformer_blocks = [
            TransformerBlock(
            d_k,
            d_model,
            n_heads,
            dropout_prob=dropout
            ) for _ in range(n_layers)
        ]
        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.layer_norm = nn.LayerNorm(d_model)
        self.final_layer = nn.Linear(d_model, n_classes)


    def forward(self, x, mask=None):
        x = self.embed(x)
        x = self.positional_encoding(x)
        for block in self.transformer_blocks:
            x = block(x, mask=mask)


        # many-to-one (x has shape N x T x D)
        x = x[:, 0, :]

        x = self.layer_norm(x)
        x = self.final_layer(x)
        return x

In [7]:
# torchinfo.summary(model)

In [8]:
# load tokenizer
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# load dataset
raw_ds = datasets.load_dataset('glue', 'sst2')

vocab_size = tokenizer.vocab_size
max_length = tokenizer.max_model_input_sizes[checkpoint]
d_k = 16
d_model = 64
n_heads = 4
n_layers = 2
n_classes = 2
dropout = 0.1
num_epochs = 3
batch_size = 32

def tokenize_dataset(batch):
    return tokenizer(batch['sentence'], truncation=True, padding='max_length')

tokenized_ds = raw_ds.map(tokenize_dataset, batched=True)
torch_ds = tokenized_ds.with_format(type="torch", columns=["input_ids", "attention_mask", "label"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

torch_ds = torch_ds.remove_columns(['sentence', 'idx'])
torch_ds = torch_ds.rename_columns({'label': 'labels'})

train_dataloader = DataLoader(
    dataset=torch_ds['train'],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    torch_ds['validation'],
    batch_size=batch_size,
    collate_fn=data_collator
)

model = Encoder(
    d_k=d_k,
    vocab_size=vocab_size,
    max_length=max_length,
    d_model=d_model,
    n_heads=n_heads,
    n_layers=n_layers,
    n_classes=n_classes,
    dropout=dropout
)

model = model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)

train_losses = []
eval_losses = []

for epoch in range(1, num_epochs+1):
    torch.cuda.empty_cache()

    # train loop
    model.train()
    train_loss = 0
    n_train = 0
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        # forward pass
        outputs = model(batch['input_ids'], batch['attention_mask'])
        loss = loss_fn(outputs, batch['labels'])

        # backward step and optimization
        loss.backward()
        optimizer.step()

        # loss is the mean across whole batch
        train_loss += loss.item() * batch['input_ids'].size(0)
        n_train += batch['input_ids'].size(0)

    train_loss = train_loss / n_train
    train_losses.append(train_loss)

    # evaluation loop
    model.eval()
    eval_loss = 0
    n_eval = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        outputs = model(batch['input_ids'], batch['attention_mask'])
        loss = loss_fn(outputs, batch['labels'])
        eval_loss += loss * batch['input_ids'].size(0)
        n_eval = batch['input_ids'].size(0)

    eval_loss = eval_loss / n_eval
    eval_losses.append(eval_loss)

    print(f"Epoch: {epoch}/{num_epochs} \n Train loss: {train_loss:.4f} \n Eval loss: {eval_loss:.4f}")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1/3 
 Train loss: 0.6948 
 Eval loss: 75.2725


OutOfMemoryError: ignored

In [None]:
# compute accuracy
model.eval()
n_correct = 0
n_total = 0
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(batch['input_ids'], batch['attention_masks'])
    _, predictions = torch.max(outputs, dim=1)
    acc_train = (predictions, batch['labels']).sum().item()
    n_correct += acc_train
    n_total += batch['input_ids'].size(0)
print(f"Train Accuracy: {(n_correct / n_total):.4f}")

n_correct = 0
n_total = 0
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(batch['input_ids'], batch['attention_masks'])
    _, predictions = torch.max(outputs, dim=1)
    acc_eval = (predictions, batch['labels']).sum().item()
    n_correct += acc_eval
    n_total += batch['input_ids'].size(0)
print(f"Eval Accuracy: {(n_correct / n_total):.4f}")