In [2]:
from IPython.display import Image, display
from IPython.core.display import HTML 

![Title](presentation/Title.png)

![Cover1](presentation/Cover1.png)

![Slide3](presentation/Slide3.png)

Example: "Mary gave roses to Susan."

![Slide4](presentation/Slide4.png)

![Slide5](presentation/Slide5.png)

![Slide6](presentation/Slide6.png)

![Slide7](presentation/Slide7.png)

In [2]:
import torch
from torch import nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    """
    Implementation for the multi-head self attention.
    """
    
    def __init__(self, emb, heads=8, mask=False):
        """
        :param emb:
        :param heads:
        :param mask:
        """
        
        super().__init__()
        assert emb % heads == 0, f'Embedding dimension ({emb}) should be divisible by nr. of heads ({heads})'
        
        self.emb = emb
        self.heads = heads
        self.mask = mask
        
        s = emb // heads
        
        # Wk, Wq and Wv params to learn
        self.tokeys = nn.Linear(emb, emb, bias=False)
        self.toqueries = nn.Linear(emb, emb, bias=False)
        self.tovalues = nn.Linear(emb, emb, bias=False)
        
        self.unifyheads = nn.Linear(emb, emb)
        
    def forward(self, x):
        
        #b = batch dimension, t amount of inputs and e embed size
        b, t, e = x.size()
        h = self.heads
        assert e == self.emb, f'Input embedding dim ({e}) should match layer embedding dim ({self.emb})'
        
        s = e // h
        
        keys = self.tokeys(x)
        queries = self.toqueries(x)
        values = self.tovalues(x)
        
        # we separete in different heads
        keys = keys.view(b, t, h, s)
        queries = queries.view(b, t, h, s)
        values = values.view(b, t, h, s)
        
        # fold heads in batch dimension
        keys = keys.transpose(1,2).contiguous().view(b * h, t, s)
        queries = queries.transpose(1,2).contiguous().view(b * h, t, s)
        values = values.transpose(1,2).contiguous().view(b * h, t, s)
        
        # compute scaled dot-product self-attention
        queries = queries / (e ** (1/4))
        keys = keys / (e ** (1/4))
        
        # dot product of queries and keys
        dot = torch.bmm(queries, keys.transpose(1,2))
        
        assert dot.size() == (b * h, t, t)
        
        if self.mask: # this mask is for text generation that uses an autoregressive model.
            mask_(dot, maskval=float('-inf'), mask_diagonal=False)
            
        dot = F.softmax(dot, dim=2)
        # dot now has row-wise self-attention probabilities
        
        out = torch.bmm(dot, values).view(b, h, t, s)
        
        out = out.transpose(1,2).contiguous().view(b, t, s * h)
        
        return self.unifyheads(out)
        

![Cover2](presentation/Cover2.png)

![Slide8](presentation/Slide8.png)

In [3]:
class TransformerBlock(nn.Module):
    
    def __init__(self, emb, heads, mask, ff_hidden_mult=4, dropout=0.0, pos_embedding=None):
        
        super().__init__()
        
        self.attention = SelfAttention(emb, heads=heads, mask=mask)
        self.mask = mask
        
        self.norm1 = nn.LayerNorm(emb)
        self.norm2 = nn.LayerNorm(emb)
        
        self.ff = nn.Sequential(
            nn.Linear(emb, ff_hidden_mult * emb),
            nn.ReLU(),
            nn.Linear(ff_hidden_mult * emb, emb)
        )
        
        self.do = nn.Dropout(dropout)
        
    def forward(self, x):
        
        attended = self.attention(x)
        x = self.norm1(attended + x)
        feedforward = self.ff(x)
        x = self.norm2(feedforward + x)
        x = self.do(x)
        
        return x            

![Slide9](presentation/Slide9.png)

In [4]:
def d(tensor=None):
    """
    Returns a device string either for the best available device,
    or for the device corresponding to the argument
    :param tensor:
    :return:
    """
    if tensor is None:
        return 'cuda' if torch.cuda.is_available() else 'cpu'
    return 'cuda' if tensor.is_cuda else 'cpu'

class CTransformer(nn.Module):
    """
    Transformer for sentiment analysis
    """
    
    def __init__(self, emb, heads, depth, seq_length, num_tokens, num_classes, max_pool=True, dropout=0.0):
        """
        :param emb: Embedding dimension
        :param heads: nr. of attention heads
        :param depth: Number of transformer blocks
        :param seq_length: Expected maximum sequence length
        :param num_tokens: Number of tokens (usually words) in the vocabulary
        :param num_classes: Number of classes.
        :param max_pool: If true, use global max pooling in the last layer. If false, use global
                         average pooling.
        """
        super().__init__()
        self.num_tokens, self.max_pool = num_tokens, max_pool
        
        self.token_embedding = nn.Embedding(embedding_dim=emb, num_embeddings=num_tokens)
        self.pos_embedding = nn.Embedding(embedding_dim=emb, num_embeddings=seq_length)
        
        tblocks = []
        for i in range(depth):
            tblocks.append(TransformerBlock(emb=emb, heads=heads, mask=False, dropout=dropout))
            
        self.tblocks = nn.Sequential(*tblocks)
        self.toprobs = nn.Linear(emb, num_classes)
        self.do = nn.Dropout(dropout)
        
    def forward(self, x):
        """
        :param x: A batch by sequence length integer tensor of token indices.
        :return: predicted log-probability vectors for each token based on the preceding tokens.
        """
        
        tokens = self.token_embedding(x)
        b, t, e = tokens.size()
        
        positions = self.pos_embedding(torch.arange(t, device=d()))[None, :, :].expand(b, t, e)
        x = tokens + positions
        x = self.do(x)
        x = self.tblocks(x)
        x = x.max(dim=1)[0] if self.max_pool else x.mean(dim=1)
        x = self.toprobs(x)
        
        return F.log_softmax(x, dim=1)

In [28]:
from experiments.classify import batch_sampler, collate_batch
from torchtext.datasets import IMDB
from torchtext.vocab import vocab
from collections import Counter
from torch.utils.data import DataLoader
from functools import partial
from torchtext.data.utils import get_tokenizer
from tqdm import tqdm
import random

tokenizer = get_tokenizer('basic_english')
NUM_CLS = 2

def go(train_step=True, batch_size=4, max_length=512, embedding_size=128,
       num_heads=8, depth=6, vocab_size=50_000, max_pool=True,
      lr_warmup=10_000, num_epochs=80, gradient_clipping=1.0, lr=0.0001):
    """
    Creates and trains a basic Transformer for the IMDB sentiment classification task.
    """
    device = 'cuda'
    
    if train_step:
        train, test = IMDB(split=('train', 'test'))
    
    else:
        tdata, _ = IMDB(split=('train', 'test'))
        tdata = list(tdata)
        random.shuffle(tdata)
        train, test = tdata[:int(len(tdata)*0.8)], tdata[int(len(tdata)*0.8):]
        
    train_list = list(train)
    test_list = list(test)
    
    counter = Counter()
    
    for (label, line) in train:
        counter.update(tokenizer(line))
    train_vocab = vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
    train_vocab.set_default_index(train_vocab['<unk>'])
    
    label_transform = lambda x: 1 if x == 'pos' else 0
    text_transform = lambda x: [train_vocab['<BOS>']] + [train_vocab[token] for token in tokenizer(x)] + [train_vocab['<EOS>']]
    
    test_dataloader = DataLoader(list(test),
                              collate_fn=partial(collate_batch, label_transform, text_transform),  
                              batch_sampler=batch_sampler(batch_size, tokenizer, test_list))
    
    print(f'- nr. of training examples {len(train_list)}')
    print(f'- nr. of {"test" if train_step else "validation"} examples {len(test_list)}')
    
    if max_length < 0:
        mx = max([len(input[1]) for input in train])
        mx = mx * 2
        print(f'- maximum sequence length: {mx}')
    else:
        mx = max_length

    # create the model
    model = CTransformer(emb=embedding_size, heads=num_heads, depth=depth,
                         seq_length=mx, num_tokens=vocab_size, num_classes=NUM_CLS, max_pool=max_pool)
    if torch.cuda.is_available():
        model.cuda()

    opt = torch.optim.Adam(lr=lr, params=model.parameters())
    sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (lr_warmup / batch_size), 1.0))

    # training loop
    seen = 0
    for e in range(num_epochs):
        # Dataloaders have to be created inside epoch loop so that generator starts back again from zero
        train_dataloader = DataLoader(list(train),
                                      collate_fn=partial(collate_batch, label_transform, text_transform),  
                                      batch_sampler=batch_sampler(batch_size, tokenizer,train_list))
        test_dataloader = DataLoader(list(test),
                                      collate_fn=partial(collate_batch, label_transform, text_transform),  
                                      batch_sampler=batch_sampler(batch_size, tokenizer,test_list))
    
        print(f'\n epoch {e}')
        print("Train Step")
        model.train(True)
        pbar_train = tqdm(total = len(train_list) / batch_size)
        for i, (label, input) in enumerate(train_dataloader):
            opt.zero_grad()
            label = label.to(device)
            input = input.to(device)
        
            # print("label",type(label), label.shape) 
            # print("input", type(input), input.shape)
            
            if input.size(1) > mx:
                input = input[:, :mx]
            out = model(input)
            loss = F.nll_loss(out, label)

            loss.backward()

            # clip gradients
            # - If the total gradient vector has a length > 1, we clip it back down to 1.
            if gradient_clipping > 0.0:
                nn.utils.clip_grad_norm_(model.parameters(), gradient_clipping)

            opt.step()
            sch.step()

            seen += input.size(0)
            
            # pbar.update(batch_size)
            # print("loss", loss)
            pbar_train.update(1)
        pbar_train.close()
        
        print("Evaluation Step")
        pbar_eval = tqdm(total = len(test_list) / batch_size)
        with torch.no_grad():

            model.train(False)
            tot, cor= 0.0, 0.0
            for i, (label, input) in enumerate(test_dataloader):
                label = label.to(device)
                input = input.to(device)
        
        
                if input.size(1) > mx:
                    input = input[:, :mx]
                out = model(input).argmax(dim=1)
                
                tot += float(input.size(0))
                cor += float((label == out).sum().item())
                pbar_eval.update(1)
            acc = cor / tot
            print(f'-- {"test" if train_step else "validation"} accuracy {acc:.3}')
        pbar_eval.close()
        
    return model

In [38]:
model = go(batch_size=8, num_epochs=10)

- nr. of training examples 25000
- nr. of test examples 25000

 epoch 0
Train Step


100%|██████████| 3125/3125.0 [01:15<00:00, 41.36it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:20<00:00, 155.79it/s]


-- test accuracy 0.758

 epoch 1
Train Step


100%|██████████| 3125/3125.0 [01:17<00:00, 40.24it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:23<00:00, 131.14it/s]


-- test accuracy 0.793

 epoch 2
Train Step


100%|██████████| 3125/3125.0 [01:28<00:00, 35.48it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:23<00:00, 131.16it/s]


-- test accuracy 0.82

 epoch 3
Train Step


100%|██████████| 3125/3125.0 [01:28<00:00, 35.45it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:24<00:00, 128.07it/s]


-- test accuracy 0.821

 epoch 4
Train Step


100%|██████████| 3125/3125.0 [01:28<00:00, 35.31it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:23<00:00, 131.77it/s]


-- test accuracy 0.837

 epoch 5
Train Step


100%|██████████| 3125/3125.0 [01:28<00:00, 35.39it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:23<00:00, 131.59it/s]


-- test accuracy 0.824

 epoch 6
Train Step


100%|██████████| 3125/3125.0 [01:29<00:00, 35.06it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:23<00:00, 130.73it/s]


-- test accuracy 0.846

 epoch 7
Train Step


100%|██████████| 3125/3125.0 [01:28<00:00, 35.42it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:23<00:00, 131.51it/s]


-- test accuracy 0.846

 epoch 8
Train Step


100%|██████████| 3125/3125.0 [01:28<00:00, 35.41it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:23<00:00, 130.75it/s]


-- test accuracy 0.836

 epoch 9
Train Step


100%|██████████| 3125/3125.0 [01:28<00:00, 35.42it/s]


Evaluation Step


100%|██████████| 3125/3125.0 [00:23<00:00, 131.67it/s]


-- test accuracy 0.846


In [39]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
token_embedding.weight 	 torch.Size([50000, 128])
pos_embedding.weight 	 torch.Size([512, 128])
tblocks.0.attention.tokeys.weight 	 torch.Size([128, 128])
tblocks.0.attention.toqueries.weight 	 torch.Size([128, 128])
tblocks.0.attention.tovalues.weight 	 torch.Size([128, 128])
tblocks.0.attention.unifyheads.weight 	 torch.Size([128, 128])
tblocks.0.attention.unifyheads.bias 	 torch.Size([128])
tblocks.0.norm1.weight 	 torch.Size([128])
tblocks.0.norm1.bias 	 torch.Size([128])
tblocks.0.norm2.weight 	 torch.Size([128])
tblocks.0.norm2.bias 	 torch.Size([128])
tblocks.0.ff.0.weight 	 torch.Size([512, 128])
tblocks.0.ff.0.bias 	 torch.Size([512])
tblocks.0.ff.2.weight 	 torch.Size([128, 512])
tblocks.0.ff.2.bias 	 torch.Size([128])
tblocks.1.attention.tokeys.weight 	 torch.Size([128, 128])
tblocks.1.attention.toqueries.weight 	 torch.Size([128, 128])
tblocks.1.attention.tovalues.weight 	 torch.Size([128, 128])
tblocks.1.attention.unifyheads.weight 	 torch.Size([128, 12

### Save model

In [35]:
import os
path_models = './models'
model_path = os.path.join(path_models, 'presentation_model_10epochs.pth')
torch.save(model, model_path)

### Load model

In [40]:
import os
path_models = './models'
model_path = os.path.join(path_models, 'presentation_model_10epochs.pth')

loaded_model = torch.load(model_path)
loaded_model.eval()

CTransformer(
  (token_embedding): Embedding(50000, 128)
  (pos_embedding): Embedding(512, 128)
  (tblocks): Sequential(
    (0): TransformerBlock(
      (attention): SelfAttention(
        (tokeys): Linear(in_features=128, out_features=128, bias=False)
        (toqueries): Linear(in_features=128, out_features=128, bias=False)
        (tovalues): Linear(in_features=128, out_features=128, bias=False)
        (unifyheads): Linear(in_features=128, out_features=128, bias=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): ReLU()
        (2): Linear(in_features=512, out_features=128, bias=True)
      )
      (do): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (attention): SelfAttention(
        (tokeys): Linear(in_features=128, out_features=128, bias=False)
        (to

In [41]:
print("Model's state_dict:")
for param_tensor in loaded_model.state_dict():
    print(param_tensor, "\t", loaded_model.state_dict()[param_tensor].size())

Model's state_dict:
token_embedding.weight 	 torch.Size([50000, 128])
pos_embedding.weight 	 torch.Size([512, 128])
tblocks.0.attention.tokeys.weight 	 torch.Size([128, 128])
tblocks.0.attention.toqueries.weight 	 torch.Size([128, 128])
tblocks.0.attention.tovalues.weight 	 torch.Size([128, 128])
tblocks.0.attention.unifyheads.weight 	 torch.Size([128, 128])
tblocks.0.attention.unifyheads.bias 	 torch.Size([128])
tblocks.0.norm1.weight 	 torch.Size([128])
tblocks.0.norm1.bias 	 torch.Size([128])
tblocks.0.norm2.weight 	 torch.Size([128])
tblocks.0.norm2.bias 	 torch.Size([128])
tblocks.0.ff.0.weight 	 torch.Size([512, 128])
tblocks.0.ff.0.bias 	 torch.Size([512])
tblocks.0.ff.2.weight 	 torch.Size([128, 512])
tblocks.0.ff.2.bias 	 torch.Size([128])
tblocks.1.attention.tokeys.weight 	 torch.Size([128, 128])
tblocks.1.attention.toqueries.weight 	 torch.Size([128, 128])
tblocks.1.attention.tovalues.weight 	 torch.Size([128, 128])
tblocks.1.attention.unifyheads.weight 	 torch.Size([128, 12

![Fin](https://i.imgflip.com/64xetv.gif "fin")