# Transformers
To understand anything that's going on below, check first the slides / video lesson.

In [1]:
import torch 
from torch import nn
import torch.nn.functional as f
import numpy as np 

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nn_Softargmax = nn.Softmax  # fix wrong name

## Multi head attention

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, d_input=None):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        if d_input is None:
            d_xq = d_xk = d_xv = d_model
        else:
            d_xq, d_xk, d_xv = d_input
            
        # Make sure that the embedding dimension of the model is a multiple of
        # a number of heads
        assert d_model % self.num_heads == 0
        # length of query and key should match
        assert d_xq == d_xk
        self.d_k = d_model // self.num_heads
        
        # These are still of dimension d_model. They will be split into number
        # of heads 
        self.W_q = nn.Linear(d_xq, d_model, bias=False)
        self.W_k = nn.Linear(d_xk, d_model, bias=False)
        self.W_v = nn.Linear(d_xv, d_model, bias=False)
        
        # Outputs of all sub-layers need to be of dimension d_model
        self.W_h = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V):
        # batch_size = Q.size(0) 
        # k_length = dim per head
        # k_length = K.size(-2) 
        
        # Scaling by d_k so that the soft(arg)max doesnt saturate
        # (bs, n_heads, q_length, k_length)
        Q = Q / np.sqrt(self.d_k)
        # (bs, n_heads, q_length, k_length)
        scores = torch.matmul(Q, K.transpose(2, 3))        
        # (bs, n_heads, q_length, k_length)
        A = nn_Softargmax(dim=-1)(scores)
        
        # Get the weighted average of the values
        # (bs, n_heads, q_length, k_length)
        H = torch.matmul(A, V)

        return H, A 

        
    def split_heads(self, x, batch_size):
        """
        Split the last dimension (embedding dim) into (heads X depth)
        Return after transpose to put in shape
        (batch_size X num_heads X seq_length X d_k)
        """
        return x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

    def group_heads(self, x, batch_size):
        """
        Combine the heads again to get
        (batch_size X seq_length X (num_heads * d_k))
        """
        return x.transpose(1, 2).contiguous().view(
            batch_size, -1, self.num_heads * self.d_k)
    

    def forward(self, X_q, X_k, X_v):
        batch_size, _, _ = X_q.size()

        # After transforming, split into num_heads 
        # (bs, n_heads, q_length, dim_per_head)
        Q = self.split_heads(self.W_q(X_q), batch_size)
        # (bs, n_heads, k_length, dim_per_head)
        K = self.split_heads(self.W_k(X_k), batch_size)
        # (bs, n_heads, v_length, dim_per_head)
        V = self.split_heads(self.W_v(X_v), batch_size)
        # Calculate the attention weights for each of the heads
        H_cat, A = self.scaled_dot_product_attention(Q, K, V)
        
        # Put all the heads back together by concat
        # (bs, q_length, dim)
        H_cat = self.group_heads(H_cat, batch_size)
        
        # Final linear layer
        # (bs, q_length, dim)
        H = self.W_h(H_cat)

        return H, A

### Some sanity checks:

In [4]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
def print_out(Q, K, V):
    temp_out, temp_attn = temp_mha.scaled_dot_product_attention(Q, K, V)
    print('Attention weights are:', temp_attn.squeeze())
    print('Output is:', temp_out.squeeze())

To check our self attention works - if the query matches with one of the key values, it should have all the attention focused there, with the value returned as the value at that index

In [5]:
test_K = torch.tensor(
    [[10, 0, 0],
     [ 0,10, 0],
     [ 0, 0,10],
     [ 0, 0,10]]
).float()[None,None]

test_V = torch.tensor(
    [[   1,0,0],
     [  10,0,0],
     [ 100,5,0],
     [1000,6,0]]
).float()[None,None]

test_Q = torch.tensor(
    [[0, 10, 0]]
).float()[None,None]
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06])
Output is: tensor([1.0004e+01, 4.0993e-05, 0.0000e+00])


Great! We can see that it focuses on the second key and returns the second value. 

If we give a query that matches two keys exactly, it should return the averaged value of the two values for those two keys. 

In [6]:
test_Q = torch.tensor([[0, 0, 10]]).float()  
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01])
Output is: tensor([549.9979,   5.5000,   0.0000])


We see that it focuses equally on the third and fourth key and returns the average of their values.

Now giving all the queries at the same time:

In [7]:
test_Q = torch.tensor(
    [[0, 0, 10], [0, 10, 0], [10, 10, 0]]
).float()[None,None]
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([[1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01],
        [3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06],
        [5.0000e-01, 5.0000e-01, 1.8633e-06, 1.8633e-06]])
Output is: tensor([[5.5000e+02, 5.5000e+00, 0.0000e+00],
        [1.0004e+01, 4.0993e-05, 0.0000e+00],
        [5.5020e+00, 2.0497e-05, 0.0000e+00]])


## 1D convolution with `kernel_size = 1`

This is basically an MLP with one hidden layer and ReLU activation applied to each and every element in the set.

In [8]:
class CNN(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super().__init__()
        self.k1convL1 = nn.Linear(d_model, hidden_dim)
        self.k1convL2 = nn.Linear(hidden_dim, d_model)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.k1convL1(x)
        x = self.activation(x)
        x = self.k1convL2(x)
        return x

## Transformer encoder

Now we have all components for our Transformer Encoder block shown below!!!!

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, conv_hidden_dim):
        super().__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.cnn = CNN(d_model, conv_hidden_dim)

        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
    
    def forward(self, x):
        
        # Multi-head attention 
        # (batch_size, input_seq_len, d_model)
        attn_output, _ = self.mha(x, x, x)
        
        # Layer norm after adding the residual connection 
        # (batch_size, input_seq_len, d_model)
        out1 = self.layernorm1(x + attn_output)
        
        # Feed forward 
        # (batch_size, input_seq_len, d_model)
        cnn_output = self.cnn(out1)
        
        # Second layer norm after adding residual connection 
        # (batch_size, input_seq_len, d_model)
        out2 = self.layernorm2(out1 + cnn_output)

        return out2

### Encoder 
#### Blocks of N Encoder Layers + Positional encoding + Input embedding

Self attention by itself does not have any recurrence or convolutions so to make it sensitive to position we must provide additional positional encodings. These are calculated as follows:

\begin{aligned}
E(p, 2i)    &= \sin(p / 10000^{2i / d}) \\
E(p, 2i+1) &= \cos(p / 10000^{2i / d})
\end{aligned}

In [10]:
def create_sinusoidal_embeddings(nb_p, dim, E):
    theta = np.array([
        [p / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
        for p in range(nb_p)
    ])
    E.requires_grad = False
    E[:, 0::2] = torch.FloatTensor(np.sin(theta[:, 0::2]))
    E[:, 1::2] = torch.FloatTensor(np.cos(theta[:, 1::2]))
    E = E.to(device)

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size, max_position_embeddings):
        super().__init__()
        # vocab_size X d_model
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)
        # max_position X d_model
        self.position_embeddings = nn.Embedding(
            max_position_embeddings, d_model)
        create_sinusoidal_embeddings(
            nb_p=max_position_embeddings,
            dim=d_model,
            E=self.position_embeddings.weight
        )

        self.LayerNorm = nn.LayerNorm(d_model, eps=1e-12)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        # (max_seq_length)
        position_ids = torch.arange(
            seq_length, dtype=torch.long, device=input_ids.device)
        # (bs, max_seq_length)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        
        # Get word embeddings for each input id
        # (bs, max_seq_length, dim)
        word_embeddings = self.word_embeddings(input_ids)
        
        # Get position embeddings for each position id
        # (bs, max_seq_length, dim)
        position_embeddings = self.position_embeddings(position_ids)
        
        # Add them both
        # (bs, max_seq_length, dim)
        embeddings = word_embeddings + position_embeddings
        
        # Layer norm
        # (bs, max_seq_length, dim)
        embeddings = self.LayerNorm(embeddings) 
        return embeddings

In [11]:
class Encoder(nn.Module):
    def __init__(
            self, num_layers, d_model, num_heads, ff_hidden_dim,
            input_vocab_size, maximum_position_encoding):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embeddings(
            d_model, input_vocab_size, maximum_position_encoding)

        self.enc_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.enc_layers.append(
                EncoderLayer(d_model, num_heads, ff_hidden_dim))
        
    def forward(self, x):
        # Transform to (batch_size, input_seq_length, d_model)
        x = self.embedding(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        # (batch_size, input_seq_len, d_model)
        return x

In [12]:
from collections import Counter
from functools import partial
from torchtext.datasets import IMDB, NUM_LINES
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [16]:
batch_size = 20
num_words = 20_000
num_test = NUM_LINES['IMDB']['test']
split_num =  num_test // 2

In [13]:
max_len = 200
ds_train_int, ds_test_int = IMDB(root='./', split=('train', 'test'))

In [42]:
ds_train = ds_train_int.shuffle(buffer_size=20000)

In [43]:
sentiments = []
zero = 0
one = 0
other = 0
max_zero_run = 0
max_one_run = 0
zero_run = 0
one_run = 0

it_ = iter(ds_train)
batch_size = 20
train_len = NUM_LINES['IMDB']['train']
i = 0
batch = []

for x, _ in it_:
    i += 1
    if i < 20:
        batch.append(0 if x  == 'neg' else 1)
        continue
    avg = np.mean(batch)
    if not (avg >= 0 or avg <= 1):
        print(f'i: {i} batch: {batch}')
        break
    if avg == 0:
        zero += 1
        zero_run += 1
        # if one_run != 0:
        #     print(f'One run: {one_run}')
        if one_run > max_one_run:
            max_one_run = one_run
        one_run = 0
    elif avg == 1:
        one += 1
        one_run += 1
        # if zero_run != 0:
        #     print(f'Zero run: {zero_run}')
        if zero_run > max_zero_run:
            max_zero_run = zero_run
        zero_run = 0
    else:
        other += 1
    sentiments.append(avg)

    i = 0
    batch = []
print(f'Max Zero Run: {max_zero_run} Max One Run: {max_one_run} Other: {other}')
#print(f'Mean batch sentiment: {np.mean(sentiments)}')

Max Zero Run: 0 Max One Run: 0 Other: 1250
Mean batch sentiment: 0.4996631578947368


In [44]:
len(sentiments)


1250

In [45]:
from torchdata.datapipes.iter import IterableWrapper

def my_ints():
    for i in range(10):
        yield i
dp = IterableWrapper(my_ints())
shuffle_dp = dp.shuffle()
list(shuffle_dp)



[1, 3, 8, 5, 0, 6, 7, 2, 4, 9]

In [46]:
from typing import Tuple

In [47]:
class Splitter:
    def __init__(self, total : int, num_classes : int):
        self.count = 0
        self.total = total
        self.num_classes = num_classes
    
    def __call__(self, inst_id : int, elem : Tuple) -> int:
        _ = elem
        ret = (self.count % self.num_classes) == inst_id
        self.count += 1
        return int(ret)

splitter = Splitter(num_test, 2)
ds_valid, ds_test = ds_test_int.demux(
    num_instances=2, classifier_fn=partial(splitter, 1), buffer_size=20000)

In [48]:
type(ds_valid)

torch.utils.data.datapipes.iter.combining._ChildDataPipe

In [49]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [50]:
counter = Counter()
for _, line in ds_train:
    counter.update(tokenizer(line))

In [51]:
counter.most_common()[num_words:num_words+1]

[('woulda', 12)]

In [52]:
def yield_tokens(stream_iter):
    for _, line in stream_iter:
        yield tokenizer(line)

In [53]:
vocab_obj = build_vocab_from_iterator(
    yield_tokens(stream_iter=ds_train), max_tokens=num_words,
    specials=['<unk>', '<pad>', '<bos>', '<eos>'])


In [54]:

vocab_obj.set_default_index(vocab_obj["<unk>"])

In [55]:
print("The length of the new vocab is", len(vocab_obj))
new_stoi = vocab_obj.get_stoi()
#print("The index of '' is", new_stoi[''])
new_itos = vocab_obj.get_itos()
print("The token at index 2 is", new_itos[2])

The length of the new vocab is 20000
The token at index 2 is <bos>


In [57]:
BOS_IDX = vocab_obj['<bos>']
EOS_IDX = vocab_obj['<eos>']
PAD_IDX = vocab_obj['<pad>']
UNK_IDX = vocab_obj['<unk>']

#train_loader, valid_loader, test_loader = data.BucketIterator.splits(
#    (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False)

In [58]:
def text_transform(vocab, line):
    return [BOS_IDX] + [vocab[tok] for tok in tokenizer(line)] + [EOS_IDX]

def label_transform(label):
    return 1 if label == 'pos' else 0

def collate_batch(vocab, batch):
    label_list, text_list = [], []
    for _label, _text in batch:
        label_list.append(torch.tensor(label_transform(_label)))
        text_list.append(torch.tensor(text_transform(vocab, _text)))
    text_list = pad_sequence(text_list, padding_value=PAD_IDX, batch_first=True)
    label_list = torch.tensor(
        label_list).unsqueeze(1).type(
        torch.FloatTensor)
    return text_list.to(device), label_list.to(device)

collate_batch_vocab = partial(collate_batch, vocab_obj)

In [89]:
train_loader = DataLoader(
    ds_train, batch_size=batch_size, shuffle=None,
    collate_fn=collate_batch_vocab)
valid_loader = DataLoader(
    ds_valid, batch_size=batch_size, shuffle=False,
    collate_fn=collate_batch_vocab)
test_loader = DataLoader(
    ds_test, batch_size=batch_size, shuffle=False,
    collate_fn=collate_batch_vocab)        

In [91]:
sentiments = []
zero = 0
one = 0
other = 0
max_zero_run = 0
max_one_run = 0
zero_run = 0
one_run = 0

# next(iter(valid_loader))
for _, y in train_loader:
    avg = y.cpu().numpy().mean()
    if avg == 0:
        zero += 1
        zero_run += 1
        # if one_run != 0:
        #     print(f'One run: {one_run}')
        if one_run > max_one_run:
            max_one_run = one_run
        one_run = 0
    elif avg == 1:
        one += 1
        one_run += 1
        # if zero_run != 0:
        #     print(f'Zero run: {zero_run}')
        if zero_run > max_zero_run:
            max_zero_run = zero_run
        zero_run = 0
    else:
        other += 1
    sentiments.append(avg)
print(f'Max Zero Run: {max_zero_run} Max One Run: {max_one_run} Other: {other}')

Max Zero Run: 0 Max One Run: 0 Other: 1250


In [92]:
def count_batches(loader):
    count = 0
    for _ in loader:
        count += 1
    return count

In [93]:
est_train_batches = (NUM_LINES['IMDB']['train'] + (batch_size-1)) // batch_size
# train_batches = count_batches(train_loader)
train_batches = est_train_batches
print(f'Estimated training batches: {est_train_batches} '
    f'actual training batches {train_batches}')

Estimated training batches: 1250 actual training batches 1250


In [94]:
est_test_batches = ((NUM_LINES['IMDB']['test'] // 2) + (batch_size-1)) \
    // batch_size
#test_batches = count_batches(test_loader)
test_batches = est_test_batches
print(f'Estimated test batches: {est_test_batches} '
    f'actual test batches {test_batches}')

Estimated test batches: 625 actual test batches 625


In [95]:
est_valid_batches = ((NUM_LINES['IMDB']['test'] // 2) + (batch_size-1)) \
    // batch_size
# valid_batches = count_batches(valid_loader)
valid_batches = est_valid_batches
print(f'Estimated valid batches: {est_valid_batches} '
    f'actual valid batches {valid_batches}')

Estimated valid batches: 625 actual valid batches 625


In [96]:
class TransformerClassifier(nn.Module):
    def __init__(
            self, num_layers, d_model, num_heads, conv_hidden_dim,
            input_vocab_size, num_answers):
        super().__init__()
        
        self.encoder = Encoder(
            num_layers, d_model, num_heads, conv_hidden_dim, input_vocab_size,
            maximum_position_encoding=10000)
        self.dense = nn.Linear(d_model, num_answers)

    def forward(self, x):
        x = self.encoder(x)
        x, _ = torch.max(x, dim=1)
        x = self.dense(x)
        return x

In [97]:
model = TransformerClassifier(
    num_layers=1, d_model=32, num_heads=2, conv_hidden_dim=128,
    input_vocab_size=num_words, num_answers=1)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.BCEWithLogitsLoss()

In [108]:
epochs = 5

In [109]:
def evaluate(data_loader, nb_batches):
    data_iterator = iter(data_loader)
    model.eval()
    acc = 0
    for x, y in data_iterator:
        out = model(x)
        acc += ((torch.sigmoid(out)>0.5) == y).cpu().numpy().mean()

    print(f"Eval accuracy: {acc / nb_batches}")

In [110]:
def train(
    train_loader, nb_batches_train,
    valid_loader, nb_batches_valid):
    for epoch in range(epochs):
        train_iterator = iter(train_loader)
        train_acc = 0
        model.train()
        losses = 0.0
        count = 0
        for x, y in train_iterator:
            count += 1
            out = model(x)  # ①
            loss = criterion(out, y)  # ②
            model.zero_grad()  # ③
            loss.backward()  # ④
            losses += loss.item()
            optimizer.step()  # ⑤
            acc = ((torch.sigmoid(out) > 0.5) == y).cpu().numpy().mean()
            # pos_perc = y.cpu().numpy().mean()
            # print(f'Iteration {count}: accuracy: {acc} loss: {loss.item()} '
            #     f'Percent Positive: {pos_perc}')
            train_acc += acc

        
        print(f"Training loss at epoch {epoch} is {losses / nb_batches_train}")
        print(f"Training accuracy: {train_acc / nb_batches_train}")
        print('Evaluating on validation:')
        evaluate(valid_loader, nb_batches_valid)

In [111]:
train(train_loader, train_batches, valid_loader, valid_batches)

Training loss at epoch 0 is 0.4321249868035317
Training accuracy: 0.8009599999999995
Evaluating on validation:
Eval accuracy: 0.8062400000000002
Training loss at epoch 1 is 0.33005270760655403
Training accuracy: 0.8579599999999998
Evaluating on validation:




Eval accuracy: 0.8219199999999999
Training loss at epoch 2 is 0.26211003826856616
Training accuracy: 0.8938000000000027
Evaluating on validation:
Eval accuracy: 0.8419199999999993
Training loss at epoch 3 is 0.2108477133527398
Training accuracy: 0.9172400000000058
Evaluating on validation:
Eval accuracy: 0.84304
Training loss at epoch 4 is 0.16937405742555856
Training accuracy: 0.9364800000000066
Evaluating on validation:
Eval accuracy: 0.8284799999999981


In [112]:
evaluate(test_loader, test_batches)

Eval accuracy: 0.8292799999999972


In [113]:
from itertools import islice

In [114]:
idx = 100
it_ = iter(train_loader)
it_ = islice(it_, idx, idx+1)
items = next(it_)
toks = [t for t in items[0][0] if t != PAD_IDX]
toks = vocab_obj.lookup_tokens(toks)
print(' '.join(toks))
print(f'\nReview: {items[1][0]}')
# review_sum = sum([i for _, y in iter(valid_loader) for i in y])
# print(f'Number positive reviews : {review_sum}')

<bos> I took this out <unk> from the library the other night , having no idea of the film 's cult , influence , or that it is currently being staged as a musical . ( ! ) Most of the comments here are on target , it 's moving , funny , sad , and yes , a tad exploitive despite the best intentions of the filmmakers . The expanded <unk> edition is a must for anyone who loved it when it came out . < br /><br />I think you can also see in little Edie the fall of a class that sort of disappeared , you can hear it in old films of Jackie O too ; people just do n't talk like that anymore . I think as a documentary , it would have been interesting to get more information about how the home fell into <unk> , Old Edie at least still seems aware of what 's going on to a certain degree ; could n't She see the once spectacular home <unk> ? < br /><br />Yet the film 's subject is the life the two women have constructed for themselves now , a real life <unk> Williams one act . Well worth your time . <eo

In [115]:
# items = next(iter(train_loader))
model.eval()
with torch.no_grad():
    out = model(items[0])
    sig_out = torch.sigmoid(out)
    print(torch.cat((sig_out, items[1]), dim=1))
    print(((sig_out > 0.5) == items[1]).cpu().numpy().mean())


tensor([[9.1716e-01, 1.0000e+00],
        [1.3316e-01, 0.0000e+00],
        [6.8777e-02, 0.0000e+00],
        [9.9864e-01, 1.0000e+00],
        [3.9816e-04, 0.0000e+00],
        [9.9287e-01, 1.0000e+00],
        [3.7836e-02, 0.0000e+00],
        [1.0492e-03, 0.0000e+00],
        [5.2196e-01, 1.0000e+00],
        [9.9052e-01, 1.0000e+00],
        [5.6175e-04, 0.0000e+00],
        [5.9922e-03, 0.0000e+00],
        [9.8571e-01, 1.0000e+00],
        [3.1491e-02, 0.0000e+00],
        [3.3671e-02, 0.0000e+00],
        [1.9651e-02, 0.0000e+00],
        [4.0941e-01, 0.0000e+00],
        [9.9138e-01, 1.0000e+00],
        [5.5386e-01, 1.0000e+00],
        [6.6777e-01, 1.0000e+00]], device='cuda:0')
1.0
