# Part 1 : Additive Attention from scratch

## Attention Mechanism Demo on Pytorch: Machine Translation Example (Many-to-Many, encoder-decoder)

In this demo, we will show you how to create a machine translator using Pytorch. This demo is inspired by Andrew Ng's deeplearning.ai course on sequence models. (Programming Assignment: Neural Machine Translation with Attention)    In this demo, we create a machine translator to translate dates in various formats  into dates in an ISO format. 

In [None]:
%matplotlib inline

import torchtext
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
!pip install pytorch_lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer

import random


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-1.8.6-py3-none-any.whl (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.3/800.3 KB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.4/512.4 KB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning-utilities!=0.4.0,>=0.3.0
  Downloading lightning_utilities-0.5.0-py3-none-any.whl (18 kB)
Collecting tensorboardX>=2.2
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 KB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX, torchmetrics, lightning-utilities, pytorch_lightning
Successfully installed lightning-utilities-0.5.

## Generate Dataset
We generate a toy dataset using datetime library.  A target output only comes in one format (iso format), while there are three different date format for an input.

In [None]:
#Generating a toy dataset
import datetime
base = datetime.datetime.today()
base = datetime.date(base.year, base.month, base.day)
date_list = [base - datetime.timedelta(days=x) for x in range(0, 15000)]

In [None]:
target_date_list = [date.isoformat() for date in date_list] 
print(target_date_list[0])

2023-01-06


In [None]:
from random import randint
random.seed(42)
input_date_list = list()
for date in date_list:
    random_num = randint(0, 2)
    if random_num == 0:
        input_date_list.append(date.strftime("%d/%m/%y"))#"11/03/02"
    elif random_num == 1:
        input_date_list.append(date.strftime("%A %d %B %Y")) #"Monday 11 March 2002"
    elif random_num == 2: 
        input_date_list.append(date.strftime("%d %B %Y")) #"11 March 2002"

In [None]:
for input_sample, target_sample in zip(input_date_list[0:10],target_date_list[0:10]):
    print(input_sample,target_sample)

06 January 2023 2023-01-06
05/01/23 2023-01-05
04/01/23 2023-01-04
03 January 2023 2023-01-03
Monday 02 January 2023 2023-01-02
01/01/23 2023-01-01
31/12/22 2022-12-31
30/12/22 2022-12-30
29 December 2022 2022-12-29
28/12/22 2022-12-28


In [None]:
#Preprocessing
input_chars = list(set(''.join(input_date_list)))
output_chars = list(set(''.join(target_date_list)))

# +1 for padding
data_size, vocab_size = len(input_date_list), len(input_chars)+1 
output_vocab_size = len(output_chars)+1

print('There are %d lines and %d unique characters in your input data.' % (data_size, vocab_size))
maxlen = len( max(input_date_list, key=len)) #max input length

There are 15000 lines and 42 unique characters in your input data.


In [None]:
print("Max input length:", maxlen)

Max input length: 27


In [None]:
sorted_chars= sorted(input_chars)
sorted_output_chars= sorted(output_chars)
sorted_chars.insert(0,"<PAD>") #PADDING for input
sorted_output_chars.insert(0,"<PAD>") #PADDING for output

#input vocab
input_vocab = torchtext.vocab.vocab({})
for char in sorted_chars: input_vocab.append_token(char) 

#output vocab
output_vocab = torchtext.vocab.vocab({})
for char in sorted_output_chars: output_vocab.append_token(char) 

In [None]:
print(sorted(input_vocab.get_stoi().items(), key=lambda item: item[1]))
print(sorted(output_vocab.get_stoi().items(), key=lambda item: item[1]))

[('<PAD>', 0), (' ', 1), ('/', 2), ('0', 3), ('1', 4), ('2', 5), ('3', 6), ('4', 7), ('5', 8), ('6', 9), ('7', 10), ('8', 11), ('9', 12), ('A', 13), ('D', 14), ('F', 15), ('J', 16), ('M', 17), ('N', 18), ('O', 19), ('S', 20), ('T', 21), ('W', 22), ('a', 23), ('b', 24), ('c', 25), ('d', 26), ('e', 27), ('g', 28), ('h', 29), ('i', 30), ('l', 31), ('m', 32), ('n', 33), ('o', 34), ('p', 35), ('r', 36), ('s', 37), ('t', 38), ('u', 39), ('v', 40), ('y', 41)]
[('<PAD>', 0), ('-', 1), ('0', 2), ('1', 3), ('2', 4), ('3', 5), ('4', 6), ('5', 7), ('6', 8), ('7', 9), ('8', 10), ('9', 11)]


In [None]:
m=15000
Tx=maxlen
Ty=10

In [None]:
X = []
for line in input_date_list:
    line = [l for l in line] #change from string to list
    X.append(torch.tensor(input_vocab(line)))
Y = []
for line in target_date_list:
    line = [l for l in line] #change from string to list
    Y.append(torch.tensor(output_vocab(line)))

X = nn.utils.rnn.pad_sequence(X, batch_first = True) 

In [None]:
X.shape

torch.Size([15000, 27])

In [None]:
class DateDataset(Dataset):
  def __init__(self, X, y):
    self.encoded = X.long()
    self.label = torch.stack(y).long()
    
  def __getitem__(self, idx):
    return {"x" :self.encoded[idx], "y":self.label[idx]}

  def __len__(self):
    return len(self.encoded)

In [None]:
class DateDataModule(pl.LightningDataModule):

  def __init__(self, train_data, y, batch_size, num_workers=0):
      super().__init__()
      self.train_data = train_data
      self.y = y
      self.batch_size = batch_size
      self.num_workers = num_workers
 

  def setup(self, stage: str):
    pass

  def collate_fn(self, batch):
      one_hot_x = torch.stack([F.one_hot(b["x"], num_classes=len(input_vocab)) for b in batch])
      return {"x": one_hot_x.float(), "y": torch.stack([b["y"] for b in batch])}

  def train_dataloader(self):
      train_dataset = DateDataset(self.train_data, self.y)
      train_loader = DataLoader(train_dataset, 
                                batch_size = self.batch_size, 
                                shuffle = True, 
                                collate_fn = self.collate_fn,
                                num_workers = self.num_workers)
      
      return train_loader


In [None]:
batch_size = 16
data_module = DateDataModule(X, Y, batch_size=batch_size,num_workers=0)

## Attention Mechanism
![attn_mech](https://raw.githubusercontent.com/ekapolc/nlp_2019/master/HW8/images/attn_mech.png)

In [None]:
def one_step_attention(h, s_prev, linear_1, linear_2):
    #h.shape = batch, seq_len, hidden_dim
    #s_prev.shape = batch, hidden_dim
    # #linear_1 and linear_2 are linear layers in the model
    s_prev = s_prev.unsqueeze(1).repeat((1, h.shape[1], 1))
    concat = torch.cat([h, s_prev], dim=-1) #concat.shape = batch, seq_len, hidden_dim*2

    #Attention function### 
    e = F.tanh(linear_1(concat))
    energies = F.relu(linear_2(e))
    # calculate attention_scores (softmax)
    attention_scores = F.softmax(energies, dim=1)
    # calculate a context vector
    temp = torch.mul(attention_scores, h)
    context = torch.sum(temp,dim=1)

    return context


## The model
![rnn_model](https://raw.githubusercontent.com/ekapolc/nlp_2019/master/HW8/images/rnn_date.png)

In [None]:
class AttentionModel(pl.LightningModule):
    def __init__(self, learning_rate, criterion):
                
        super().__init__()
        self.n_h = 32 #hidden dimensions for encoder 
        self.n_s = 64 #hidden dimensions for decoder

        self.learning_rate = learning_rate
        self.criterion = criterion

        #encoder
        bidirection = True
        self.num_directions = 2 if bidirection else 1
        self.lstm = nn.LSTM(len(input_vocab), self.n_h, bidirectional=bidirection, batch_first=True)
        #decoder 
        self.decoder_lstm_cell = nn.LSTMCell(self.n_s, self.n_s)
        self.output_layer = nn.Linear(self.n_s, len(output_vocab))
        #attention
        self.fc1 = nn.Linear(self.n_h*2*self.num_directions, self.n_h)
        self.fc2 = nn.Linear(self.n_h, 1)

    def forward(self, src):
        lstm_out, _ = self.lstm(src) 
        
        decoder_s = torch.randn(src.shape[0], self.n_s).to(self.decoder_lstm_cell.weight_ih.device)
        decoder_c = torch.randn(src.shape[0], self.n_s).to(self.decoder_lstm_cell.weight_ih.device)

        prediction = torch.zeros((src.shape[0], Ty, len(output_vocab))).to(self.decoder_lstm_cell.weight_ih.device)
        #Iterate for Ty steps (Decoding)
        for t in range(Ty):

            #Perform one step of the attention mechanism to calculate the context vector at timestep t
            context = one_step_attention(lstm_out, decoder_s, self.fc1, self.fc2)
            # Feed the context vector to the decoder LSTM cell
            decoder_s, decoder_c = self.decoder_lstm_cell(context, (decoder_s, decoder_c))
              
            # Pass the decoder hidden output to the output layer (softmax)
            out = self.output_layer(decoder_s)
            
            # Append an output list with the current output
            prediction[:, t] = out
        return prediction

    def training_step(self, batch, batch_idx):
        src = batch['x']
        target = batch['y']
        prediction = self(src)
        prediction = prediction.reshape(-1, len(output_vocab))
        target = target.reshape(-1)
        loss = self.criterion(prediction, target)
        self.log("train_loss", loss)
        return loss

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        src = batch['x']
        with torch.no_grad():
          prediction = self(src)
          prediction = F.softmax(prediction, dim=-1)
          prediction = torch.argmax(prediction, dim=-1)
          for pred in prediction:
            print("".join(output_vocab.lookup_tokens(pred.cpu().numpy())))

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.learning_rate)

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 0.01
model = AttentionModel(lr, criterion)

In [None]:
trainer = Trainer(
    max_epochs=10,
    gpus=1,
)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type             | Params
-------------------------------------------------------
0 | criterion         | CrossEntropyLoss | 0     
1 | lstm              | LSTM             | 19.5 K
2 | decoder_lstm_cell | LSTMCell         | 33.3 K
3 | output_layer      | Linear           | 780   
4 | fc1               | Linear           | 4.1 K 
5 | fc2               | Linear           | 33    
-------------------------------------------------------
57.7 K    Trainable params
0         Non-trainable params
57.7 K    Total params
0.231     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


## Let's do some "translation"

In [None]:
EXAMPLES = ['Monday 15 March 2022', '3 May 1999', '05 October 2009', '30 August 2016', '11 July 2000', 'Saturday 19 May 2018', '3 March 2001', '1 March 2001']
predict_data = []
for line in EXAMPLES:
    line = [l for l in line] #change from string to list
    predict_data.append(torch.tensor(input_vocab(line)))

print(len(predict_data))    
def collate_fn(batch):
    one_hot_x = torch.stack([F.one_hot(b["x"], num_classes=len(input_vocab)) for b in batch])
    return {"x": one_hot_x.float()}

predict_data = nn.utils.rnn.pad_sequence(predict_data, batch_first = True)
predict_dataset = DateDataset(predict_data, [torch.tensor(0)]*len(predict_data))
predict_loader = DataLoader(predict_dataset, 
                          batch_size = 1, 
                          shuffle = False, 
                          collate_fn = collate_fn,
                          num_workers = 0)

8


In [None]:
trainer.predict(model, predict_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 938it [00:00, ?it/s]

2022-03-15
1999-05-33
2009-10-05
2016-08-30
2000-07-11
2011-05-19
2001-03-33
2001-03-11




[None, None, None, None, None, None, None, None]

# PART 2

In [None]:
#ADD MULTIHEAD ATTENTION
# MULTIHEAD ATTENTION is self attention. this example is encoder-decoder attention
# but encoder-decoder attention not available in pytorch