In [1]:
from data_loaders_transformer import train_iterator, test_iterator, valid_iterator
from bleu import bleu_ignore_eos

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
from tensorboardX import SummaryWriter

import random
import math
import os
import time

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader, vocab = train_iterator(device=device, batch_size=32)
test_iterator, vocab_test = test_iterator(device=device, batch_size=32)

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device):
        super().__init__()
        
        assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(100, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size, 
                                              padding = (kernel_size - 1) // 2)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [batch size, src sent len]
        
        #create position tensor
        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)
        
        #pos = [batch size, src sent len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded = pos_embedded = [batch size, src sent len, emb dim]
        
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch size, src sent len, emb dim]
        
        #pass embedded through linear layer to go through emb dim -> hid dim
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch size, src sent len, hid dim]
        
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch size, hid dim, src sent len]
        
        for i, conv in enumerate(self.convs):
        
            #pass through convolutional layer
            conved = conv(self.dropout(conv_input))

            #conved = [batch size, 2*hid dim, src sent len]

            #pass through GLU activation function
            conved = F.glu(conved, dim = 1)

            #conved = [batch size, hid dim, src sent len]
            
            #apply residual connection
            conved = (conved + conv_input) * self.scale

            #conved = [batch size, hid dim, src sent len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
        
        #permute and convert back to emb dim
        conved = self.hid2emb(conved.permute(0, 2, 1))
        
        #conved = [batch size, src sent len, emb dim]
        
        #elementwise sum output (conved) and input (embedded) to be used for attention
        combined = (conved + embedded) * self.scale
        
        #combined = [batch size, src sent len, emb dim]
        
        return conved, combined

In [4]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, pad_idx, device):
        super().__init__()
        
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.pad_idx = pad_idx
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(output_dim, emb_dim)
        self.pos_embedding = nn.Embedding(100, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.out = nn.Linear(emb_dim, output_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(hid_dim, 2*hid_dim, kernel_size)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
      
    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
        
        #embedded = [batch size, trg sent len, emb dim]
        #conved = [batch size, hid dim, trg sent len]
        #encoder_conved = encoder_combined = [batch size, src sent len, emb dim]
        
        #permute and convert back to emb dim
        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
        
        #conved_emb = [batch size, trg sent len, emb dim]
        
        combined = (embedded + conved_emb) * self.scale
        
        #combined = [batch size, trg sent len, emb dim]
                
        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
        
        #energy = [batch size, trg sent len, src sent len]
        
        attention = F.softmax(energy, dim=2)
        
        #attention = [batch size, trg sent len, src sent len]
            
        attended_encoding = torch.matmul(attention, (encoder_conved + encoder_combined))
        
        #attended_encoding = [batch size, trg sent len, emd dim]
        
        #convert from emb dim -> hid dim
        attended_encoding = self.attn_emb2hid(attended_encoding)
        
        #attended_encoding = [batch size, trg sent len, hid dim]
        
        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
        
        #attended_combined = [batch size, hid dim, trg sent len]
        
        return attention, attended_combined
        
    def forward(self, trg, encoder_conved, encoder_combined):
        
        #trg = [batch size, trg sent len]
        #encoder_conved = encoder_combined = [batch size, src sent len, emb dim]
                
        #create position tensor
        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(device)
        
        #pos = [batch size, trg sent len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(trg)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded = [batch size, trg sent len, emb dim]
        #pos_embedded = [batch size, trg sent len, emb dim]
        
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch size, trg sent len, emb dim]
        
        #pass embedded through linear layer to go through emb dim -> hid dim
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch size, trg sent len, hid dim]
        
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch size, hid dim, trg sent len]
        
        for i, conv in enumerate(self.convs):
        
            #apply dropout
            conv_input = self.dropout(conv_input)
        
            #need to pad so decoder can't "cheat"
            padding = torch.zeros(conv_input.shape[0], conv_input.shape[1], self.kernel_size-1).fill_(self.pad_idx).to(device)
            padded_conv_input = torch.cat((padding, conv_input), dim=2)
        
            #padded_conv_input = [batch size, hid dim, trg sent len + kernel size - 1]
        
            #pass through convolutional layer
            conved = conv(padded_conv_input)

            #conved = [batch size, 2*hid dim, trg sent len]
            
            #pass through GLU activation function
            conved = F.glu(conved, dim=1)

            #conved = [batch size, hid dim, trg sent len]
            
            attention, conved = self.calculate_attention(embedded, conved, encoder_conved, encoder_combined)
            
            #attention = [batch size, trg sent len, src sent len]
            #conved = [batch size, hid dim, trg sent len]
            
            #apply residual connection
            conved = (conved + conv_input) * self.scale
            
            #conved = [batch size, hid dim, trg sent len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
            
        conved = self.hid2emb(conved.permute(0, 2, 1))
         
        #conved = [batch size, trg sent len, hid dim]
            
        output = self.out(self.dropout(conved))
        
        #output = [batch size, trg sent len, output dim]
            
        return output, attention

In [5]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg):
        
        #src = [batch size, src sent len]
        #trg = [batch size, trg sent len]
           
        #calculate z^u (encoder_conved) and e (encoder_combined)
        #encoder_conved is output from final encoder conv. block
        #encoder_combined is encoder_conved plus (elementwise) src embedding plus positional embeddings 
        encoder_conved, encoder_combined = self.encoder(src)
            
        #encoder_conved = [batch size, src sent len, emb dim]
        #encoder_combined = [batch size, src sent len, emb dim]
        
        #calculate predictions of next words
        #output is a batch of predictions for each word in the trg sentence
        #attention a batch of attention scores across the src sentence for each word in the trg sentence
        output, attention = self.decoder(trg, encoder_conved, encoder_combined)
        
        #output = [batch size, trg sent len, output dim]
        #attention = [batch size, trg sent len, src sent len]
        
        return output, attention

In [6]:
INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
EMB_DIM = 256
HID_DIM = 512
ENC_LAYERS = 10
DEC_LAYERS = 10
ENC_KERNEL_SIZE = 3
DEC_KERNEL_SIZE = 3
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
PAD_IDX = vocab.stoi['<pad>']
EOS_TOKEN = vocab.stoi['<eos>']
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, PAD_IDX, device)

model = Seq2Seq(enc, dec, device).to(device)

In [7]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 49,320,286 trainable parameters


In [8]:
optimizer = optim.Adam(model.parameters())

In [9]:
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [10]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.sentence
        trg = batch.sentence
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,1:])
        
        #output = [batch size, trg sent len - 1, output dim]
        #trg = [batch size, trg sent len]
        
        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:,1:].contiguous().view(-1)
        
        #output = [batch size * trg sent len - 1, output dim]
        #trg = [batch size * trg sent len - 1]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [11]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    avg_bleu = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.sentence
            trg = batch.sentence

            output, _ = model(src, trg[:,1:])
            
            avg_bleu += compute_bleu(output, trg[:,1:])
        
            #output = [batch size, trg sent len - 1, output dim]
            #trg = [batch size, trg sent len]

            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:,1:].contiguous().view(-1)

            #output = [batch size * trg sent len - 1, output dim]
            #trg = [batch size * trg sent len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), avg_bleu / len(iterator)

In [12]:
def compute_bleu(hypothesis, reference):
    hypothesis = hypothesis.argmax(dim=2)
    
    hypothesis = hypothesis.detach().cpu().numpy()
    reference = reference.detach().cpu().numpy()
    scores = []
    for i in range(reference.shape[0]):
        a = reference[i]
        b = hypothesis[i]
        score = float(bleu_ignore_eos(reference=a.tolist(), hypothesis=b.tolist(), eos_token=EOS_TOKEN))
        scores.append(score)
    
    return torch.tensor(scores).mean()

In [13]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [14]:
from itertools import count

N_EPOCHS = 100
CLIP = 1

best_valid_loss = float('inf')
PLOT = True # Set to false to disable plotting to tensorboard
writer = SummaryWriter(comment='_cnn')

for epoch in count(1):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss, avg_bleu = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model.pt')
    
    print(f'Epoch: {epoch:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | Avg Bleu: {avg_bleu:.6f}')
    if PLOT:
        writer.add_scalar('train/epoch_sec', epoch_mins * 60 + epoch_secs, epoch)
        writer.add_scalar('train/loss', train_loss, epoch)
        writer.add_scalar('train/ppl', math.exp(train_loss), epoch)
        writer.add_scalar('valid/loss', valid_loss, epoch)
        writer.add_scalar('valid/ppl', math.exp(valid_loss), epoch)
        writer.add_scalar('valid/avg_bleu', avg_bleu, epoch)

Epoch: 01 | Time: 0m 32s
	Train Loss: 9.652 | Train PPL: 15548.802
	 Val. Loss: 3.772 |  Val. PPL:  43.471 | Avg Bleu: 0.136464
Epoch: 02 | Time: 0m 31s
	Train Loss: 3.449 | Train PPL:  31.469
	 Val. Loss: 2.306 |  Val. PPL:  10.038 | Avg Bleu: 0.399130
Epoch: 03 | Time: 0m 31s
	Train Loss: 2.418 | Train PPL:  11.229
	 Val. Loss: 1.776 |  Val. PPL:   5.909 | Avg Bleu: 0.563227
Epoch: 04 | Time: 0m 30s
	Train Loss: 1.835 | Train PPL:   6.263
	 Val. Loss: 1.570 |  Val. PPL:   4.809 | Avg Bleu: 0.632465
Epoch: 05 | Time: 0m 30s
	Train Loss: 1.441 | Train PPL:   4.225
	 Val. Loss: 1.468 |  Val. PPL:   4.338 | Avg Bleu: 0.674440
Epoch: 06 | Time: 0m 31s
	Train Loss: 1.151 | Train PPL:   3.162
	 Val. Loss: 1.377 |  Val. PPL:   3.964 | Avg Bleu: 0.698035
Epoch: 07 | Time: 0m 31s
	Train Loss: 0.919 | Train PPL:   2.506
	 Val. Loss: 1.352 |  Val. PPL:   3.864 | Avg Bleu: 0.711923
Epoch: 08 | Time: 0m 31s
	Train Loss: 0.728 | Train PPL:   2.071
	 Val. Loss: 1.363 |  Val. PPL:   3.907 | Avg Bleu:

Exception in thread Thread-8:
Traceback (most recent call last):
  File "C:\Users\Andriy\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
  File "C:\Users\Andriy\Anaconda3\lib\site-packages\tensorboardX\event_file_writer.py", line 180, in run
    self._ev_writer.write_event(event)
  File "C:\Users\Andriy\Anaconda3\lib\site-packages\tensorboardX\event_file_writer.py", line 61, in write_event
    return self._write_serialized_event(event.SerializeToString())
  File "C:\Users\Andriy\AppData\Roaming\Python\Python37\site-packages\google\protobuf\internal\python_message.py", line 1042, in SerializeToString
    return self.SerializePartialToString(**kwargs)
  File "C:\Users\Andriy\AppData\Roaming\Python\Python37\site-packages\google\protobuf\internal\python_message.py", line 1051, in SerializePartialToString
    self._InternalSerialize(out.write, **kwargs)
  File "C:\Users\Andriy\AppData\Roaming\Python\Python37\site-packages\google\protobuf\internal\python_message.py"

Epoch: 52 | Time: 0m 30s


OverflowError: math range error