In [1]:
from data_loaders_transformer import train_iterator, test_iterator, valid_iterator
from bleu import bleu_ignore_eos

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import os
import time

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader, vocab = train_iterator(device=device, batch_size=64)
test_iterator, vocab_test = test_iterator(device=device, batch_size=32)

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src sent len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [4]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #sent len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [5]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src sent len, batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, max_len):
            
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)
        
        return outputs

In [6]:
INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [7]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(22110, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(22110, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (out): Linear(in_features=512, out_features=22110, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 30,019,166 trainable parameters


In [9]:
optimizer = optim.Adam(model.parameters())

In [10]:
PAD_IDX = vocab.stoi['<pad>']
EOS_TOKEN = vocab.stoi['<eos>']

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [11]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.sentence
        trg = batch.sentence
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [12]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    avg_bleu = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.sentence
            trg = batch.sentence

            output = model(src, trg, 0) #turn off teacher forcing

            avg_bleu += compute_bleu(output, trg[:,1:])
            
            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), avg_bleu / len(iterator)

In [13]:
def compute_bleu(hypothesis, reference):
    hypothesis = hypothesis.argmax(dim=2)
    
    hypothesis = hypothesis.detach().cpu().numpy()
    reference = reference.detach().cpu().numpy()
    scores = []
    for i in range(reference.shape[0]):
        a = reference[i]
        b = hypothesis[i]
        score = float(bleu_ignore_eos(reference=a.tolist(), hypothesis=b.tolist(), eos_token=EOS_TOKEN))
        scores.append(score)
    
    return torch.tensor(scores).mean()

In [14]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [15]:
from itertools import count
from tensorboardX import SummaryWriter
# N_EPOCHS = 10
CLIP = 1
PLOT = True # Set to false to disable plotting to tensorboard
best_valid_loss = float('inf')
writer = SummaryWriter(comment='_seq2seq_lstm')

for epoch in count(1): # run forever
    
    start_time = time.time()    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    end_time = time.time()
    
    valid_loss, avg_bleu = evaluate(model, test_iterator, criterion)
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | Avg Bleu: {avg_bleu:.6f}')
    if PLOT:
        writer.add_scalar("train/epoch_sec", epoch_mins * 60 + epoch_secs, epoch)
        writer.add_scalar("train/loss", train_loss, epoch)
        writer.add_scalar("train/ppl", math.exp(train_loss), epoch)
        writer.add_scalar("valid/loss", valid_loss, epoch)
        writer.add_scalar("valid/ppl", math.exp(valid_loss), epoch)
        writer.add_scalar("valid/avg_bleu", avg_bleu, epoch)

Epoch: 01 | Time: 0m 52s
	Train Loss: 7.083 | Train PPL: 1191.610
	 Val. Loss: 6.804 |  Val. PPL: 901.228 | Avg Bleu: 0.000438
Epoch: 02 | Time: 0m 51s
	Train Loss: 6.653 | Train PPL: 774.927
	 Val. Loss: 6.794 |  Val. PPL: 892.601 | Avg Bleu: 0.018385
Epoch: 03 | Time: 0m 52s
	Train Loss: 6.689 | Train PPL: 803.251
	 Val. Loss: 6.808 |  Val. PPL: 905.221 | Avg Bleu: 0.012830
Epoch: 04 | Time: 0m 53s
	Train Loss: 6.573 | Train PPL: 715.423
	 Val. Loss: 6.785 |  Val. PPL: 884.780 | Avg Bleu: 0.016022
Epoch: 05 | Time: 0m 54s
	Train Loss: 6.538 | Train PPL: 691.143
	 Val. Loss: 6.787 |  Val. PPL: 885.988 | Avg Bleu: 0.018261
Epoch: 06 | Time: 0m 54s
	Train Loss: 6.504 | Train PPL: 668.068
	 Val. Loss: 6.800 |  Val. PPL: 898.223 | Avg Bleu: 0.017743
Epoch: 07 | Time: 0m 54s
	Train Loss: 6.477 | Train PPL: 650.007
	 Val. Loss: 6.804 |  Val. PPL: 901.033 | Avg Bleu: 0.019136
Epoch: 08 | Time: 0m 54s
	Train Loss: 6.462 | Train PPL: 640.023
	 Val. Loss: 6.814 |  Val. PPL: 910.959 | Avg Bleu: 

Epoch: 66 | Time: 0m 51s
	Train Loss: 5.852 | Train PPL: 347.955
	 Val. Loss: 6.662 |  Val. PPL: 781.977 | Avg Bleu: 0.026117
Epoch: 67 | Time: 0m 51s
	Train Loss: 5.838 | Train PPL: 343.024
	 Val. Loss: 6.672 |  Val. PPL: 789.843 | Avg Bleu: 0.026233
Epoch: 68 | Time: 0m 51s
	Train Loss: 5.835 | Train PPL: 341.900
	 Val. Loss: 6.736 |  Val. PPL: 842.486 | Avg Bleu: 0.026808
Epoch: 69 | Time: 0m 52s
	Train Loss: 5.840 | Train PPL: 343.917
	 Val. Loss: 6.709 |  Val. PPL: 819.841 | Avg Bleu: 0.028180
Epoch: 70 | Time: 0m 52s
	Train Loss: 5.822 | Train PPL: 337.520
	 Val. Loss: 6.648 |  Val. PPL: 770.943 | Avg Bleu: 0.026758
Epoch: 71 | Time: 0m 52s
	Train Loss: 5.793 | Train PPL: 327.880
	 Val. Loss: 6.638 |  Val. PPL: 763.570 | Avg Bleu: 0.027020
Epoch: 72 | Time: 0m 52s
	Train Loss: 5.780 | Train PPL: 323.783
	 Val. Loss: 6.672 |  Val. PPL: 790.113 | Avg Bleu: 0.027497
Epoch: 73 | Time: 0m 51s
	Train Loss: 5.775 | Train PPL: 322.164
	 Val. Loss: 6.625 |  Val. PPL: 753.374 | Avg Bleu: 0

	 Val. Loss: 6.449 |  Val. PPL: 631.790 | Avg Bleu: 0.030292
Epoch: 131 | Time: 0m 51s
	Train Loss: 5.421 | Train PPL: 226.181
	 Val. Loss: 6.460 |  Val. PPL: 639.145 | Avg Bleu: 0.029582
Epoch: 132 | Time: 0m 51s
	Train Loss: 5.450 | Train PPL: 232.836
	 Val. Loss: 6.681 |  Val. PPL: 796.905 | Avg Bleu: 0.026915
Epoch: 133 | Time: 0m 50s
	Train Loss: 5.531 | Train PPL: 252.376
	 Val. Loss: 6.700 |  Val. PPL: 812.159 | Avg Bleu: 0.026513
Epoch: 134 | Time: 0m 51s
	Train Loss: 5.503 | Train PPL: 245.307
	 Val. Loss: 6.669 |  Val. PPL: 787.443 | Avg Bleu: 0.027381
Epoch: 135 | Time: 0m 51s
	Train Loss: 5.487 | Train PPL: 241.556
	 Val. Loss: 6.674 |  Val. PPL: 791.268 | Avg Bleu: 0.026591
Epoch: 136 | Time: 0m 51s
	Train Loss: 5.513 | Train PPL: 247.861
	 Val. Loss: 6.602 |  Val. PPL: 736.430 | Avg Bleu: 0.027899
Epoch: 137 | Time: 0m 51s
	Train Loss: 5.490 | Train PPL: 242.287
	 Val. Loss: 6.631 |  Val. PPL: 758.246 | Avg Bleu: 0.027329
Epoch: 138 | Time: 0m 51s
	Train Loss: 5.489 | Tra

Epoch: 195 | Time: 0m 50s
	Train Loss: 5.300 | Train PPL: 200.425
	 Val. Loss: 6.485 |  Val. PPL: 655.240 | Avg Bleu: 0.029191
Epoch: 196 | Time: 0m 51s
	Train Loss: 5.301 | Train PPL: 200.505
	 Val. Loss: 6.595 |  Val. PPL: 731.642 | Avg Bleu: 0.026601
Epoch: 197 | Time: 0m 50s
	Train Loss: 5.426 | Train PPL: 227.282
	 Val. Loss: 6.661 |  Val. PPL: 781.495 | Avg Bleu: 0.028129
Epoch: 198 | Time: 0m 50s
	Train Loss: 5.381 | Train PPL: 217.206
	 Val. Loss: 6.722 |  Val. PPL: 830.339 | Avg Bleu: 0.027024
Epoch: 199 | Time: 0m 51s
	Train Loss: 5.447 | Train PPL: 232.127
	 Val. Loss: 6.632 |  Val. PPL: 759.099 | Avg Bleu: 0.028710
Epoch: 200 | Time: 0m 50s
	Train Loss: 5.394 | Train PPL: 220.032
	 Val. Loss: 6.606 |  Val. PPL: 739.267 | Avg Bleu: 0.028634
Epoch: 201 | Time: 0m 51s
	Train Loss: 5.387 | Train PPL: 218.601
	 Val. Loss: 6.578 |  Val. PPL: 719.423 | Avg Bleu: 0.028921
Epoch: 202 | Time: 0m 51s
	Train Loss: 5.323 | Train PPL: 204.914
	 Val. Loss: 6.529 |  Val. PPL: 685.036 | Avg

	Train Loss: 5.213 | Train PPL: 183.578
	 Val. Loss: 6.467 |  Val. PPL: 643.857 | Avg Bleu: 0.030170
Epoch: 260 | Time: 0m 51s
	Train Loss: 5.206 | Train PPL: 182.403
	 Val. Loss: 6.484 |  Val. PPL: 654.506 | Avg Bleu: 0.029204
Epoch: 261 | Time: 0m 51s
	Train Loss: 5.171 | Train PPL: 176.019
	 Val. Loss: 6.490 |  Val. PPL: 658.487 | Avg Bleu: 0.028641
Epoch: 262 | Time: 0m 51s
	Train Loss: 5.240 | Train PPL: 188.722
	 Val. Loss: 6.475 |  Val. PPL: 648.790 | Avg Bleu: 0.029609
Epoch: 263 | Time: 0m 50s
	Train Loss: 5.296 | Train PPL: 199.523
	 Val. Loss: 6.455 |  Val. PPL: 635.853 | Avg Bleu: 0.030056
Epoch: 264 | Time: 0m 51s
	Train Loss: 5.229 | Train PPL: 186.545
	 Val. Loss: 6.432 |  Val. PPL: 621.565 | Avg Bleu: 0.029282
Epoch: 265 | Time: 0m 51s
	Train Loss: 5.176 | Train PPL: 177.056
	 Val. Loss: 6.456 |  Val. PPL: 636.438 | Avg Bleu: 0.030364
Epoch: 266 | Time: 0m 51s
	Train Loss: 5.174 | Train PPL: 176.571
	 Val. Loss: 6.478 |  Val. PPL: 650.590 | Avg Bleu: 0.029025
Epoch: 267

	 Val. Loss: 6.440 |  Val. PPL: 626.152 | Avg Bleu: 0.030816
Epoch: 324 | Time: 0m 51s
	Train Loss: 5.120 | Train PPL: 167.391
	 Val. Loss: 6.434 |  Val. PPL: 622.680 | Avg Bleu: 0.030536
Epoch: 325 | Time: 0m 51s
	Train Loss: 5.085 | Train PPL: 161.635
	 Val. Loss: 6.418 |  Val. PPL: 612.838 | Avg Bleu: 0.031314
Epoch: 326 | Time: 0m 51s
	Train Loss: 5.039 | Train PPL: 154.391
	 Val. Loss: 6.450 |  Val. PPL: 632.838 | Avg Bleu: 0.030636
Epoch: 327 | Time: 0m 51s
	Train Loss: 5.017 | Train PPL: 150.945
	 Val. Loss: 6.446 |  Val. PPL: 630.145 | Avg Bleu: 0.030966
Epoch: 328 | Time: 0m 51s
	Train Loss: 5.060 | Train PPL: 157.613
	 Val. Loss: 6.427 |  Val. PPL: 618.615 | Avg Bleu: 0.031321
Epoch: 329 | Time: 0m 51s
	Train Loss: 5.033 | Train PPL: 153.368
	 Val. Loss: 6.450 |  Val. PPL: 632.742 | Avg Bleu: 0.030447
Epoch: 330 | Time: 0m 51s
	Train Loss: 5.023 | Train PPL: 151.819
	 Val. Loss: 6.444 |  Val. PPL: 629.160 | Avg Bleu: 0.030777
Epoch: 331 | Time: 0m 51s
	Train Loss: 5.067 | Tra

Epoch: 388 | Time: 0m 50s
	Train Loss: 5.049 | Train PPL: 155.907
	 Val. Loss: 6.397 |  Val. PPL: 599.841 | Avg Bleu: 0.031720
Epoch: 389 | Time: 0m 50s
	Train Loss: 5.105 | Train PPL: 164.919
	 Val. Loss: 6.414 |  Val. PPL: 610.267 | Avg Bleu: 0.031593
Epoch: 390 | Time: 0m 51s
	Train Loss: 5.007 | Train PPL: 149.384
	 Val. Loss: 6.371 |  Val. PPL: 584.374 | Avg Bleu: 0.032295
Epoch: 391 | Time: 0m 51s
	Train Loss: 5.055 | Train PPL: 156.823
	 Val. Loss: 6.380 |  Val. PPL: 589.986 | Avg Bleu: 0.030689
Epoch: 392 | Time: 0m 51s
	Train Loss: 5.049 | Train PPL: 155.852
	 Val. Loss: 6.399 |  Val. PPL: 601.443 | Avg Bleu: 0.032261
Epoch: 393 | Time: 0m 51s
	Train Loss: 5.059 | Train PPL: 157.470
	 Val. Loss: 6.399 |  Val. PPL: 601.188 | Avg Bleu: 0.031803
Epoch: 394 | Time: 0m 51s
	Train Loss: 4.997 | Train PPL: 147.984
	 Val. Loss: 6.404 |  Val. PPL: 604.025 | Avg Bleu: 0.031299
Epoch: 395 | Time: 0m 51s
	Train Loss: 5.000 | Train PPL: 148.484
	 Val. Loss: 6.379 |  Val. PPL: 589.428 | Avg

	Train Loss: 5.032 | Train PPL: 153.241
	 Val. Loss: 6.365 |  Val. PPL: 581.228 | Avg Bleu: 0.032866
Epoch: 453 | Time: 0m 51s
	Train Loss: 4.926 | Train PPL: 137.807
	 Val. Loss: 6.351 |  Val. PPL: 573.081 | Avg Bleu: 0.032113
Epoch: 454 | Time: 0m 51s
	Train Loss: 4.835 | Train PPL: 125.805
	 Val. Loss: 6.422 |  Val. PPL: 614.945 | Avg Bleu: 0.030891
Epoch: 455 | Time: 0m 51s
	Train Loss: 4.840 | Train PPL: 126.495
	 Val. Loss: 6.318 |  Val. PPL: 554.732 | Avg Bleu: 0.032349
Epoch: 456 | Time: 0m 51s
	Train Loss: 4.900 | Train PPL: 134.302
	 Val. Loss: 6.318 |  Val. PPL: 554.265 | Avg Bleu: 0.032381
Epoch: 457 | Time: 0m 51s
	Train Loss: 4.918 | Train PPL: 136.708
	 Val. Loss: 6.333 |  Val. PPL: 563.036 | Avg Bleu: 0.032235
Epoch: 458 | Time: 0m 50s
	Train Loss: 4.953 | Train PPL: 141.586
	 Val. Loss: 6.299 |  Val. PPL: 543.967 | Avg Bleu: 0.033287
Epoch: 459 | Time: 0m 51s
	Train Loss: 4.913 | Train PPL: 136.073
	 Val. Loss: 6.313 |  Val. PPL: 551.796 | Avg Bleu: 0.033009
Epoch: 460

	 Val. Loss: 6.283 |  Val. PPL: 535.620 | Avg Bleu: 0.032296
Epoch: 517 | Time: 0m 51s
	Train Loss: 4.956 | Train PPL: 142.002
	 Val. Loss: 6.352 |  Val. PPL: 573.472 | Avg Bleu: 0.031957
Epoch: 518 | Time: 0m 51s
	Train Loss: 4.911 | Train PPL: 135.711
	 Val. Loss: 6.297 |  Val. PPL: 542.681 | Avg Bleu: 0.033092
Epoch: 519 | Time: 0m 51s
	Train Loss: 4.849 | Train PPL: 127.587
	 Val. Loss: 6.282 |  Val. PPL: 534.894 | Avg Bleu: 0.033429
Epoch: 520 | Time: 0m 51s
	Train Loss: 4.798 | Train PPL: 121.213
	 Val. Loss: 6.271 |  Val. PPL: 528.827 | Avg Bleu: 0.032468
Epoch: 521 | Time: 0m 51s
	Train Loss: 4.884 | Train PPL: 132.147
	 Val. Loss: 6.258 |  Val. PPL: 521.979 | Avg Bleu: 0.033386
Epoch: 522 | Time: 0m 51s
	Train Loss: 4.946 | Train PPL: 140.665
	 Val. Loss: 6.223 |  Val. PPL: 504.171 | Avg Bleu: 0.034391
Epoch: 523 | Time: 0m 51s
	Train Loss: 4.874 | Train PPL: 130.859
	 Val. Loss: 6.244 |  Val. PPL: 515.030 | Avg Bleu: 0.033887
Epoch: 524 | Time: 0m 51s
	Train Loss: 4.916 | Tra

Epoch: 581 | Time: 0m 51s
	Train Loss: 4.884 | Train PPL: 132.205
	 Val. Loss: 6.310 |  Val. PPL: 550.038 | Avg Bleu: 0.031755
Epoch: 582 | Time: 0m 51s
	Train Loss: 4.835 | Train PPL: 125.871
	 Val. Loss: 6.323 |  Val. PPL: 557.417 | Avg Bleu: 0.033239
Epoch: 583 | Time: 0m 51s
	Train Loss: 4.805 | Train PPL: 122.117
	 Val. Loss: 6.311 |  Val. PPL: 550.611 | Avg Bleu: 0.032045
Epoch: 584 | Time: 0m 51s
	Train Loss: 4.835 | Train PPL: 125.823
	 Val. Loss: 6.247 |  Val. PPL: 516.308 | Avg Bleu: 0.033566
Epoch: 585 | Time: 0m 51s
	Train Loss: 4.839 | Train PPL: 126.316
	 Val. Loss: 6.274 |  Val. PPL: 530.573 | Avg Bleu: 0.032972
Epoch: 586 | Time: 0m 51s
	Train Loss: 4.771 | Train PPL: 118.092
	 Val. Loss: 6.274 |  Val. PPL: 530.581 | Avg Bleu: 0.034318
Epoch: 587 | Time: 0m 51s
	Train Loss: 4.851 | Train PPL: 127.873
	 Val. Loss: 6.331 |  Val. PPL: 561.784 | Avg Bleu: 0.032041
Epoch: 588 | Time: 0m 51s
	Train Loss: 4.885 | Train PPL: 132.271
	 Val. Loss: 6.233 |  Val. PPL: 509.364 | Avg

	Train Loss: 4.745 | Train PPL: 114.974
	 Val. Loss: 6.301 |  Val. PPL: 545.289 | Avg Bleu: 0.032019
Epoch: 646 | Time: 0m 51s
	Train Loss: 4.796 | Train PPL: 121.080
	 Val. Loss: 6.315 |  Val. PPL: 552.628 | Avg Bleu: 0.032405
Epoch: 647 | Time: 0m 51s
	Train Loss: 4.808 | Train PPL: 122.523
	 Val. Loss: 6.253 |  Val. PPL: 519.684 | Avg Bleu: 0.032873
Epoch: 648 | Time: 0m 51s
	Train Loss: 4.838 | Train PPL: 126.200
	 Val. Loss: 6.313 |  Val. PPL: 551.543 | Avg Bleu: 0.032661
Epoch: 649 | Time: 0m 51s
	Train Loss: 4.807 | Train PPL: 122.340
	 Val. Loss: 6.264 |  Val. PPL: 525.374 | Avg Bleu: 0.032278
Epoch: 650 | Time: 0m 51s
	Train Loss: 4.769 | Train PPL: 117.845
	 Val. Loss: 6.255 |  Val. PPL: 520.534 | Avg Bleu: 0.032244
Epoch: 651 | Time: 0m 51s
	Train Loss: 4.806 | Train PPL: 122.202
	 Val. Loss: 6.286 |  Val. PPL: 536.925 | Avg Bleu: 0.032569
Epoch: 652 | Time: 0m 51s
	Train Loss: 4.781 | Train PPL: 119.278
	 Val. Loss: 6.309 |  Val. PPL: 549.413 | Avg Bleu: 0.031664
Epoch: 653

	 Val. Loss: 6.267 |  Val. PPL: 527.103 | Avg Bleu: 0.032350
Epoch: 710 | Time: 0m 51s
	Train Loss: 4.718 | Train PPL: 111.922
	 Val. Loss: 6.289 |  Val. PPL: 538.603 | Avg Bleu: 0.033068
Epoch: 711 | Time: 0m 51s
	Train Loss: 4.752 | Train PPL: 115.857
	 Val. Loss: 6.294 |  Val. PPL: 541.076 | Avg Bleu: 0.031654
Epoch: 712 | Time: 0m 50s
	Train Loss: 4.732 | Train PPL: 113.544
	 Val. Loss: 6.281 |  Val. PPL: 534.135 | Avg Bleu: 0.032320
Epoch: 713 | Time: 0m 51s
	Train Loss: 4.709 | Train PPL: 110.973
	 Val. Loss: 6.313 |  Val. PPL: 551.774 | Avg Bleu: 0.032725
Epoch: 714 | Time: 0m 51s
	Train Loss: 4.692 | Train PPL: 109.100
	 Val. Loss: 6.316 |  Val. PPL: 553.142 | Avg Bleu: 0.032340
Epoch: 715 | Time: 0m 54s
	Train Loss: 4.664 | Train PPL: 106.069
	 Val. Loss: 6.316 |  Val. PPL: 553.553 | Avg Bleu: 0.032523


KeyboardInterrupt: 