In [145]:
import random
import torch
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x26b9cbf6810>

In [146]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
spacy_en = spacy.load('en_core_web_sm')

In [147]:
from iteration_utilities import deepflatten
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
def tokenize_vi(text):
    return [tok for tok in deepflatten(annotator.tokenize(text), depth=1)]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [148]:
import pandas as pd

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
val.to_json("val.json", orient="records", lines=True)


In [142]:
# import json
# from collections import Counter
# from itertools import chain

# def load_data(filename, source_tokenizer, target_tokenizer):
#     examples = []
#     with open(filename, "r", encoding="utf-8") as f:
#         for line in f:
#             example = json.loads(line)
#             src = source_tokenizer(example["English"])
#             trg = target_tokenizer(example["Vietnamese"])
#             examples.append(src,trg)
#     return examples

# train_examples = load_data("train.json", tokenize_en, tokenize_vi)
# val_examples = load_data("val.json", tokenize_en, tokenize_vi)
# test_examples = load_data("test.json", tokenize_en, tokenize_vi)

# import re
# import collections

# def process(text: str) -> str:
#     """
#     Tokenizes and lowercases a text.
    
#     Args:
#         text (str): The input text.
    
#     Returns:
#         str: The processed text.
#     """
#     tokens = tokenize_vi(text.lower())
#     return " ".join(tokens)


# def build_vocab(texts, max_vocab_size=10000, min_freq=2):
#     """
#     Builds a vocabulary from a list of texts.
    
#     Args:
#         texts (list of str): The input texts.
#         max_vocab_size (int or None): The maximum size of the vocabulary. If None, use all words in the input.
#         min_freq (int): The minimum frequency for a word to be included in the vocabulary.
    
#     Returns:
#         tuple: A tuple containing the token-to-index mapping (a dictionary) and the index-to-token mapping (a list).
#     """
#     # Tokenize and lowercase the texts
#     processed_texts = [process(t) for t in texts]
    
#     # Flatten the tokenized texts
#     tokens = [token for text in processed_texts for token in text.split()]
    
#     # Count the token frequencies
#     counter = Counter(tokens)
    
#     # Select the most common tokens
#     if max_vocab_size is None:
#         max_vocab_size = len(counter)
#     most_common = counter.most_common(max_vocab_size)
#     most_common = [(token, freq) for token, freq in most_common if freq >= min_freq]
    
#     # Create the token-to-index mapping and the index-to-token mapping
#     token_to_idx = {"<pad>": 0, "<unk>": 1, "<bos>": 2, "<eos>": 3}
#     idx_to_token = ["<pad>", "<unk>", "<bos>", "<eos>"]
#     for token, freq in most_common:
#         token_to_idx[token] = len(idx_to_token)
#         idx_to_token.append(token)
    
#     return token_to_idx, idx_to_token



# source_sentences_train = [example["src"] for example in train_examples]
# target_sentences_train = [example["trg"] for example in train_examples]
# source_vocab, source_itos = build_vocab(source_sentences_train)
# target_vocab, target_itos = build_vocab(target_sentences_train)



AttributeError: 'list' object has no attribute 'lower'

In [461]:
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

class Field:
    def __init__(self, tokenize_func=None, init_token=None, eos_token=None, pad_token='<pad>', lower=True, unk_token='<unk>'):
        self.tokenize_func = tokenize_func or (lambda x: x.split())
        self.init_token = init_token
        self.eos_token = eos_token
        self.pad_token = pad_token
        self.lower = lower
        self.vocab = None
        self.itos = None
        self.unk_token = unk_token
    
    def build_vocab(self, texts, max_vocab_size=10000, min_freq=2):
        # Tokenize the texts
        tokenized_texts = [self.tokenize_func(t.lower() if self.lower else t) for t in texts]
        
        # Flatten the tokenized texts
        tokens = [token for token_list in tokenized_texts for token in token_list]
        
        # Count the tokens
        counter = Counter(tokens)
        
        # Sort the tokens by frequency
        sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        
        # Truncate the sorted tokens by max_vocab_size
        if max_vocab_size is not None:
            sorted_tokens = sorted_tokens[:max_vocab_size]
        
        # Filter the tokens by min_freq
        filtered_tokens = [(token, freq) for token, freq in sorted_tokens if freq >= min_freq]
        
        # Add special tokens to the vocabulary
        if self.init_token is not None:
            filtered_tokens.insert(0, (self.init_token, float("inf")))
        if self.eos_token is not None:
            filtered_tokens.append((self.eos_token, float("inf")))
        if self.pad_token is not None:
            filtered_tokens.append((self.pad_token, float("inf")))
        filtered_tokens.append((self.unk_token, float("inf")))
        
        # Build the token-to-index mapping and index-to-token mapping
        self.vocab = {}
        self.itos = []
        for token, freq in filtered_tokens:
            self.vocab[token] = len(self.itos)
            self.itos.append(token)
    
    def numericalize(self, texts, device=None):
        # Convert the tokenized texts to numericalized sequences
        numerical_seqs = []
        for tokens in texts:
            numerical_seq = []
            if self.init_token is not None:
                numerical_seq.append(self.vocab.get(self.init_token, self.vocab[self.unk_token]))
            numerical_seq.extend(self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens)
            if self.eos_token is not None:
                numerical_seq.append(self.vocab.get(self.eos_token, self.vocab[self.unk_token]))
            numerical_seqs.append(numerical_seq)
        
        # Convert the numericalized sequences to tensors and move to device
        tensor_seqs = [torch.LongTensor(seq) for seq in numerical_seqs]
        if device is not None:
            tensor_seqs = [seq.to(device) for seq in tensor_seqs]
        
        # Pad the tensor sequences
        padded_seqs = pad_sequence(tensor_seqs, padding_value=self.vocab[self.pad_token])
        
        return padded_seqs


    
source = Field(tokenize_func=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
target = Field(tokenize_func=tokenize_vi, init_token='<sos>', eos_token='<eos>', lower=True)
fields = {"English": source, "Vietnamese": target}



In [421]:
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": "train.json", "test": "test.json", "val": "val.json"})

Found cached dataset json (C:/Users/16262/.cache/huggingface/datasets/json/default-8d04fd48df9290ce/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [422]:
train_data = dataset["train"]
test_data = dataset["test"]
val_data = dataset["val"]
english_text=[]
vietnamese_text=[]
for data in train_data:
    english_text.append(data["English"])
    print(data["English"])
    vietnamese_text.append(data["Vietnamese"])

Tom doesn't have an Australian accent
An old woman limped along the street.
Tom recognized the old man, but didn't know his name.
Tom is the one who said he was too busy to help, not me
It's not the first time.
I saw a fight
That watch is very nice.
I don't believe we've been formally introduced.
Can you believe this?
Let's review Lesson 5
He is quick to adapt to new circumstances
You can't possibly really believe that
Would you mind if I speak to Tom alone for a sec?
I saw Tom yesterday morning.
Tom held up his wine glass for a toast
Tom heard some music coming from the next room
I have a slight headache
Ten years have passed since I came to Tokyo at the age of eighteen
You always destroy everything
How did you get there?
The pigeon has flown away.
It was too muggy for me to get to sleep last night.
You're not in this alone
He insulted our team
They just announced their engagement
Some of these young people have legs twice as long as mine.
The whole community is behind this plan.
Sinc

In [465]:
source.build_vocab(english_text)
target.build_vocab(vietnamese_text)


In [428]:
dataset = dataset.map(lambda example: {'English': tokenize_en(example['English']),
                                       'Vietnamese': tokenize_vi(example['Vietnamese'])})

Loading cached processed dataset at C:\Users\16262\.cache\huggingface\datasets\json\default-8d04fd48df9290ce\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-ebe59e9e1cfc52d1.arrow
Loading cached processed dataset at C:\Users\16262\.cache\huggingface\datasets\json\default-8d04fd48df9290ce\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-57e0fefc1446b5f0.arrow
Loading cached processed dataset at C:\Users\16262\.cache\huggingface\datasets\json\default-8d04fd48df9290ce\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-805ce7de3ea080e5.arrow


In [429]:
print(f"Unique tokens in source (en) vocabulary: {len(source.vocab)}")
print(f"Unique tokens in target (vi) vocabulary: {len(target.vocab)}")
print(source.vocab)

Unique tokens in source (en) vocabulary: 1469
Unique tokens in target (vi) vocabulary: 1337
{'<sos>': 0, '.': 1, 'i': 2, 'the': 3, 'to': 4, 'tom': 5, 'you': 6, 'a': 7, '?': 8, "n't": 9, 'is': 10, 'do': 11, 'he': 12, 'it': 13, 'in': 14, 'that': 15, 'of': 16, "'s": 17, ',': 18, 'was': 19, 'for': 20, 'have': 21, 'me': 22, 'we': 23, 'this': 24, 'my': 25, 'what': 26, 'his': 27, 'and': 28, 'not': 29, 'are': 30, 'be': 31, 'mary': 32, 'did': 33, "'m": 34, 'with': 35, 'she': 36, 'on': 37, 'at': 38, 'want': 39, 'can': 40, 'know': 41, 'your': 42, "'ll": 43, 'him': 44, 'think': 45, 'they': 46, 'about': 47, 'as': 48, 'here': 49, 'there': 50, 'her': 51, 'would': 52, 'all': 53, 'time': 54, 'has': 55, "'re": 56, 'like': 57, 'tell': 58, 'go': 59, 'had': 60, 'up': 61, 'how': 62, 'were': 63, 'get': 64, "'ve": 65, 'by': 66, 'will': 67, 'ca': 68, 'does': 69, 'just': 70, 'one': 71, 'very': 72, 'when': 73, 'been': 74, 'some': 75, 'out': 76, 'good': 77, 'an': 78, 'us': 79, 'going': 80, 'could': 81, 'why': 82,

In [430]:
train_data = dataset["train"]
test_data = dataset["test"]
val_data = dataset["val"]
for x in train_data:
    print(x["English"])

['Tom', 'does', "n't", 'have', 'an', 'Australian', 'accent']
['An', 'old', 'woman', 'limped', 'along', 'the', 'street', '.']
['Tom', 'recognized', 'the', 'old', 'man', ',', 'but', 'did', "n't", 'know', 'his', 'name', '.']
['Tom', 'is', 'the', 'one', 'who', 'said', 'he', 'was', 'too', 'busy', 'to', 'help', ',', 'not', 'me']
['It', "'s", 'not', 'the', 'first', 'time', '.']
['I', 'saw', 'a', 'fight']
['That', 'watch', 'is', 'very', 'nice', '.']
['I', 'do', "n't", 'believe', 'we', "'ve", 'been', 'formally', 'introduced', '.']
['Can', 'you', 'believe', 'this', '?']
['Let', "'s", 'review', 'Lesson', '5']
['He', 'is', 'quick', 'to', 'adapt', 'to', 'new', 'circumstances']
['You', 'ca', "n't", 'possibly', 'really', 'believe', 'that']
['Would', 'you', 'mind', 'if', 'I', 'speak', 'to', 'Tom', 'alone', 'for', 'a', 'sec', '?']
['I', 'saw', 'Tom', 'yesterday', 'morning', '.']
['Tom', 'held', 'up', 'his', 'wine', 'glass', 'for', 'a', 'toast']
['Tom', 'heard', 'some', 'music', 'coming', 'from', 'the',

In [446]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import numpy as np

class TranslationDataset(Dataset):
    def __init__(self, data, source_vocab, target_vocab):
        self.data = data
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        src_sent = [token.lower() for token in self.data[index]["English"]]
        trg_sent = [token.lower() for token in self.data[index]["Vietnamese"]]
        src_sent = [token if token in self.source_vocab else '<unk>' for token in src_sent]
        trg_sent = [token if token in self.target_vocab else '<unk>' for token in trg_sent]
        src_seq = np.array([self.source_vocab.get(token, self.source_vocab['<unk>']) for token in src_sent])
        trg_seq = np.array([self.target_vocab.get(token, self.target_vocab['<unk>']) for token in trg_sent])
        return src_seq, trg_seq


def get_iterator(data, source_vocab, target_vocab, device, batch_size=128, shuffle=True):
    dataset = TranslationDataset(data, source_vocab, target_vocab)
    sampler = None
    if shuffle:
        sampler = torch.utils.data.RandomSampler(dataset)
    batch_sampler = torch.utils.data.BatchSampler(torch.utils.data.sampler.SequentialSampler(dataset), batch_size=batch_size, drop_last=False)
    loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, num_workers=0, collate_fn=lambda x: tuple(zip(*x)))
    for batch in loader:
        src_seqs = pad_sequence([torch.LongTensor(seq) for seq in batch[0]], padding_value=source_vocab['<unk>'], batch_first=True).to(device)
        trg_seqs = pad_sequence([torch.LongTensor(seq) for seq in batch[1]], padding_value=target_vocab['<unk>'], batch_first=True).to(device)
        yield src_seqs.transpose(0, 1), trg_seqs.transpose(0, 1)


train_batches = get_iterator(train_data, source.vocab, target.vocab, device)
test_batches = get_iterator(test_data, source.vocab, target.vocab, device)
val_batches = get_iterator(val_data, source.vocab, target.vocab, device)

test_batch = next(iter(test_batches))
test_batch[1]


tensor([[  23,  237,    4,  ..., 1336,    4,   58],
        [  13, 1336,   38,  ...,    9,  161,   64],
        [  12,   25,   43,  ...,    4,  116,   22],
        ...,
        [1336, 1336, 1336,  ..., 1336, 1336, 1336],
        [1336, 1336, 1336,  ..., 1336, 1336, 1336],
        [1336, 1336, 1336,  ..., 1336, 1336, 1336]])

In [447]:
import torch
from torch import nn, optim

# adjustable parameters
INPUT_DIM = len(source.vocab)
OUTPUT_DIM = len(target.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, n_layers, dropout=dropout,
                          bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

    def forward(self, src_batch):
        # src [sent len, batch size]

        # [sent len, batch size, emb dim]
        embedded = self.embedding(src_batch)
        outputs, hidden = self.rnn(embedded)
        # outputs -> [sent len, batch size, hidden dim * n directions]
        # hidden -> [n layers * n directions, batch size, hidden dim]

        # initial decoder hidden is final hidden state of the forwards and
        # backwards encoder RNNs fed through a linear layer
        concated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = torch.tanh(self.fc(concated))
        return outputs, hidden

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
outputs, hidden = encoder(test_batch[0])

print(outputs.shape, hidden.shape)



torch.Size([16, 128, 1024]) torch.Size([128, 512])




In [448]:
class Attention(nn.Module):

    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        # enc_hid_dim multiply by 2 due to bidirectional
        self.fc1 = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        self.fc2 = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]
        
        # repeat encoder hidden state src_len times [batch size, sent len, dec hid dim]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        # reshape/permute the encoder output, so that the batch size comes first
        # [batch size, sent len, enc hid dim * 2], times 2 because of bidirectional
        outputs = encoder_outputs.permute(1, 0, 2)

        # the attention mechanism receives a concatenation of the hidden state
        # and the encoder output
        concat = torch.cat((hidden, outputs), dim=2)
        
        # fully connected layer and softmax layer to compute the attention weight
        # [batch size, sent len, dec hid dim]
        energy = torch.tanh(self.fc1(concat))
        # attention weight should be of [batch size, sent len]
        attention = self.fc2(energy).squeeze(dim=2)  
        attention_weight = torch.softmax(attention, dim=1)
        return attention_weight

    
attention = Attention(ENC_HID_DIM, DEC_HID_DIM).to(device)
attention_weight = attention(outputs, hidden)
attention_weight.shape

torch.Size([128, 16])

In [449]:
class Decoder(nn.Module):

    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers,
                 dropout, attention):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(enc_hid_dim * 2 + emb_dim, dec_hid_dim, n_layers, dropout=dropout)
        self.linear = nn.Linear(dec_hid_dim, output_dim)

    def forward(self, trg, encoder_outputs, hidden):
        # trg [batch size]
        # outputs [src sen len, batch size, enc hid dim * 2], times 2 due to bidirectional
        # hidden [batch size, dec hid dim]

        # [batch size, 1, sent len] 
        attention = self.attention(encoder_outputs, hidden).unsqueeze(1)

        # [batch size, sent len, enc hid dim * 2]
        outputs = encoder_outputs.permute(1, 0, 2)

        # [1, batch size, enc hid dim * 2]
        context = torch.bmm(attention, outputs).permute(1, 0, 2)

        # input sentence -> embedding
        # [1, batch size, emb dim]
        embedded = self.embedding(trg.unsqueeze(0))
        rnn_input = torch.cat((embedded, context), dim=2)

        outputs, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        prediction = self.linear(outputs.squeeze(0))
        return prediction, hidden.squeeze(0)

decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attention).to(device)
prediction, decoder_hidden = decoder(test_batch[1][0], outputs, hidden)

# notice the decoder_hidden's shape should match the shape that's generated by
# the encoder
prediction.shape, decoder_hidden.shape

(torch.Size([128, 1337]), torch.Size([128, 512]))

In [450]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src_batch, trg_batch, teacher_forcing_ratio=0.5):
        max_len, batch_size = trg_batch.shape
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        # encoder_outputs : all hidden states of the input sequence (forward and backward)
        # hidden : final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src_batch)

        trg = trg_batch[0]
        for i in range(1, max_len):
            prediction, hidden = self.decoder(trg, encoder_outputs, hidden)
            outputs[i] = prediction

            if random.random() < teacher_forcing_ratio:
                trg = trg_batch[i]
            else:
                trg = prediction.argmax(1)

        return outputs

attention = Attention(ENC_HID_DIM, DEC_HID_DIM)
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attention)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
seq2seq

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1469, 256)
    (rnn): GRU(256, 512, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (fc1): Linear(in_features=1536, out_features=512, bias=True)
      (fc2): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(1337, 256)
    (rnn): GRU(1280, 512, dropout=0.5)
    (linear): Linear(in_features=512, out_features=1337, bias=True)
  )
)

In [451]:
outputs = seq2seq(test_batch[0],test_batch[1])
outputs.shape

torch.Size([14, 128, 1337])

In [452]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(seq2seq):,} trainable parameters')

The model has 7,837,497 trainable parameters


In [453]:
optimizer = optim.Adam(seq2seq.parameters())

# ignore the padding index when calculating the loss
PAD_IDX = target.vocab['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [454]:
from tqdm import tqdm
import math
import time

def train(seq2seq, iterator, optimizer, criterion):
    seq2seq.train()
    
    epoch_loss = 0
    
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        outputs = seq2seq(test_batch[0], test_batch[1])

        # the loss function only works on 2d inputs
        # and 1d targets we need to flatten each of them
        outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
        trg_flatten = test_batch[1][1:].reshape(-1)
        loss = criterion(outputs_flatten, trg_flatten)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    return epoch_loss / 28


def evaluate(seq2seq, iterator, criterion):
    seq2seq.eval()

    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(iterator):
            # turn off teacher forcing
            outputs = seq2seq(test_batch[0], test_batch[1], teacher_forcing_ratio=0) 

            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]
            outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
            trg_flatten = test_batch[1][1:].reshape(-1)
            loss = criterion(outputs_flatten, trg_flatten)
            epoch_loss += loss.item()
        
    return epoch_loss / 4
  

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 30
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(seq2seq, train_batches, optimizer, criterion)
    valid_loss = evaluate(seq2seq, val_batches, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(seq2seq.state_dict(), 'tut2-model.pt')

    print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

28it [00:25,  1.09it/s]
4it [00:01,  3.34it/s]


Epoch: 1 | Time: 0m 26s
	Train Loss: 2.936 | Train PPL:  18.847
	 Val. Loss: 2.304 |  Val. PPL:  10.012


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 2 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 3 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 4 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 5 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 6 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 7 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 8 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 9 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 10 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 11 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 12 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 13 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 14 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 15 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 16 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 17 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 18 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 19 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 20 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 21 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 22 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 23 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 24 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 25 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 26 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 27 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 28 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 29 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]

Epoch: 30 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000





In [455]:
seq2seq.load_state_dict(torch.load('tut2-model.pt'))

test_loss = evaluate(seq2seq, test_batches, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

7it [00:02,  3.30it/s]

| Test Loss: 4.032 | Test PPL:  56.355 |





In [456]:
example_idx = 0
example = train_data[example_idx]
print('source sentence: ', ' '.join(example["English"]))
print('target sentence: ', ' '.join(example["Vietnamese"]))

source sentence:  Tom does n't have an Australian accent
target sentence:  tom không có giọng úc


In [466]:

src_tensor = source.numericalize([example["English"]], device=device)
trg_tensor = target.numericalize([example["Vietnamese"]], device=device)

print(trg_tensor.shape)

seq2seq.eval()
with torch.no_grad():
    outputs = seq2seq(src_tensor, trg_tensor, teacher_forcing_ratio=0)

print(outputs.shape)


torch.Size([7, 1])
torch.Size([7, 1, 1337])


In [467]:
output_idx = outputs[1:].squeeze(1).argmax(1).tolist()
print(output_idx)
' '.join([target.itos[idx] for idx in output_idx])


[6, 1336, 1336, 1336, 1336, 1336]


'đã <unk> <unk> <unk> <unk> <unk>'

In [471]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import sacrebleu

def evaluate(model, iterator, source_field, target_field, device):
    model.eval()

    with torch.no_grad():
        targets = []
        predictions = []
        for i, batch in enumerate(iterator):
            src = getattr(batch, source_field)
            trg = getattr(batch, target_field)
            output = model(src, trg, 0) # turn off teacher forcing
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            # convert output indices to tokens
            predicted_tokens = []
            for i in range(output.shape[0]):
                predicted_tokens.append(target_field.vocab.itos[output[i]])

            targets.append(trg.tolist())
            predictions.append(predicted_tokens)

    # calculate BLEU score
    references = [[target_field.vocab.itos[token] for token in t if token != target_field.vocab.stoi[target_field.eos_token]] for t in targets]
    candidate_corpus = [[token for token in prediction if token != target_field.eos_token] for prediction in predictions]
    bleu_score = corpus_bleu(references, candidate_corpus, smoothing_function=SmoothingFunction().method1)

    # calculate METEOR score
    meteor_score = 0
    for i in range(len(targets)):
        meteor_score += nltk.translate.meteor_score(references[i], predictions[i])
    meteor_score /= len(targets)

    # calculate ROUGE score
    rouge = sacrebleu.corpus_rouge(predictions, references)

    # calculate TER score
    ter_score = 0
    for i in range(len(targets)):
        ter_score += nltk.translate.ter(references[i], predictions[i])
    ter_score /= len(targets)

    return bleu_score, meteor_score, rouge.score, ter_score

evaluate(seq2seq,train_batches,source,target,device=device)


ZeroDivisionError: Fraction(0, 0)