In [22]:
import random
import torch
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x2048e6466b0>

In [23]:
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

In [24]:
import nltk
import string
import itertools

def tokenize_en(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return tokens

def tokenize_vi(text):
    return [tok for tok in itertools.chain.from_iterable(annotator.tokenize(text))]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [25]:
import pandas as pd

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
val.to_json("val.json", orient="records", lines=True)


In [26]:
import json
from collections import Counter
from itertools import chain

source_tokenizer = tokenize_en
target_tokenizer = tokenize_vi

def load_data(filename, source_tokenizer, target_tokenizer):
    examples = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            example = json.loads(line)
            src = source_tokenizer(example["English"])
            trg = target_tokenizer(example["Vietnamese"])
            examples.append((src, trg))
    return examples

train_examples = load_data("train.json", source_tokenizer, target_tokenizer)
val_examples = load_data("val.json", source_tokenizer, target_tokenizer)
test_examples = load_data("test.json", source_tokenizer, target_tokenizer)

def build_vocab(tokenized_sentences, max_size=None, min_freq=1):
    word_counts = Counter(chain(*tokenized_sentences))
    sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
    if max_size is not None:
        sorted_words = sorted_words[:max_size]
    vocabulary = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
    for word, count in sorted_words:
        if count >= min_freq and word not in vocabulary:
            vocabulary[word] = len(vocabulary)
    return vocabulary

source_sentences_train = [example[0] for example in train_examples]
target_sentences_train = [example[1] for example in train_examples]
source_vocab = build_vocab(source_sentences_train, max_size=10000, min_freq=2)
target_vocab = build_vocab(target_sentences_train, max_size=10000, min_freq=2)

print(f"Unique tokens in source (en) vocabulary: {len(source_vocab)}")
print(f"Unique tokens in target (vi) vocabulary: {len(target_vocab)}")

Unique tokens in source (en) vocabulary: 1535
Unique tokens in target (vi) vocabulary: 1359


In [70]:
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_iterator(data, batch_size, source_vocab, target_vocab, device):
    src_sents = [x[0] for x in data]
    trg_sents = [x[1] for x in data]

    # Sort the sentences by length for efficiency
    sorted_indices = sorted(range(len(src_sents)), key=lambda i: len(src_sents[i]))

    # Split the data into batches
    batches = []
    for i in range(0, len(src_sents[:batch_size*128]), batch_size):
        indices = sorted_indices[i:i+batch_size]
        src_batch = [src_sents[j] for j in indices]
        trg_batch = [trg_sents[j] for j in indices]
        batch = {"src": src_batch, "trg": trg_batch}
        batches.append(batch)

    # Shuffle the batches
    random.shuffle(batches)

    # Iterate over the batches
    for batch in batches:
        # Convert the sentences to sequences of indices
        src_seqs = [torch.LongTensor([source_vocab[token] if token in source_vocab else source_vocab['<unk>'] for token in sent]) for sent in batch['src']]
        trg_seqs = [torch.LongTensor([target_vocab[token] if token in target_vocab else target_vocab['<unk>'] for token in sent]) for sent in batch['trg']]

        # Pad the sequences
        src_seqs = torch.nn.utils.rnn.pad_sequence(src_seqs, batch_first=True, padding_value=source_vocab['<pad>']).to(device)
        trg_seqs = torch.nn.utils.rnn.pad_sequence(trg_seqs, batch_first=True, padding_value=target_vocab['<pad>']).to(device)

        # Return the batch
        yield {"src": src_seqs, "trg": trg_seqs}





train_batches = get_iterator(train_examples, BATCH_SIZE, source_vocab, target_vocab, device)
valid_batches = get_iterator(val_examples, BATCH_SIZE, source_vocab, target_vocab, device)
test_batches = get_iterator(test_examples, BATCH_SIZE, source_vocab, target_vocab, device)

for i, batch in enumerate (test_batches):
    print(f"Batch {i}:")
    print("Source sequences:")
    print(batch["src"][i])
    print(batch["src"])
    print("Target sequences:")
    print(batch["trg"][i])
    print(batch["trg"])




Batch 0:
Source sequences:
tensor([148,  16,  10,   8,  74,  52,  70,   6,   0])
tensor([[148,  16,  10,  ...,  70,   6,   0],
        [  6,  56, 125,  ...,  34,   1,   0],
        [  4,  47, 121,  ...,  46,   1,   0],
        ...,
        [  4,  66, 117,  ..., 122,  67,  18],
        [  4,  33, 926,  ...,   1, 400, 203],
        [  4,  69,  25,  ...,  98,  39, 801]])
Target sequences:
tensor([251,   6,   7,  37, 168,   8,  14,   0,   0,   0,   0,   0,   0])
tensor([[251,   6,   7,  ...,   0,   0,   0],
        [  8,  15, 121,  ...,   0,   0,   0],
        [ 27,  17,   1,  ...,   0,   0,   0],
        ...,
        [  5,   9,  15,  ...,   0,   0,   0],
        [ 27,   1, 128,  ...,   0,   0,   0],
        [  5,  17,  99,  ...,   0,   0,   0]])
Batch 1:
Source sequences:
tensor([ 6, 11, 12,  1,  1,  0])
tensor([[  29,   47,   85,    8,  331,    0],
        [   6,   11,   12,    1,    1,    0],
        [   6,    1,   28,   14,  386,    0],
        [  80,   11,    9,  535,    1,    0],
   

In [103]:
import torch
from torch import nn, optim

# adjustable parameters
INPUT_DIM = len(source_vocab)
OUTPUT_DIM = len(target_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, n_layers, dropout=dropout,
                          bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

    def forward(self, src_batch):
        # src [sent len, batch size]

        # [sent len, batch size, emb dim]
        embedded = self.embedding(src_batch)
        outputs, hidden = self.rnn(embedded)
        # outputs -> [sent len, batch size, hidden dim * n directions]
        # hidden -> [n layers * n directions, batch size, hidden dim]

        # initial decoder hidden is final hidden state of the forwards and
        # backwards encoder RNNs fed through a linear layer
        concated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = torch.tanh(self.fc(concated))
        return outputs, hidden

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
outputs, hidden = encoder(batch["src"])
outputs = outputs.permute(1, 0, 2)
hidden = hidden.repeat(32, 1)
# transpose the outputs tensor to have batch size as the first dimension
print(outputs.shape, hidden.shape)



torch.Size([4, 128, 1024]) torch.Size([128, 512])


In [104]:
class Attention(nn.Module):

    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        # enc_hid_dim multiply by 2 due to bidirectional
        self.fc1 = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        self.fc2 = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]
        
        # repeat encoder hidden state src_len times [batch size, sent len, dec hid dim]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        # reshape/permute the encoder output, so that the batch size comes first
        # [batch size, sent len, enc hid dim * 2], times 2 because of bidirectional
        outputs = encoder_outputs.permute(1, 0, 2)

        # the attention mechanism receives a concatenation of the hidden state
        # and the encoder output
        concat = torch.cat((hidden, outputs), dim=2)
        
        # fully connected layer and softmax layer to compute the attention weight
        # [batch size, sent len, dec hid dim]
        energy = torch.tanh(self.fc1(concat))
        # attention weight should be of [batch size, sent len]
        attention = self.fc2(energy).squeeze(dim=2)  
        attention_weight = torch.softmax(attention, dim=1)
        return attention_weight

    
attention = Attention(ENC_HID_DIM, DEC_HID_DIM).to(device)
attention_weight = attention(outputs, hidden)
attention_weight.shape

torch.Size([128, 4])

In [111]:
class Decoder(nn.Module):

    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers,
                 dropout, attention):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(enc_hid_dim * 2 + emb_dim, dec_hid_dim, n_layers, dropout=dropout)
        self.linear = nn.Linear(dec_hid_dim, output_dim)

    def forward(self, trg, encoder_outputs, hidden):
        # trg [batch size]
        # outputs [src sen len, batch size, enc hid dim * 2], times 2 due to bidirectional
        # hidden [batch size, dec hid dim]

        # [batch size, 1, sent len] 
        attention = self.attention(encoder_outputs, hidden).unsqueeze(1)

        # [batch size, sent len, enc hid dim * 2]
        outputs = encoder_outputs.permute(1, 0, 2)

        # [1, batch size, enc hid dim * 2]
        context = torch.bmm(attention, outputs).permute(1, 0, 2)

        # input sentence -> embedding
        # [1, batch size, emb dim]
        embedded = self.embedding(trg.unsqueeze(0))
        print('embedded shape:', embedded.shape)
        print('context shape:', context.shape)
        rnn_input = torch.cat((embedded, context), dim=2)
        

        outputs, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        prediction = self.linear(outputs.squeeze(0))
        return prediction, hidden.squeeze(0)


decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attention).to(device)
prediction, decoder_hidden = decoder(batch["trg"][0], outputs, hidden)

# notice the decoder_hidden's shape should match the shape that's generated by
# the encoder
prediction.shape, decoder_hidden.shape

embedded shape: torch.Size([1, 8, 256])
context shape: torch.Size([1, 128, 1024])


RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 8 but got size 128 for tensor number 1 in the list.

In [106]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src_batch, trg_batch, teacher_forcing_ratio=0.5):
        max_len, batch_size = trg_batch.shape
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        # encoder_outputs : all hidden states of the input sequence (forward and backward)
        # hidden : final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src_batch)

        trg = trg_batch[0]
        for i in range(1, max_len):
            prediction, hidden = self.decoder(trg, encoder_outputs, hidden)
            outputs[i] = prediction

            if random.random() < teacher_forcing_ratio:
                trg = trg_batch[i]
            else:
                trg = prediction.argmax(1)

        return outputs
    
attention = Attention(ENC_HID_DIM, DEC_HID_DIM)
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attention)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
seq2seq

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1535, 256)
    (rnn): GRU(256, 512, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (fc1): Linear(in_features=1536, out_features=512, bias=True)
      (fc2): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(1359, 256)
    (rnn): GRU(1280, 512, dropout=0.5)
    (linear): Linear(in_features=512, out_features=1359, bias=True)
  )
)

In [107]:
for batch in test_batches:
    src = batch["src"]
    trg = batch["trg"]
    outputs = seq2seq(src, trg)
    # do something with the outputs

outputs.shape

torch.Size([4, 128, 1024])

In [108]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(seq2seq):,} trainable parameters')

The model has 7,871,311 trainable parameters


In [109]:
optimizer = optim.Adam(seq2seq.parameters())

# ignore the padding index when calculating the loss
PAD_IDX = target_vocab['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [110]:
from tqdm import tqdm
import time
import math
def train(seq2seq, iterator, optimizer, criterion):
    seq2seq.train()
    
    epoch_loss = 0
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        outputs = seq2seq(batch.src, batch.trg)

        # the loss function only works on 2d inputs
        # and 1d targets we need to flatten each of them
        outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
        trg_flatten = batch.trg[1:].view(-1)
        loss = criterion(outputs_flatten, trg_flatten)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(seq2seq, iterator, criterion):
    seq2seq.eval()

    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(iterator):
            # turn off teacher forcing
            outputs = seq2seq(batch.src, batch.trg, teacher_forcing_ratio=0) 

            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]
            outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
            trg_flatten = batch.trg[1:].view(-1)
            loss = criterion(outputs_flatten, trg_flatten)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 30
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(seq2seq, train_iterator, optimizer, criterion)
    valid_loss = evaluate(seq2seq, valid_iterator, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(seq2seq.state_dict(), 'tut2-model.pt')

    print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  inputs = np.array(inputs)
  targets = np.array(targets)
0it [00:00, ?it/s]


AttributeError: 'tuple' object has no attribute 'src'