In [29]:
import random
import torch
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1ccfe7ca630>

In [30]:
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

In [31]:
import nltk
import string
import itertools

def tokenize_en(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return tokens

def tokenize_vi(text):
    return [tok for tok in itertools.chain.from_iterable(annotator.tokenize(text))]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [32]:
import pandas as pd

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
val.to_json("val.json", orient="records", lines=True)


In [None]:
source = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
target = Field(tokenize=tokenize_vi, init_token='<sos>', eos_token='<eos>', lower=True)
fields = {"English": ("src", source), "Vietnamese": ("trg", target)}
train_data, test_data, val_data = TabularDataset.splits(
    path="./", train="train.json", test="test.json", validation ="val.json", format="json", fields=fields
)
source.build_vocab(train_data, max_size=10000, min_freq=2)
target.build_vocab(train_data, max_size=10000, min_freq=2)
print(f"Unique tokens in source (en) vocabulary: {len(source.vocab)}")
print(f"Unique tokens in target (vi) vocabulary: {len(target.vocab)}")

BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data), batch_size=BATCH_SIZE, sort_key = lambda x: len(x.src),
    sort_within_batch=True, device=device)
test_batch = next(iter(test_iterator))
test_batch.src

In [65]:
import json
from collections import Counter
from itertools import chain

source_tokenizer = tokenize_en
target_tokenizer = tokenize_vi

def load_data(filename, source_tokenizer, target_tokenizer):
    examples = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            example = json.loads(line)
            src = source_tokenizer(example["English"])
            trg = target_tokenizer(example["Vietnamese"])
            examples.append((src, trg))
    return examples

train_examples = load_data("train.json", source_tokenizer, target_tokenizer)
val_examples = load_data("val.json", source_tokenizer, target_tokenizer)
test_examples = load_data("test.json", source_tokenizer, target_tokenizer)

def build_vocab(tokenized_sentences, max_size=None, min_freq=1):
    word_counts = Counter(chain(*tokenized_sentences))
    sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
    if max_size is not None:
        sorted_words = sorted_words[:max_size]
    vocabulary = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
    for word, count in sorted_words:
        if count >= min_freq and word not in vocabulary:
            vocabulary[word] = len(vocabulary)
    return vocabulary

source_sentences_train = [example[0] for example in train_examples]
target_sentences_train = [example[1] for example in train_examples]
source_vocab = build_vocab(source_sentences_train, max_size=10000, min_freq=2)
target_vocab = build_vocab(target_sentences_train, max_size=10000, min_freq=2)

print(f"Unique tokens in source (en) vocabulary: {len(source_vocab)}")
print(f"Unique tokens in target (vi) vocabulary: {len(target_vocab)}")

Unique tokens in source (en) vocabulary: 1529
Unique tokens in target (vi) vocabulary: 1343


In [84]:
for i, word in enumerate(source_vocab):
    if word == "stay":
        print("Index of 'stay' in source_vocab:", i)
        break

print (train_examples)

Index of 'stay' in source_vocab: 217


In [100]:
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_iterator(data, batch_size, source_vocab, target_vocab, device):
    src_sents = [x[0] for x in data]
    trg_sents = [x[1] for x in data]
   
    # Sort the sentences by length for efficiency
    sorted_indices = sorted(range(len(src_sents)), key=lambda i: len(src_sents[i]))
    src_sents = [src_sents[i] for i in sorted_indices]
    trg_sents = [trg_sents[i] for i in sorted_indices]

    # Split the data into batches
    batches = []
    for i in range(0, len(src_sents), batch_size):
        src_batch = src_sents[i:i+batch_size]
        trg_batch = trg_sents[i:i+batch_size]
        batch = (src_batch, trg_batch)
        batches.append(batch)

    # Shuffle the batches
    random.shuffle(batches)

    # Iterate over the batches
    for batch in batches:
        # Convert the sentences to sequences of indices
        src_seqs = [torch.LongTensor([source_vocab[token] if token in source_vocab else source_vocab['<unk>'] for token in sent]) for sent in batch[0]]
        trg_seqs = [torch.LongTensor([target_vocab[token.split('_')[0]] if token.split('_')[0] in target_vocab else target_vocab['<unk>'] for token in sent]) for sent in batch[1]]

        # Pad the sequences
        src_seqs = torch.nn.utils.rnn.pad_sequence(src_seqs, batch_first=True, padding_value=source_vocab['<pad>']).to(device)
        trg_seqs = torch.nn.utils.rnn.pad_sequence(trg_seqs, batch_first=True, padding_value=target_vocab['<pad>']).to(device)

        # Return the batch
        yield (src_seqs, trg_seqs)


train_batches = get_iterator(train_examples, BATCH_SIZE, source_vocab, target_vocab, device)
valid_batches = get_iterator(val_examples, BATCH_SIZE, source_vocab, target_vocab, device)
test_batches = get_iterator(test_examples, BATCH_SIZE, source_vocab, target_vocab, device)

for i, test_batch in enumerate(test_batches):
    print(f"Batch {i}:")
    print(f"Source sequence: {test_batch[0]}")
    print(f"Target sequence: {test_batch[1]}")




Batch 0:
Source sequence: tensor([[  44,   10,    1,   56,    7,  853,    0],
        [   4,   38,   10,  221,    7,    1,    0],
        [   4,   38,   10,  363,  356,   87,    0],
        [   4,   16,   29,   21,  111,  133,    0],
        [  53,  181,    5,  292,   32,   47,    0],
        [   4,   58,    9,    1,    1,  298,    0],
        [ 106,   34,  102,  805,  464,    1,    0],
        [ 175,    1,  866,   74,  129,    1,    0],
        [  22,  758,    5,  158,   36,    1,    0],
        [   6,  798,   11,    9,  105,  567,    0],
        [  22,    1,    7,  644,   79,  216,    0],
        [   6,  411,  200,   10,  398,   51,    0],
        [ 123,    8,  301,  617,   17,  609,    0],
        [   4,   51,    5,  708,  148,  109,    0],
        [   6,   26,   31,   34,   10,    1,    0],
        [   4,   21,  451,   14,   84,  543,    0],
        [   1,    1,  259,    5,   28,  272,    0],
        [ 132,   16,   10,    8,   16,  366,    0],
        [   6,   26,    4,   21,  275,

In [102]:
# adjustable parameters
INPUT_DIM = len(source_vocab)
OUTPUT_DIM = len(target_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [104]:
from torch import nn, optim
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, n_layers, dropout=dropout,
                          bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

    def forward(self, src_batch):
        # src [sent len, batch size]

        # [sent len, batch size, emb dim]
        embedded = self.embedding(src_batch)
        outputs, hidden = self.rnn(embedded)
        # outputs -> [sent len, batch size, hidden dim * n directions]
        # hidden -> [n layers * n directions, batch size, hidden dim]

        # initial decoder hidden is final hidden state of the forwards and
        # backwards encoder RNNs fed through a linear layer
        concated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = torch.tanh(self.fc(concated))
        return outputs, hidden

In [108]:
# Rerun to reinitialize the iterator
test_batches = get_iterator(test_examples, BATCH_SIZE, source_vocab, target_vocab, device)

# Iterate over the batches
for i, test_batch in enumerate(test_batches):
    if i == 0:
        # Modify test_batch here if needed
        break


In [112]:
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
test_batch = next(iter(test_batches))
outputs, hidden = encoder(test_batch[0].to(device))
print(outputs.shape, hidden.shape)


torch.Size([128, 6, 1024]) torch.Size([6, 512])
