In [29]:
import random
import torch
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1ccfe7ca630>

In [30]:
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

In [31]:
import nltk
import string
import itertools

def tokenize_en(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return tokens

def tokenize_vi(text):
    return [tok for tok in itertools.chain.from_iterable(annotator.tokenize(text))]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [32]:
import pandas as pd

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
val.to_json("val.json", orient="records", lines=True)


In [None]:
source = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
target = Field(tokenize=tokenize_vi, init_token='<sos>', eos_token='<eos>', lower=True)
fields = {"English": ("src", source), "Vietnamese": ("trg", target)}
train_data, test_data, val_data = TabularDataset.splits(
    path="./", train="train.json", test="test.json", validation ="val.json", format="json", fields=fields
)
source.build_vocab(train_data, max_size=10000, min_freq=2)
target.build_vocab(train_data, max_size=10000, min_freq=2)
print(f"Unique tokens in source (en) vocabulary: {len(source.vocab)}")
print(f"Unique tokens in target (vi) vocabulary: {len(target.vocab)}")

BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data), batch_size=BATCH_SIZE, sort_key = lambda x: len(x.src),
    sort_within_batch=True, device=device)
test_batch = next(iter(test_iterator))
test_batch.src

In [65]:
import json
from collections import Counter
from itertools import chain

source_tokenizer = tokenize_en
target_tokenizer = tokenize_vi

def load_data(filename, source_tokenizer, target_tokenizer):
    examples = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            example = json.loads(line)
            src = source_tokenizer(example["English"])
            trg = target_tokenizer(example["Vietnamese"])
            examples.append((src, trg))
    return examples

train_examples = load_data("train.json", source_tokenizer, target_tokenizer)
val_examples = load_data("val.json", source_tokenizer, target_tokenizer)
test_examples = load_data("test.json", source_tokenizer, target_tokenizer)

def build_vocab(tokenized_sentences, max_size=None, min_freq=1):
    word_counts = Counter(chain(*tokenized_sentences))
    sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
    if max_size is not None:
        sorted_words = sorted_words[:max_size]
    vocabulary = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
    for word, count in sorted_words:
        if count >= min_freq and word not in vocabulary:
            vocabulary[word] = len(vocabulary)
    return vocabulary

source_sentences_train = [example[0] for example in train_examples]
target_sentences_train = [example[1] for example in train_examples]
source_vocab = build_vocab(source_sentences_train, max_size=10000, min_freq=2)
target_vocab = build_vocab(target_sentences_train, max_size=10000, min_freq=2)

print(f"Unique tokens in source (en) vocabulary: {len(source_vocab)}")
print(f"Unique tokens in target (vi) vocabulary: {len(target_vocab)}")

Unique tokens in source (en) vocabulary: 1529
Unique tokens in target (vi) vocabulary: 1343


In [74]:
print(source_vocab)
print(target_vocab)


{'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3, 'I': 4, 'to': 5, 'Tom': 6, 'the': 7, 'you': 8, 'a': 9, "n't": 10, 'is': 11, "'s": 12, 'of': 13, 'in': 14, 'was': 15, 'do': 16, 'that': 17, 'for': 18, 'me': 19, 'The': 20, 'have': 21, 'He': 22, 'it': 23, 'You': 24, 'his': 25, 'and': 26, 'We': 27, 'be': 28, 'not': 29, "'m": 30, 'Mary': 31, 'with': 32, 'he': 33, 'are': 34, 'this': 35, 'on': 36, 'It': 37, 'did': 38, 'at': 39, 'my': 40, 'want': 41, 'your': 42, 'what': 43, 'Do': 44, "'ll": 45, 'know': 46, 'him': 47, 'about': 48, "'re": 49, 'think': 50, 'go': 51, 'What': 52, 'She': 53, 'as': 54, 'her': 55, 'up': 56, 'like': 57, 'had': 58, 'here': 59, 'time': 60, 'get': 61, 'all': 62, 'will': 63, 'has': 64, 'can': 65, "'ve": 66, 'one': 67, 'we': 68, 'been': 69, 'were': 70, 'ca': 71, 'just': 72, 'out': 73, 'by': 74, 'does': 75, 'could': 76, 'They': 77, 'going': 78, 'very': 79, 'That': 80, 'would': 81, 'tell': 82, 'us': 83, 'an': 84, 'This': 85, 'should': 86, 'there': 87, 'come': 88, 'from': 89, '

In [75]:
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_iterator(data, batch_size, device):
    data.sort(key=lambda x: len(x[0]))
    batches = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        try:
            src_seqs = [torch.LongTensor([source_vocab[t] for t in src]) for (src, trg) in batch]
        except KeyError as e:
            print("Error: missing key in source_vocab:", e)
            print("Batch:", batch)
            continue
        try:
            trg_seqs = [torch.LongTensor([target_vocab[t] for t in trg]) for (src, trg) in batch]
        except KeyError as e:
            print("Error: missing key in target_vocab:", e)
            print("Batch:", batch)
            continue
        src_seqs = torch.nn.utils.rnn.pad_sequence(src_seqs, batch_first=True).to(device)
        trg_seqs = torch.nn.utils.rnn.pad_sequence(trg_seqs, batch_first=True).to(device)
        batches.append((src_seqs, trg_seqs))
    return batches

train_batches = get_iterator(train_examples, BATCH_SIZE, device)
valid_batches = get_iterator(val_examples, BATCH_SIZE, device)
test_batches = get_iterator(test_examples, BATCH_SIZE, device)

test_batch = test_batches[0]
test_src = test_batch[0]

Error: missing key in source_vocab: 'Stay'
Batch: [(['Hi'], ['Chào', '.']), (['Bring', 'wine'], ['mang', 'rượu']), (['Stay', 'sharp'], ['Minh_mẫn']), (['Tom', 'knits'], ['vải', 'dệt_kim']), (['He', 'ran'], ['ông', 'chạy', '.']), (['No', 'kidding'], ['không', 'đua', 'đâu', '?']), (['She', 'runs'], ['Cô', 'ấy', 'chạy', '.']), (['Stop', 'yelling'], ['ngừng', 'la_hét']), (['What', 'stopped', 'Tom'], ['Điều', 'gì', 'đã', 'dừng', 'tom', '?']), (['Tom', 'rushed', 'upstairs'], ['Tom', 'vội_vã', 'lên', 'lầu', '.']), (['Tom', 'saw', 'somebody'], ['tom', 'thấy', 'ai', 'đó']), (['That', "'s", 'Saturn'], ['đó', 'là', 'saturn', '.']), (['Tom', 'remained', 'silent'], ['tom', 'vẫn', 'im_lặng']), (['Ann', 'came', 'downstairs'], ['ann', 'đi', 'xuống', 'cầu_thang', '.']), (['They', "'re", 'clean'], ['họ', 'sạch_sẽ']), (['Tom', 'seems', 'dazed'], ['tom', 'có_vẻ', 'choáng_váng']), (['We', 'saw', 'you'], ['chúng_tôi', 'đã', 'nhìn', 'thấy', 'bạn']), (['A', 'coke', 'please'], ['làm_ơn', 'cho', 'một', 'cốc', '

IndexError: list index out of range