In [2]:
import random
import torch
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x210437cb2b0>

In [3]:
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

In [4]:
import nltk
import string
import itertools

def tokenize_en(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return tokens

def tokenize_vi(text):
    return [tok for tok in itertools.chain.from_iterable(annotator.tokenize(text))]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [12]:
import pandas as pd

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=False)#line=true will cause json error 
test.to_json("test.json", orient="records", lines=False)
val.to_json("val.json", orient="records", lines=False)


In [39]:
from torch.utils.data import Dataset, DataLoader
import json
import torch

class TranslationDataset(Dataset):
    def __init__(self, data_path, src_field_idx, trg_field_idx):
        with open(data_path, "r") as f:
            data = json.load(f)
        self.src = [data[i][src_field_idx] for i in range(len(data))]
        self.trg = [data[i][trg_field_idx] for i in range(len(data))]
        self.length = len(self.src)
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        src_sent = self.src[idx]
        trg_sent = self.trg[idx]
        
        # convert to tensor
        src_tensor = torch.tensor(src_sent)
        trg_tensor = torch.tensor(trg_sent)
        
        return src_tensor, trg_tensor

train_data = TranslationDataset("train.json", "English", "Vietnamese")
val_data = TranslationDataset("val.json", "English", "Vietnamese")
test_data = TranslationDataset("test.json", "English", "Vietnamese")

batch_size = 128

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False )
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [53]:
class Field:
    def __init__(self, tokenize_fn, init_token=None, eos_token=None, lower=False):
        self.tokenize_fn = tokenize_fn
        self.init_token = init_token
        self.eos_token = eos_token
        self.lower = lower
        self.vocab = None

    def tokenize(self, text):
        tokens = self.tokenize_fn(text)
        if self.lower:
            tokens = [token.lower() for token in tokens]
        if self.init_token:
            tokens.insert(0, self.init_token)
        if self.eos_token:
            tokens.append(self.eos_token)
        return tokens

    def build_vocab(self, data, max_size=None, min_freq=1):
        word_freq = {}
        for example in data:
            for word in self.tokenize(example):
                if word in word_freq:
                    word_freq[word] += 1
                else:
                    word_freq[word] = 1
        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        most_common_words = [word[0] for word in sorted_words if word[1] >= min_freq]
        if max_size:
            most_common_words = most_common_words[:max_size]
        self.vocab = {word: i for i, word in enumerate(most_common_words)}

    def numericalize(self, text):
        return [self.vocab[token] if token in self.vocab else self.vocab['<unk>'] for token in text]

source = Field(tokenize_fn=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
target = Field(tokenize_fn=tokenize_vi, init_token='<sos>', eos_token='<eos>', lower=True)


In [54]:
source.build_vocab(train_data, max_size=10000, min_freq=2)
target.build_vocab(train_data, max_size=10000, min_freq=2)

TypeError: new(): invalid data type 'str'