In [22]:
import random
import torch
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x18097cdfd10>

In [23]:
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

In [24]:
import nltk
import string
import itertools

def tokenize_en(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return tokens

def tokenize_vi(text):
    return [tok for tok in itertools.chain.from_iterable(annotator.tokenize(text))]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [61]:
import pandas as pd

# Define the fields for your data
fields = {"English": ("src", tokenize_en), "Vietnamese": ("trg", tokenize_vi)}

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=False)#line=true will cause json error 
test.to_json("test.json", orient="records", lines=False)
val.to_json("val.json", orient="records", lines=False)


In [62]:
from torch.utils.data import Dataset, DataLoader
import json

class TranslationDataset(Dataset):
    def __init__(self, data_path, src_field_idx, trg_field_idx):
        with open(data_path, "r") as f:
            data = json.load(f)
        self.src = [data[i][src_field_idx] for i in range(len(data))]
        self.trg = [data[i][trg_field_idx] for i in range(len(data))]
        self.length = len(self.src)
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        src_sent = self.src[idx]
        trg_sent = self.trg[idx]
        return src_sent, trg_sent

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    return {"src": src_batch, "trg": trg_batch}

train_data = TranslationDataset("train.json", "English", "Vietnamese")
val_data = TranslationDataset("val.json", "English", "Vietnamese")
test_data = TranslationDataset("test.json", "English", "Vietnamese")

batch_size = 128

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [60]:
src_sent, trg_sent = train_data[2000]
print(src_sent)
print(trg_sent)


You look surprised to see me
bạn có vẻ ngạc nhiên khi thấy tôi
