In [22]:
import random
import torch
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x18097cdfd10>

In [23]:
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

In [24]:
import nltk
import string
import itertools

def tokenize_en(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return tokens

def tokenize_vi(text):
    return [tok for tok in itertools.chain.from_iterable(annotator.tokenize(text))]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [25]:
import pandas as pd
from nlp import load_dataset


# Define the fields for your data
fields = {"English": ("src", tokenize_en), "Vietnamese": ("trg", tokenize_vi)}

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
val.to_json("val.json", orient="records", lines=True)
