In [1]:
# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz
# !tar -zxf train-en-vi.tgz
# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/dev-2012-en-vi.tgz
# !tar -zxf dev-2012-en-vi.tgz
# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/test-2013-en-vi.tgz
# !tar -zxf test-2013-en-vi.tgz

# !rm train-en-vi.tgz dev-2012-en-vi.tgz test-2013-en-vi.tgz

In [2]:
import malaya
import re
import os
from tqdm import tqdm

In [3]:
tokenizer = malaya.preprocessing.SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def tokenizing(string):
    tokenized = tokenizer(string)
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    return tokenized

In [4]:
db_dir = os.getcwd()

train_en = []
train_vi = []

test_en = []
test_vi = []

with open('train.en', 'r') as f_train_en, open('train.vi', 'r') as f_train_vi:
    train_en.extend(f_train_en.read().split('\n')[:-1])
    train_vi.extend(f_train_vi.read().split('\n')[:-1])

with open('tst2012.en', 'r') as f_test_en, open('tst2012.vi', 'r') as f_test_vi:
    test_en.extend(f_test_en.read().split('\n')[:-1])
    test_vi.extend(f_test_vi.read().split('\n')[:-1])

with open('tst2013.en', 'r') as f_test_en, open('tst2013.vi', 'r') as f_test_vi:
    test_en.extend(f_test_en.read().split('\n')[:-1])
    test_vi.extend(f_test_vi.read().split('\n')[:-1])

In [5]:
for i in tqdm(range(len(train_en))):
    tokenized_en = ' '.join(tokenizing(train_en[i]))
    tokenized_vi = ' '.join(tokenizing(train_vi[i]))
    train_en[i] = tokenized_en
    train_vi[i] = tokenized_vi

100%|██████████| 133317/133317 [00:46<00:00, 2877.75it/s]


In [6]:
for i in tqdm(range(len(test_en))):
    tokenized_en = ' '.join(tokenizing(test_en[i]))
    tokenized_vi = ' '.join(tokenizing(test_vi[i]))
    test_en[i] = tokenized_en
    test_vi[i] = tokenized_vi

100%|██████████| 2821/2821 [00:00<00:00, 2968.76it/s]


In [7]:
import collections
import json

def build_dataset(words, n_words, atleast=1):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [8]:
concat_from = ' '.join(train_en).split()
vocabulary_size_from = len(list(set(concat_from)))
data_from, count_from, dictionary_from, rev_dictionary_from = build_dataset(concat_from, vocabulary_size_from)
print('Vocab from size: %d'%(vocabulary_size_from))
print('Most common words', count_from[4:10])
print('Sample data', data_from[:10], [rev_dictionary_from[i] for i in data_from[:10]])

Vocab from size: 48111
Most common words [(',', 156175), ('.', 135024), ('the', 103138), ('to', 65797), ("'", 64433), ('of', 60341)]
Sample data [6531, 16858, 55, 58, 335, 593, 11, 731, 5477, 132] ['Rachel', 'Pike', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline', 'In']


In [9]:
concat_to = ' '.join(train_vi).split()
vocabulary_size_to = len(list(set(concat_to)))
data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, vocabulary_size_to)
print('Vocab to size: %d'%(vocabulary_size_to))
print('Most common words', count_to[4:10])
print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])

Vocab to size: 22465
Most common words [(',', 128672), ('.', 125418), ('là', 58046), ('tôi', 52058), ('một', 49025), ('có', 48322)]
Sample data [1915, 66, 1136, 128, 8, 372, 111, 38, 412, 724] ['Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu']


In [10]:
with open('train-test.json', 'w') as fopen:
    json.dump({'train_X': train_en, 'train_Y': train_vi,
              'test_X': test_en,
              'test_Y': test_vi}, fopen)

In [11]:
with open('dictionary.json', 'w') as fopen:
    json.dump({'from': {'dictionary': dictionary_from, 'rev_dictionary': rev_dictionary_from},
              'to': {'dictionary': dictionary_to, 'rev_dictionary': rev_dictionary_to}}, fopen)