In [16]:
import pandas as pd
import csv
import youtokentome as yttm
from torchtext.data import Field, TabularDataset, BucketIterator
from nltk import WordPunctTokenizer
import random

In [17]:
data_path = '../../datasets/Machine_translation_EN_RU/data.txt'
en_path = '../../datasets/Machine_translation_EN_RU/en_data.txt'
ru_path = '../../datasets/Machine_translation_EN_RU/ru_data.txt'
df = pd.read_csv(data_path, delimiter='\t', header=None)

In [18]:
tokenizer = WordPunctTokenizer()

In [19]:
target = Field(
#     tokenize=lambda x: tokenizer.tokenize(x.lower()),
    init_token='<sos>',
    eos_token='<eos>',
    lower=True
)
source = Field(
#     tokenize=lambda x: tokenizer.tokenize(x.lower()),
    init_token='<sos>',
    eos_token='<eos>',
    lower=True
)

In [20]:
dataset = TabularDataset(
            path=data_path,
            format='tsv',
            fields=[('trg', target), ('src', source)]
        )

In [21]:
random.seed(42)
random_state = random.getstate()
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05], random_state=random_state)

In [22]:
a = train_data.examples[0]

In [23]:
a.trg

['the', 'apartment', 'is', '1.9', 'km', 'from', 'trinity', 'cathedral.']

In [24]:
ru_data = []
en_data = []
for exm in train_data.examples:
    ru_data.append(' '.join(exm.src))
    en_data.append(' '.join(exm.trg))

In [25]:
with open(ru_path, 'w') as fout:
    fout.write('\n'.join(ru_data))
    
with open(en_path, 'w') as fout:
    fout.write('\n'.join(en_data))

In [26]:
vocab_size = 4000
en_model_path = f'configs/en_bpe_{vocab_size}'
ru_model_path = f'configs/ru_bpe_{vocab_size}'
en_bpe = yttm.BPE.train(data=en_path, vocab_size=vocab_size, model=en_model_path)
ru_bpe = yttm.BPE.train(data=ru_path, vocab_size=vocab_size, model=ru_model_path)

In [27]:
ru_test = 'Отель Cordelia расположен в Тбилиси, в 3 минутах ходьбы от Свято-Троицкого собора.'
en_test = 'Cordelia Hotel is situated in Tbilisi, a 3-minute walk away from Saint Trinity Church.'
print(ru_bpe.encode([ru_test.lower()], output_type=yttm.OutputType.SUBWORD))
print(ru_bpe.encode([ru_test.lower()]))

[['▁отель', '▁cor', 'd', 'el', 'ia', '▁расположен', '▁в', '▁тбили', 'си', ',', '▁в', '▁3', '▁минутах', '▁ходьбы', '▁от', '▁свя', 'то', '-', 'тро', 'и', 'ц', 'ко', 'го', '▁собо', 'ра.']]
[[367, 3077, 50, 574, 934, 308, 175, 3723, 550, 30, 175, 426, 378, 532, 190, 1813, 226, 36, 1148, 11, 37, 203, 182, 1778, 1241]]


In [28]:
ru_bpe.vocab_size()

4000

In [29]:
smth = yttm.BPE('configs/en_bpe_2000')

In [30]:
smth.vocab_size()

2000