### Задание

Разобраться с моделькой перевода как она устроена
запустить для перевода с русского на английский (при желании можно взять другие пары языков) два варианта с вниманием и без внимания
оценить качество насколько корректно переводит (для теста отобрать примеры с увеличением длины текста) (так как оценка визуальная достаточно 20-ти примеров в тестовой выборке)

In [44]:
import re
import io
import os
import tensorflow as tf
import time
import numpy as np

from sklearn.model_selection import train_test_split

In [11]:
path = "./rus-eng/rus.txt"

In [2]:
def preprocess_sentence(w):
    w = w.lower().strip()

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [5]:
preprocess_sentence("I can't go.")

"<start> i can't go . <end>"

In [6]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENG, RUS]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [12]:
en, ru = create_dataset(path, None)
print(en[0])
print(ru[0])

<start> go . <end>
<start> марш ! <end>


In [13]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, lang_tokenizer

In [14]:
def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [19]:
# Try experimenting with the size of that dataset
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [22]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

80000 80000 20000 20000


In [23]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [24]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
14 ----> у
16 ----> меня
293 ----> руки
922 ----> устали
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
21 ----> my
1352 ----> arms
20 ----> are
164 ----> tired
3 ----> .
2 ----> <end>


In [25]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 300
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [26]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 15]), TensorShape([64, 11]))

In [27]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=False,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [28]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_hidden = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder Hidden state shape: (batch size, units) (64, 1024)


In [29]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden):
        # enc_output shape == (batch_size, max_length, hidden_size)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x, initial_state=hidden)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        # fc - fully connected слой
        x = self.fc(output)

        return x, state

In [30]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_sample_x, decoder_sample_h = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)

In [31]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

# Убираем паддиноговые нулевые токены чтоб не учитывались в лоссе
# когда получаем падинговые элементы (0), то зануляем лосс
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [34]:
checkpoint_dir = './training_nmt_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [35]:
@tf.function
def train_step(inp, targ, enc_hidden):
    # inp - входная последовательность (русский)
    # target - английский
    # enc_hidden - вектор скрытого состояния энкодера
    loss = 0
    
    with tf.GradientTape() as tape
        enc_hidden = encoder(inp, enc_hidden)
        
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden = decoder(dec_input, dec_hidden)
            
            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))
        
        variables = encoder.trainable_variables + decoder.trainable_variables
        
        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))
        
    return batch_loss


In [38]:
EPOCHS = 50

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.5393
Epoch 1 Batch 100 Loss 1.9611
Epoch 1 Batch 200 Loss 1.6905
Epoch 1 Batch 300 Loss 1.6273
Epoch 1 Batch 400 Loss 1.4904
Epoch 1 Batch 500 Loss 1.4276
Epoch 1 Batch 600 Loss 1.4086
Epoch 1 Batch 700 Loss 1.2387
Epoch 1 Batch 800 Loss 1.3205
Epoch 1 Batch 900 Loss 1.2519
Epoch 1 Batch 1000 Loss 1.1038
Epoch 1 Batch 1100 Loss 1.0596
Epoch 1 Batch 1200 Loss 1.1102
Epoch 1 Loss 1.4360
Time taken for 1 epoch 3925.748117685318 sec

Epoch 2 Batch 0 Loss 0.8758
Epoch 2 Batch 100 Loss 0.9703
Epoch 2 Batch 200 Loss 0.8944
Epoch 2 Batch 300 Loss 0.8035
Epoch 2 Batch 400 Loss 0.8996
Epoch 2 Batch 500 Loss 0.8842
Epoch 2 Batch 600 Loss 0.7046
Epoch 2 Batch 700 Loss 0.6875
Epoch 2 Batch 800 Loss 0.7051
Epoch 2 Batch 900 Loss 0.6485
Epoch 2 Batch 1000 Loss 0.6875
Epoch 2 Batch 1100 Loss 0.5901
Epoch 2 Batch 1200 Loss 0.6047
Epoch 2 Loss 0.7653
Time taken for 1 epoch 3745.6542394161224 sec

Epoch 3 Batch 0 Loss 0.4707
Epoch 3 Batch 100 Loss 0.4136
Epoch 3 Batch 200 Loss 0.51

Epoch 18 Batch 1000 Loss 0.0147
Epoch 18 Batch 1100 Loss 0.0485
Epoch 18 Batch 1200 Loss 0.0568
Epoch 18 Loss 0.0648
Time taken for 1 epoch 1942.0145058631897 sec

Epoch 19 Batch 0 Loss 0.0699
Epoch 19 Batch 100 Loss 0.0335
Epoch 19 Batch 200 Loss 0.0350
Epoch 19 Batch 300 Loss 0.0432
Epoch 19 Batch 400 Loss 0.0759
Epoch 19 Batch 500 Loss 0.0628
Epoch 19 Batch 600 Loss 0.0555
Epoch 19 Batch 700 Loss 0.0868
Epoch 19 Batch 800 Loss 0.0695
Epoch 19 Batch 900 Loss 0.0635
Epoch 19 Batch 1000 Loss 0.0546
Epoch 19 Batch 1100 Loss 0.0543
Epoch 19 Batch 1200 Loss 0.0691
Epoch 19 Loss 0.0636
Time taken for 1 epoch 1956.5356936454773 sec

Epoch 20 Batch 0 Loss 0.0724
Epoch 20 Batch 100 Loss 0.0570
Epoch 20 Batch 200 Loss 0.0632
Epoch 20 Batch 300 Loss 0.0906
Epoch 20 Batch 400 Loss 0.0544
Epoch 20 Batch 500 Loss 0.0490
Epoch 20 Batch 600 Loss 0.0482
Epoch 20 Batch 700 Loss 0.0541
Epoch 20 Batch 800 Loss 0.0863
Epoch 20 Batch 900 Loss 0.0532
Epoch 20 Batch 1000 Loss 0.0646
Epoch 20 Batch 1100 Loss

Epoch 36 Batch 100 Loss 0.0443
Epoch 36 Batch 200 Loss 0.0416
Epoch 36 Batch 300 Loss 0.0254
Epoch 36 Batch 400 Loss 0.0475
Epoch 36 Batch 500 Loss 0.0483
Epoch 36 Batch 600 Loss 0.0487
Epoch 36 Batch 700 Loss 0.0412
Epoch 36 Batch 800 Loss 0.0668
Epoch 36 Batch 900 Loss 0.0793
Epoch 36 Batch 1000 Loss 0.0396
Epoch 36 Batch 1100 Loss 0.0591
Epoch 36 Batch 1200 Loss 0.0852
Epoch 36 Loss 0.0499
Time taken for 1 epoch 1951.8575489521027 sec

Epoch 37 Batch 0 Loss 0.0384
Epoch 37 Batch 100 Loss 0.0543
Epoch 37 Batch 200 Loss 0.0400
Epoch 37 Batch 300 Loss 0.0397
Epoch 37 Batch 400 Loss 0.0348
Epoch 37 Batch 500 Loss 0.0481
Epoch 37 Batch 600 Loss 0.0268
Epoch 37 Batch 700 Loss 0.0267
Epoch 37 Batch 800 Loss 0.0945
Epoch 37 Batch 900 Loss 0.0483
Epoch 37 Batch 1000 Loss 0.0742
Epoch 37 Batch 1100 Loss 0.0640
Epoch 37 Batch 1200 Loss 0.0784
Epoch 37 Loss 0.0492
Time taken for 1 epoch 1957.3648438453674 sec

Epoch 38 Batch 0 Loss 0.0521
Epoch 38 Batch 100 Loss 0.0445
Epoch 38 Batch 200 Loss 0

In [39]:
# функция для оценки перевода
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)
    # переводим каждый токен в индекс
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')

    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    
    enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden = decoder(dec_input, dec_hidden)

        # storing the attention weights to plot later on
        predicted_id = tf.argmax(predictions[0]).numpy()
        # т.к. у нас нет таргета - передаём предсказание на следующий шаг
        result += targ_lang.index_word[predicted_id] + ' '
        # останавливаемся если встречаем токен конца
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence


In [41]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [40]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x211c108e100>

In [45]:
translate('Здесь хорошо.')

Input: <start> здесь хорошо . <end>
Predicted translation: it's is here here . . . . . . . 


Моделька тренируется долго, переводит так себе