# Neural machine translation

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

## Download and prepare the dataset

We'll use a language dataset provided by http://www.manythings.org/anki/

In [None]:
!wget http://www.manythings.org/anki/rus-eng.zip

--2022-08-03 06:21:41--  http://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14819554 (14M) [application/zip]
Saving to: ‘rus-eng.zip’


2022-08-03 06:21:42 (41.1 MB/s) - ‘rus-eng.zip’ saved [14819554/14819554]



In [None]:
!mkdir rus-eng
!unzip rus-eng.zip -d rus-eng/

Archive:  rus-eng.zip
  inflating: rus-eng/rus.txt         
  inflating: rus-eng/_about.txt      


In [None]:
!ls /content/rus-eng/ -lah

total 71M
drwxr-xr-x 2 root root 4.0K Aug  3 06:21 .
drwxr-xr-x 1 root root 4.0K Aug  3 06:21 ..
-rw-r--r-- 1 root root 1.5K May  2 01:29 _about.txt
-rw-r--r-- 1 root root  71M May  2 01:29 rus.txt


In [17]:
# Download the file
path_to_file = "rus-eng/rus.txt"

In [18]:
def preprocess_sentence(w):
  w = w.lower().strip()

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [19]:
preprocess_sentence("I can't go.")

"<start> i can't go . <end>"

In [20]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENG, RUS]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

  return zip(*word_pairs)

In [21]:
en, ru = create_dataset(path_to_file, None)
print(en[20])
print(ru[20])

<start> duck ! <end>
<start> пригнись ! <end>


In [22]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [23]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

### Limit the size of the dataset to experiment faster (optional)


In [24]:
len(en), len(ru)

(444587, 444587)

In [25]:
# Try experimenting with the size of that dataset
num_examples = 300000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [26]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

240000 240000 60000 60000


In [27]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [28]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
9 ----> это
13006 ----> типично
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
85 ----> that's
2549 ----> typical
3 ----> .
2 ----> <end>


### Create a tf.data dataset

In [29]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 200
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [30]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 18]), TensorShape([64, 14]))

In [31]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=False,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [32]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_hidden = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder Hidden state shape: (batch size, units) (64, 1024)


In [33]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

  def call(self, x, hidden):
    # enc_output shape == (batch_size, max_length, hidden_size)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x, initial_state=hidden)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state

In [34]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_sample_x, decoder_sample_h = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)



In [35]:
decoder_sample_x.shape

TensorShape([64, 12605])

In [36]:
decoder_sample_h.shape

TensorShape([64, 1024])

## Define the optimizer and the loss function

In [37]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0)) # маска будет зануляиь нулевые токены паддинга , 
            #через которые мы не хотим прокидывать градиент, 
            #поэтому мы их сразу в лоссе зануляем чтобы они на лосс никак не реагировали, 
            #чтобы не вносили джобавочные коэффициенты в наш лосс
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

## Checkpoints (Object-based saving)

In [38]:
checkpoint_dir = './training_nmt_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [39]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [40]:
%%time

EPOCHS = 50

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)): 
        # inp это текст на русском а targ это текст на англ - то что мы ходим получить
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.6892
Epoch 1 Batch 100 Loss 2.2355
Epoch 1 Batch 200 Loss 1.9385
Epoch 1 Batch 300 Loss 1.8131
Epoch 1 Batch 400 Loss 1.7556
Epoch 1 Batch 500 Loss 1.6755
Epoch 1 Batch 600 Loss 1.6880
Epoch 1 Batch 700 Loss 1.5366
Epoch 1 Batch 800 Loss 1.6738
Epoch 1 Batch 900 Loss 1.4137
Epoch 1 Batch 1000 Loss 1.2655
Epoch 1 Batch 1100 Loss 1.3175
Epoch 1 Batch 1200 Loss 1.3982
Epoch 1 Batch 1300 Loss 1.1530
Epoch 1 Batch 1400 Loss 1.2634
Epoch 1 Batch 1500 Loss 1.1518
Epoch 1 Batch 1600 Loss 1.0954
Epoch 1 Batch 1700 Loss 1.1171
Epoch 1 Batch 1800 Loss 1.0170
Epoch 1 Batch 1900 Loss 0.9484
Epoch 1 Batch 2000 Loss 0.9553
Epoch 1 Batch 2100 Loss 0.8893
Epoch 1 Batch 2200 Loss 0.9847
Epoch 1 Batch 2300 Loss 0.9165
Epoch 1 Batch 2400 Loss 0.9137
Epoch 1 Batch 2500 Loss 0.9785
Epoch 1 Batch 2600 Loss 0.8960
Epoch 1 Batch 2700 Loss 0.7773
Epoch 1 Batch 2800 Loss 0.7177
Epoch 1 Batch 2900 Loss 0.7743
Epoch 1 Batch 3000 Loss 0.7787
Epoch 1 Batch 3100 Loss 0.7220
Epoch 1 Batch 3200 L

Epoch 7 Batch 2700 Loss 0.1250
Epoch 7 Batch 2800 Loss 0.1231
Epoch 7 Batch 2900 Loss 0.1438
Epoch 7 Batch 3000 Loss 0.1455
Epoch 7 Batch 3100 Loss 0.1234
Epoch 7 Batch 3200 Loss 0.1058
Epoch 7 Batch 3300 Loss 0.1519
Epoch 7 Batch 3400 Loss 0.1833
Epoch 7 Batch 3500 Loss 0.1765
Epoch 7 Batch 3600 Loss 0.1616
Epoch 7 Batch 3700 Loss 0.1156
Epoch 7 Loss 0.1215
Time taken for 1 epoch 1217.4793915748596 sec

Epoch 8 Batch 0 Loss 0.0892
Epoch 8 Batch 100 Loss 0.0728
Epoch 8 Batch 200 Loss 0.0839
Epoch 8 Batch 300 Loss 0.0750
Epoch 8 Batch 400 Loss 0.0873
Epoch 8 Batch 500 Loss 0.0856
Epoch 8 Batch 600 Loss 0.0966
Epoch 8 Batch 700 Loss 0.1046
Epoch 8 Batch 800 Loss 0.0898
Epoch 8 Batch 900 Loss 0.1068
Epoch 8 Batch 1000 Loss 0.1281
Epoch 8 Batch 1100 Loss 0.1025
Epoch 8 Batch 1200 Loss 0.1248
Epoch 8 Batch 1300 Loss 0.1023
Epoch 8 Batch 1400 Loss 0.1045
Epoch 8 Batch 1500 Loss 0.1002
Epoch 8 Batch 1600 Loss 0.1132
Epoch 8 Batch 1700 Loss 0.0972
Epoch 8 Batch 1800 Loss 0.0827
Epoch 8 Batch 1

Epoch 14 Batch 800 Loss 0.0999
Epoch 14 Batch 900 Loss 0.0499
Epoch 14 Batch 1000 Loss 0.0828
Epoch 14 Batch 1100 Loss 0.0943
Epoch 14 Batch 1200 Loss 0.0862
Epoch 14 Batch 1300 Loss 0.1032
Epoch 14 Batch 1400 Loss 0.0684
Epoch 14 Batch 1500 Loss 0.1412
Epoch 14 Batch 1600 Loss 0.0681
Epoch 14 Batch 1700 Loss 0.0856
Epoch 14 Batch 1800 Loss 0.1056
Epoch 14 Batch 1900 Loss 0.0831
Epoch 14 Batch 2000 Loss 0.1147
Epoch 14 Batch 2100 Loss 0.0977
Epoch 14 Batch 2200 Loss 0.0884
Epoch 14 Batch 2300 Loss 0.1068
Epoch 14 Batch 2400 Loss 0.0949
Epoch 14 Batch 2500 Loss 0.0738
Epoch 14 Batch 2600 Loss 0.0991
Epoch 14 Batch 2700 Loss 0.0967
Epoch 14 Batch 2800 Loss 0.1090
Epoch 14 Batch 2900 Loss 0.0974
Epoch 14 Batch 3000 Loss 0.1380
Epoch 14 Batch 3100 Loss 0.1381
Epoch 14 Batch 3200 Loss 0.1086
Epoch 14 Batch 3300 Loss 0.0741
Epoch 14 Batch 3400 Loss 0.0949
Epoch 14 Batch 3500 Loss 0.1176
Epoch 14 Batch 3600 Loss 0.1182
Epoch 14 Batch 3700 Loss 0.1281
Epoch 14 Loss 0.0972
Time taken for 1 epoc

Epoch 20 Batch 2600 Loss 0.1017
Epoch 20 Batch 2700 Loss 0.1171
Epoch 20 Batch 2800 Loss 0.1146
Epoch 20 Batch 2900 Loss 0.1102
Epoch 20 Batch 3000 Loss 0.1285
Epoch 20 Batch 3100 Loss 0.0827
Epoch 20 Batch 3200 Loss 0.0812
Epoch 20 Batch 3300 Loss 0.0934
Epoch 20 Batch 3400 Loss 0.0682
Epoch 20 Batch 3500 Loss 0.0680
Epoch 20 Batch 3600 Loss 0.1253
Epoch 20 Batch 3700 Loss 0.0963
Epoch 20 Loss 0.0958
Time taken for 1 epoch 1210.3577735424042 sec

Epoch 21 Batch 0 Loss 0.0381
Epoch 21 Batch 100 Loss 0.0703
Epoch 21 Batch 200 Loss 0.0623
Epoch 21 Batch 300 Loss 0.0581
Epoch 21 Batch 400 Loss 0.0856
Epoch 21 Batch 500 Loss 0.0839
Epoch 21 Batch 600 Loss 0.0841
Epoch 21 Batch 700 Loss 0.0630
Epoch 21 Batch 800 Loss 0.0736
Epoch 21 Batch 900 Loss 0.0835
Epoch 21 Batch 1000 Loss 0.0869
Epoch 21 Batch 1100 Loss 0.0822
Epoch 21 Batch 1200 Loss 0.0841
Epoch 21 Batch 1300 Loss 0.1129
Epoch 21 Batch 1400 Loss 0.1015
Epoch 21 Batch 1500 Loss 0.0865
Epoch 21 Batch 1600 Loss 0.0948
Epoch 21 Batch 1

Epoch 27 Batch 400 Loss 0.0831
Epoch 27 Batch 500 Loss 0.0935
Epoch 27 Batch 600 Loss 0.0589
Epoch 27 Batch 700 Loss 0.0728
Epoch 27 Batch 800 Loss 0.0838
Epoch 27 Batch 900 Loss 0.0940
Epoch 27 Batch 1000 Loss 0.1242
Epoch 27 Batch 1100 Loss 0.0672
Epoch 27 Batch 1200 Loss 0.0710
Epoch 27 Batch 1300 Loss 0.0638
Epoch 27 Batch 1400 Loss 0.1112
Epoch 27 Batch 1500 Loss 0.1206
Epoch 27 Batch 1600 Loss 0.0674
Epoch 27 Batch 1700 Loss 0.1049
Epoch 27 Batch 1800 Loss 0.0880
Epoch 27 Batch 1900 Loss 0.0961
Epoch 27 Batch 2000 Loss 0.0890
Epoch 27 Batch 2100 Loss 0.1022
Epoch 27 Batch 2200 Loss 0.1272
Epoch 27 Batch 2300 Loss 0.1548
Epoch 27 Batch 2400 Loss 0.0798
Epoch 27 Batch 2500 Loss 0.1072
Epoch 27 Batch 2600 Loss 0.0946
Epoch 27 Batch 2700 Loss 0.1180
Epoch 27 Batch 2800 Loss 0.1347
Epoch 27 Batch 2900 Loss 0.1251
Epoch 27 Batch 3000 Loss 0.0748
Epoch 27 Batch 3100 Loss 0.1479
Epoch 27 Batch 3200 Loss 0.0897
Epoch 27 Batch 3300 Loss 0.1616
Epoch 27 Batch 3400 Loss 0.1110
Epoch 27 Batch

Epoch 33 Batch 2200 Loss 0.1617
Epoch 33 Batch 2300 Loss 0.0902
Epoch 33 Batch 2400 Loss 0.1374
Epoch 33 Batch 2500 Loss 0.0700
Epoch 33 Batch 2600 Loss 0.1093
Epoch 33 Batch 2700 Loss 0.0887
Epoch 33 Batch 2800 Loss 0.0850
Epoch 33 Batch 2900 Loss 0.1685
Epoch 33 Batch 3000 Loss 0.1273
Epoch 33 Batch 3100 Loss 0.1425
Epoch 33 Batch 3200 Loss 0.1356
Epoch 33 Batch 3300 Loss 0.1368
Epoch 33 Batch 3400 Loss 0.1244
Epoch 33 Batch 3500 Loss 0.1061
Epoch 33 Batch 3600 Loss 0.1328
Epoch 33 Batch 3700 Loss 0.1283
Epoch 33 Loss 0.1015
Time taken for 1 epoch 1207.6635932922363 sec

Epoch 34 Batch 0 Loss 0.0926
Epoch 34 Batch 100 Loss 0.0717
Epoch 34 Batch 200 Loss 0.0816
Epoch 34 Batch 300 Loss 0.0633
Epoch 34 Batch 400 Loss 0.0951
Epoch 34 Batch 500 Loss 0.0986
Epoch 34 Batch 600 Loss 0.1056
Epoch 34 Batch 700 Loss 0.0628
Epoch 34 Batch 800 Loss 0.0694
Epoch 34 Batch 900 Loss 0.0887
Epoch 34 Batch 1000 Loss 0.0713
Epoch 34 Batch 1100 Loss 0.0760
Epoch 34 Batch 1200 Loss 0.0648
Epoch 34 Batch 1

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcd in position 133: invalid continuation byte

In [41]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden = decoder(dec_input, dec_hidden)

    # storing the attention weights to plot later on
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [42]:
def translate(sentence):
  result, sentence = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

## Restore the latest checkpoint and test

In [43]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x19160e7beb0>

## Тестирование

In [44]:
# Тестируем

test_texts = ['Здесь хорошо.', 'Я не смогу поехать.', 'Когда нам воду дадут?',
              'Я делаю домашнюю работу.', 'Люби Родину, мать твою.',
              'Перевод без механизма внимания SEQ2SEQ модель', 'Выберите постельное белье по вашим параметрам.',
              ]

for text in test_texts:
    translate(text)

Input: <start> здесь хорошо . <end>
Predicted translation: it's good . <end> 
Input: <start> я не смогу поехать . <end>
Predicted translation: i can't go . <end> 
Input: <start> когда нам воду дадут ? <end>
Predicted translation: when will we give up the tv ? <end> 
Input: <start> я делаю домашнюю работу . <end>
Predicted translation: i'm doing my homework . <end> 
Input: <start> люби родину , мать твою . <end>
Predicted translation: love your mentioned to your mother . <end> 


KeyError: 'механизма'