<a href="https://colab.research.google.com/github/changsin/DeepLearning-101/blob/master/04_rnn_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Translation using RNN
The data comes from [manythings.org](http://www.manythings.org/anki/). The format of the language-pairs are sentences of two language delimted by a tab (It has a third column that contains another attribute, but that is not needed for our purposes).

```markdown
Run!	¡Corre!	
Run!	¡Corran!
```

In [None]:
import string
import numpy as np
import datetime
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

print(tf.__version__)

2.3.0


In [None]:
# Path to translation file
path_to_data = 'spa-en-es.txt'

# Read file
translation_file = open(path_to_data,"r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in raw_data]
pairs = pairs[:-2] # skip last empty element

In [None]:
print(pairs[1])

['Go.', 'Vete.', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986656 (cueyayotl)']


In [None]:
pairs = pairs[1000:20000]

for idx_sample in range(5,10):
    print('English example in pair {}:  {}'.format(idx_sample + 1, pairs[idx_sample][0]))
    print('Spanish example in pair {}:  {}'.format(idx_sample + 1, pairs[idx_sample][1]))

English example in pair 6:  Ask anyone.
Spanish example in pair 6:  Pregúntenle a cualquiera.
English example in pair 7:  Ask around.
Spanish example in pair 7:  Pregunta por aquí.
English example in pair 8:  Ask around.
Spanish example in pair 8:  Pregunta en los alrededores.
English example in pair 9:  Be careful.
Spanish example in pair 9:  ¡Ten cuidado!
English example in pair 10:  Be careful.
Spanish example in pair 10:  ¡Sé cuidadoso!


In [None]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

print(clean_sentence("I will surf today !!"))

i will surf today 


In [None]:
text_examples = [
    'i will surf today',
    'this week i will travel to the beach',
    'he went to his house by the beach',]

# Create tokenizer
exp_text_tokenizer = Tokenizer()
# Create word index
exp_text_tokenizer.fit_on_texts(text_examples)
for key, value in exp_text_tokenizer.word_index.items():
    print("Word: {} is converted to number {}".format(key, value))
    
    
# Tokenize sentences
exp_text_tokenized = exp_text_tokenizer.texts_to_sequences(text_examples)
print ('\n')
for sample_i, (sent, token_sent) in enumerate(zip(text_examples, exp_text_tokenized)):
    print('Input sentence:  {}'.format(sent))
    print('Output vector: {} \n'.format(token_sent))

Word: i is converted to number 1
Word: will is converted to number 2
Word: to is converted to number 3
Word: the is converted to number 4
Word: beach is converted to number 5
Word: surf is converted to number 6
Word: today is converted to number 7
Word: this is converted to number 8
Word: week is converted to number 9
Word: travel is converted to number 10
Word: he is converted to number 11
Word: went is converted to number 12
Word: his is converted to number 13
Word: house is converted to number 14
Word: by is converted to number 15


Input sentence:  i will surf today
Output vector: [1, 2, 6, 7] 

Input sentence:  this week i will travel to the beach
Output vector: [8, 9, 1, 2, 10, 3, 4, 5] 

Input sentence:  he went to his house by the beach
Output vector: [11, 12, 3, 13, 14, 15, 4, 5] 



In [None]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [None]:
# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
spanish_sentences = [clean_sentence(pair[1]) for pair in pairs]

# Tokenize words
spa_text_tokenized, spa_text_tokenizer = tokenize(spanish_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))

# Check language length
spanish_vocab = len(spa_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(spanish_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

Maximum length spanish sentence: 12
Maximum length english sentence: 6
Spanish vocabulary is of 7198 unique words
English vocabulary is of 3738 unique words


**Padding**

In [None]:
print('Maximum length of example sentence: {}'.format(len(max(exp_text_tokenized,key=len))))
# Pad tokenize vectors
exp_pad_sentence = pad_sequences(exp_text_tokenized, 8, padding = "post") # 8 is the max length
for index, pad_sentence in enumerate(exp_pad_sentence):
    print("Example sentence {}:".format(index+1))
    print("  -Input:{}".format(exp_text_tokenized[index]))
    print("  -Output:{}".format(pad_sentence))


Maximum length of example sentence: 8
Example sentence 1:
  -Input:[1, 2, 6, 7]
  -Output:[1 2 6 7 0 0 0 0]
Example sentence 2:
  -Input:[8, 9, 1, 2, 10, 3, 4, 5]
  -Output:[ 8  9  1  2 10  3  4  5]
Example sentence 3:
  -Input:[11, 12, 3, 13, 14, 15, 4, 5]
  -Output:[11 12  3 13 14 15  4  5]


In [None]:
max_sentence_length = 12
spa_pad_sentence = pad_sequences(spa_text_tokenized, max_sentence_length, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_sentence_length, padding = "post")

# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

## Create the RNN Model

In [None]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
input_shape = (max_sentence_length, 1)

In [None]:
input_shape = (max_sentence_length, 1)
input_sequence = Input(input_shape, name='InputLayer')
rnn = LSTM(256, return_sequences=True, dropout=0.5, name='RNNLayer')(input_sequence)
logits = TimeDistributed(Dense(spanish_vocab), name='TimeDistributed')(rnn)

model = Model(input_sequence, Activation('softmax')(logits))
model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-2),
              metrics=['accuracy'])

In [None]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
InputLayer (InputLayer)      [(None, 12, 1)]           0         
_________________________________________________________________
RNNLayer (LSTM)              (None, 12, 256)           264192    
_________________________________________________________________
TimeDistributed (TimeDistrib (None, 12, 7198)          1849886   
_________________________________________________________________
activation (Activation)      (None, 12, 7198)          0         
Total params: 2,114,078
Trainable params: 2,114,078
Non-trainable params: 0
_________________________________________________________________


In [None]:
%load_ext tensorboard

In [None]:
batch_size = 30

# Include the epoch in the file name (uses `str.format`)
checkpoint_path = "training_2/cp-{epoch:04d}.ckpt"
checkpoint_dir = checkpoint_path
log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

logs_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                               histogram_freq=1)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         verbose=1, 
                                                         save_weights_only=True,
                                                         save_freq=5*batch_size)

In [None]:
model.load_weights('rnn-translator-weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9ae349c278>

In [None]:
history = model.fit(eng_pad_sentence, spa_pad_sentence, batch_size=batch_size,
                    callbacks=[logs_callback, checkpoint_callback], epochs=10)

Epoch 1/10
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 00001: saving model to training_2/cp-0001.ckpt
Epoch 00001: saving model to training_2/cp-0001.ckpt
Epoch 00001: saving model to training_2/cp-0001.ckpt
Epoch 00001: saving model to training_2/cp-0001.ckpt
Epoch 2/10
115/634 [====>.........................] - ETA: 1:49 - loss: 1.7001 - accuracy: 0.7433
Epoch 00002: saving model to training_2/cp-0002.ckpt
Epoch 00002: saving model to training_2/cp-0002.ckpt
Epoch 00002: saving model to training_2/cp-0002.ckpt
Epoch 00002: saving model to training_2/cp-0002.ckpt
Epoch 3/10
 81/634 [==>...........................] - ETA: 1:57 - loss: 1.6772 - accuracy: 0.7458
Epoch 00003: saving model to training_2/cp-0003.ckpt
Epoch 00003: saving model to training_2/cp-0003.ckpt
Epoch 00003: saving model to training_2/cp-0003.ckpt
Epoch 00003: saving model to training_2/cp-0003.ckpt
Epoch 4/10
 47/634 [=>............................] - ETA: 2:02 - loss: 1.7009 - accu

In [None]:
model.save_weights('rnn-translator-weights')

In [None]:
%tensorboard --logdir logs

In [None]:
index = 10
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(spanish_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(model.predict(eng_pad_sentence[index:index+1])[0], spa_text_tokenizer))

The english sentence is: be content
The spanish sentence is: estate contento
The predicted sentence is :
es se <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty>
