<a href="https://colab.research.google.com/github/goyetc/machine_translation/blob/master/HW5_rev3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1: English -> Spanish

A minimal example based on the TensorFlow [tutorial](https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention) and this helpful [article](https://machinetalk.org/2019/03/29/neural-machine-translation-with-attention-mechanism/).

In [0]:
!pip install -q tensorflow-gpu==2.0.0-alpha0

In [0]:
!pip install sacrebleu # https://github.com/mjpost/sacreBLEU



In [0]:
import numpy as np
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata

In [0]:
import os

In [0]:
os.getcwd()

'/content'

####Prep the data

In [0]:
file = open('spa.txt','r')

In [0]:
#eng = list()
#spa = list()
data = list()
for line in file:
  set = re.split(r'\t+', line)
  set[1] = set[1].rstrip()
  data.append(set)
  
file.close()
  

In [0]:
import random
random.seed(42)

In [0]:
len(data)

118964

In [0]:
sample = random.randint(0,len(data))
data[sample]

['There is no doubt about his ability.',
 'No hay ninguna duda sobre su habilidad.']

use this to test the model, then save it and move on

In [0]:
data_sample = random.sample(data,5000)
train = data_sample[:4000]
test = data_sample[4000:]


In [0]:
#run the test code through the model immediately after it finishes training. do not look at it. 

In [0]:
len(train), len(test)

(4000, 1000)

In [0]:
sentences = train

In [0]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  s = '<start> ' + s + ' <end>'
  return s

In [0]:
print("Original:", sentences[0])
sentences = [(preprocess(source), preprocess(target)) for (source, target) in sentences]
print("Preprocessed:", sentences[0])

Original: ['They were amazing.', 'Fueron maravillosos.']
Preprocessed: ('<start> They were amazing . <end>', '<start> Fueron maravillosos . <end>')


In [0]:
source_sentences, target_sentences = list(zip(*sentences))

In [0]:
type(target_sentences)

tuple

In [0]:
type(target_sentences[0])

str

In [0]:
type(source_sentences)

tuple

In [0]:
source_sentences[0]

'<start> They were amazing . <end>'

In [0]:
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer.fit_on_texts(source_sentences)
source_data = source_tokenizer.texts_to_sequences(source_sentences)
print("Sequence:", source_data[0])
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_data, padding='post')
print("Padded:", source_data[0])

Sequence: [1, 48, 56, 1539, 3, 2]
Padded: [   1   48   56 1539    3    2    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [0]:
source_data.shape

(4000, 28)

In [0]:
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
print("Sequence:", target_data[0])
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')
print("Padded:", target_data[0])

Sequence: [1, 316, 1779, 3, 2]
Padded: [   1  316 1779    3    2    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [0]:
type(target_data)

numpy.ndarray

In [0]:
target_data.shape

(4000, 27)

In [0]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels = np.zeros(target_data.shape)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]

print("Target sequence", target_data[0])
print("Target label", target_labels[0])

Target sequence [   1  316 1779    3    2    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]
Target label [ 316. 1779.    3.    2.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.]


In [0]:
source_vocab_size = len(source_tokenizer.word_index) + 1
print("source vocab size: " + str(source_vocab_size))
target_vocab_size = len(target_tokenizer.word_index) + 1
print("target vocab size: " + str(target_vocab_size))


source vocab size: 3156
target vocab size: 4467


Interesting that the spanish vocabulary in this sample is ~50% larger than that of english

#### define functions..encoder and decoder

In [0]:
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode(source_data[0], source_tokenizer)

1 -> <start>
48 -> they
56 -> were
1539 -> amazing
3 -> .
2 -> <end>


In [0]:
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)

In [0]:
example_batch = next(iter(dataset))
source, target, taget_labels = example_batch
print("Shapes:", source.shape, target.shape, taget_labels.shape)

Shapes: (5, 28) (5, 27) (5, 27)


embedding size
* chose an rnn with 64 units, and an embedding depth of 32

In [0]:
embedding_size = 32
rnn_size = 64

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))

Demonstrate calling the encoder.
* Note that I create a unique instance of encoder and decoder for part 1

In [0]:
# Create a batch of one sentence

s = random.randint(0,len(sentences))

ex_sentence = tf.expand_dims(source_data[s], axis=0)
ex_translation = tf.expand_dims(target_data[s], axis=0)
ex_labels = tf.expand_dims(target_labels[s], axis=0)
print(ex_sentence.shape)

encoder = Encoder()
hidden_state = encoder.init_state(batch_size=1)
print(hidden_state.shape)

output, hidden_state = encoder(ex_sentence, hidden_state)
print(output.shape)

(1, 28)
(1, 64)
(1, 28, 64)


In [0]:
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

Demonstrate calling the decoder.

In [0]:
decoder = Decoder()
decoder_output, decoder_state1 = decoder(ex_labels, hidden_state)
print(decoder_output.shape)

(1, 27, 4467)


In [0]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

print("Loss", calc_loss(ex_labels, decoder_output))

Loss tf.Tensor(1.8678355, shape=(), dtype=float32)


#### Translate function for part 1

In [0]:
def translate_eng_to_esp(idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences))
    
    input_sent = source_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder.init_state(batch_size=1)
    output, hidden_state = encoder(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences[idx][0], sentences[idx][1], translation

In [0]:
input_sent, target_sent, translation = translate_eng_to_esp()
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Input: <start> I did so at his request . <end>
Target: <start> Yo lo hice a peticion suya . <end>
Translation: quedaran malgastado escribire considerate sereno recordaron llamara fijare taxi diversiones propuesta adorable heroe llaves queria llegada tren sentados olvidado vive



* gibberish, as expected

#### Train model

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function # remove this annotation when debugging
def train_step_1(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder(source_seq, initial_state)
    logits, decoder_state = decoder(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [0]:
EPOCHS = 200

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
      loss = train_step_1(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate_eng_to_esp()
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 1.1651, Time 18.16 sec
Input: <start> You don t have to get married if you don t want to . <end>
Target: <start> No tenes que casarte si no queres . <end>
Translation: ¿ no no no no la . <end>

Epoch #10, Loss 0.7453, Time 14.05 sec
Input: <start> Don t worry . He may look intimidating at first glance , but he s actually a very friendly person . <end>
Target: <start> No temas , al principio puede parecer intimidante , pero el es realmente una persona muy amigable . <end>
Translation: no me gusta una carta . <end>

Epoch #20, Loss 0.5470, Time 7.27 sec
Input: <start> What s that for ? <end>
Target: <start> ¿ Para que vale eso ? <end>
Translation: ¿ que te gusta el ? <end>

Epoch #30, Loss 0.4423, Time 7.42 sec
Input: <start> I ll drive to Boston . <end>
Target: <start> Conducire hasta Boston . <end>
Translation: yo renuncie a la estacion . <end>

Epoch #40, Loss 0.3688, Time 7.18 sec
Input: <start> It is necessary for you to go there . <end>
Target: <start> Es necesario q

Loss < 0.10 after 100 epochs, looks pretty good.

But clearly still some funny quirks, a la epoch 130:
* Input: <start> I like English , too . <end>
* Target: <start> A mi tambien me gusta el ingles . <end>
* Translation: me gusta jugar al beisbol . <end>
  
Which translates to.. "I like to play baseball". The english prefer cricket..

Loss appears to flatten out around 150 epochs (more like 200), and we're likely overfitting to the sample set after this point. 

Calculate BLEU Score.

In [0]:
original1, references1, hypotheses1 = [], [], []


for i in range(len(sentences)):
#for i in range(10):
  input_sent, target_sent, translation = translate_eng_to_esp()
  original1.append(input_sent)
  references1.append(target_sent)
  hypotheses1.append("<start> " + translation)
  
results1 = sacrebleu.raw_corpus_bleu(hypotheses1, [references1])
print(results1)

BLEU(score=20.61060634910397, counts=[18554, 8683, 3912, 2928], totals=[38115, 34115, 30115, 26115], precisions=[48.678997769906864, 25.452147149347795, 12.990204217167525, 11.211947156806433], bp=1.0, sys_len=38115, ref_len=36978)


despite minimal loss, I achieved BLEU scores of 50 or below on multiple runs.. 
* except one, where I achieved 60:
* BLEU(score=59.856526611808704, counts=[30867, 21448, 16608, 12947], totals=[38537, 34537, 30537, 26537], precisions=[80.09704958870695, 62.10151431797782, 54.38648197268887, 48.788484003466856], bp=0.9930697700178548, sys_len=38537, ref_len=38805)

translate a sentence:

In [0]:
input_sent1, target_sent1, translation1 = translate_eng_to_esp(10)

In [0]:
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent1, target_sent1, translation1))

Input: <start> I don t think there s anything in the box . <end>
Target: <start> No creo que haya nada en la caja . <end>
Translation: no creo que haya nada en la caja . <end>



#Part 2: Spanish -> English

In [0]:
train[0]

['They were amazing.', 'Fueron maravillosos.']

In [0]:
sentences_swapped = [(t[1], t[0]) for t in train]

In [0]:
sentences_swapped[0]

('Fueron maravillosos.', 'They were amazing.')

In [0]:
print("Original:", sentences_swapped[0])
sentences_swapped = [(preprocess(source), preprocess(target)) for (source, target) in sentences_swapped]
print("Preprocessed:", sentences_swapped[0])

Original: ('Fueron maravillosos.', 'They were amazing.')
Preprocessed: ('<start> Fueron maravillosos . <end>', '<start> They were amazing . <end>')


In [0]:
source_sentences2, target_sentences2 = list(zip(*sentences_swapped))

In [0]:
source_sentences2[0], target_sentences2[0]

('<start> Fueron maravillosos . <end>', '<start> They were amazing . <end>')

In [0]:
hypotheses1[0], original1[0]

('<start> estoy ansioso por escuchar lo que piensas de este asunto . <end>',
 '<start> I m sure Tom will succeed . <end>')

### process data for second model

In [0]:
source_tokenizer2 = tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer2.fit_on_texts(source_sentences2)
source_data2 = source_tokenizer2.texts_to_sequences(source_sentences2)
print("Sequence:", source_data2[0])
source_data2 = tf.keras.preprocessing.sequence.pad_sequences(source_data2, padding='post')
print("Padded:", source_data2[0])

Sequence: [1, 316, 1779, 3, 2]
Padded: [   1  316 1779    3    2    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [0]:
target_tokenizer2 = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer2.fit_on_texts(target_sentences2)
target_data2 = target_tokenizer2.texts_to_sequences(target_sentences2)
print("Sequence:", target_data2[0])
target_data2 = tf.keras.preprocessing.sequence.pad_sequences(target_data2, padding='post')
print("Padded:", target_data2[0])

Sequence: [1, 48, 56, 1539, 3, 2]
Padded: [   1   48   56 1539    3    2    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [0]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels2 = np.zeros(target_data2.shape)
target_labels2[:,0:target_data2.shape[1] -1] = target_data2[:,1:]

print("Target sequence", target_data2[0])
print("Target label", target_labels2[0])

Target sequence [   1   48   56 1539    3    2    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
Target label [  48.   56. 1539.    3.    2.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.]


In [0]:
source_vocab_size2 = len(source_tokenizer2.word_index) + 1
print("source vocab size: " + str(source_vocab_size2))
target_vocab_size2 = len(target_tokenizer2.word_index) + 1
print("target vocab size: " + str(target_vocab_size2))


source vocab size: 4467
target vocab size: 3156


Note that here, we have a compressed vocab size from source to target. Suspect this might produce higher loss, but actually a better BLEU score

In [0]:
decode(source_data2[0], source_tokenizer2)

1 -> <start>
316 -> fueron
1779 -> maravillosos
3 -> .
2 -> <end>


In [0]:
#batch_size = 5
dataset2 = tf.data.Dataset.from_tensor_slices((source_data2, target_data2, target_labels2)).batch(batch_size)

In [0]:
example_batch2 = next(iter(dataset2))
source2, target2, taget_labels2 = example_batch2
print("Shapes:", source2.shape, target2.shape, taget_labels2.shape)

Shapes: (5, 27) (5, 28) (5, 28)


In [0]:
#same as before
#embedding_size = 32
#rnn_size = 64

In [0]:
class Encoder2(tf.keras.Model):
  def __init__(self):
    super(Encoder2, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size2,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))

Demonstrate calling the encoder.

In [0]:
# Create a batch of one sentence

s2 = random.randint(0,len(sentences_swapped))

ex_sentence2 = tf.expand_dims(source_data2[s], axis=0)
ex_translation2 = tf.expand_dims(target_data2[s], axis=0)
ex_labels2 = tf.expand_dims(target_labels2[s], axis=0)
print(ex_sentence2.shape)

encoder2 = Encoder2()
hidden_state2 = encoder2.init_state(batch_size=1)
print(hidden_state2.shape)

output2, hidden_state2 = encoder2(ex_sentence2, hidden_state2)
print(output2.shape)

(1, 27)
(1, 64)
(1, 27, 64)


In [0]:
class Decoder2(tf.keras.Model):
  def __init__(self):
    super(Decoder2, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size2, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size2)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

Demonstrate calling the decoder.

In [0]:
decoder2 = Decoder2()
decoder2_output, decoder2_state = decoder2(ex_labels2, hidden_state2)
print(decoder2_output.shape)

(1, 28, 3156)


In [0]:
#crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

'''def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)'''

print("Loss", calc_loss(ex_labels2, decoder2_output))

Loss tf.Tensor(1.4386643, shape=(), dtype=float32)


### Esp to eng

In [0]:
def translate_esp_to_eng(idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences_swapped))
    
    input_sent = source_data2[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder2.init_state(batch_size=1)
    output, hidden_state = encoder2(input_sent, hidden_state)
    
    decoder2_input = tf.expand_dims([target_tokenizer2.word_index['<start>']], 0)
    out_words = []
    
    decoder2_state = hidden_state

    while True:
      
        decoder2_output, decoder2_state = decoder2(decoder2_input, decoder2_state)
        decoder2_input = tf.argmax(decoder2_output, -1)
        word_idx = decoder2_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer2.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences_swapped[idx][0], sentences_swapped[idx][1], translation

In [0]:
input_sent, target_sent, translation = translate_esp_to_eng()
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Input: <start> No puedo ayudar a Tom con su tarea . <end>
Target: <start> I can t help Tom with his homework . <end>
Translation: five take aboard hawaii stairs ears chose bitten unlocked proper awful anyone sleeping cousin awful guarantee programs visit blaming chose



gibberish here again, as expected prior to training

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function # remove this annotation when debugging
def train_step_2(source_seq, target_seq, target_labels2, initial_state):
  
  with tf.GradientTape() as tape:
    encoder2_output, encoder2_state = encoder2(source_seq, initial_state)
    logits, decoder2_state = decoder2(target_seq, encoder2_state)
    loss = calc_loss(target_labels2, logits)

  variables = encoder2.trainable_variables + decoder2.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

200 Epochs were initially used as I observed overfitting in part 1 near this #, but I found 200 epochs to produce inferior results for a re-do of english -> spanish. 
* Sticking with 300

In [0]:
EPOCHS = 200

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states2 = encoder2.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels2) in enumerate(dataset2):
      loss2 = train_step_2(source_seq, target_seq, target_labels2, en_initial_states2)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss2, elapsed))
      input_sent, target_sent, translation = translate_esp_to_eng()
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 1.0993, Time 9.20 sec
Input: <start> Tom mando a Mary a casa . <end>
Target: <start> Tom sent Mary home . <end>
Translation: i is to the lot . <end>

Epoch #10, Loss 0.6501, Time 7.25 sec
Input: <start> He estado leyendo este libro . <end>
Target: <start> I have been reading this book . <end>
Translation: i don t know what you want to be here . <end>

Epoch #20, Loss 0.4849, Time 7.23 sec
Input: <start> Me dormi escuchando la radio . <end>
Target: <start> I fell asleep while listening to the radio . <end>
Translation: i didn t know if you were . <end>

Epoch #30, Loss 0.3511, Time 7.24 sec
Input: <start> Quieto . <end>
Target: <start> Stay still . <end>
Translation: leave your house . <end>

Epoch #40, Loss 0.2692, Time 7.24 sec
Input: <start> Fueron al festival de musica . <end>
Target: <start> They went to the music festival . <end>
Translation: they went to school . <end>

Epoch #50, Loss 0.2288, Time 7.28 sec
Input: <start> El esta con otro telefono . <end>
Target: <

swap source and target location to preserve rest of script
* Note that the same dataset was used as part 1 for a controlled comparison of performance

Loss is higher than the equivalent english -> spanish results, using the same sample of data.
* Wonder if this has to do with the 'compressed' vocab size of english vs spanish. 
* Still, results look good, and the BLEU score is not significantly different
* In fact, the BLEU score is higher for the Spanish -> English translation, despite greater loss
* - again, this may have to do with a smaller vocabulary in the English language portion of the sample

Calculate BLEU Score.

In [0]:
references2, hypotheses2 = [], []


for i in range(len(sentences_swapped)):
  input_sent, target_sent, translation = translate_esp_to_eng()
  references2.append(target_sent)
  hypotheses2.append("<start> " + translation)
  
results2 = sacrebleu.raw_corpus_bleu(hypotheses2, [references2])
print(results2)

BLEU(score=56.27093543007827, counts=[29793, 20184, 15353, 11890], totals=[38657, 34657, 30657, 26657], precisions=[77.07012960136586, 58.23931673255042, 50.07991649541703, 44.603668829950855], bp=1.0, sys_len=38657, ref_len=38442)


the bleu score is consistently higher for spanish to english, despite higher loss. 

In [0]:
input_sent2, target_sent2, translation2 = translate_esp_to_eng(10)

In [0]:
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent2, target_sent2, translation2))

Input: <start> No creo que haya nada en la caja . <end>
Target: <start> I don t think there s anything in the box . <end>
Translation: i don t think there s anything in so those went shopping elsewhere . <end>



In [0]:
input_sent2, target_sent2, translation2 = translate_esp_to_eng(random.randint(0,len(test)))

In [0]:
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent2, target_sent2, translation2))

Input: <start> Ella es buena esquiando . <end>
Target: <start> She is good at skiing . <end>
Translation: she is good at skiing . <end>



# Part 3

## Part 3
* Use test set from beginning -  created true/unobserved test set by pulling a random sample of 5000 sets and then partitioning that sample into train and test sets
* We will extract the translations after the first english -> spanish training steps, and then use those translations as the source for a spanish to english translation. 
* The tokenizations will also be used from the second model, as we are treating the translated words from the original model in part 1 as the ground trouth in part 3. 

In [0]:
for i in range(len(test)):
  input_sent, target_sent, translation = translate_esp_to_eng(i)

In [0]:
hypotheses1[:10]

['<start> estoy ansioso por escuchar lo que piensas de este asunto . <end>',
 '<start> el alcanzare pronto . <end>',
 '<start> parece estar muy de clase . <end>',
 '<start> ¿ no lo mas este de cinco ? <end>',
 '<start> ya tienes edad para mantenerte solo . <end>',
 '<start> se durmio en clase de historia . <end>',
 '<start> ¿ cual es tu talla de correos he estado fumando tomar ? <end>',
 '<start> no quiero esperar hoy . <end>',
 '<start> una mirada contenta aparecio en su rostro . <end>',
 '<start> que me gustaria estar bien con mi . <end>']

In [0]:
original1[:10]

['<start> I m sure Tom will succeed . <end>',
 '<start> You were lucky . <end>',
 '<start> He seems quite happy . <end>',
 '<start> Don t you think this computer game may be a little too difficult for Tom ? <end>',
 '<start> You are now old enough to support yourself . <end>',
 '<start> I know that would make me happy . <end>',
 '<start> What did you buy her for Christmas ? <end>',
 '<start> I don t want any distractions . <end>',
 '<start> A look of contentment appeared on his face . <end>',
 '<start> She was poor , but she was honest . <end>']

In [0]:
source_data3 = source_tokenizer2.texts_to_sequences(hypotheses1)
print("Sequence:", source_data3[9])
source_data3 = tf.keras.preprocessing.sequence.pad_sequences(source_data3, padding='post')
print("Padded:", source_data3[9])

Sequence: [1, 4, 17, 171, 130, 115, 26, 24, 3, 2]
Padded: [  1   4  17 171 130 115  26  24   3   2   0   0   0   0   0   0   0   0
   0   0   0]


In [0]:
target_data3 = target_tokenizer2.texts_to_sequences(original1)
print("Sequence:", target_data3[9])
target_data3 = tf.keras.preprocessing.sequence.pad_sequences(target_data3, padding='post')
print("Padded:", target_data3[9])

Sequence: [1, 26, 21, 980, 19, 88, 26, 21, 703, 3, 2]
Padded: [  1  26  21 980  19  88  26  21 703   3   2   0   0   0   0   0   0   0
   0   0   0   0   0   0]


In [0]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels3 = np.zeros(target_data3.shape)
target_labels3[:,0:target_data3.shape[1] -1] = target_data3[:,1:]

print("Target sequence", target_data3[0])
print("Target label", target_labels3[0])

Target sequence [  1   4  46 215   8  60 754   3   2   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0]
Target label [  4.  46. 215.   8.  60. 754.   3.   2.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]


In [0]:
source_vocab_size3 = len(source_tokenizer2.word_index) + 1
print("source vocab size: " + str(source_vocab_size3))
target_vocab_size3 = len(target_tokenizer2.word_index) + 1
print("target vocab size: " + str(target_vocab_size3))


source vocab size: 4467
target vocab size: 3156


In [0]:
hypotheses1[0]

'<start> estoy ansioso por escuchar lo que piensas de este asunto . <end>'

In [0]:
decode(source_data3[0], source_tokenizer2)

1 -> <start>
51 -> estoy
4360 -> ansioso
22 -> por
1012 -> escuchar
21 -> lo
4 -> que
345 -> piensas
5 -> de
39 -> este
328 -> asunto
3 -> .
2 -> <end>


In [0]:
original1[0]

'<start> I m sure Tom will succeed . <end>'

In [0]:
decode(target_data3[0], target_tokenizer2)

1 -> <start>
4 -> i
46 -> m
215 -> sure
8 -> tom
60 -> will
754 -> succeed
3 -> .
2 -> <end>


In [0]:
#batch_size = 5
dataset3 = tf.data.Dataset.from_tensor_slices((source_data3, target_data3, target_labels3)).batch(batch_size)

In [0]:
def translate_esp_to_eng_2(idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(test))
    
    input_sent = source_data3[idx]
    #decode(input_sent, source_tokenizer2)
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder2.init_state(batch_size=1)
    output, hidden_state = encoder2(input_sent, hidden_state)
    
    decoder2_input = tf.expand_dims([target_tokenizer2.word_index['<start>']], 0)
    out_words = []
    
    decoder2_state = hidden_state

    while True:
      
        decoder2_output, decoder2_state = decoder2(decoder2_input, decoder2_state)
        decoder2_input = tf.argmax(decoder2_output, -1)
        word_idx = decoder2_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer2.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return hypotheses1[idx], original1[idx], translation

In [0]:
input_sent_test, target_sent_test, translation_test = translate_esp_to_eng_2(9)
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent_test, target_sent_test, translation_test))

1 -> <start>
4 -> que
17 -> me
171 -> gustaria
130 -> estar
115 -> bien
26 -> con
24 -> mi
3 -> .
2 -> <end>
Input: <start> que me gustaria estar bien con mi . <end>
Target: <start> She was poor , but she was honest . <end>
Translation: i d like you to attend something ? <end>



In [0]:
references3, hypotheses3 = [], []
  
for i in range(len(test)):
  input_sent, target_sent, translation = translate_esp_to_eng_2(i)
  references3.append(target_sent)
  hypotheses3.append("<start> " + translation)
  
results3 = sacrebleu.raw_corpus_bleu(hypotheses3, [references3])
print(results3)


BLEU(score=19.511622655125215, counts=[4533, 2028, 967, 750], totals=[9749, 8749, 7749, 6749], precisions=[46.49707662324341, 23.179791976225854, 12.479029552200284, 11.112757445547489], bp=0.9923364194021157, sys_len=9749, ref_len=9824)


Bleu score not great.. 19.5 vs ~60 for the original ground truth spanish examples

In [0]:
input_sent_test, target_sent_test, translation_test = translate_esp_to_eng_2(8)
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent_test, target_sent_test, translation_test))

Input: <start> una mirada contenta aparecio en su rostro . <end>
Target: <start> A look of contentment appeared on his face . <end>
Translation: a look of contentment appeared on his face . <end>

