### 2022-1 Artificial Intelligence (01)
## Live Session #4-1: Machine Translation (seq2seq) with RNN
---
Copyright (c) Prof. Jaehyeong Sim 

Department of Computer Science and Engineering

Ewha Womans University

ref: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [39]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import unicodedata

from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [40]:
num_samples = 33000

In [41]:
def to_ascii(s):
  # Remove French accent
  # e.g. 'déjà diné' -> deja dine
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent):
  sent = to_ascii(sent.lower())

  # Insert whitespace between words and puncutation
  # e.g. "I am a student." => "I am a student ."
  sent = re.sub(r"([?.!,¿])", r" \1", sent)

  # Replace with whitespace except (a-z, A-Z, ".", "?", "!", ",")
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

  # Replace multiple whitespaces with single one
  sent = re.sub(r"\s+", " ", sent)
  return sent

In [42]:
# Preprocessing test
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print('English sentence before preprocessing :', en_sent)
print('English sentence after preprocessing :',preprocess_sentence(en_sent))
print('French sentence before preprocessing :', fr_sent)
print('French sentence after preprocessing :', preprocess_sentence(fr_sent))

English sentence before preprocessing : Have you had dinner?
English sentence after preprocessing : have you had dinner ?
French sentence before preprocessing : Avez-vous déjà diné?
French sentence after preprocessing : avez vous deja dine ?


In [43]:
def load_preprocessed_data():
  encoder_input, decoder_input, decoder_target = [], [], []

  with open("fra.txt", "r") as lines:
    for i, line in enumerate(lines):
      # Split source and target data
      src_line, tar_line, _ = line.strip().split('\t')

      # Preprocess source 
      src_line = [w for w in preprocess_sentence(src_line).split()]

      # Preprocess target
      tar_line = preprocess_sentence(tar_line)
      tar_line_in = [w for w in ("<sos> " + tar_line).split()]
      tar_line_out = [w for w in (tar_line + " <eos>").split()]

      encoder_input.append(src_line)
      decoder_input.append(tar_line_in)
      decoder_target.append(tar_line_out)

      if i == num_samples - 1:
        break

  return encoder_input, decoder_input, decoder_target

In [44]:
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()
print('Encoder input :',sents_en_in[:5])
print('Decoder input :',sents_fra_in[:5])
print('Decoder label:',sents_fra_out[:5])

Encoder input : [['go', '.'], ['go', '.'], ['go', '.'], ['hi', '.'], ['hi', '.']]
Decoder input : [['<sos>', 'va', '!'], ['<sos>', 'marche', '.'], ['<sos>', 'bouge', '!'], ['<sos>', 'salut', '!'], ['<sos>', 'salut', '.']]
Decoder label: [['va', '!', '<eos>'], ['marche', '.', '<eos>'], ['bouge', '!', '<eos>'], ['salut', '!', '<eos>'], ['salut', '.', '<eos>']]


In [45]:
tokenizer_en = Tokenizer(filters="", lower=False)
tokenizer_en.fit_on_texts(sents_en_in)
encoder_input = tokenizer_en.texts_to_sequences(sents_en_in)
encoder_input = pad_sequences(encoder_input, padding="post")

tokenizer_fra = Tokenizer(filters="", lower=False)
tokenizer_fra.fit_on_texts(sents_fra_in)
tokenizer_fra.fit_on_texts(sents_fra_out)

decoder_input = tokenizer_fra.texts_to_sequences(sents_fra_in)
decoder_input = pad_sequences(decoder_input, padding="post")

decoder_target = tokenizer_fra.texts_to_sequences(sents_fra_out)
decoder_target = pad_sequences(decoder_target, padding="post")

In [46]:
print('Shape of encoder input :',encoder_input.shape)
print('Shape of decoder input :',decoder_input.shape)
print('Shape of decoder label :',decoder_target.shape)

Shape of encoder input : (33000, 8)
Shape of decoder input : (33000, 16)
Shape of decoder label : (33000, 16)


In [47]:
src_vocab_size = len(tokenizer_en.word_index) + 1
tar_vocab_size = len(tokenizer_fra.word_index) + 1
print("English vocabulary size : {:d}, French vocabulary size : {:d}".format(src_vocab_size, tar_vocab_size))

English vocabulary size : 4672, French vocabulary size : 8153


In [48]:
src_to_index = tokenizer_en.word_index
index_to_src = tokenizer_en.index_word
tar_to_index = tokenizer_fra.word_index
index_to_tar = tokenizer_fra.index_word

In [49]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print('Random sequence :',indices)

Random sequence : [28638 19499  4419 ... 15933  9795 20858]


In [50]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [51]:
n_of_val = int(33000*0.1)
print('Validation data size :',n_of_val)

Validation data size : 3300


In [52]:
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [53]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking
from tensorflow.keras.models import Model

In [54]:
embedding_dim = 64
hidden_units = 64

In [55]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
enc_masking = Masking(mask_value=0.0)(enc_emb)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)
encoder_states = [state_h, state_c]

In [56]:
# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(tar_vocab_size, hidden_units) 
dec_emb = dec_emb_layer(decoder_inputs)
dec_masking = Masking(mask_value=0.0)(dec_emb)

decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True) 

decoder_outputs, _, _ = decoder_lstm(dec_masking,
                                     initial_state=encoder_states)

decoder_dense = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [57]:
# Model inputs and outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

In [58]:
model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train, \
          validation_data=([encoder_input_test, decoder_input_test], decoder_target_test),
          batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fa84dcad490>

In [59]:
# Encoder
encoder_model = Model(encoder_inputs, encoder_states)

# Design decoder for translation
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reusing embedding layer
dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

# Next word prediction
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Modified decoder
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [60]:
def decode_sequence(input_seq):
  states_value = encoder_model.predict(input_seq)

  # Create an integer for <sos>
  target_seq = np.zeros((1,1))
  target_seq[0, 0] = tar_to_index['<sos>']

  stop_condition = False
  decoded_sentence = ''

  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = index_to_tar[sampled_token_index]

    decoded_sentence += ' '+sampled_char

    if (sampled_char == '<eos>' or
        len(decoded_sentence) > 50):
        stop_condition = True

    target_seq = np.zeros((1,1))
    target_seq[0, 0] = sampled_token_index

    states_value = [h, c]

  return decoded_sentence

In [61]:
def seq_to_src(input_seq):
  sentence = ''
  for encoded_word in input_seq:
    if(encoded_word != 0):
      sentence = sentence + index_to_src[encoded_word] + ' '
  return sentence

def seq_to_tar(input_seq):
  sentence = ''
  for encoded_word in input_seq:
    if(encoded_word != 0 and encoded_word != tar_to_index['<sos>'] and encoded_word != tar_to_index['<eos>']):
      sentence = sentence + index_to_tar[encoded_word] + ' '
  return sentence

In [62]:
for seq_index in [3, 50, 100, 300, 1001]:
  input_seq = encoder_input_train[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)

  print("Input :",seq_to_src(encoder_input_train[seq_index]))
  print("Label :",seq_to_tar(decoder_input_train[seq_index]))
  print("Output :",decoded_sentence[1:-5])
  print("-"*50)

Input : get away . 
Label : partez . 
Output : degage . 
--------------------------------------------------
Input : can you try ? 
Label : pouvez vous essayer ? 
Output : peux tu nous ? 
--------------------------------------------------
Input : i struggled . 
Label : je me suis debattue . 
Output : je me suis debattue . 
--------------------------------------------------
Input : go home now . 
Label : va a la maison maintenant . 
Output : va chez vous . 
--------------------------------------------------
Input : cash is better . 
Label : de l argent liquide c est mieux . 
Output : le vois c est la voiture ! 
--------------------------------------------------


In [64]:
for seq_index in [3, 50, 100, 300, 1001]:
  input_seq = encoder_input_test[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)

  print("Input :",seq_to_src(encoder_input_test[seq_index]))
  print("Label :",seq_to_tar(decoder_input_test[seq_index]))
  print("Output :",decoded_sentence[1:-5])
  print("-"*50)

Input : i couldn t find it . 
Label : je ne pus le trouver . 
Output : je ne l ai pas fait . 
--------------------------------------------------
Input : i m thorough . 
Label : je suis consciencieuse . 
Output : je suis attentif . 
--------------------------------------------------
Input : i woke you up . 
Label : je vous ai reveille . 
Output : je t ai reveille . 
--------------------------------------------------
Input : you can come . 
Label : vous pouvez venir . 
Output : tu peux venir . 
--------------------------------------------------
Input : they re twins . 
Label : ils sont jumeaux . 
Output : elles sont jumelles . 
--------------------------------------------------
