## Encoder-Decoder model for Japanese-to-English Translation
inspired by: https://www.tensorflow.org/tutorials/text/nmt_with_attention

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import io
import os
import time
import unicodedata
import re

!pip install tqdm
!pip install mojimoji
!pip install spacy
# used by Spacy Japanese tokenizer
!pip install sudachipy sudachidict_core
!pip install nltk
!pip install sklearn

from tqdm import tqdm
import mojimoji
from spacy.lang.ja import Japanese
from nltk.translate.meteor_score import single_meteor_score
from sklearn.model_selection import train_test_split

In [None]:
# test if Tensorflow running against GPU
tf.debugging.set_log_device_placement(True)

# Create some tensors
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
c = tf.matmul(a, b)

print(c)

### Import dataset

In [2]:
def create_dataset(path):
    # two line returns \u2028 cause saved files of processed text to reload with extra lines
    lines = io.open(path, encoding='UTF-8').read().strip().replace('\u2028',' ').split('\n')

    word_pairs = [[w for w in l.split('\t')]  for l in lines]

    return zip(*word_pairs)

In [3]:
en, jp = create_dataset('./jesc-corpus.txt')

### Preprocess Text

In [8]:
# Tokenize Japanese text (since Japanese doesn't naturally put spaces between words)

# reference: https://github.com/WorksApplications/SudachiPy
# Load SudachiPy with split mode B: "国家公務員" => ['国家', '公務員']
# default is split mode A: "国家公務員" => ['国家公務員']
# NOTE: this may be worth adjusting in future training
jcfg = {"split_mode": "B"}
j_tokenizer = Japanese(meta={"tokenizer": {"config": jcfg}})

def tokenize_jp_sentence(text):
    return " ".join([i.text for i in j_tokenizer(text)])

In [9]:
print(jp[0])
print(tokenize_jp_sentence(jp[0]))

あなたは戻ったのね ハロルド?
あなた は 戻っ た の ね ハロルド ?


In [10]:
# convert any half-width katakana to normal-width katakana using mojimoji library
def norm_kt(text):
    return mojimoji.han_to_zen(text)

In [11]:
print("ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ: " + norm_kt("ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ"))

ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ: ニューラルネットワーク


In [12]:
# convert unicode to ascii
def jp_unicode_to_ascii(text):
    return ''.join(ascii_text for ascii_text in unicodedata.normalize('NFKD', text))

# remove any accented characters for English-language text
def en_unicode_to_ascii(text):
    return ''.join(ascii_text for ascii_text in unicodedata.normalize('NFKD', text)
                   .encode('ascii', 'ignore').decode('utf-8', 'ignore'))

In [13]:
print(en_unicode_to_ascii("It's in my résumé."))
print(jp_unicode_to_ascii("それは履歴書にあります。"))

It's in my resume.
それは履歴書にあります。


In [14]:
# keep only Kanji, Hiragana, Katakana, numerals, and common punctuation: ("。", "、", "?", "？", "!", "！"))
def jp_preprocessing_and_spacing(text):
    text = re.sub(r"([。、?？!！])", r" \1", text)
    pattern = r"[^\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F?!！\s、。.,0-9]+"
    text = re.sub(pattern, '', text).rstrip().strip()

    # add spaces between words and punctuation
    text = re.sub(r'[" "]+', " ", text)
    # remove interpunct (黒丸)
    text = text.replace("・" , "")

    text = text.lower()

    return text

# remove special characters and place spaces between words and punctuation
def en_preprocessing_and_spacing(text):
    text = en_unicode_to_ascii(text.lower().strip())

    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    text = re.sub(r"[^a-zA-Z?.!,]+", " ", text)
    text = text.strip()

    return text

In [15]:
print(en_preprocessing_and_spacing('Hello, email@world!'))
print(jp_preprocessing_and_spacing('こんにちは、エメール＠世界！'))

hello , email world !
こんにちは 、エメール世界 ！


### Normalize Text

In [16]:
# utilize preprocessing functions and mark start and end of sentences
def normalize_text(japanese_text, english_text):
    
    inputs = []
    targets = []
    
    for jp_text, en_text in tqdm(zip(japanese_text, english_text)):

        # normalize Japanese
        jp_text = jp_preprocessing_and_spacing(jp_text)
        jp_text = tokenize_jp_sentence(jp_text)
        jp_text = norm_kt(jp_text)
        jp_text = jp_unicode_to_ascii(jp_text)
        
        jp_text = "<start> " + jp_text + " <end>"
        
        inputs.append(jp_text)
        
        # normalize English
        en_text = en_unicode_to_ascii(en_text)
        en_text = en_preprocessing_and_spacing(en_text)

        en_text = "<start> " + en_text + " <end>"
        targets.append(en_text)

    return inputs, targets

In [17]:
inputs, targets = normalize_text(jp, en)

2801388it [54:40, 853.96it/s]


In [18]:
for i in range(5):
    print(inputs[i])
    print(targets[i])

<start> あなた は 戻っ た の ね ハロルド ? <end>
<start> you are back , aren t you , harold ? <end>
<start> 俺 の 相手 は シャーク だ 。 <end>
<start> my opponent is shark . <end>
<start> 引き換え だ ある 事 と ある 物 の <end>
<start> this is one thing in exchange for another . <end>
<start> もう いい よ ごちそう さま ううん <end>
<start> yeah , i m fine . <end>
<start> もう 会社 に は 来 ない で くれ 電話 も する な <end>
<start> don t come to the office anymore . don t call me either . <end>


In [19]:
with open('jp_normalized.txt', 'w') as filehandle:
    for jp_sentence in jp:
        filehandle.write('%s\n' % jp_sentence)

with open('en_normalized.txt', 'w') as filehandle:
    for en_sentence in en:
        filehandle.write('%s\n' % en_sentence)

In [None]:
with open('jp_normalized.txt') as f:
    inputs = f.read().splitlines()
    f.close()
with open('en_normalized.txt') as f:
    targets = f.read().splitlines()
    f.close()

In [None]:
print(len(inputs), len(targets))

### Tokenize text

In [20]:
def tokenize(lang):
    # vectorize a text corpus
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=' ')

    # updates internal vocabulary based on a corpus
    lang_tokenizer.fit_on_texts(lang)

    # Transforms each text in texts to a sequence of integers.
    tensor = lang_tokenizer.texts_to_sequences(lang)

    # Pads sequences to the same length.
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                          padding='post')
    return tensor, lang_tokenizer

In [21]:
print(jp[9])
print(en[9])
tokenize([inputs[9], targets[9]])

カンパニーの元社員が
it seems a former employee...


(array([[ 2,  4,  5,  6,  7,  8,  3,  0,  0,  0],
        [ 2,  9, 10, 11, 12, 13,  1,  1,  1,  3]], dtype=int32),
 <keras_preprocessing.text.Tokenizer at 0x7fb9794e3b50>)

In [22]:
input_tensor, input_lang_tokenizer = tokenize(inputs)
target_tensor, target_lang_tokenizer = tokenize(targets)

### Create input and target datasets

In [23]:
# Calculate max_length of the target tensors
max_length_target, max_length_input = target_tensor.shape[1], input_tensor.shape[1]

In [24]:
print(max_length_target)
print(max_length_input)

83
80


In [25]:
# Creating train-test-validation splits
# Reference: https://datascience.stackexchange.com/questions/15135/train-test-validation-set-splitting-in-sklearn
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is 75% of the entire data set
input_tensor_train, input_tensor_test, \
    target_tensor_train, target_tensor_test = train_test_split(input_tensor, target_tensor,
                                                               test_size=1 - train_ratio,
                                                               random_state=1)

# test is 10% of the initial data set
# validation is 15% of the initial data set
input_tensor_val, input_tensor_test, \
    target_tensor_val, target_tensor_test = train_test_split(input_tensor_test, target_tensor_test,
                                                             test_size=test_ratio/(test_ratio + validation_ratio)) 

# Show length
print(len(input_tensor_train), len(target_tensor_train), 
      len(input_tensor_val), len(target_tensor_val), 
      len(input_tensor_test), len(target_tensor_test))

2101041 2101041 420208 420208 280139 280139


In [26]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [27]:
print ("Input Language; index to word mapping")
convert(input_lang_tokenizer, input_tensor_train[1])
print ()
print ("Target Language; index to word mapping")
convert(target_lang_tokenizer, target_tensor_train[1])

Input Language; index to word mapping
1 ----> <start>
782 ----> おはよう
8702 ----> ジョシュ
1069 ----> やあ
4371 ----> ローズ
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
86 ----> good
305 ----> morning
4 ----> ,
4941 ----> josh
3 ----> .
2 ----> <end>


### Profile memory

In [None]:
!pip install memory_profiler

In [None]:
%load_ext memory_profiler

In [None]:
%memit

### Create a tf.dataset

In [28]:
BUFFER_SIZE = 15000
BATCH_SIZE = 16
steps_per_epoch = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 32
units = 128
vocab_inp_size = len(input_lang_tokenizer.word_index) + 1
vocab_tar_size = len(target_lang_tokenizer.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [29]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 80]), TensorShape([64, 83]))

### Create Encoder and Decoder Models

In [30]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [31]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 80, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [32]:
# reference for Bahdanau Attention Encoder: https://arxiv.org/pdf/1409.0473.pdf
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [33]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 80, 1)


In [34]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [35]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 131612)


In [36]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [37]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

### Model training

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [39]:
EPOCHS = 100

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                        batch,
                                                        batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 10 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.3692


KeyboardInterrupt: 

### Evaluate translation

In [112]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa9f76ef050>

In [114]:
# move checkpoints into base model
encoder = checkpoint.encoder
decoder = checkpoint.decoder

In [155]:
def predict(sentence):
    inputs = tf.convert_to_tensor(sentence)
    inputs = tf.expand_dims(inputs, axis=0)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, state = encoder(inputs, hidden)
    hidden_state = state
    dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_target):
        predictions, hidden_state, _ = decoder(dec_input,
                                            hidden_state,
                                            enc_out)

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += target_lang_tokenizer.index_word[predicted_id] + ' '
        if target_lang_tokenizer.index_word[predicted_id] == '<end>' or len(result) > max_length_target:
            return result

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    return result

In [156]:
def create_reference(lang, tensor):
    all_sentence_list = []

    for word_list in tensor:
        sentence_list = []

        for t in word_list:
            if not t == 0:
                # Index number assigned to each word
                sentence_list.append(lang.index_word[t])
        all_sentence_list.append(sentence_list)
    return all_sentence_list

In [157]:
reference = create_reference(target_lang_tokenizer, target_tensor_test.tolist())

In [160]:
# create predictions
predictions = []
for test in tqdm(input_tensor_test):
    predictions.append(predict(test))

100%|██████████| 3000/3000 [09:12<00:00,  5.43it/s]


In [188]:
score = 0
for i in range(len(reference)):
    score += single_meteor_score(" ".join(reference[i][1:-1]), predictions[i][:-5])

score /= len(reference)
print("The average meteor score per sentence is: {:1.5f}".format(score))

The average meteor score per sentence is: 0.06839


### Translation examples

In [226]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

take(10, input_lang_tokenizer.word_index.items())

[('<start>', 1),
 ('<end>', 2),
 ('ありがとう', 3),
 ('！', 4),
 ('了解', 5),
 ('ああ', 6),
 ('．', 7),
 ('．\u3000．\u3000．', 8),
 ('どうぞ', 9),
 ('その\u3000通り', 10)]

In [274]:
# debugging normalize Japanese for evaluate method
# sentence = jp_unicode_to_ascii("いい天気ですね")
# sentence = jp_preprocessing_and_spacing(sentence)
# sentence = tokenize_jp_sentence(sentence)
# sentence = norm_kt(sentence)

# sentence = "<start>\u3000" + sentence + "\u3000<end>"
# # [input_lang_tokenizer.word_index[i] for i in sentence.split('\u3000')]
# # input_lang_tokenizer.word_index['いい']

1: いい天気ですね
2: いい天気ですね
3: いい 天気 です ね
4: いい　天気　です　ね
5: <start>　いい　天気　です　ね　<end>
6: ['<start>', 'いい', '天気', 'です', 'ね', '<end>']


KeyError: 'いい'

In [279]:
def evaluate(sentence):
    # normalize Japanese
    sentence = jp_unicode_to_ascii(sentence)
    sentence = jp_preprocessing_and_spacing(sentence)
    sentence = tokenize_jp_sentence(sentence)
    sentence = norm_kt(sentence)

    sentence = "<start>\u3000" + sentence + "\u3000<end>"
    print(sentence)
    inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split('\u3000')]

    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_input,
                                                           padding='post')
    
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, state = encoder(inputs, hidden)
    hidden_state = state
    dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_target):
        predictions, hidden_state, _ = decoder(dec_input,
                                                hidden_state,
                                                enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()

        result += target_lang_tokenizer.index_word[predicted_id] + ' '
        if target_lang_tokenizer.index_word[predicted_id] == '<end>' or len(result) > max_length_target:
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence

In [277]:
result, sentence = evaluate("ありがとう") # ありがとう = "thank you"
print('Input: %s' % (sentence))
print('Predicted translation: {}'.format(result))

<start>　ありがとう　<end>
Input: <start>　ありがとう　<end>
Predicted translation: thanks for good . thanks for good . thanks for good . thanks for good 


In [285]:
result, sentence = evaluate("ああ") # ああ = "ah"
print('Input: %s' % (sentence))
print('Predicted translation: {}'.format(result))

<start>　ああ　<end>
Input: <start>　ああ　<end>
Predicted translation: ah , then . <end> 


In [286]:
result, sentence = evaluate("これは何？") # これは何？ = "what is this?"
print('Input: %s' % (sentence))
print('Predicted translation: {}'.format(result))

<start>　これ　は　何　？　<end>


KeyError: 'これ'