In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
tf.enable_eager_execution()
import importlib
import os

from matplotlib import pyplot as plt
%matplotlib notebook

from transliteration import data, train, model_one, script, decode, evaluate

In [3]:
importlib.reload(data)
batch_size = 128
cmu_train_dataset = data.make_dataset('../data/tfrecord/cmu_train.tfrecord',
                                      from_script='en',
                                      to_script='cmu',
                                      combine_words_proportion=.3,
                                      batch_size=batch_size)
cmu_valid_dataset = data.make_dataset('../data/tfrecord/cmu_valid.tfrecord',
                                      from_script='en',
                                      to_script='cmu',
                                      combine_words_proportion=.3,
                                      batch_size=batch_size)
cmu_test_dataset = data.make_dataset('../data/tfrecord/cmu_test.tfrecord',
                                     from_script='en',
                                     to_script='cmu',
                                     combine_words_proportion=.3,
                                     batch_size=batch_size)
eob_train_dataset = data.make_dataset('../data/tfrecord/eob_train.tfrecord',
                                       from_script='en',
                                       to_script='ja',
                                       batch_size=batch_size)
eob_valid_dataset = data.make_dataset('../data/tfrecord/eob_valid.tfrecord',
                                       from_script='en',
                                       to_script='ja',
                                       batch_size=batch_size)
eob_test_dataset = data.make_dataset('../data/tfrecord/eob_test.tfrecord',
                                      from_script='en',
                                      to_script='ja',
                                      batch_size=batch_size)

Instructions for updating:
Colocations handled automatically by placer.


In [4]:
optimizer = tf.train.AdamOptimizer()

def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred)
    return tf.reduce_mean(loss_ * mask)

cmu_encoder_config = model_one.Config(lstm_size=240,
                                      embedding_size=30,
                                      attention_size=None,
                                      vocab_size=script.SCRIPTS['en'].vocab_size)
cmu_decoder_config = model_one.Config(lstm_size=240,
                                      embedding_size=30,
                                      attention_size=120,
                                      attention='monotonic_bahdanau',
                                      vocab_size=script.SCRIPTS['cmu'].vocab_size)
ja_decoder_config = model_one.Config(lstm_size=240,
                                     embedding_size=30,
                                     attention_size=120,
                                     attention='multiple:monotonic_bahdanau,monotonic_bahdanau',
                                     vocab_size=script.SCRIPTS['ja'].vocab_size)
cmu_encoder = model_one.Encoder(cmu_encoder_config)
cmu_decoder = model_one.Decoder(cmu_decoder_config)
ja_encoder = model_one.StackedEncoderDecoderEncoder(cmu_encoder, cmu_decoder, 'cmu')
ja_decoder = model_one.Decoder(ja_decoder_config)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 cmu_encoder=cmu_encoder,
                                 ja_decoder=ja_decoder,
                                 cmu_decoder=cmu_decoder)

In [None]:
cmu_best_val_loss = None
cmu_checkpoint = None

In [5]:
for e in range(10):
    loss = train.run_one_epoch(cmu_train_dataset,
                               True,
                               from_script='en',
                               to_script='cmu',
                               encoder=cmu_encoder,
                               decoder=cmu_decoder,
                               optimizer=optimizer,
                               loss_function=loss_function)
    valid_loss = train.run_one_epoch(cmu_valid_dataset,
                                     False,
                                     from_script='en',
                                     to_script='cmu',
                                     encoder=cmu_encoder,
                                     decoder=cmu_decoder,
                                     loss_function=loss_function)
    if cmu_best_val_loss is None or valid_loss < cmu_best_val_loss:
        cmu_best_val_loss = valid_loss
        cmu_checkpoint = checkpoint.save(file_prefix=checkpoint_prefix)
    print("Epoch {}: Train Loss {:.3f}, Valid Loss {:.3f}".format(e, loss, valid_loss))
    print(decode.transliterate(input_strs=['derick'],
                               from_script='en',
                               to_script='cmu',
                               encoder=cmu_encoder,
                               decoder=cmu_decoder,
                               k_best=2,
                               decoding_method=decode.beam_search_decode))

Epoch 0: Train Loss 24.578, Valid Loss 16.087
([['D EH1 R K K K AA1 R IY0', 'D EH1 R K K K AA1 K AH0 N']], array([[-13.55177731, -14.8608727 ]]))


Epoch 1: Train Loss 10.358, Valid Loss 8.736
([['D EH1 R IH0 K S AH0 K', 'D EH1 R IH0 K S AH2 K IH0 NG']], array([[ -8.86743026, -11.32839058]]))


Epoch 2: Train Loss 5.679, Valid Loss 4.720


([['D EH1 R IH0 K S AH2 K IH0 NG', 'D EH1 R IH0 K S AH2 K AH0 NG']], array([[-10.32414979, -10.9477067 ]]))


Epoch 3: Train Loss 4.227, Valid Loss 4.193


([['D EH1 R IH0 K AH0 S', 'D EH1 R IH0 K IH0 NG']], array([[-5.48341189, -5.516672  ]]))


Epoch 4: Train Loss 3.562, Valid Loss 3.476


([['D EH1 R IH0 K IH0 NG', 'D EH1 R IH0 K IH0 NG K IH2 NG']], array([[-5.03364553, -8.83807178]]))


Epoch 5: Train Loss 3.220, Valid Loss 3.165


([['D EH1 R IH0 K AH0 L IH0 NG', 'D IH0 R IH1 K AH0 L IH0 NG']], array([[-7.50752225, -7.99510864]]))


Epoch 6: Train Loss 2.947, Valid Loss 3.067


([['D EH1 R IH0 K AH0 K IH0 NG', 'D EH1 R IH0 K AH0 K IH0 NG K IH2 NG']], array([[ -6.98905387, -11.11280253]]))


Epoch 7: Train Loss 2.751, Valid Loss 2.976


([['D EH1 R IH0 K AH0 Z', 'D EH1 R IH0 K AH0 N Z']], array([[-4.36395468, -4.78826316]]))


Epoch 8: Train Loss 2.610, Valid Loss 2.990
([['D EH1 R IH0 K AH0 Z', 'D IH0 R IH1 K AH0 V IH0 NG']], array([[-4.28846049, -5.81977644]]))


Epoch 9: Train Loss 2.488, Valid Loss 2.791


([['D EH1 R IH0 K AH0 L Z', 'D EH1 R IH0 K AH0 L AE0 K IH0 NG']], array([[-5.59058979, -9.17627224]]))


In [9]:
for e in range(10, 15):
    loss = train.run_one_epoch(cmu_train_dataset,
                               True,
                               from_script='en',
                               to_script='cmu',
                               encoder=cmu_encoder,
                               decoder=cmu_decoder,
                               optimizer=optimizer,
                               loss_function=loss_function)
    valid_loss = train.run_one_epoch(cmu_valid_dataset,
                                     False,
                                     from_script='en',
                                     to_script='cmu',
                                     encoder=cmu_encoder,
                                     decoder=cmu_decoder,
                                     loss_function=loss_function)
    if cmu_best_val_loss is None or valid_loss < cmu_best_val_loss:
        cmu_best_val_loss = valid_loss
        cmu_checkpoint = checkpoint.save(file_prefix=checkpoint_prefix)
    print("Epoch {}: Train Loss {:.3f}, Valid Loss {:.3f}".format(e, loss, valid_loss))
    print(decode.transliterate(input_strs=['derick'],
                               from_script='en',
                               to_script='cmu',
                               encoder=cmu_encoder,
                               decoder=cmu_decoder,
                               k_best=2,
                               decoding_method=decode.beam_search_decode))

Epoch 10: Train Loss 2.345, Valid Loss 2.696


([['D IH0 R IH1 K IH0 NG', 'D IH0 R IH1 K AH0 V IH0 NG']], array([[-4.6448728 , -6.31911044]]))


Epoch 11: Train Loss 2.273, Valid Loss 2.769
([['D EH1 R IH0 K AH0 Z', 'D EH1 R IH0 K AH0 L Z']], array([[-3.77462789, -4.33016792]]))


KeyboardInterrupt: 

In [10]:
checkpoint.restore(cmu_checkpoint).assert_consumed()
print(train.run_one_epoch(cmu_valid_dataset,
                          False,
                          from_script='en',
                          to_script='cmu',
                          encoder=cmu_encoder,
                          decoder=cmu_decoder,
                          loss_function=loss_function))

tf.Tensor(2.7325873, shape=(), dtype=float32)


In [11]:
def run_some_epochs(epochs):
    checkpoint_path = None
    best_val_loss = None
    for e in range(epochs):
        loss = train.run_one_epoch(eob_train_dataset,
                                   True,
                                   from_script='en',
                                   to_script='ja',
                                   encoder=ja_encoder,
                                   decoder=ja_decoder,
                                   optimizer=optimizer,
                                   loss_function=loss_function)
        valid_loss = train.run_one_epoch(eob_valid_dataset,
                                         False,
                                         from_script='en',
                                         to_script='ja',
                                         encoder=ja_encoder,
                                         decoder=ja_decoder,
                                         loss_function=loss_function)
        print("Epoch {}: Train Loss {:.3f}, Valid Loss {:.3f}".format(e, loss, valid_loss))
        print(decode.transliterate(input_strs=['derick'],
                                       from_script='en',
                                       to_script='ja',
                                       encoder=ja_encoder,
                                       decoder=ja_decoder,
                                       k_best=2,
                                       decoding_method=decode.beam_search_decode))
        if best_val_loss is None or valid_loss < best_val_loss:
            best_val_loss = valid_loss
            checkpoint_path = checkpoint.save(file_prefix=checkpoint_prefix)
        else:
            break
    return checkpoint_path

In [12]:
for layer in cmu_encoder.layers:
    layer.trainable = False
for layer in cmu_decoder.layers:
    layer.trainable = False
checkpoint_path = run_some_epochs(10)
checkpoint.restore(checkpoint_path).assert_consumed()
train.run_one_epoch(eob_valid_dataset,
                    False,
                    from_script='en',
                    to_script='ja',
                    encoder=ja_encoder,
                    decoder=ja_decoder,
                    loss_function=loss_function)

Epoch 0: Train Loss 13.320, Valid Loss 7.738


([['デリック', 'ディリック']], array([[-3.76657595, -4.79967488]]))


Epoch 1: Train Loss 6.186, Valid Loss 5.539


([['ディリック', 'デリック']], array([[-3.14992185, -3.73725431]]))


Epoch 2: Train Loss 4.774, Valid Loss 5.115


([['デリック', 'デリクカット']], array([[-3.67511961, -6.25354449]]))


Epoch 3: Train Loss 3.904, Valid Loss 4.839


([['ディリック', 'デリック']], array([[-2.69615537, -3.46091083]]))


Epoch 4: Train Loss 3.300, Valid Loss 4.845


([['デリック', 'ディリック']], array([[-2.30153008, -2.4553171 ]]))


<tf.Tensor: id=306351252, shape=(), dtype=float32, numpy=4.9945216>

In [13]:
valid_df = pd.read_csv('../data/split/eob_pairs_valid.csv',
                       keep_default_na=False)

In [14]:
tr = decode.transliterate(input_strs=valid_df['en'].values,
                          from_script='en',
                          to_script='ja',
                          encoder=ja_encoder,
                          decoder=ja_decoder,
                          k_best=10,
                          num_beams=20,
                          decoding_method=decode.beam_search_decode)
evaluate.top_k_accuracy(valid_df['ja'].values, tr, k=1)

0.49123767798466594

In [15]:
muse_valid_df = pd.read_csv('../data/split/muse_pairs_valid.csv',
                       keep_default_na=False)

In [16]:
tr = decode.transliterate(input_strs=muse_valid_df['en'].values,
                          from_script='en',
                          to_script='ja',
                          encoder=ja_encoder,
                          decoder=ja_decoder,
                          k_best=10,
                          num_beams=20,
                          decoding_method=decode.beam_search_decode)
evaluate.top_k_accuracy(muse_valid_df['ja'].values, tr, k=1)

0.37720488466757124