In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
tf.enable_eager_execution()
import importlib
import os

from matplotlib import pyplot as plt
%matplotlib notebook

from transliteration import data, train, model_one, script, decode, evaluate

In [2]:
importlib.reload(data)
batch_size = 128
cmu_train_dataset = data.make_dataset('../data/tfrecord/cmu_train.tfrecord',
                                  from_script='en',
                                  to_script='cmu',
                                  batch_size=batch_size)
cmu_valid_dataset = data.make_dataset('../data/tfrecord/cmu_valid.tfrecord',
                                  from_script='en',
                                  to_script='cmu',
                                  batch_size=batch_size)
cmu_test_dataset = data.make_dataset('../data/tfrecord/cmu_test.tfrecord',
                                 from_script='en',
                                 to_script='cmu',
                                 batch_size=batch_size)
eob_train_dataset = data.make_dataset('../data/tfrecord/eob_train.tfrecord',
                                       from_script='en',
                                       to_script='ja',
                                       batch_size=batch_size)
eob_valid_dataset = data.make_dataset('../data/tfrecord/eob_valid.tfrecord',
                                       from_script='en',
                                       to_script='ja',
                                       batch_size=batch_size)
eob_test_dataset = data.make_dataset('../data/tfrecord/eob_test.tfrecord',
                                      from_script='en',
                                      to_script='ja',
                                      batch_size=batch_size)

In [3]:
optimizer = tf.train.AdamOptimizer()

def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred)
    return tf.reduce_mean(loss_ * mask)

cmu_encoder_config = model_one.Config(lstm_size=240,
                                      embedding_size=30,
                                      attention_size=None,
                                      vocab_size=script.SCRIPTS['en'].vocab_size)
cmu_decoder_config = model_one.Config(lstm_size=240,
                                      embedding_size=30,
                                      attention_size=120,
                                      attention='monotonic_bahdanau',
                                      vocab_size=script.SCRIPTS['cmu'].vocab_size)
ja_decoder_config = model_one.Config(lstm_size=240,
                                     embedding_size=30,
                                     attention_size=120,
                                     attention='multiple:monotonic_bahdanau,monotonic_bahdanau',
                                     vocab_size=script.SCRIPTS['ja'].vocab_size)
cmu_encoder = model_one.Encoder(cmu_encoder_config)
cmu_decoder = model_one.Decoder(cmu_decoder_config)
ja_encoder = model_one.StackedEncoderDecoderEncoder(cmu_encoder, cmu_decoder, 'cmu')
ja_decoder = model_one.Decoder(ja_decoder_config)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 cmu_encoder=cmu_encoder,
                                 ja_decoder=ja_decoder,
                                 cmu_decoder=cmu_decoder)

In [4]:
cmu_best_val_loss = None
cmu_checkpoint = None
for e in range(10):
    loss = train.run_one_epoch(cmu_train_dataset,
                               True,
                               from_script='en',
                               to_script='cmu',
                               encoder=cmu_encoder,
                               decoder=cmu_decoder,
                               optimizer=optimizer,
                               loss_function=loss_function)
    valid_loss = train.run_one_epoch(cmu_valid_dataset,
                                     False,
                                     from_script='en',
                                     to_script='cmu',
                                     encoder=cmu_encoder,
                                     decoder=cmu_decoder,
                                     loss_function=loss_function)
    if cmu_best_val_loss is None or valid_loss < cmu_best_val_loss:
        cmu_best_val_loss = valid_loss
        cmu_checkpoint = checkpoint.save(file_prefix=checkpoint_prefix)
    else:
        break
    print("Epoch {}: Train Loss {:.3f}, Valid Loss {:.3f}".format(e, loss, valid_loss))
    print(decode.transliterate(input_strs=['derick'],
                               from_script='en',
                               to_script='cmu',
                               encoder=cmu_encoder,
                               decoder=cmu_decoder,
                               k_best=2,
                               decoding_method=decode.beam_search_decode))

Instructions for updating:
Colocations handled automatically by placer.


Epoch 0: Train Loss 15.451, Valid Loss 8.827
([['D EH1 R K IH0 K ER0 Z', 'D EH1 R K W EH2 K ER0 Z']], array([[-6.8596259 , -8.28446002]]))


Epoch 1: Train Loss 6.500, Valid Loss 5.189
([['D ER1 K IH0 K', 'D EH1 R CH IH0 K']], array([[-3.74216951, -3.85924087]]))


Epoch 2: Train Loss 4.291, Valid Loss 3.858
([['D EH1 R IH0 K', 'D ER1 IH0 K']], array([[-2.75024256, -2.84142637]]))


Epoch 3: Train Loss 3.260, Valid Loss 3.151
([['D ER1 IH0 K', 'D EH1 R IH0 K']], array([[-2.53842798, -2.94311621]]))


Epoch 4: Train Loss 2.629, Valid Loss 2.674
([['D ER1 IH0 K', 'D EH1 R IH0 K']], array([[-1.57642895, -1.91252233]]))


Epoch 5: Train Loss 2.209, Valid Loss 2.397
([['D EH1 R IH0 K T', 'D ER0 IH1 K T']], array([[-1.66566963, -2.03239874]]))


Epoch 6: Train Loss 1.933, Valid Loss 2.263
([['D EH1 R IH0 K T ER0', 'D EH1 R IH0 K T']], array([[-1.96778044, -2.28633122]]))


Epoch 7: Train Loss 1.740, Valid Loss 2.184
([['D EH1 R IH0 K T', 'D EH1 R IH0 K S']], array([[-2.44807088, -2.75266555]]))


Epoch 8: Train Loss 1.582, Valid Loss 2.146
([['D EH1 R IH0 K AH0 L T', 'D EH1 R IH0 K IH0 V']], array([[-1.64011403, -2.67278628]]))


Epoch 9: Train Loss 1.448, Valid Loss 2.126
([['D EH1 R IH0 K AH0 N', 'D EH1 R IH0 K AH0 L']], array([[-2.50181594, -2.5455018 ]]))


In [5]:
checkpoint.restore(cmu_checkpoint).assert_consumed()
print(train.run_one_epoch(cmu_valid_dataset,
                          False,
                          from_script='en',
                          to_script='cmu',
                          encoder=cmu_encoder,
                          decoder=cmu_decoder,
                          loss_function=loss_function))

tf.Tensor(2.1078532, shape=(), dtype=float32)


In [6]:
def run_some_epochs(epochs):
    checkpoint_path = None
    best_val_loss = None
    for e in range(epochs):
        loss = train.run_one_epoch(eob_train_dataset,
                                   True,
                                   from_script='en',
                                   to_script='ja',
                                   encoder=ja_encoder,
                                   decoder=ja_decoder,
                                   optimizer=optimizer,
                                   loss_function=loss_function)
        valid_loss = train.run_one_epoch(eob_valid_dataset,
                                         False,
                                         from_script='en',
                                         to_script='ja',
                                         encoder=ja_encoder,
                                         decoder=ja_decoder,
                                         loss_function=loss_function)
        print("Epoch {}: Train Loss {:.3f}, Valid Loss {:.3f}".format(e, loss, valid_loss))
        print(decode.transliterate(input_strs=['derick'],
                                       from_script='en',
                                       to_script='ja',
                                       encoder=ja_encoder,
                                       decoder=ja_decoder,
                                       k_best=2,
                                       decoding_method=decode.beam_search_decode))
        if best_val_loss is None or valid_loss < best_val_loss:
            best_val_loss = valid_loss
            checkpoint_path = checkpoint.save(file_prefix=checkpoint_prefix)
        else:
            break
    return checkpoint_path

In [7]:
for layer in cmu_encoder.layers:
    layer.trainable = False
for layer in cmu_decoder.layers:
    layer.trainable = False
checkpoint_path = run_some_epochs(10)
checkpoint.restore(checkpoint_path).assert_consumed()
train.run_one_epoch(eob_valid_dataset,
                    False,
                    from_script='en',
                    to_script='ja',
                    encoder=ja_encoder,
                    decoder=ja_decoder,
                    loss_function=loss_function)

Epoch 0: Train Loss 12.920, Valid Loss 7.402


([['ディリク', 'ディリック']], array([[-4.04340054, -4.97185265]]))


Epoch 1: Train Loss 6.220, Valid Loss 5.784


([['デリック', 'ディリック']], array([[-2.83951345, -4.08381903]]))


Epoch 2: Train Loss 4.678, Valid Loss 5.330


([['デリック', 'ディリック']], array([[-2.64089123, -4.6104294 ]]))


Epoch 3: Train Loss 3.754, Valid Loss 5.169


([['デリック', 'ダリック']], array([[-2.58163609, -3.48164397]]))


Epoch 4: Train Loss 3.063, Valid Loss 5.032


([['デリック', 'デリ']], array([[-2.65950389, -2.77438992]]))


Epoch 5: Train Loss 2.493, Valid Loss 5.030


([['デリカック', 'ディリック']], array([[-3.54263781, -3.6919361 ]]))


Epoch 6: Train Loss 2.022, Valid Loss 5.180


([['デリ', 'デリカック']], array([[-2.51001856, -2.77032691]]))


<tf.Tensor: id=307138649, shape=(), dtype=float32, numpy=5.048813>

In [8]:
valid_df = pd.read_csv('../data/split/eob_pairs_valid.csv',
                       keep_default_na=False)

In [9]:
tr = decode.transliterate(input_strs=valid_df['en'].values,
                     from_script='en',
                     to_script='ja',
                     encoder=ja_encoder,
                     decoder=ja_decoder,
                     k_best=10,
                     num_beams=20,
                     decoding_method=decode.beam_search_decode)
evaluate.top_k_accuracy(valid_df['ja'].values, tr, k=1)

0.4375684556407448

In [10]:
valid_df = pd.read_csv('../data/split/muse_pairs_valid.csv',
                       keep_default_na=False)

In [11]:
tr = decode.transliterate(input_strs=valid_df['en'].values,
                     from_script='en',
                     to_script='ja',
                     encoder=ja_encoder,
                     decoder=ja_decoder,
                     k_best=10,
                     num_beams=20,
                     decoding_method=decode.beam_search_decode)
evaluate.top_k_accuracy(valid_df['ja'].values, tr, k=1)

0.3066485753052917