In [1]:
import os 
import time
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pprint import pprint
from IPython.display import clear_output
import tensorflow as tf
import tensorflow_datasets as tfds
import logging

logging.basicConfig(level=logging.ERROR)
np.set_printoptions(suppress=True)

In [2]:
output_dir = 'nmt'
en_vocab_file = os.path.join(output_dir, 'en_vocab')
zh_vocab_file = os.path.join(output_dir, 'zh_vocab')
checkpoint_path = os.path.join(output_dir, 'checkpoints')
log_dir = os.path.join(output_dir, 'logs')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [3]:
tmp_builder = tfds.builder('wmt19_translate/zh-en')
pprint(tmp_builder.subsets)

{Split('train'): ['newscommentary_v14',
                  'wikititles_v1',
                  'uncorpus_v1',
                  'casia2015',
                  'casict2011',
                  'casict2015',
                  'datum2015',
                  'datum2017',
                  'neu2017'],
 Split('validation'): ['newstest2018']}


In [4]:
config = tfds.translate.wmt.WmtConfig(
    version='1.0.0',
    language_pair=('zh', 'en'),
    subsets={
        tfds.Split.TRAIN: ['newscommentary_v14']
    }
)
builder = tfds.builder('wmt_translate', config=config)
builder.download_and_prepare()

In [5]:
train_examples, val_examples = builder.as_dataset(split=['train[:20%]', 'train[20%:21%]'], as_supervised=True)
print(train_examples)
print(val_examples)

<PrefetchDataset shapes: ((), ()), types: (tf.string, tf.string)>
<PrefetchDataset shapes: ((), ()), types: (tf.string, tf.string)>


In [6]:
for en, zh in train_examples.take(3):
    print(en)
    print(zh)
    print('-' * 10)

tf.Tensor(b'The fear is real and visceral, and politicians ignore it at their peril.', shape=(), dtype=string)
tf.Tensor(b'\xe8\xbf\x99\xe7\xa7\x8d\xe6\x81\x90\xe6\x83\xa7\xe6\x98\xaf\xe7\x9c\x9f\xe5\xae\x9e\xe8\x80\x8c\xe5\x86\x85\xe5\x9c\xa8\xe7\x9a\x84\xe3\x80\x82 \xe5\xbf\xbd\xe8\xa7\x86\xe5\xae\x83\xe7\x9a\x84\xe6\x94\xbf\xe6\xb2\xbb\xe5\xae\xb6\xe4\xbb\xac\xe5\x89\x8d\xe9\x80\x94\xe5\xa0\xaa\xe5\xbf\xa7\xe3\x80\x82', shape=(), dtype=string)
----------
tf.Tensor(b'In fact, the German political landscape needs nothing more than a truly liberal party, in the US sense of the word \xe2\x80\x9cliberal\xe2\x80\x9d \xe2\x80\x93 a champion of the cause of individual freedom.', shape=(), dtype=string)
tf.Tensor(b'\xe4\xba\x8b\xe5\xae\x9e\xe4\xb8\x8a\xef\xbc\x8c\xe5\xbe\xb7\xe5\x9b\xbd\xe6\x94\xbf\xe6\xb2\xbb\xe5\xb1\x80\xe5\x8a\xbf\xe9\x9c\x80\xe8\xa6\x81\xe7\x9a\x84\xe4\xb8\x8d\xe8\xbf\x87\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe7\xac\xa6\xe5\x90\x88\xe7\xbe\x8e\xe5\x9b\xbd\xe6\x89\x80\xe8\

In [7]:
sample_examples = []
num_samples = 10

for en_t, zh_t in train_examples.take(num_samples):
    en = en_t.numpy().decode('utf-8')
    zh = zh_t.numpy().decode('utf-8')

    print(en)
    print(zh)
    print('-' * 10)

    sample_examples.append((en, zh))

The fear is real and visceral, and politicians ignore it at their peril.
这种恐惧是真实而内在的。 忽视它的政治家们前途堪忧。
----------
In fact, the German political landscape needs nothing more than a truly liberal party, in the US sense of the word “liberal” – a champion of the cause of individual freedom.
事实上，德国政治局势需要的不过是一个符合美国所谓“自由”定义的真正的自由党派，也就是个人自由事业的倡导者。
----------
Shifting to renewable-energy sources will require enormous effort and major infrastructure investment.
必须付出巨大的努力和基础设施投资才能完成向可再生能源的过渡。
----------
In this sense, it is critical to recognize the fundamental difference between “urban villages” and their rural counterparts.
在这方面，关键在于认识到“城市村落”和农村村落之间的根本区别。
----------
A strong European voice, such as Nicolas Sarkozy’s during the French presidency of the EU, may make a difference, but only for six months, and at the cost of reinforcing other European countries’ nationalist feelings in reaction to the expression of “Gallic pride.”
法国担任轮值主席国期间尼古拉·萨科奇统一的欧洲声音可能让人耳目一新，但这种声音却只持续了短短六个月，而且付出了让其他欧洲国家在面对“高卢人的骄

In [8]:
%%time
try:
    subword_encoder_en = tfds.features.text.SubwordTextEncoder.load_from_file(en_vocab_file)
    print(f'Load builded corpus: {en_vocab_file}')
except:
    print(f'Build corpus: {en_vocab_file}')
    subword_encoder_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        (en.numpy() for en, _ in train_examples),
        target_vocab_size=2**13
    )
    subword_encoder_en.save_to_file(en_vocab_file)

print(f'Size of corpus: {subword_encoder_en.vocab_size}')
print(f'First 10 subwords: {subword_encoder_en.subwords[:10]}')

Load builded corpus: nmt\en_vocab
Size of corpus: 8113
First 10 subwords: [', ', 'the_', 'of_', 'to_', 'and_', 's_', 'in_', 'a_', 'is_', 'that_']
Wall time: 31.9 ms


In [9]:
sample_string = 'Taiwan is beautiful.'
indices = subword_encoder_en.encode(sample_string)
print(indices)
print([subword_encoder_en.decode([idx]) for idx in indices])

[3461, 7889, 9, 3502, 4379, 1134, 7903]
['Taiwan', ' ', 'is ', 'bea', 'uti', 'ful', '.']


In [10]:
%%time
try:
    subword_encoder_zh = tfds.features.text.SubwordTextEncoder.load_from_file(zh_vocab_file)
    print(f'Load builded corpus: {zh_vocab_file}')
except:
    print(f'Build corpus: {zh_vocab_file}')
    subword_encoder_zh = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        (zh.numpy() for _, zh in train_examples),
        target_vocab_size=2**13,
        max_subword_length=1
    )
    subword_encoder_zh.save_to_file(zh_vocab_file)

print(f'Size of corpus: {subword_encoder_zh.vocab_size}')
print(f'First 10 subwords: {subword_encoder_zh.subwords[:10]}')

Load builded corpus: nmt\zh_vocab
Size of corpus: 4205
First 10 subwords: ['的', '，', '。', '国', '在', '是', '一', '和', '不', '这']
Wall time: 17 ms


In [11]:
def encode(en_t, zh_t):
    en_vocab_size = subword_encoder_en.vocab_size
    zh_vocab_size = subword_encoder_zh.vocab_size
    en_indices = [en_vocab_size] + subword_encoder_en.encode(en_t.numpy()) + [en_vocab_size + 1]
    zh_indices = [zh_vocab_size] + subword_encoder_zh.encode(zh_t.numpy()) + [zh_vocab_size + 1]
    return en_indices, zh_indices

In [12]:
en_t, zh_t = next(iter(train_examples))
en_indices, zh_indices = encode(en_t, zh_t)
print('Index of BOS in en: ', subword_encoder_en.vocab_size)
print('Index of EOS in en: ', subword_encoder_en.vocab_size + 1)
print('Index of BOS in zh: ', subword_encoder_zh.vocab_size)
print('Index of EOS in zh: ', subword_encoder_zh.vocab_size + 1)

print('\nTensor:')
pprint((en_t, zh_t))
print('-' * 15)
print('Indices:')
print((en_indices, zh_indices))

Index of BOS in en:  8113
Index of EOS in en:  8114
Index of BOS in zh:  4205
Index of EOS in zh:  4206

Tensor:
(<tf.Tensor: shape=(), dtype=string, numpy=b'The fear is real and visceral, and politicians ignore it at their peril.'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\xbf\x99\xe7\xa7\x8d\xe6\x81\x90\xe6\x83\xa7\xe6\x98\xaf\xe7\x9c\x9f\xe5\xae\x9e\xe8\x80\x8c\xe5\x86\x85\xe5\x9c\xa8\xe7\x9a\x84\xe3\x80\x82 \xe5\xbf\xbd\xe8\xa7\x86\xe5\xae\x83\xe7\x9a\x84\xe6\x94\xbf\xe6\xb2\xbb\xe5\xae\xb6\xe4\xbb\xac\xe5\x89\x8d\xe9\x80\x94\xe5\xa0\xaa\xe5\xbf\xa7\xe3\x80\x82'>)
---------------
Indices:
([8113, 16, 1284, 9, 243, 5, 1275, 1756, 156, 1, 5, 1016, 5566, 21, 38, 33, 2982, 7965, 7903, 8114], [4205, 10, 151, 574, 1298, 6, 374, 55, 29, 193, 5, 1, 3, 3981, 931, 431, 125, 1, 17, 124, 33, 20, 97, 1089, 1247, 861, 3, 4206])


In [13]:
tf_encode = lambda en_t, zh_t: tf.py_function(encode, [en_t, zh_t], [tf.int64, tf.int64])

tmp_dataset = train_examples.map(tf_encode)
en_indices, zh_indices = next(iter(tmp_dataset))
print(en_indices)
print(zh_indices)

tf.Tensor(
[8113   16 1284    9  243    5 1275 1756  156    1    5 1016 5566   21
   38   33 2982 7965 7903 8114], shape=(20,), dtype=int64)
tf.Tensor(
[4205   10  151  574 1298    6  374   55   29  193    5    1    3 3981
  931  431  125    1   17  124   33   20   97 1089 1247  861    3 4206], shape=(28,), dtype=int64)


In [14]:
MAX_LENGTH = 100

def filter_max_length(en, zh, max_length = MAX_LENGTH):
    return tf.logical_and(tf.size(en) <= max_length, tf.size(zh) <= max_length)
tmp_dataset = tmp_dataset.filter(filter_max_length)

In [15]:
BATCH_SIZE = 64
tmp_dataset = tmp_dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
en_batch, zh_batch = next(iter(tmp_dataset))
print('en_batch:')
print(en_batch)
print('-' * 15)
print('zh_batch:')
print(zh_batch)

en_batch:
tf.Tensor(
[[8113   16 1284 ...    0    0    0]
 [8113   44  369 ...    0    0    0]
 [8113 1894 1302 ...    0    0    0]
 ...
 [8113 1809 5706 ...    0    0    0]
 [8113 1634    1 ...    0    0    0]
 [8113  100 2542 ...    0    0    0]], shape=(64, 71), dtype=int64)
---------------
zh_batch:
tf.Tensor(
[[4205   10  151 ...    0    0    0]
 [4205  109   55 ...    0    0    0]
 [4205  206  275 ...    0    0    0]
 ...
 [4205    9  270 ...    0    0    0]
 [4205  327  363 ...    0    0    0]
 [4205   16    4 ...    0    0    0]], shape=(64, 99), dtype=int64)


In [16]:
MAX_LENGTH = 100
BATCH_SIZE = 128
BUFFER_SIZE = 10000

train_dataset = (train_examples
                 .map(tf_encode)
                 .filter(filter_max_length)
                 .cache()
                 .shuffle(BUFFER_SIZE)
                 .padded_batch(BATCH_SIZE, padded_shapes=(100, 100))
                 .prefetch(tf.data.experimental.AUTOTUNE))

val_dataset = (val_examples
               .map(tf_encode)
               .filter(filter_max_length)
               .cache()
               .shuffle(BUFFER_SIZE)
               .padded_batch(BATCH_SIZE, padded_shapes=(100, 100))
               .prefetch(tf.data.experimental.AUTOTUNE))

In [17]:
en_batch, zh_batch = next(iter(train_dataset))
print(en_batch)
print(zh_batch)

tf.Tensor(
[[8113 1633   24 ...    0    0    0]
 [8113  100   24 ...    0    0    0]
 [8113  559 1063 ...    0    0    0]
 ...
 [8113  214  911 ...    0    0    0]
 [8113 3509   11 ...    0    0    0]
 [8113   87 4462 ...    0    0    0]], shape=(128, 100), dtype=int64)
tf.Tensor(
[[4205  699  178 ...    0    0    0]
 [4205   29   16 ...    0    0    0]
 [4205   81  286 ...    0    0    0]
 ...
 [4205   17  123 ...    0    0    0]
 [4205   10    6 ...    0    0    0]
 [4205  241   53 ...    0    0    0]], shape=(128, 100), dtype=int64)


In [18]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')

def loss_function(real, pred):
    mask = tf.logical_not(tf.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

In [19]:
num_layers = 4 
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = subword_encoder_en.vocab_size + 2
target_vocab_size = subword_encoder_zh.vocab_size + 2
dropout_rate = 0.1

print("input_vocab_size:", input_vocab_size)
print("target_vocab_size:", target_vocab_size)

input_vocab_size: 8115
target_vocab_size: 4207


In [20]:
from learningrate import CustomSchedule

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [21]:
from model import Transformer

transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, dropout_rate)

print(f"""這個 Transformer 有 {num_layers} 層 Encoder / Decoder layers
d_model: {d_model}
num_heads: {num_heads}
dff: {dff}
input_vocab_size: {input_vocab_size}
target_vocab_size: {target_vocab_size}
dropout_rate: {dropout_rate}
""")

這個 Transformer 有 4 層 Encoder / Decoder layers
d_model: 128
num_heads: 8
dff: 512
input_vocab_size: 8115
target_vocab_size: 4207
dropout_rate: 0.1



In [22]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def create_padding_mask(seq):
    mask = tf.cast(tf.equal(seq, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

In [23]:
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    
    return enc_padding_mask, combined_mask, dec_padding_mask    

In [24]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask)
        loss = loss_function(tar_real, predictions)
    
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [25]:
@tf.function
def val_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    predictions, _ = transformer(inp, tar_inp, False, enc_padding_mask, combined_mask, dec_padding_mask)
    loss = loss_function(tar_real, predictions)

    val_loss(loss)
    val_accuracy(tar_real, predictions)

In [27]:
(inp, tar) = next(iter(train_dataset))
train_step(inp, tar)

transformer.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder (Encoder)            multiple                  1831808   
_________________________________________________________________
decoder (Decoder)            multiple                  1596800   
_________________________________________________________________
dense_64 (Dense)             multiple                  542703    
Total params: 3,971,311
Trainable params: 3,971,311
Non-trainable params: 0
_________________________________________________________________


In [28]:
run_id = f"{num_layers}layers_{d_model}d_{num_heads}heads_{dff}dff"
checkpoint_path = os.path.join(checkpoint_path, run_id)
log_dir = os.path.join(log_dir, run_id)

In [29]:
ckpt = tf.train.Checkpoint(model=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    last_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1]) * 5
    print(f'Read checkpoint, {last_epoch - 1} epoches have beed trained.')
else:
    last_epoch = 0
    print('Checkpoint not found.')

summary_writer = tf.summary.create_file_writer(log_dir)
tf.summary.trace_on(graph=True, profiler=False)

Read checkpoint, 29 epoches have beed trained.


In [30]:
EPOCHS = 40

for epoch in range(last_epoch, EPOCHS):
    start = time.time()
  
    train_loss.reset_states()
    train_accuracy.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()
    
    probar = tf.keras.utils.Progbar(478)
    for (step_idx, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)
        probar.add(1)

    for (step_idx, (inp, tar)) in enumerate(val_dataset):
        val_step(inp, tar)

    if (epoch + 1) % 5 == 0:
      ckpt_save_path = ckpt_manager.save()
      print ('Saving checkpoint for epoch {} at {}'.format(epoch, ckpt_save_path))

    with summary_writer.as_default():
      tf.summary.scalar("train_loss", train_loss.result(), step=epoch)
      tf.summary.scalar("train_acc", train_accuracy.result(), step=epoch)
      tf.summary.scalar("val_loss", val_loss.result(), step=epoch)
      tf.summary.scalar("val_acc", val_accuracy.result(), step=epoch)
    
    print(f'Epoch {epoch}\
         Loss {train_loss.result():.4f}\
         Accuracy {train_accuracy.result():.4f}\
         Val_Loss {val_loss.result():.4f}\
         Val_Accuracy {val_accuracy.result():.4f}')
    
    print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 30         Loss 0.7710         Accuracy 0.2621         Val_Loss 0.9092         Val_Accuracy 0.2515
Time taken for 1 epoch: 204.60754680633545 secs

Epoch 31         Loss 0.7650         Accuracy 0.2631         Val_Loss 0.9055         Val_Accuracy 0.2528
Time taken for 1 epoch: 217.48237371444702 secs

Epoch 32         Loss 0.7595         Accuracy 0.2638         Val_Loss 0.9048         Val_Accuracy 0.2529
Time taken for 1 epoch: 234.59940481185913 secs

Epoch 33         Loss 0.7546         Accuracy 0.2646         Val_Loss 0.9045         Val_Accuracy 0.2528
Time taken for 1 epoch: 232.9268627166748 secs

Saving checkpoint for epoch 34 at nmt\checkpoints\4layers_128d_8heads_512dff\ckpt-7
Epoch 34         Loss 0.7495         Accuracy 0.2655         Val_Loss 0.9065         Val_Accuracy 0.2528
Time taken for 1 epoch: 233.20344400405884 secs

Epoch 35         Loss 0.7452         Accuracy 0.2662         Val_Loss 0.9027         Val_Accuracy 0.2536
Time taken for 1 epoch: 233.5184998512268 