In [1]:
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
from matplotlib import pyplot as plt
import time

In [2]:
df = pd.read_hdf('/home/work/nlp/tokenized_10thousand.hdf', stop=10)

In [3]:
df.head()

Unnamed: 0,original,translation
0,"[○, 丙, 申, /, 十, 七, 日, 丙, 申, ,, 太, 祖, 卽, 位, 于, ...","[태조/NNP, 가/JKS, 수창/NNP, 궁/NNG, 에서/JKB, 왕위/NNP,..."
1,"[○, 上, 在, 潛, 邸, ,, 夢, 有, 神, 人, 執, 金, 尺, 自, 天, ...","[임금/NNG, 이/JKS, 잠/NNG, 저/NNG, 에/JKB, 있/VV, 을/E..."
2,"[○, 丁, 酉, /, 雨, 。, 前, 此, 久, 旱, ,, 及, 上, 卽, 位, ...","[비/NNG, 가/JKS, 내리/VV, 었/EP, 다/EF, ./SF, 이/NP, ..."
3,"[竊, 謂, 小, 邦, ,, 至, 恭, 愍, 王, 薨, 無, 嗣, ,, 逆, 臣, ...","[도평의사사/NNP, 및/MAJ, 대소/NNG, 신료와/NA, 한량/NNP, ·/S..."
4,"[○, 立, 義, 興, 親, 軍, 衛, ,, 罷, 都, 摠, 中, 外, 諸, 軍, ...","[의/NNG, 흥/NNG, 친군위/NNG, 를/JKO, 설치/NNG, 하/XSV, ..."


In [5]:
df.original[4]

array(['○', '立', '義', '興', '親', '軍', '衛', ',', '罷', '都', '摠', '中', '外',
       '諸', '軍', '事', '府', '。'], dtype='<U1')

# 전처리

## id부여
- `START`: `vocab_size`
- `END` : `vocab_size`+1
- `OOV`: 1
- `PAD`: 0

In [109]:
# json으로 저장한 단어 사전 불러오기
import json

with open('./id_dict/input_id.json', 'r') as fp:
    input_id = json.load(fp)

with open('./id_dict/target_id.json', 'r') as fp:
    target_id = json.load(fp)


In [70]:
#  id->원래 단어 역변환
reverse_input_id = {i:char for char, i in input_id.items()}
reverse_input_id[1] = '_'
reverse_target_id = {i:char for char, i in target_id.items()}
reverse_target_id[1] = '_'

In [71]:
def encode_original(original):
    original = np.array([len(input_id)] +\
                list(map(lambda x:input_id.setdefault(x,1), original))\
                + [len(input_id)+1], dtype=np.int)
    
    return original

In [72]:
def encode_translation(translation):
    translation = np.array([len(target_id)] +\
                    list(map(lambda x:target_id.setdefault(x,1), translation))\
                    + [len(target_id)+1], dtype=np.int)
    
    return translation

In [73]:
origin, trans = df.original.apply(encode_original), df.translation.apply(encode_translation)

# Padding
최대 문장의 길이(== 모델 인풋)를 `200`으로 설정해서 문장 길이가 `200`이 되지 않으면 padding 한다

`200`넘으면 잘라버린다.

`<PAD>`의 id는 `0`으로 부여한다.


In [74]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [75]:
encoder_input_data = tf.convert_to_tensor(pad_sequences(origin, maxlen=200, padding='post', truncating='post'),dtype=tf.int64)
decoder_target_data = tf.convert_to_tensor(pad_sequences(trans, maxlen=200, padding='post', truncating='post'),dtype=tf.int64)

In [76]:
train_dataset = tf.data.Dataset.from_tensor_slices((encoder_input_data, decoder_target_data))

In [77]:
BUFFER_SIZE = 20000
BATCH_SIZE = 32

In [78]:
# train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Positional Encoding
토큰의 위치정보를 담는 positional encoding

In [79]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [80]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
  
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

# Masking
`PAD` 토큰을 입력으로 다루지 않도록 마스킹합니다.

mask가 1이면 해당 토큰이 `PAD`라는 뜻.

In [81]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

look-ahead mask는 이후 토큰을 mask합니다. mask된 토큰은 쓰이지 않는 토큰을 가리킵니다.

세번째 단어를 예측하는데, 첫번째와 두번째 단어만 사용하기 위해서 처리하는 것!

In [82]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

# Scaled Dot-product Attention
![image.png](https://www.tensorflow.org/images/tutorials/transformer/scaled_attention.png)

In [83]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.

    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
    output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [84]:
def print_out(q, k, v):
    temp_out, temp_attn = scaled_dot_product_attention(
      q, k, v, None)
    print ('Attention weights are:')
    print (temp_attn)
    print ('Output is:')
    print (temp_out)

# Multi-head Attention

![image.png](https://www.tensorflow.org/images/tutorials/transformer/multi_head_attention.png)

In [85]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

## Position wise FF network 

In [86]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

# Encoder And Decoder

![](https://www.tensorflow.org/images/tutorials/transformer/transformer.png)

In [87]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

In [88]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

# Encoder

In [89]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.d_model)


        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

# Decoder

In [90]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                 look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

# Create Transformer

In [91]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                               input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                               target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, enc_padding_mask, 
               look_ahead_mask, dec_padding_mask):

        enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights = self.decoder(
            tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output, attention_weights

## Hyperparameters

In [92]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = len(input_id) + 2
target_vocab_size = len(target_id) + 2
dropout_rate = 0.1

In [93]:
input_vocab_size, target_vocab_size

(7051, 10364)

In [111]:
len(input_id), len(target_id)

(7049, 10003)

## Optimizer
커스텀 learning rate scheduler를 더한 Adam Optimizer를 사용

In [95]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [96]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [97]:
# temp_learning_rate_schedule = CustomSchedule(d_model)

# plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
# plt.ylabel("Learning Rate")
# plt.xlabel("Train Step")

## Loss and Metrics
target sequence가 padded 되었기 때문에, loss 계산 시에 padding mask를 적용해야 한다.

In [98]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [99]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [100]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

# Training and Checkpointing

In [101]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

In [102]:
def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)

    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

In [103]:
checkpoint_path = "/home/work/nlp/checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [52]:
# EPOCHS = 20

In [53]:
# # The @tf.function trace-compiles train_step into a TF graph for faster
# # execution. The function specializes to the precise shape of the argument
# # tensors. To avoid re-tracing due to the variable sequence lengths or variable
# # batch sizes (the last batch is smaller), use input_signature to specify
# # more generic shapes.

# train_step_signature = [
#     tf.TensorSpec(shape=(None, None), dtype=tf.int64),
#     tf.TensorSpec(shape=(None, None), dtype=tf.int64),
# ]

# @tf.function(input_signature=train_step_signature)
# def train_step(inp, tar):
#     tar_inp = tar[:, :-1]
#     tar_real = tar[:, 1:]

#     enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

#     with tf.GradientTape() as tape:
#         predictions, _ = transformer(inp, tar_inp, 
#                                      True, 
#                                      enc_padding_mask, 
#                                      combined_mask, 
#                                      dec_padding_mask)
#         loss = loss_function(tar_real, predictions)

#     gradients = tape.gradient(loss, transformer.trainable_variables)    
#     optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

#     train_loss(loss)
#     train_accuracy(tar_real, predictions)

# Evaluate

In [164]:
def evaluate(inp_sentence):

    # inp sentence is portuguese, hence adding the start and end token
#     inp_sentence = tf.convert_to_tensor(inp_sentence.apply(encode_original))
    encoder_input = inp_sentence

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [[len(target_id)]*32]
    output = tf.convert_to_tensor(pad_sequences(decoder_input, maxlen=200, padding='post', truncating='post'),dtype=tf.int64)

    for i in range(200):
        print(encoder_input)
        print(output)
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)
        
        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if predicted_id == len(target_id)+1:
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

In [165]:
origin, trans = df.original.apply(encode_original), df.translation.apply(encode_translation)
encoder_input_data = tf.convert_to_tensor(pad_sequences(origin, maxlen=200, padding='post', truncating='post'),dtype=tf.int64)


In [166]:
evaluate(encoder_input_data)

tf.Tensor(
[[7049   11  516 ...    2  713  604]
 [7049   11   14 ...    5  178   24]
 [7049   11  345 ...    0    0    0]
 ...
 [7049   11  610 ...    0    0    0]
 [7049   11  324 ...    0    0    0]
 [7049   11  345 ...    0    0    0]], shape=(32, 200), dtype=int64)
tf.Tensor(
[[10362 10362 10362 10362 10362 10362 10362 10362 10362 10362 10362 10362
  10362 10362 10362 10362 10362 10362 10362 10362 10362 10362 10362 10362
  10362 10362 10362 10362 10362 10362 10362 10362     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0    

ValueError: Tensor's shape (51112, 128) is not compatible with supplied shape (10364, 128)

In [106]:
len(input_id), len(target_id)

(7049, 10362)

In [167]:
tf.train.list_variables(tf.train.latest_checkpoint('/home/work/nlp/checkpoints/train'))

[('_CHECKPOINTABLE_OBJECT_GRAPH', []),
 ('optimizer/beta_1/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/beta_2/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/decay/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('save_counter/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('transformer/decoder/dec_layers/0/ffn/layer-0/bias/.ATTRIBUTES/VARIABLE_VALUE',
  [512]),
 ('transformer/decoder/dec_layers/0/ffn/layer-0/bias/.OPTIMIZER_SLOT/optimizer/m/.ATTRIBUTES/VARIABLE_VALUE',
  [512]),
 ('transformer/decoder/dec_layers/0/ffn/layer-0/bias/.OPTIMIZER_SLOT/optimizer/v/.ATTRIBUTES/VARIABLE_VALUE',
  [512]),
 ('transformer/decoder/dec_layers/0/ffn/layer-0/kernel/.ATTRIBUTES/VARIABLE_VALUE',
  [128, 512]),
 ('transformer/decoder/dec_layers/0/ffn/layer-0/kernel/.OPTIMIZER_SLOT/optimizer/m/.ATTRIBUTES/VARIABLE_VALUE',
  [128, 512]),
 ('transformer/decoder/dec_layers/0/ffn/layer-0/kernel/.OPTIMIZER_SLOT/optimizer/v/.ATTRIBUTES/VARIABLE_VALUE',
  [128, 512]),
 ('transforme