# Translation with Transformer (TensorFlow Tutorial)

Most of this code is provided by The TensorFlow Authors through this tutorial: https://www.tensorflow.org/tutorials/text/transformer

Main changes:

- Loading a custom dataset
- Adjusting parameters
- Walking through latent space (last part)
- Deleting all code that is not necessary to run the script (explanations).

In [None]:
''' Install libraries. '''
# !python3 -m pip install tensorflow-datasets pandas scikit-learn

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

import time
import numpy as np

## Load custom dataset through pandas

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

txt = open('dataset.txt', encoding='utf8').readlines()
txt = [sentence.replace('\n','') for sentence in txt]

# Next sentence as target sequence
raw_data = {'src': [line for line in txt[:-1]],
           'trg': [line for line in txt[1:]]}

df = pd.DataFrame(raw_data, columns=['src', 'trg'])

# Split training and validation data
df_train, df_val = train_test_split(df, test_size=0.15)

# Create dataset objects
train_examples = tf.data.Dataset.from_tensor_slices((df_train['src'], df_train['trg']))
val_examples = tf.data.Dataset.from_tensor_slices((df_val['src'], df_val['trg']))

dataset shape (`next(iter(dataset))`):

(<tf.Tensor: shape=(), dtype=string, numpy=b'Paul Val\xc3\xa9rys Form: Sie wird gespeist von seinem unerm\xc3\xbcdlichen Drang zum Objektivieren und, mit C\xc3\xa9zannes Wort, Realisieren, der kein Dunkles, Unaufgehelltes, Ungel\xc3\xb6stes duldet; dem die Transparenz nach aussen zum Mass des Gelingens im Innern selbst wird.\n'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Grosse Einsichten in die Kunst geraten \xc3\xbcberhaupt entweder in absoluter Distanz, aus der Konsequenz des Begriffs, ungest\xc3\xb6rt vom sogenannten Kunstverst\xc3\xa4ndnis, wie bei Kant oder auch Hegel, oder in solcher absoluten N\xc3\xa4he, der Haltung dessen, der hinter den Kulissen steht, der nicht Publikum ist, sondern das Kunstwerk mitvollzieht unter dem Aspekt des Machens, der Technik.\n'>)

## Tokenizer

In [3]:
''' Run tokenizer. '''

dataset = tf.data.Dataset.from_tensor_slices((df['src'], df['trg']))

tokenizer_src = tfds.features.text.SubwordTextEncoder.build_from_corpus(
   (src.numpy() for src, trg in dataset), 
   target_vocab_size=2**12) #4096
tokenizer_src.save_to_file('Seq2Seq/tokenizer_src_training')

tokenizer_trg = tfds.features.text.SubwordTextEncoder.build_from_corpus(
   (trg.numpy() for src, trg in dataset), 
   target_vocab_size=2**12) #4096
tokenizer_trg.save_to_file('Seq2Seq/tokenizer_trg_training')

In [4]:
''' Load tokenizer. '''

# tokenizer_src = tfds.features.text.SubwordTextEncoder.load_from_file('Seq2Seq/tokenizer_src')
# tokenizer_trg = tfds.features.text.SubwordTextEncoder.load_from_file('Seq2Seqtokenizer_trg')

In [5]:
''' Test tokenizer. '''

sample_string = 'Wie du weisst, sage ich immer zu mir, ist immer alles und alles immer in deinem Kopf.'

encoded_string = tokenizer_src.encode(sample_string)
print(encoded_string)

decoded_string = tokenizer_src.decode(encoded_string)
print(decoded_string)

[521, 2631, 1515, 276, 1, 3093, 33, 70, 6, 1979, 1, 8, 70, 130, 4, 130, 70, 5, 213, 854, 110, 2800, 3822]
Wie du weisst, sage ich immer zu mir, ist immer alles und alles immer in deinem Kopf.


In [6]:
''' Print subwords. '''

for ts in encoded_string:
    print('{:5} ----> {}'.format(ts, tokenizer_src.decode([ts])))

  521 ----> Wie 
 2631 ----> du 
 1515 ----> weis
  276 ----> st
    1 ----> , 
 3093 ----> sage 
   33 ----> ich 
   70 ----> immer 
    6 ----> zu 
 1979 ----> mir
    1 ----> , 
    8 ----> ist 
   70 ----> immer 
  130 ----> alles 
    4 ----> und 
  130 ----> alles 
   70 ----> immer 
    5 ----> in 
  213 ----> de
  854 ----> ine
  110 ----> m 
 2800 ----> Kopf
 3822 ----> .


## Prepare dataset

In [7]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [8]:
# Add start and end token to input and target
def encode(src, trg):
    src = [tokenizer_src.vocab_size] + tokenizer_src.encode(
      src.numpy()) + [tokenizer_src.vocab_size+1]

    trg = [tokenizer_trg.vocab_size] + tokenizer_trg.encode(
      trg.numpy()) + [tokenizer_trg.vocab_size+1]
  
    return src, trg

In [9]:
# Wrap encoding in a tf.py_function to access it through map
def tf_encode(src, trg):
    result_src, result_trg = tf.py_function(encode, [src, trg], [tf.int64, tf.int64])
    result_src.set_shape([None])
    result_trg.set_shape([None])

    return result_src, result_trg

In [10]:
# Reduce to samples with MAX_LENGTH tokens.
MAX_LENGTH = 40

def filter_max_length(x, y, max_length=MAX_LENGTH):
      return tf.logical_and(tf.size(x) <= max_length,
                            tf.size(y) <= max_length)

In [11]:
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)

# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)


val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.padded_batch(BATCH_SIZE)

In [12]:
src_batch, trg_batch = next(iter(val_dataset))
src_batch, trg_batch

(<tf.Tensor: shape=(64, 67), dtype=int64, numpy=
 array([[4032,  389, 1457, ...,    0,    0,    0],
        [4032,  521,   26, ...,    0,    0,    0],
        [4032,  113, 2721, ...,    0,    0,    0],
        ...,
        [4032,   28, 1245, ...,    0,    0,    0],
        [4032, 3692, 3808, ...,    0,    0,    0],
        [4032,   79,    3, ...,    0,    0,    0]])>,
 <tf.Tensor: shape=(64, 125), dtype=int64, numpy=
 array([[4033,  388,  505, ...,    0,    0,    0],
        [4033, 2873,    8, ...,    0,    0,    0],
        [4033,  360,   48, ...,    0,    0,    0],
        ...,
        [4033, 1209,   96, ...,    0,    0,    0],
        [4033, 3749, 2352, ...,    0,    0,    0],
        [4033, 1042,    3, ...,    0,    0,    0]])>)

## Positional encoding

[Notebook about positional encoding](https://github.com/tensorflow/examples/blob/master/community/en/position_encoding.ipynb)

In [13]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [14]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)

## Masking

In [15]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [16]:
'''This look-ahead mask masks future tokens in a sequence.'''
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

## Scaled dot product attention

In [17]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.
    
    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
    to (..., seq_len_q, seq_len_k). Defaults to None.
    
    Returns:
    output, attention_weights
    """
    
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    
    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  
        
    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
    
    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
    
    return output, attention_weights


## Multi-head attention

In [18]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        
        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
        
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
        
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
        
        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        
        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
        
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
        return output, attention_weights


## Point wise feed forward network

This network consists of two fully-connected layers with a ReLU activation in between.

In [19]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

## Encoder and decoder

In [20]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
        
        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
        
        return out2

In [21]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()
        
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
        
        
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)
    
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        
        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
        
        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
        
        return out3, attn_weights_block1, attn_weights_block2


## Encoder

Consists of:

1. Input Embedding
2. Positional Encoding
3. N encoder layers

In [22]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                 maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
        
        
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]
        
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        
        seq_len = tf.shape(x)[1]
        
        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
            
        return x  # (batch_size, input_seq_len, d_model)

## Decoder

Consists of:

1. Output Embedding
2. Positional Embedding
3. N decoder layers

In [23]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                 maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                   look_ahead_mask, padding_mask)
            
            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
        
        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

## Transformer

The Transformer consists of encoder, decoder and a final linear layer. 

In [24]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
                 target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                               input_vocab_size, pe_input, rate)
        
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                               target_vocab_size, pe_target, rate)
        
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
    def call(self, inp, tar, training, enc_padding_mask, 
             look_ahead_mask, dec_padding_mask):
    
        enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
        
        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights = self.decoder(
            tar, enc_output, training, look_ahead_mask, dec_padding_mask)
        
        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
        
        return final_output, attention_weights

In [25]:
num_layers=2
d_model=128
dff=256
num_heads=4
EPOCHS=150

input_vocab_size = tokenizer_src.vocab_size + 2
target_vocab_size = tokenizer_trg.vocab_size + 2
dropout_rate = 0.1

## Optimizer

Adam with a custom learning rate scheduler (from the paper).

In [26]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        
        self.warmup_steps = warmup_steps
        
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [27]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

## Loss and metrics

In [28]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
  
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

## Training and checkpoints

In [29]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

In [30]:
def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)

    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
''' For training: remove old checkpoints. '''
!rm -r 'Seq2Seq/checkpoints/' # for training: remove existing checkpoints

In [32]:
checkpoint_path = "Seq2Seq/checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

In [33]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    
    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, 
                                     True, 
                                     enc_padding_mask, 
                                     combined_mask, 
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)
        
    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [34]:
for epoch in range(EPOCHS):
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    # inp -> src, tar -> trg
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)
        
            
    if (epoch + 1) % 50 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))
        
        print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                             train_loss.result(), 
                                                             train_accuracy.result()))

Saving checkpoint for epoch 50 at ./checkpoints/train/ckpt-1
Epoch 50 Loss 4.0500 Accuracy 0.1966
Saving checkpoint for epoch 100 at ./checkpoints/train/ckpt-2
Epoch 100 Loss 0.1977 Accuracy 0.5755
Saving checkpoint for epoch 150 at ./checkpoints/train/ckpt-3
Epoch 150 Loss 0.0432 Accuracy 0.5848


## Evaluate

In [35]:
def evaluate(inp_sentence):
    start_token = [tokenizer_src.vocab_size]
    end_token = [tokenizer_src.vocab_size + 1]
    
    inp_sentence = start_token + tokenizer_src.encode(inp_sentence) + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)
    
    decoder_input = [tokenizer_trg.vocab_size]
    output = tf.expand_dims(decoder_input, 0)
    
    for i in range(MAX_LENGTH):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)
        
        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)
        
        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
        
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        
        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer_trg.vocab_size+1:
            return tf.squeeze(output, axis=0), attention_weights
            
        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)
            
    return tf.squeeze(output, axis=0), attention_weights

In [36]:
def translate(sentence, target='-', plot=''):
    result, attention_weights = evaluate(sentence)
  
    predicted_sentence = tokenizer_trg.decode([i for i in result if i < tokenizer_trg.vocab_size])  

    return predicted_sentence

In [60]:
inp = "Kunst lebt von den Fehlern der Welt."
translate(inp)

'Ob sie uns lachen oder weinen macht, wir belachen oder beweinen Abschaffenswertes.'

## Walking through latent space (word embeddings)

In [61]:
# ''' Encode a sentence to retrieve ids. '''
print(tokenizer_src.encode('Aufwertung des Selbstmodells durch dessen Formalisierung.'))

[309, 640, 41, 18, 3581, 1388, 3180, 31, 358, 2834, 3822]


In [63]:
print(tokenizer_src.decode([2834]))

Formalisierung


In [64]:
tok_id = 2834

In [66]:
''' Retrieve learned embeddings. '''
# it is a matrix of shape (1, vocab_size, embedding-dimension)
e = transformer.encoder.embedding
weights = e.get_weights().copy()

In [68]:
''' Weights of tok_id. '''
weights_backup = weights[0][tok_id].copy()
# used as backup for restoring
print(weights_backup)

[ 0.04585315 -0.05057897  0.0362612   0.04257246 -0.07136862  0.05591803
  0.03278057 -0.06143778  0.00618251 -0.03838155 -0.10599931  0.10329395
 -0.05787181  0.08018018  0.00537649  0.08309967  0.106056    0.07382646
 -0.0269549  -0.05182509 -0.03455293 -0.00505163 -0.00166758  0.02991542
  0.02471358 -0.06029985 -0.04304826  0.06371844  0.01733958  0.0279441
 -0.02159692 -0.08514401 -0.05884919  0.03114592  0.07437395 -0.04945388
 -0.10289568  0.0309912  -0.0358796   0.10355988 -0.00079994 -0.02894607
 -0.04770349 -0.07436524 -0.02258739  0.01571286  0.10603911 -0.09613569
 -0.13266215 -0.10592959 -0.0160829  -0.01319074  0.0384906  -0.0509856
  0.01403771  0.01202052  0.02398745  0.00178476  0.05008786 -0.07558156
 -0.02897525  0.00308615 -0.01544227  0.0696001  -0.00254706  0.0116569
  0.0878296   0.02139591 -0.01359293  0.06344606  0.02299168 -0.05707199
  0.04110349  0.05555265  0.05628436 -0.02213254  0.02659026 -0.01314045
 -0.03082482 -0.05694765 -0.01538857  0.05024926 -0.02

In [43]:
''' Restore weights. '''
# weights[0][tok_id] = weights_backup

In [69]:
''' Iterate through the vector. Set each time one value of the vector to 0. '''
translations = []
for i in range(128):
    # restore weights
    weights[0][tok_id] = weights_backup
    # change i
    weights[0][tok_id][i] = 0.0
    # set weights
    e.set_weights(weights)
    translations.append(translate('Aufwertung des Selbstmodells durch dessen Formalisierung.'))    

In [70]:
''' Reduce to unique sentences. '''
translations_set = set(translations)

In [71]:
for t in translations_set:
    print(t)

Es blieb jedoch nicht mehr nur bei der Vorstellung ermöglicht eine Maschine, die Neverarbeitung von sich selbst und Maschine, sondern eine Maschine.
Es findet eine Nervennetzen, sich nicht mehr repräsentiert sich präferiert und ihm bewusst von sich nicht fest, sondern eine Maschine.
Es blieb jedoch nicht mehr nur bei der Vorstellung ermöglicht eine Maschine, löst eine Spiegel diese selbst hat, sondern eine Maschine.
Es blieb jedoch nicht mehr nur bei der Vorstellung ermöglicht eine Maschine, die Newtopologische Kunst, braucht, sondern eine Spiegel keines zur Kunst ist.
Es gibt es nur eine Person von unserem Gehirn kontinuierlichen Spektrums an von sich selbst produziert.
Es blieb jedoch nicht mehr nur bei der Vorstellung ermöglicht eine Maschine, löst eine Spiegel diese selbst hat, d.
Es gibt es nur eine Person von sich selbst zuteilen, d.
Es gibt es nur ein Werk nur zusammen mit der Kunst oder absprechen.
Es findet eine Nervennetzen, sich nicht direkt mit einer Person von sich selbst 