# Import and Setting

### import

In [1]:
import sys; sys.path.append('../')
import os; os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LayerNormalization, Dropout

from Preprocess import Preprocess
from Transformer.Transformer import Transformer

# Functions

In [2]:
def set_gpu(): # gpu setting
    gpus = tf.config.list_physical_devices(device_type="GPU")
    tf.config.experimental.set_memory_growth(gpus[0], True)
    tf.config.experimental.set_memory_growth(gpus[1], True)

def set_random_state(): # random_state
    SEED = 1
    os.environ["PYTHONASHSEED"]= str(SEED)
    os.environ["TF_DETERMINISTIC_OPS"]= "1"
    tf.random.set_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)

def get_hyperparameter(preprocessed, BATCH_SIZE):
    BATCH_SIZE = BATCH_SIZE
    ENC_VOCAB_SIZE = preprocessed.fr_tokenizer.GetPieceSize()
    DEC_VOCAB_SIZE = preprocessed.en_tokenizer.GetPieceSize()
    MAX_SEQ_LEN = preprocessed.get_shape("fr_train")[-1]

    D_MODEL = 128 # 512
    RATE = 0.1
    NUM_LAYERS = 4 # 6
    NUM_HEADS = 4 # 8
    EPSILON = 1e-6
    D_PFF = 512 # 2048

    # Original from the papaer
    # D_MODEL = 512 # 128
    # RATE = 0.1
    # NUM_LAYERS = 6 # 4
    # NUM_HEADS = 8 # 4
    # EPSILON = 1e-6
    # D_PFF = 2048 # 512

    return {"batch_size":BATCH_SIZE, "enc_vocab_size":ENC_VOCAB_SIZE, "dec_vocab_size":DEC_VOCAB_SIZE, "max_seq_len":MAX_SEQ_LEN, "d_model":D_MODEL, "rate":RATE, "num_layers":NUM_LAYERS, "num_heads":NUM_HEADS, "epsilon":EPSILON, "d_pff":D_PFF}

# Main

### parameter setting and model preparation

In [156]:
# Initialize
BATCH_SIZE = 64
set_gpu(); set_random_state()

# Preprocess
data = pd.read_csv("./eng_-french.csv")
train, test = train_test_split(data, test_size=0.2, random_state=0)
valid, test = train_test_split(train, test_size=0.1, random_state=0)
preprocessed = Preprocess(BATCH_SIZE, train, valid, test, training=False)

# Hyperparameter Setting
params = get_hyperparameter(preprocessed, BATCH_SIZE)

In [157]:
# Any length of sequence can be input
enc_input = Input(shape=(None,), name="enc_input")
dec_input = Input(shape=(None,), name="dec_input")

transformer = Transformer(**params, name="dec_output")
dec_output = transformer(enc_input, dec_input, training=True)

model = tf.keras.models.Model(inputs=(enc_input, dec_input), outputs=dec_output)

### loss Function & CustomLearningRateScheduler
$learning\_rate = d_{model}^{-0.5} \times min(step\_num^{-0.5}, step\_num \times warmup\_steps^{-1.5})$

In [5]:
def loss_function(y, pred):
    mask = 1 - tf.cast(tf.equal(0., y), tf.float32) # Zero for padding, one for else

    # SparseCategoricalCrossentropy: label can stay int (no need to be one-hot encoded form)
    # from_logits: automatically takes softmax to predicted value,
    # reduction: can be none or mean or so
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")(y, pred)
    loss *= mask # losses of paddings are zero now

    return tf.reduce_sum(loss) / tf.reduce_sum(mask) # Calculate mean only for valid losses

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()
    self.d_model = tf.cast(d_model, tf.float32)
    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.pow(step, tf.cast(-0.5, tf.float32))
    arg2 = step * tf.pow(self.warmup_steps, tf.cast(-1.5, tf.float32))
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(params["d_model"])
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
total_batch_size = preprocessed.get_shape("fr_train")[0]

### train

In [6]:
@tf.function
def train_step(input, target):
    with tf.GradientTape() as tape:
        pred = model(input, target)
        loss = loss_function(target, pred)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    pred = tf.argmax(pred[-1], axis=-1)
    
    return loss, target, pred

for e in range(20):
    for n_batch, ((train_input, train_target),(valid_input, valid_target)) in enumerate(zip(preprocessed.train_dataset, preprocessed.valid_dataset)):
        enc_input_train, dec_input_train, dec_output_train = train_input["enc_input"], train_input["dec_input"], train_target["dec_output"]
        enc_input_valid, dec_input_valid, dec_output_valid = valid_input["enc_input"], valid_input["dec_input"], valid_target["dec_output"]
        
        loss, target, pred = train_step((enc_input_train, dec_input_train), dec_output_train)
        pred_valid = tf.argmax(model((enc_input_valid, dec_input_valid), dec_output_valid)[-1], axis=-1)

        if n_batch % 500 == 0:
            input_train_cd = preprocessed.fr_tokenizer.IdToPiece([int(i) for i in enc_input_train[-1].numpy()])
            target_train_cd = preprocessed.en_tokenizer.IdToPiece([int(i) for i in dec_input_train[-1].numpy()])
            pred_train_cd = preprocessed.en_tokenizer.IdToPiece([int(i) for i in pred.numpy()])

            input_valid_str = preprocessed.fr_tokenizer.DecodeIds([int(i) for i in enc_input_valid[-1].numpy()])
            target_valid_str = preprocessed.en_tokenizer.DecodeIds([int(i) for i in dec_input_valid[-1].numpy()])
            pred_valid_str = preprocessed.en_tokenizer.DecodeIds([int(i) for i in pred_valid.numpy()])

            print(f"{e}: {n_batch+1}/{total_batch_size}: {loss}")
            print("input_train:", input_train_cd)
            print("target_train:", target_train_cd)
            print("pred_train:", pred_train_cd)
            print("-"*50)
            print("input_valid:", input_valid_str)
            print("target_valid:", target_valid_str)
            print("pred_valid:", pred_valid_str)
            print("_"*100)

0: 1/140496: 9.016409873962402
input_train: ['<SOS>', '▁C', "'", 'est', '▁Tom', '▁qui', '▁a', '▁dit', '▁qu', "'", 'il', '▁était', '▁trop', '▁occupé', '▁pour', '▁aider', ',', '▁pas', '▁moi', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
target_train: ['<SOS>', '▁Tom', '▁is', '▁the', '▁one', '▁who', '▁said', '▁he', '▁was', '▁too', '▁busy', '▁to', '▁help', ',', '▁not', '▁me', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<P

### evaluate

In [155]:
def evaluate(model, preprocessed, s):
    enc_input = tf.constant([[preprocessed.fr_tokenizer.bos_id()] + preprocessed.fr_tokenizer.EncodeAsIds(s) + [preprocessed.fr_tokenizer.eos_id()]], tf.float32)
    dec_input = tf.constant([[preprocessed.fr_tokenizer.bos_id()]], tf.float32)

    for i in range(100):
        pred = tf.cast(tf.argmax(model((enc_input, dec_input)), axis=-1), tf.float32)
        dec_input = tf.concat((dec_input, pred[:, -1:]), axis=-1)
        
        output = dec_input.numpy().reshape(-1)
        output = [int(i) for i in output]
        output = preprocessed.en_tokenizer.IdToPiece(output)
        print(output)

        if int(pred.numpy().reshape(-1)[-1]) == preprocessed.en_tokenizer.eos_id():
            return preprocessed.en_tokenizer.DecodePieces(output)

# test individual strings
idx = 10 # 2, 4, 10
s = test["French words/sentences"].iloc[idx]; print(f"French: {s}"); print("_"*100)
answer = test["English words/sentences"].iloc[idx]
pred = evaluate(model, preprocessed, s); print("_"*100)
print(f"English taught: {answer}\n→ Predicted: {pred}")

French: Il ne put que regarder.
____________________________________________________________________________________________________
['<SOS>', '▁He']
['<SOS>', '▁He', '▁could']
['<SOS>', '▁He', '▁could', '▁only']
['<SOS>', '▁He', '▁could', '▁only', '▁look']
['<SOS>', '▁He', '▁could', '▁only', '▁look', '.']
['<SOS>', '▁He', '▁could', '▁only', '▁look', '.', '<EOS>']
____________________________________________________________________________________________________
English taught: He could do nothing but watch.
→ Predicted: He could only look.
