<a href="https://colab.research.google.com/github/dsanroman96/Machine-Learning-Projects/blob/main/Transformer_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import time
from tensorflow.keras.layers import Dense, Embedding, BatchNormalization, LayerNormalization
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import IPython

## Data Preprocessing

In [None]:
raw_data = (
    ('What a ridiculous concept!', 'Quel concept ridicule !'),
    ('Your idea is not entirely crazy.', "Votre idée n'est pas complètement folle."),
    ("A man's worth lies in what he is.", "La valeur d'un homme réside dans ce qu'il est."),
    ('What he did is very wrong.', "Ce qu'il a fait est très mal."),
    ("All three of you need to do that.", "Vous avez besoin de faire cela, tous les trois."),
    ("Are you giving me another chance?", "Me donnez-vous une autre chance ?"),
    ("Both Tom and Mary work as models.", "Tom et Mary travaillent tous les deux comme mannequins."),
    ("Can I have a few minutes, please?", "Puis-je avoir quelques minutes, je vous prie ?"),
    ("Could you close the door, please?", "Pourriez-vous fermer la porte, s'il vous plaît ?"),
    ("Did you plant pumpkins this year?", "Cette année, avez-vous planté des citrouilles ?"),
    ("Do you ever study in the library?", "Est-ce que vous étudiez à la bibliothèque des fois ?"),
    ("Don't be deceived by appearances.", "Ne vous laissez pas abuser par les apparences."),
    ("Excuse me. Can you speak English?", "Je vous prie de m'excuser ! Savez-vous parler anglais ?"),
    ("Few people know the true meaning.", "Peu de gens savent ce que cela veut réellement dire."),
    ("Germany produced many scientists.", "L'Allemagne a produit beaucoup de scientifiques."),
    ("Guess whose birthday it is today.", "Devine de qui c'est l'anniversaire, aujourd'hui !"),
    ("He acted like he owned the place.", "Il s'est comporté comme s'il possédait l'endroit."),
    ("Honesty will pay in the long run.", "L'honnêteté paye à la longue."),
    ("How do we know this isn't a trap?", "Comment savez-vous qu'il ne s'agit pas d'un piège ?"),
    ("I can't believe you're giving up.", "Je n'arrive pas à croire que vous abandonniez."),
)

def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def normalize_string(s):
    s = unicode_to_ascii(s)
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s


raw_data_en, raw_data_fr = list(zip(*raw_data))
raw_data_en, raw_data_fr = list(raw_data_en), list(raw_data_fr)
raw_data_en = [normalize_string(data) for data in raw_data_en]
raw_data_fr_in = ['<start> ' + normalize_string(data) for data in raw_data_fr]
raw_data_fr_out = [normalize_string(data) + ' <end>' for data in raw_data_fr]


en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
en_tokenizer.fit_on_texts(raw_data_en)
data_en = en_tokenizer.texts_to_sequences(raw_data_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en,
                                                        padding='post')

fr_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
fr_tokenizer.fit_on_texts(raw_data_fr_in)
fr_tokenizer.fit_on_texts(raw_data_fr_out)
data_fr_in = fr_tokenizer.texts_to_sequences(raw_data_fr_in)
data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in,
                                                           padding='post')

data_fr_out = fr_tokenizer.texts_to_sequences(raw_data_fr_out)
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out,
                                                            padding='post')


BATCH_SIZE = 1
dataset = tf.data.Dataset.from_tensor_slices(
    (data_en, data_fr_in, data_fr_out))
dataset = dataset.shuffle(20).batch(BATCH_SIZE)
dataset

<BatchDataset shapes: ((None, 10), (None, 14), (None, 14)), types: (tf.int32, tf.int32, tf.int32)>

## Positional Embedding

In [None]:
def positional_embedding(max_len, model_size):
  embedding = []
  for pos in range(max_len):
    PE = np.zeros((1, model_size))
    for i in range(model_size):
      if i %2 == 0:
        PE[:,i] = np.sin(pos / 10000 ** (i / model_size))
      else:
        PE[:,i] = np.cos(pos / 10000 ** ((i-1) / model_size))
    embedding.append(PE)
  
  embedding=np.concatenate(embedding, axis=0)
  embedding = tf.constant(embedding, dtype=tf.float32)
  return embedding

## Multi-Head Attention

In [None]:
class MultiHeadAttention(tf.keras.Model):
  def __init__(self, model_size, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.heads_size = model_size // num_heads
    self.querys = Dense(model_size)
    self.keys = Dense(model_size)
    self.values = Dense(model_size)
    self.output_layer = Dense(model_size)
  
  def create_head(self, head, batch_size):
    head = tf.reshape(head, [batch_size, -1, self.num_heads, self.heads_size])
    head = tf.transpose(head, [0, 2, 1, 3])
    return head

  def call(self, target_seq, input_seq, mask=None):
    batch_size = q.shape[0]
    q = self.create_head(self.querys(target_seq), batch_size)
    k = self.create_head(self.keys(input_seq), batch_size)
    v = self.create_head(self.values(input_seq), batch_size)

    score = tf.matmul(q, k, transpose_b=True)
    score /= tf.math.sqrt(tf.dtypes.cast(self.heads_size, dtype=tf.float32))

    if mask is not None:
      score *= mask
      score = tf.where(tf.equal(score, 0), tf.ones_like(score) * -1e9, score)

    alignment = tf.nn.softmax(score, axis=-1)
    context = tf.matmul(alignment, v)
    context = tf.transpose(context, [0, 2, 1, 3])
    context = tf.reshape(context, [batch_size, -1, self.heads_size * self.num_heads])

    heads = self.output_layer(context)

    return heads

## Encoder

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, model_size, num_layers, num_heads, pes):
    super(Encoder, self).__init__()
    self.model_size = model_size
    self.num_layers = num_layers
    self.num_heads = num_heads
    self.pes = pes
    
    self.embedding = Embedding(vocab_size, model_size)

    self.attention = [MultiHeadAttention(model_size, num_heads) for _ in range(num_layers)]
    self.attention_norm = [LayerNormalization() for _ in range(num_layers)]

    self.dense_1 = [Dense(model_size * 4, activation="relu") for _ in range(num_layers)]
    self.dense_2 = [Dense(model_size) for _ in range(num_layers)]
    self.dense_norm = [LayerNormalization() for _ in range(num_layers)]

  def call(self, sequence, padding_mask=None):
    embed = self.embedding(sequence)
    embed += self.pes[:sequence.shape[1], :]
    
    sub_in = embed

    for i in range(self.num_layers):
      sub_out = self.attention[i](sub_in, sub_in, mask=None)
      sub_out = sub_in + sub_out
      sub_out = self.attention_norm[i](sub_out)

      feed_forward = sub_out

      feed_forward = self.dense_1[i](feed_forward)
      feed_forward = self.dense_2[i](feed_forward)
      feed_forward = feed_forward + sub_out
      feed_forward = self.dense_norm[i](feed_forward)

      sub_in = feed_forward
    
    return feed_forward


## Decoder

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, model_size, num_layers, num_heads, pes):
    super(Decoder, self).__init__()
    self.model_size = model_size
    self.num_layers = num_layers
    self.num_heads = num_heads

    self.pes = pes
    self.embedding = Embedding(vocab_size, model_size)
    
    self.attention_1 = [MultiHeadAttention(model_size, num_heads) for _ in range(num_layers)]
    self.attention_norm_1 = [LayerNormalization() for _ in range(num_layers)]

    self.attention_2 = [MultiHeadAttention(model_size, num_heads) for _ in range(num_layers)]
    self.attention_norm_2 = [LayerNormalization() for _ in range(num_layers)]

    self.dense_1 = [Dense(model_size * 4, activation="relu") for _ in range(num_layers)]
    self.dense_2 = [Dense(model_size) for _ in range(num_layers)]
    self.dense_norm = [LayerNormalization() for _ in range(num_layers)]

    self.dense_out = Dense(vocab_size)

  def call(self, sequence, encoder_output, padding_mask=None):
    embed = self.embedding(sequence)
    embed += self.pes[:sequence.shape[1], :]
    sub_in_1 = embed

    for i in range(self.num_layers):

      look_left_only_mask = tf.linalg.band_part(tf.ones((sequence.shape[1], sequence.shape[1])), -1, 0)
      sub_out_1 = self.attention_1[i](sub_in_1, sub_in_1, look_left_only_mask)
      sub_out_1 = sub_out_1 + sub_in_1
      sub_out_1 = self.attention_norm_1[i](sub_out_1)

      sub_in_2 = sub_out_1

      sub_out_2 = self.attention_2[i](sub_in_2, encoder_output, padding_mask)
      sub_out_2 = sub_out_2 + sub_in_2
      sub_out_2 = self.attention_norm_2[i](sub_out_2)
      
      feed_forward = sub_out_2

      feed_forward = self.dense_1[i](feed_forward)
      feed_forward = self.dense_2[i](feed_forward)
      feed_forward = feed_forward + sub_out_2
      feed_forward = self.dense_norm[i](feed_forward)

      sub_in_1 = feed_forward

    logits = self.dense_out(feed_forward)
    return logits


## Loss Function & Optimizer

In [None]:
def my_loss_func(targets, logits):
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    loss = crossentropy(targets, logits, sample_weight=mask)
    return loss

optimizer = tf.keras.optimizers.Adam()

## Train Function

In [None]:
@tf.function
def train_step(original_seq, target_seq_in, target_seq_out):
  with tf.GradientTape() as tape:
    padding_mask = 1 - tf.cast(tf.equal(original_seq, 0), dtype=tf.float32)
    padding_mask = tf.expand_dims(padding_mask, axis=1)
    padding_mask = tf.expand_dims(padding_mask, axis=1)

    encoder_output = encoder(original_seq, padding_mask)
    decoder_output = decoder(target_seq_in, encoder_output, padding_mask)
    loss = my_loss_func(target_seq_out, decoder_output)
  
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return loss
    

## Translate Function

In [None]:
def translate(original_text=None):
  if original_text is None:
    original_text = raw_data_en[np.random.choice(len(raw_data_en))]
  print("ENGLISH: " + original_text)

  original_seq = en_tokenizer.texts_to_sequences([original_text])
  
  en_output = encoder(tf.constant(original_seq))
  de_input = tf.constant([[fr_tokenizer.word_index['<start>']]], dtype=tf.int64)

  out_words = []
  while True:
    de_output = decoder(de_input, en_output)

    new_token = tf.expand_dims(tf.argmax(de_output, -1)[:,-1], axis=1)
    out_words.append(fr_tokenizer.index_word[new_token.numpy()[0][0]])

    de_input = tf.concat((de_input, new_token), axis=-1)

    if out_words[-1] == '<end>' or len(out_words) >= 14:
      break

  print("FRENCH: " + ' '.join(out_words))
  print()



## Testing

In [None]:
NUM_HEADS = 2
NUM_LAYERS = 2
MODEL_SIZE = 64
MAX_LEN = max(len(data_en[0]), len(data_fr_in[0]))

en_vocab_size = len(en_tokenizer.word_index) + 1
fr_vocab_size = len(fr_tokenizer.word_index) + 1
pes = positional_embedding(MAX_LEN, MODEL_SIZE)

encoder = Encoder(en_vocab_size, MODEL_SIZE, NUM_LAYERS, NUM_HEADS, pes)
decoder = Decoder(fr_vocab_size, MODEL_SIZE, NUM_LAYERS, NUM_HEADS, pes)

In [None]:
NUM_EPOCHS = 100

start_time = time.time()
for e in range(NUM_EPOCHS):
  for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
    loss = train_step(source_seq, target_seq_in, target_seq_out)

  print('Epoch {} Loss {:.4f}'.format(
    e + 1, loss.numpy()))

  if (e + 1) % 10 == 0:
    end_time = time.time()
    print('Average elapsed time: {:.2f}s'.format((end_time - start_time) / (e + 1)))
    try:
      translate()
    except Exception as e:
      print(e)
      continue

Epoch 1 Loss 3.2858
Epoch 2 Loss 3.3032
Epoch 3 Loss 3.2410
Epoch 4 Loss 3.8250
Epoch 5 Loss 2.8367
Epoch 6 Loss 4.1642
Epoch 7 Loss 4.2485
Epoch 8 Loss 4.1234
Epoch 9 Loss 2.2368
Epoch 10 Loss 2.3509
Average elapsed time: 0.61s
ENGLISH: He acted like he owned the place .
FRENCH: vous vous vous a a a s . <end>

Epoch 11 Loss 2.4507
Epoch 12 Loss 3.0400
Epoch 13 Loss 1.4478
Epoch 14 Loss 2.1693
Epoch 15 Loss 2.7361
Epoch 16 Loss 1.7082
Epoch 17 Loss 1.2900
Epoch 18 Loss 1.4886
Epoch 19 Loss 1.1630
Epoch 20 Loss 1.0029
Average elapsed time: 0.41s
ENGLISH: What he did is very wrong .
FRENCH: ce qu il a la s il vous plait ? <end>

Epoch 21 Loss 0.9549
Epoch 22 Loss 0.4730
Epoch 23 Loss 0.8532
Epoch 24 Loss 0.7908
Epoch 25 Loss 0.5397
Epoch 26 Loss 0.5489
Epoch 27 Loss 0.4877
Epoch 28 Loss 0.4394
Epoch 29 Loss 0.3401
Epoch 30 Loss 0.2855
Average elapsed time: 0.35s
ENGLISH: How do we know this isn t a trap ?
FRENCH: l allemagne a produit beaucoup de scientifiques . <end>

Epoch 31 Loss 0.29