<a href="https://colab.research.google.com/github/emrllh/My_works/blob/main/The_Transformer_Architecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os
import tensorflow_hub as hub

from pathlib import Path

url ='https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip'

path = tf.keras.utils.get_file('spa-eng.zip', origin=url, cache_dir='datasets',
                               extract=True)


Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
[1m2638744/2638744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [2]:
text = (Path(path).with_name('spa-eng')/'spa.txt').read_text()

In [3]:
text = text.replace('i', '').replace('¿', '')
pairs = [line.split('\t') for line in text.splitlines()]
np.random.seed(42)
np.random.shuffle(pairs)
pairs[:5]

[['How borng!', '¡Qué aburrmento!'],
 ['I love sports.', 'Adoro el deporte.'],
 ['Would you lke to swap jobs?', 'Te gustaría que ntercambemos los trabajos?'],
 ['My mother dd nothng but weep.', 'M madre no hzo nada sno llorar.'],
 ['Croata s n the southeastern part of Europe.',
  'Croaca está en el sudeste de Europa.']]

In [4]:
sentences_en, sentences_es = zip(*pairs)

for i in range(3):
  print(sentences_en[i], '>>>',  sentences_es[i])

How borng! >>> ¡Qué aburrmento!
I love sports. >>> Adoro el deporte.
Would you lke to swap jobs? >>> Te gustaría que ntercambemos los trabajos?


In [5]:
vocab_size = 1000 # most freq words
max_length = 50 #lenght of output sentence

text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length
)

text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length
)

text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'SOS {s} EOS' for s in sentences_es])

In [6]:
text_vec_layer_es.get_vocabulary()[:5]

['', '[UNK]', 'sos', 'eos', 'de']

In [7]:
text_vec_layer_en.get_vocabulary()[:5]

['', '[UNK]', 'the', 'i', 'to']

In [8]:
X_train = tf.constant(sentences_en[:100_000]) #constant tensors are fixed not changed
#print(X_train[:5])
X_valid = tf.constant(sentences_en[100_000:])

X_train_dec = tf.constant([f'SOS {s}' for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f'SOS {s}' for s in sentences_es[100_000:]])
#print(X_train_dec[:5])

Y_train = text_vec_layer_es([f'EOS {s}' for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f'EOS {s}' for s in sentences_es[100_000:]])
#print(Y_valid[:5])

In [9]:
print(X_train.shape, X_valid.shape)
print(X_train_dec.shape, X_valid_dec.shape)
print(Y_train.shape, Y_valid.shape)

(100000,) (18964,)
(100000,) (18964,)
(100000, 50) (18964, 50)


In [11]:
vocab_size=1000
embed_size=128
max_length=50

encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

class TransporterLayers(keras.layers.Layer):
  def __init__(self,vocab_size=vocab_size, embed_size=embed_size, max_length=max_length, **kwargs):
    super().__init__(**kwargs)
    self.vocab_size = vocab_size
    self.embed_size = embed_size
    self.max_length = max_length

    #self.encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
    #self.decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

    #self.encoder_input_ids = text_vec_layer_en(self.encoder_inputs)
    #self.decoder_input_ids = text_vec_layer_es(self.decoder_inputs)

    self.encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size)
    self.decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size)
    self.pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size) # Moved pos_embed_layer here

  def call(self, encoder_inputs, decoder_inputs):

    with tf.device("/CPU:0"):  # or "/CPU:0" if you don't have a GPU
      encoder_input_ids = text_vec_layer_en(encoder_inputs)
      decoder_input_ids = text_vec_layer_es(decoder_inputs)

    #encoder_input_ids = text_vec_layer_en(encoder_inputs)
    #decoder_input_ids = text_vec_layer_es(decoder_inputs)

    #encoder_input_ids = text_vec_layer_en(encoder_inputs)
    #decoder_input_ids = text_vec_layer_es(decoder_inputs)

    encoder_embeddings = self.encoder_embedding_layer(encoder_input_ids)
    decoder_embeddings = self.decoder_embedding_layer(decoder_input_ids)

    # Get shape within call method
    batch_max_len_enc = tf.shape(encoder_embeddings)[1]
    batch_max_len_dec = tf.shape(decoder_embeddings)[1]

    with tf.device("/CPU:0"):  # or "/CPU:0" if you don't have a GPU
        encoder_in = encoder_embeddings + self.pos_embed_layer(tf.range(batch_max_len_enc))
        decoder_in = decoder_embeddings + self.pos_embed_layer(tf.range(batch_max_len_dec))

    # Apply positional encoding within call method
    #encoder_in = encoder_embeddings + self.pos_embed_layer(tf.range(batch_max_len_enc))
    #decoder_in = decoder_embeddings + self.pos_embed_layer(tf.range(batch_max_len_dec))
    #encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]


    return batch_max_len_dec, encoder_input_ids,decoder_input_ids, encoder_in, decoder_in


In [12]:
transporter_layers=TransporterLayers()

In [13]:
batch_max_len_dec,encoder_input_ids,decoder_input_ids,encoder_in, decoder_in = transporter_layers(encoder_inputs, decoder_inputs)

In [14]:
N=2
num_heads = 8
dropout_rate = 0.1
n_units = 128 # for the first Dense layer in each Feed Forwarded block

encoder_pad_mask =tf.keras.layers.Lambda(lambda x: tf.math.not_equal(x, 0)[:, tf.newaxis])(encoder_input_ids) # it is a boolen tensor
                                                                          #It is reshaped to make it compatible with attention mechanisms

Z = encoder_in

for _ in range(N):

  skip = Z
  attn_layer = tf.keras.layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate
  )

  Z = attn_layer(Z, value=Z, attention_mask= encoder_pad_mask)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

  skip = Z

  Z = tf.keras.layers.Dense(n_units, activation = 'relu')(Z)
  Z = tf.keras.layers.Dense(embed_size)(Z)

  Z= tf.keras.layers.Dropout(dropout_rate)(Z)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [15]:
decoder_pad_mask = tf.keras.layers.Lambda(lambda x: tf.math.not_equal(x, 0)[:, tf.newaxis])(decoder_input_ids)


In [16]:
casual_mask = tf.keras.layers.Lambda(lambda x: tf.linalg.band_part(  # creates a lower triangular matrix
    tf.ones((x, x), tf.bool), -1, 0))(batch_max_len_dec)


In [17]:
encoder_outputs = Z
Z = decoder_in

for _ in range(N):
  skip = Z
  attn_layer = tf.keras.layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate
  )
  Z = attn_layer(Z, value = Z, attention_mask=casual_mask & decoder_pad_mask)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
  skip = Z
  attn_layer = tf.keras.layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate
  )

  Z = attn_layer(Z, value=encoder_outputs, attention_mask= encoder_pad_mask)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
  skip = Z

  Z = tf.keras.layers.Dense(n_units, activation='relu')(Z)
  Z = tf.keras.layers.Dense(embed_size)(Z)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [18]:
Y_proba = tf.keras.layers.Dense(vocab_size, activation='softmax')(Z)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs = Y_proba)

model.compile(loss= 'sparse_categorical_crossentropy', optimizer ='nadam',
              metrics = ['accuracy'])

model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data = ((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
[1m 456/3125[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m59:53[0m 1s/step - accuracy: 0.9515 - loss: 0.6000

KeyboardInterrupt: 

In [19]:
Epoch 1/10
3125/3125 [==============================] - 828s 263ms/step - loss: 0.2982 - accuracy: 0.5545 - val_loss: 0.2105 - val_accuracy: 0.6476
Epoch 2/10
3125/3125 [==============================] - 820s 262ms/step - loss: 0.2006 - accuracy: 0.6601 - val_loss: 0.1876 - val_accuracy: 0.6802
Epoch 3/10
3125/3125 [==============================] - 820s 263ms/step - loss: 0.1842 - accuracy: 0.6816 - val_loss: 0.1766 - val_accuracy: 0.6975
Epoch 4/10
3125/3125 [==============================] - 820s 262ms/step - loss: 0.1748 - accuracy: 0.6942 - val_loss: 0.1704 - val_accuracy: 0.7055
Epoch 5/10
3125/3125 [==============================] - 820s 262ms/step - loss: 0.1683 - accuracy: 0.7021 - val_loss: 0.1657 - val_accuracy: 0.7102
Epoch 6/10
3125/3125 [==============================] - 821s 263ms/step - loss: 0.1628 - accuracy: 0.7096 - val_loss: 0.1628 - val_accuracy: 0.7130
Epoch 7/10
3125/3125 [==============================] - 826s 264ms/step - loss: 0.1588 - accuracy: 0.7154 - val_loss: 0.1595 - val_accuracy: 0.7205
Epoch 8/10
3125/3125 [==============================] - 822s 263ms/step - loss: 0.1550 - accuracy: 0.7205 - val_loss: 0.1590 - val_accuracy: 0.7199
Epoch 9/10
3125/3125 [==============================] - 821s 263ms/step - loss: 0.1518 - accuracy: 0.7249 - val_loss: 0.1547 - val_accuracy: 0.7258
Epoch 10/10
3125/3125 [==============================] - 821s 263ms/step - loss: 0.1492 - accuracy: 0.7279 - val_loss: 0.1538 - val_accuracy: 0.7281
<keras.callbacks.History at 0x7f8946cdf9a0>

SyntaxError: invalid decimal literal (<ipython-input-19-1cef646df4a2>, line 2)