Install Packages - I should probably think of better names to explain doing something so simple.

In [None]:
!pip install tensorflow keras numpy
# that apex update gonna take forever to download!!

Import Packages!!!!!!

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention
from tensorflow.keras.layers import LayerNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.utils import to_categorical
import random

A small dummy dataset for German, Spanish, Greek to English translation - each language has 10 basic sentences, lowkey starting to see the value of my duo lessons!

In [3]:
# English sentences (target)
english_sentences = [
    "Hello", "Good morning", "How are you", "I am fine", "Thank you",
    "Goodbye", "What is your name", "My name is John", "I love you", "Please"
]

# German translations
german_sentences = [
    "Hallo", "Guten Morgen", "Wie geht es dir", "Mir geht es gut", "Danke",
    "Auf Wiedersehen", "Wie heißt du", "Mein Name ist John", "Ich liebe dich", "Bitte"
]

# Spanish translations
spanish_sentences = [
    "Hola", "Buenos días", "Cómo estás", "Estoy bien", "Gracias",
    "Adiós", "Cómo te llamas", "Me llamo John", "Te quiero", "Por favor"
]

# Greek translations
greek_sentences = [
    "Γειά σου", "Καλημέρα", "Πώς είσαι", "Είμαι καλά", "Ευχαριστώ",
    "Αντίο", "Πώς σε λένε", "Με λένε John", "Σ' αγαπώ", "Παρακαλώ"
]

# Combine all source languages
source_sentences = german_sentences + spanish_sentences + greek_sentences
target_sentences = english_sentences * 3  # Repeat English for each source language

# Shuffle the dataset
combined = list(zip(source_sentences, target_sentences))
random.shuffle(combined)
source_sentences, target_sentences = zip(*combined)
source_sentences, target_sentences = list(source_sentences), list(target_sentences)

print(f"Total training samples: {len(source_sentences)}")
print("Sample pairs:")
for i in range(3):
    print(f"{source_sentences[i]} -> {target_sentences[i]}")

Total training samples: 30
Sample pairs:
Mir geht es gut -> I am fine
Cómo te llamas -> What is your name
Mein Name ist John -> My name is John


Create tokenizers for source multi-language and target English

In [4]:
source_tokenizer = keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer.fit_on_texts(source_sentences)

target_tokenizer = keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_sentences)
target_tokenizer.fit_on_texts(['sos', 'eos'])  # start and end tokens

# Vocabulary sizes
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Max lengths
source_max_length = max(len(s.split()) for s in source_sentences)
target_max_length = max(len(s.split()) for s in target_sentences) + 2

print(f"Source vocab size: {source_vocab_size}")
print(f"Target vocab size: {target_vocab_size}")
print(f"Source max length: {source_max_length}")
print(f"Target max length: {target_max_length}")

Source vocab size: 54
Target vocab size: 22
Source max length: 4
Target max length: 6


Convert texts to sequences and pad them

In [5]:
source_sequences = source_tokenizer.texts_to_sequences(source_sentences)
source_sequences = keras.preprocessing.sequence.pad_sequences(
    source_sequences, maxlen=source_max_length, padding='post')

target_sequences = target_tokenizer.texts_to_sequences(target_sentences)
target_sequences = keras.preprocessing.sequence.pad_sequences(
    target_sequences, maxlen=target_max_length, padding='post')

# Add start (sos) and end (eos) tokens to target sequences
sos_token = target_tokenizer.word_index['sos']
eos_token = target_tokenizer.word_index['eos']

target_input_sequences = []
target_output_sequences = []
for seq in target_sequences:
    # Input sequence starts with sos and ends with the sentence (no eos)
    input_seq = [sos_token] + list(seq[seq != 0])
    # Output sequence starts with the sentence and ends with eos (no sos)
    output_seq = list(seq[seq != 0]) + [eos_token]

    # Pad them
    input_seq = keras.preprocessing.sequence.pad_sequences(
        [input_seq], maxlen=target_max_length, padding='post')[0]
    output_seq = keras.preprocessing.sequence.pad_sequences(
        [output_seq], maxlen=target_max_length, padding='post')[0]

    target_input_sequences.append(input_seq)
    target_output_sequences.append(output_seq)

target_input_sequences = np.array(target_input_sequences)
target_output_sequences = np.array(target_output_sequences)

# Convert to categorical for the output (one-hot encoding)
target_output_sequences = to_categorical(target_output_sequences, num_classes=target_vocab_size)

print("Source sequence shape:", source_sequences.shape)
print("Target input shape:", target_input_sequences.shape)
print("Target output shape:", target_output_sequences.shape)

Source sequence shape: (30, 4)
Target input shape: (30, 6)
Target output shape: (30, 6, 22)


In [6]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim)
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return inputs != 0

Transformer Encoder

In [7]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim)
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.dropout_1 = Dropout(dropout)
        self.dropout_2 = Dropout(dropout)

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=mask)
        attention_output = self.dropout_1(attention_output)
        proj_input = self.layernorm_1(inputs + attention_output)

        proj_output = self.dense_proj(proj_input)
        proj_output = self.dropout_2(proj_output)
        return self.layernorm_2(proj_input + proj_output)

Tranformer Decoder - I broke this part a few times

In [8]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim)
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()
        self.dropout_1 = Dropout(dropout)
        self.dropout_2 = Dropout(dropout)
        self.dropout_3 = Dropout(dropout)
        self.supports_masking = True

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, sequence_length, sequence_length))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)

        # First attention block (self-attention)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask
        )
        attention_output_1 = self.dropout_1(attention_output_1)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)

        # Second attention block (encoder-decoder attention)
        if mask is not None:
            # need to adjust the mask shape
            encoder_mask = tf.ones(
                (tf.shape(encoder_outputs)[0], tf.shape(encoder_outputs)[1]),
                dtype=tf.int32
            )
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
            padding_mask = tf.minimum(padding_mask, causal_mask)
        else:
            padding_mask = None

        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask
        )
        attention_output_2 = self.dropout_2(attention_output_2)
        attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)

        proj_output = self.dense_proj(attention_output_2)
        proj_output = self.dropout_3(proj_output)
        return self.layernorm_3(attention_output_2 + proj_output)

In [9]:
print("Source sequences shape:", source_sequences.shape)
print("Target input sequences shape:", target_input_sequences.shape)
print("Target output sequences shape:", target_output_sequences.shape)

Source sequences shape: (30, 4)
Target input sequences shape: (30, 6)
Target output sequences shape: (30, 6, 22)


The transformer model!!!

In [10]:
def build_transformer_model(
    source_vocab_size,
    target_vocab_size,
    source_max_length,
    target_max_length,
    embed_dim=64,
    dense_dim=128,
    num_heads=4,
    num_encoder_layers=2,
    num_decoder_layers=2,
    dropout_rate=0.1
):
    # Encoder
    encoder_inputs = Input(shape=(None,), dtype="int64", name="encoder_inputs")
    encoder_embedding = PositionalEmbedding(
        source_max_length, source_vocab_size, embed_dim)(encoder_inputs)

    encoder_outputs = encoder_embedding
    for _ in range(num_encoder_layers):
        encoder_outputs = TransformerEncoder(
            embed_dim, dense_dim, num_heads, dropout_rate)(encoder_outputs)

    # Decoder
    decoder_inputs = Input(shape=(None,), dtype="int64", name="decoder_inputs")
    decoder_embedding = PositionalEmbedding(
        target_max_length, target_vocab_size, embed_dim)(decoder_inputs)

    decoder_outputs = decoder_embedding
    for _ in range(num_decoder_layers):
        decoder_outputs = TransformerDecoder(
            embed_dim, dense_dim, num_heads, dropout_rate)(
                decoder_outputs, encoder_outputs)

    # Output
    decoder_dense = Dense(target_vocab_size, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)

    # Model
    transformer = Model(
        [encoder_inputs, decoder_inputs], decoder_outputs,
        name="transformer")

    return transformer

Rebuilt the model because the decoder broke so many times!!

In [11]:
transformer = build_transformer_model(
    source_vocab_size=source_vocab_size,
    target_vocab_size=target_vocab_size,
    source_max_length=source_max_length,
    target_max_length=target_max_length,
    embed_dim=64,
    dense_dim=128,
    num_heads=4,
    num_encoder_layers=2,
    num_decoder_layers=2
)

transformer.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

transformer.summary()



Train the model with small batch size because things broke, wasn't a smooth road!... also to help with memory constraints

In [12]:
history = transformer.fit(
    [source_sequences, target_input_sequences],
    target_output_sequences,
    batch_size=4,  # Reduced from 8 to 4
    epochs=50,
    validation_split=0.2
)

Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 240ms/step - accuracy: 0.1064 - loss: 3.2602 - val_accuracy: 0.1667 - val_loss: 2.8361
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.3184 - loss: 2.0806 - val_accuracy: 0.1389 - val_loss: 2.4594
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.3073 - loss: 1.4601 - val_accuracy: 0.2778 - val_loss: 2.1200
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.3990 - loss: 0.9386 - val_accuracy: 0.3056 - val_loss: 1.9968
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.4688 - loss: 0.7211 - val_accuracy: 0.3056 - val_loss: 1.9911
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5014 - loss: 0.5397 - val_accuracy: 0.3333 - val_loss: 1.8623
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━

Translation Inference

In [13]:
def decode_sequence(input_sentence):
    # Tokenize and pad the input sentence
    input_seq = source_tokenizer.texts_to_sequences([input_sentence])
    input_seq = keras.preprocessing.sequence.pad_sequences(
        input_seq, maxlen=source_max_length, padding='post')

    # Initialize target sequence with 'sos'
    target_seq = np.zeros((1, target_max_length))
    target_seq[0, 0] = sos_token

    # Generate translation word by word
    for i in range(target_max_length - 1):
        predictions = transformer.predict([input_seq, target_seq], verbose=0)
        next_token = np.argmax(predictions[0, i, :])
        target_seq[0, i+1] = next_token

        if next_token == eos_token:
            break

    # Convert sequence back to text
    output_sentence = []
    for token in target_seq[0]:
        if token == 0 or token == eos_token:
            continue
        word = target_tokenizer.index_word.get(token, '')
        output_sentence.append(word)

    return ' '.join(output_sentence)

Test the translation b*tch!

In [31]:
test_sentences = [
    "Hallo",  # German
    "Buenos días",  # Spanish
    "Καλημέρα",  # Greek
    "Wie geht es dir",  # German
    "Te quiero"  # Spanish
]

for sentence in test_sentences:
    translation = decode_sequence(sentence)
    print(f"{sentence} -> {translation}")

Hallo -> sos hello
Buenos días -> sos good morning
Καλημέρα -> sos good morning
Wie geht es dir -> sos i am fine
Te quiero -> sos i love you


hmmmm... NAS framework?? okay

Neural Architecture Search (NAS) framework

In [18]:
class NASearch:
    def __init__(self, source_vocab_size, target_vocab_size,
                 source_max_length, target_max_length):
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        self.best_model = None
        self.best_score = float('-inf')
        self.best_params = None

    def generate_random_architecture(self):
        params = {
            'embed_dim': np.random.choice([32, 64, 128]),
            'dense_dim': np.random.choice([64, 128, 256]),
            'num_heads': np.random.choice([2, 4, 8]),
            'num_encoder_layers': np.random.randint(1, 4),
            'num_decoder_layers': np.random.randint(1, 4),
            'dropout_rate': np.random.uniform(0.0, 0.3)
        }
        return params

    def evaluate_architecture(self, params, epochs=10, verbose=0):
        try:
            model = build_transformer_model(
                source_vocab_size=self.source_vocab_size,
                target_vocab_size=self.target_vocab_size,
                source_max_length=self.source_max_length,
                target_max_length=self.target_max_length,
                **params
            )

            model.compile(
                optimizer=Adam(learning_rate=0.001),
                loss="categorical_crossentropy",
                metrics=["accuracy"]
            )

            history = model.fit(
                [source_sequences, target_input_sequences],
                target_output_sequences,
                batch_size=8,
                epochs=epochs,
                validation_split=0.2,
                verbose=verbose
            )

            # Return validation accuracy
            val_acc = max(history.history['val_accuracy'])
            return val_acc, model
        except:
            # If model fails to build/train, return very low score
            return 0.0, None

    def search(self, num_trials=10, epochs_per_trial=15):
        for trial in range(num_trials):
            params = self.generate_random_architecture()
            print(f"\nTrial {trial + 1}/{num_trials}")
            print("Testing architecture with params:", params)

            score, model = self.evaluate_architecture(
                params, epochs=epochs_per_trial, verbose=1)

            print(f"Validation accuracy: {score:.4f}")

            if score > self.best_score and model is not None:
                self.best_score = score
                self.best_model = model
                self.best_params = params
                print("New best model found!")

        print("\nNAS completed!")
        print(f"Best validation accuracy: {self.best_score:.4f}")
        print("Best parameters:", self.best_params)

        return self.best_model, self.best_params

Run Neural Architecture Search!!!!!!!!!!

In [23]:
nas = NASearch(
    source_vocab_size=source_vocab_size,
    target_vocab_size=target_vocab_size,
    source_max_length=source_max_length,
    target_max_length=target_max_length
)

# Run with a small number of trials
best_model, best_params = nas.search(num_trials=5, epochs_per_trial=10)

# results are wishy washy!!... they weren't for the other NAS framework, because this NTM features a more robust architecture?


Trial 1/5
Testing architecture with params: {'embed_dim': np.int64(32), 'dense_dim': np.int64(256), 'num_heads': np.int64(4), 'num_encoder_layers': 1, 'num_decoder_layers': 1, 'dropout_rate': 0.06115466607640904}




Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 332ms/step - accuracy: 0.0642 - loss: 3.3360 - val_accuracy: 0.1667 - val_loss: 2.4480
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1554 - loss: 2.4369 - val_accuracy: 0.1667 - val_loss: 2.2355
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.2378 - loss: 2.0597 - val_accuracy: 0.2500 - val_loss: 2.1378
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.2986 - loss: 1.7572 - val_accuracy: 0.3056 - val_loss: 2.1088
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.3394 - loss: 1.4493 - val_accuracy: 0.3611 - val_loss: 2.0362
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.3637 - loss: 1.2680 - val_accuracy: 0.3889 - val_loss: 1.9355
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━



Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 578ms/step - accuracy: 0.0642 - loss: 3.3777 - val_accuracy: 0.1667 - val_loss: 2.7438
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.2153 - loss: 2.1577 - val_accuracy: 0.1667 - val_loss: 2.5414
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.3767 - loss: 1.4917 - val_accuracy: 0.1944 - val_loss: 2.0717
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.4210 - loss: 1.1454 - val_accuracy: 0.3056 - val_loss: 1.8923
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.4714 - loss: 0.8579 - val_accuracy: 0.3056 - val_loss: 1.9413
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.4427 - loss: 0.7605 - val_accuracy: 0.3056 - val_loss: 1.6826
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━



Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 426ms/step - accuracy: 0.0868 - loss: 3.6242 - val_accuracy: 0.1667 - val_loss: 2.4097
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - accuracy: 0.2205 - loss: 2.1843 - val_accuracy: 0.2778 - val_loss: 1.8648
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.3351 - loss: 1.5078 - val_accuracy: 0.3611 - val_loss: 1.6710
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.4297 - loss: 1.1026 - val_accuracy: 0.3889 - val_loss: 1.4906
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.4453 - loss: 0.8397 - val_accuracy: 0.3889 - val_loss: 1.4201
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.4627 - loss: 0.6505 - val_accuracy: 0.3889 - val_loss: 1.3927
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━



Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 594ms/step - accuracy: 0.0851 - loss: 3.2612 - val_accuracy: 0.1667 - val_loss: 2.8431
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.2040 - loss: 2.1081 - val_accuracy: 0.3056 - val_loss: 2.2515
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - accuracy: 0.3212 - loss: 1.4573 - val_accuracy: 0.3056 - val_loss: 1.8715
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.3125 - loss: 1.1101 - val_accuracy: 0.3611 - val_loss: 1.7101
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.4253 - loss: 0.7868 - val_accuracy: 0.3889 - val_loss: 1.6882
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.4835 - loss: 0.6400 - val_accuracy: 0.4167 - val_loss: 1.5814
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━



Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 523ms/step - accuracy: 0.1050 - loss: 2.9664 - val_accuracy: 0.1667 - val_loss: 2.8817
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.1832 - loss: 2.4632 - val_accuracy: 0.1667 - val_loss: 2.5973
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.2873 - loss: 1.7372 - val_accuracy: 0.2778 - val_loss: 2.1082
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.3438 - loss: 1.3458 - val_accuracy: 0.3333 - val_loss: 1.8202
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.3906 - loss: 1.2542 - val_accuracy: 0.3889 - val_loss: 1.6818
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.3967 - loss: 1.0508 - val_accuracy: 0.3889 - val_loss: 1.5781
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━

Test the best model found by NAS

In [25]:
def decode_sequence_with_model(model, input_sentence):
    input_seq = source_tokenizer.texts_to_sequences([input_sentence])
    input_seq = keras.preprocessing.sequence.pad_sequences(
        input_seq, maxlen=source_max_length, padding='post')

    target_seq = np.zeros((1, target_max_length))
    target_seq[0, 0] = sos_token

    for i in range(target_max_length - 1):
        predictions = model.predict([input_seq, target_seq], verbose=0)
        next_token = np.argmax(predictions[0, i, :])
        target_seq[0, i+1] = next_token

        if next_token == eos_token:
            break

    output_sentence = []
    for token in target_seq[0]:
        if token == 0 or token == eos_token:
            continue
        word = target_tokenizer.index_word.get(token, '')
        output_sentence.append(word)

    return ' '.join(output_sentence)

# Test the best model
if nas.best_model is not None:
    print("\nTesting best model found by NAS:")
    for sentence in test_sentences:
        translation = decode_sequence_with_model(nas.best_model, sentence)
        print(f"{sentence} -> {translation}")


Testing best model found by NAS:
Hallo -> sos hello
Buenos días -> sos good morning
Καλημέρα -> sos good morning
Wie geht es dir -> sos what is your name
Te quiero -> sos i love you
