# Name: Fares Mohamed Salah
# ID: 22011614

## Data Preprocessing

In [2]:
# Import necessary libraries
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Bidirectional,GRU,LSTM,Embedding
from tensorflow.keras.layers import Dense,MultiHeadAttention,LayerNormalization,Embedding,Dropout,Layer
from tensorflow.keras import Sequential,Input
from tensorflow.keras.callbacks import ModelCheckpoint

from nltk.translate.bleu_score import sentence_bleu

In [3]:
# Load the dataset from the provided file
text_file = '/kaggle/input/french/fra.txt'

In [4]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]

# Prepare text pairs (English and French sentences)
text_pairs = []
for line in lines:
    english, french = line.split("\t")
    french = "[start] " + french + " [end]"
    text_pairs.append((english, french))

In [5]:
# Display a random text pair
import random
print(random.choice(text_pairs))

('What did Tom order?', "[start] Qu'a commandé Tom\xa0? [end]")


In [6]:
# Shuffle text pairs and split into training, validation, and test datasets
import random
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [7]:
# Define characters to strip from the text
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

In [8]:
# Custom standardization function for text preprocessing
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

In [9]:
# Set parameters for text vectorization
vocab_size = 15000
sequence_length = 20

# Initialize TextVectorization layers for source and target languages
source_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
target_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

# Adapt the vectorization layers using the training data
train_english_texts = [pair[0] for pair in train_pairs]
train_french_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_french_texts)

In [10]:
# Set batch size for training
batch_size = 64

# Function to format dataset for training
def format_dataset(eng, fre):
    eng = source_vectorization(eng)
    fre = target_vectorization(fre)
    return ({
        "english": eng,
        "french": fre[:, :-1],
    }, fre[:, 1:])

# Function to create a TensorFlow dataset from text pairs
def make_dataset(pairs):
    eng_texts, fre_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    fre_texts = list(fre_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, fre_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

# Create training and validation datasets
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

## LSTM Model

### Imports

In [11]:
# Import necessary libraries
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import sentence_bleu

### Model Definition

In [12]:
# Define the LSTM model architecture
embed_dim = 256
lstm_units = 512

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = Embedding(vocab_size, embed_dim, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(lstm_units, return_state=True)(x)
encoder_states = [state_h, state_c]

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="french")
x = Embedding(vocab_size, embed_dim, mask_zero=True)(decoder_inputs)
x = LSTM(lstm_units, return_sequences=True, return_state=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(vocab_size, activation="softmax")(x[0])

lstm_model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

### Model Training

In [13]:
# Training parameters and setup
epochs = 10

checkpoint = ModelCheckpoint(filepath='lstm_language_translation_checkpoint.hdf5', save_weights_only=True, verbose=1, monitor='val_accuracy')

lstm_model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train the model
lstm_model.fit(train_ds, epochs=epochs, callbacks=[checkpoint], validation_data=val_ds)

# Saving model weights
lstm_model.save_weights("lstm_translator.h5")
load_status = lstm_model.load_weights("lstm_translator.h5")

Epoch 1/10

Epoch 00001: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 2/10

Epoch 00002: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 3/10

Epoch 00003: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 4/10

Epoch 00004: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 5/10

Epoch 00005: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 6/10

Epoch 00006: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 7/10

Epoch 00007: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 8/10

Epoch 00008: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 9/10

Epoch 00009: saving model to lstm_language_translation_checkpoint.hdf5
Epoch 10/10

Epoch 00010: saving model to lstm_language_translation_checkpoint.hdf5


### Model Evaluation

In [14]:
# Evaluate the model on the test data
fra_vocab = target_vectorization.get_vocabulary()
fra_index_lookup = dict(zip(range(len(fra_vocab)), fra_vocab))
max_decoded_sentence_length = 20

def decode_sequence_lstm(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = lstm_model([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = fra_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(5):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence_lstm(input_sentence))

-
They don't serve that here.
[start] ils ne vont pas ici [end]
-
We will have little snow this winter.
[start] nous peu de neige cet hiver [end]
-
She was on the verge of tears.
[start] elle était en colère de [UNK] [end]
-
I had a headache, and I took the day off today.
[start] jai eu un rêve du jour aujourdhui je suis déjà allé à mois [end]
-
Tom knew that he'd been tricked.
[start] tom savait quil était [UNK] [end]


In [15]:
# Evaluation using the BLEU score
test_eng_texts = [pair[0] for pair in test_pairs]
test_fra_texts = [pair[1] for pair in test_pairs]
score = 0
bleu = 0
for i in range(20):
    candidate = decode_sequence_lstm(test_eng_texts[i])
    reference = test_fra_texts[i].lower()
    print(candidate, reference)
    score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    bleu += score
    print(f"Score: {score}")
print(f"\nBLEU score : {round(bleu, 2)}/20")

[start] mon petit de largent est à clé [end] [start] mon petit doigt est enflé. [end]
Score: 0.38636363636363635
[start] tu vas avoir du temps [end] [start] tu vas passer un sale quart d'heure. [end]
Score: 0.37142857142857144


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[start] il est impatient de lire le livre [end] [start] il est impatient de lire le livre. [end]
Score: 0.31914893617021284
[start] elles ne peuvent pas te virer [end] [start] ils ne peuvent pas vous virer. [end]
Score: 0.3488372093023256
[start] ne me dites pas si vous ne voulez pas [end] [start] ne me le dis pas si tu ne veux pas. [end]
Score: 0.3137254901960784
[start] combien dargent astu dépensé pour votre voiture [end] [start] combien d'argent as-tu claqué pour ta voiture ? [end]
Score: 0.32786885245901637
[start] nous serons à la manière de garder cela [end] [start] nous essaierons de ne pas laisser cela se reproduire. [end]
Score: 0.28301886792452824
[start] nous navons pas pu le droit de résoudre le monde avait pu nous en aller [end] [start] nous n'avons pas pu acheter de places, nous ne sommes donc pas allé au concert. [end]
Score: 0.2
[start] Êtesvous pour ou contre notre proposition [end] [start] es-tu pour ou contre la proposition ? [end]
Score: 0.2727272727272727
[start] 

## Transformer Model

### Imports

In [16]:
# Import necessary libraries
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Dense, LayerNormalization, MultiHeadAttention, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import sentence_bleu

### Model Definition

In [17]:
# Define the Transformer Encoder class
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [Dense(dense_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )

        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
        })
        return config

In [18]:
# Define the Transformer Decoder class
class TransformerDecoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [Dense(dense_dim, activation="relu"),
             Dense(embed_dim),]
        )
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [19]:
# Define the Positional Embedding class
class PositionalEmbedding(Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = Embedding(input_dim=input_dim, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [20]:
# Build the Transformer model architecture
embed_dim = 256
dense_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="french")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = Dropout(0.5)(x)
decoder_outputs = Dense(vocab_size, activation="softmax")(x)
transformer_model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

### Model Training

In [21]:
# Training parameters and setup
epochs = 10

checkpoint = ModelCheckpoint(filepath='transformer_language_translation_checkpoint.hdf5', save_weights_only=True, verbose=1, monitor='val_accuracy')

transformer_model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train the model
transformer_model.fit(train_ds, epochs=epochs, callbacks=[checkpoint], validation_data=val_ds)

# Saving model weights
transformer_model.save_weights("transformer_translator.h5")
load_status = transformer_model.load_weights("transformer_translator.h5")

Epoch 1/10

Epoch 00001: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 2/10

Epoch 00002: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 3/10

Epoch 00003: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 4/10

Epoch 00004: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 5/10

Epoch 00005: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 6/10

Epoch 00006: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 7/10

Epoch 00007: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 8/10

Epoch 00008: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 9/10

Epoch 00009: saving model to transformer_language_translation_checkpoint.hdf5
Epoch 10/10

Epoch 00010: saving model to transformer_language_translation_checkpoint.hdf5


### Model Evaluation

In [22]:
# Evaluate the model on the test data
fra_vocab = target_vectorization.get_vocabulary()
fra_index_lookup = dict(zip(range(len(fra_vocab)), fra_vocab))
max_decoded_sentence_length = 20

def decode_sequence_transformer(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer_model([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = fra_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(5):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence_transformer(input_sentence))

-
We all have different abilities.
[start] nous avons tous des [UNK] [end]
-
Have you called him yet?
[start] lavezvous déjà appelé [end]
-
You're hurting him.
[start] tu lui as fait mal [end]
-
I don't have a cellphone.
[start] je nai pas de téléphone [UNK] [end]
-
Can you repair this?
[start] pouvezvous réparer ceci [end]


In [23]:
# Evaluation using the BLEU score
test_eng_texts = [pair[0] for pair in test_pairs]
test_fra_texts = [pair[1] for pair in test_pairs]
score = 0
bleu = 0
for i in range(20):
    candidate = decode_sequence_transformer(test_eng_texts[i])
    reference = test_fra_texts[i].lower()
    print(candidate, reference)
    score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    bleu += score
    print(f"Score: {score}")
print(f"\nBLEU score : {round(bleu, 2)}/20")

[start] mon peu du le est [UNK] [end] [start] mon petit doigt est enflé. [end]
Score: 0.3783783783783784
[start] tu vas avoir du mal à avoir des choses [end] [start] tu vas passer un sale quart d'heure. [end]
Score: 0.2692307692307693
[start] il est [UNK] de lire le livre [end] [start] il est impatient de lire le livre. [end]
Score: 0.3023255813953489
[start] ils ne peuvent pas vous feu [end] [start] ils ne peuvent pas vous virer. [end]
Score: 0.3902439024390244
[start] ne me dis pas si tu ne veux pas [end] [start] ne me le dis pas si tu ne veux pas. [end]
Score: 0.35555555555555557
[start] combien dargent astu fait la voiture sur ta voiture [end] [start] combien d'argent as-tu claqué pour ta voiture ? [end]
Score: 0.2923076923076923
[start] nous [UNK] pas de ne pas le quitter à nouveau [end] [start] nous essaierons de ne pas laisser cela se reproduire. [end]
Score: 0.2542372881355932
[start] nous navons pas pu acheter des [UNK] pour ne pas le film [end] [start] nous n'avons pas pu ach

## Pre-trained model from Hugging Face

### Model Loading

In [24]:
from transformers import MarianMTModel, MarianTokenizer

# Load pretrained model and tokenizer for English to French translation
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

### Inference

In [25]:
def translate_with_huggingface(input_sentence):
    # Tokenize the input sentence
    tokenized_input_sentence = tokenizer.encode(input_sentence, return_tensors="pt", padding=True, truncation=True)
    
    # Get the prediction from the model
    translated = model.generate(tokenized_input_sentence, max_length=50, num_beams=4, early_stopping=True)
    
    # Decode the translated sentence
    translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_sentence

### Test and Comparison

In [26]:
# Test model on test data
test_eng_texts = [pair[0] for pair in test_pairs]
for input_sentence in test_eng_texts[:5]:  # Adjust to the desired number of sentences
    print("Original English sentence:", input_sentence)
    
    # Translate with Hugging Face model
    hf_translation = translate_with_huggingface(input_sentence)
    print("Hugging Face Translation:", hf_translation)
    print("-" * 50)

Original English sentence: My little finger is swollen.
Hugging Face Translation: Mon petit doigt est gonflé.
--------------------------------------------------
Original English sentence: You'll have a rough time.
Hugging Face Translation: Vous passerez un moment difficile.
--------------------------------------------------
Original English sentence: He is anxious to read the book.
Hugging Face Translation: Il est impatient de lire le livre.
--------------------------------------------------
Original English sentence: They can't fire you.
Hugging Face Translation: Ils ne peuvent pas te virer.
--------------------------------------------------
Original English sentence: Don't tell me if you don't want to.
Hugging Face Translation: Ne me dis pas si tu ne veux pas.
--------------------------------------------------
