---
**Dylan Govender - 221040222 - Assignment III - COMP703**
---





## **1. Importing the Data**

In [1]:
!wget "http://www.manythings.org/anki/fra-eng.zip"
!unzip fra-eng.zip

--2024-05-24 18:08:22--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2024-05-24 18:08:25 (3.77 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


## **2. Data Preprocessing**

In [2]:
import numpy as np

# Configuration
MAX_SAMPLES = 10000  # Number of samples to train on.

input_texts = [] # saves first language sentences (input)
target_texts = [] # saves second language sentences (output)
input_characters = set() # saves unique first language words
target_characters = set() # saves unique second language words

with open("//content//fra.txt", "r", encoding="utf-8") as file:
    lines = file.read().split("\n")
    for line in lines[:min(MAX_SAMPLES, len(lines)-1)]:
        input_text, target_text, _ = line.split("\t")

        # "\t" is start sequence character for target sentence
        # "\n" is the end sequence character for target sentence
        target_text = "\t" + target_text + "\n"
        input_texts.append(input_text)
        target_texts.append(target_text)

        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)

        for char in target_text:
            if char not in target_characters:
                target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(text) for text in input_texts])
max_decoder_seq_length = max([len(text) for text in target_texts])

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype="float32",
)

decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32",
)

decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32",
)

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0

    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0

        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0

    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

Number of samples: 10000
Number of unique input tokens: 70
Number of unique output tokens: 91
Max sequence length for inputs: 14
Max sequence length for outputs: 59


## **3. Building the Model: Bi-GRU Encoder & GRU Decoder**

In [3]:
import tensorflow, keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Reshape, Input, Bidirectional, GRU, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [4]:
# Define an input sequence and process it.
LATENT_DIM = 256

# Encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = Bidirectional(GRU(LATENT_DIM, return_state=True))
encoder_outputs, forward_h, backward_h = encoder(encoder_inputs)
state_h = Concatenate()([forward_h, backward_h])

# Decoder
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_gru = GRU(2*LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs,  name="BiGRU_GRU_model")
model.summary()

Model: "BiGRU_GRU_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None, 70)]           0         []                            
                                                                                                  
 bidirectional (Bidirection  [(None, 512),                503808    ['input_1[0][0]']             
 al)                          (None, 256),                                                        
                              (None, 256)]                                                        
                                                                                                  
 input_2 (InputLayer)        [(None, None, 91)]           0         []                            
                                                                                    

## **4. Training the Model: Bi-GRU Encoder & GRU Decoder**


In [5]:
BATCH_SIZE = 64  # batch size for training
EPOCHS = 100  # number of epochs to train for

model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,
)

# save model
model.save("BiGRU_GRU_model.keras")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## **5. Evaluating the Model: Bi-GRU Encoder & GRU Decoder**

In [6]:
# restore the model and reconstruct the encoder and decoder
model = keras.models.load_model("BiGRU_GRU_model.keras")

# print detailed information about each layer
for index, layer in enumerate(model.layers):
    print(f"Layer index: {index}")
    print(f"Layer name: {layer.name}")
    print(f"Layer type: {type(layer).__name__}")
    print(f"Layer config: {layer.get_config()}")
    print(f"Layer output shape: {layer.output_shape}")
    print()

encoder_inputs = model.input[0]  # input_1
encoder_outputs, forward_h, backward_h = model.layers[1].output  # bidirectional
state_enc_h = Concatenate()([forward_h, backward_h])
encoder_model = Model(encoder_inputs, state_enc_h)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = Input(shape=(2 * LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h]

decoder_gru = model.layers[4] # gru
decoder_outputs, state_dec_h = decoder_gru(
    decoder_inputs, initial_state=decoder_states_inputs
)

decoder_states = [state_dec_h]
decoder_dense = model.layers[5] # dense
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

Layer index: 0
Layer name: input_1
Layer type: InputLayer
Layer config: {'batch_input_shape': (None, None, 70), 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'input_1'}
Layer output shape: [(None, None, 70)]

Layer index: 1
Layer name: bidirectional
Layer type: Bidirectional
Layer config: {'name': 'bidirectional', 'trainable': True, 'dtype': 'float32', 'layer': {'module': 'keras.layers', 'class_name': 'GRU', 'config': {'name': 'gru', 'trainable': True, 'dtype': 'float32', 'return_sequences': False, 'return_state': True, 'go_backwards': False, 'stateful': False, 'unroll': False, 'time_major': False, 'units': 256, 'activation': 'tanh', 'recurrent_activation': 'sigmoid', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'recurrent_initializer': {'module': 'keras.initializers', 'class_name': 'Orthogonal', 'config': {'gain': 1.0, 'seed': None}, 'registered_name': None}, 

In [7]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h = decoder_model.predict([target_seq] + [states_value], verbose=0)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = h

    return decoded_sentence

for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)

    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)
    print()

Input sentence: Go.
Decoded sentence: Sartez !


Input sentence: Go.
Decoded sentence: Sartez !


Input sentence: Go.
Decoded sentence: Sartez !


Input sentence: Go.
Decoded sentence: Sartez !


Input sentence: Hi.
Decoded sentence: C'est partit !


Input sentence: Hi.
Decoded sentence: C'est partit !


Input sentence: Run!
Decoded sentence: File !


Input sentence: Run!
Decoded sentence: File !


Input sentence: Run!
Decoded sentence: File !


Input sentence: Run!
Decoded sentence: File !


Input sentence: Run!
Decoded sentence: File !


Input sentence: Run!
Decoded sentence: File !


Input sentence: Run!
Decoded sentence: File !


Input sentence: Run!
Decoded sentence: File !


Input sentence: Run.
Decoded sentence: File !


Input sentence: Run.
Decoded sentence: File !


Input sentence: Run.
Decoded sentence: File !


Input sentence: Run.
Decoded sentence: File !


Input sentence: Run.
Decoded sentence: File !


Input sentence: Run.
Decoded sentence: File !




In [8]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Calculate BLEU score
def bleu_score(TEST_SIZE):
    smoothie = SmoothingFunction().method4
    bleu_scores = []
    unused_chars = ['\t', '\n', ' ', '\u202f']

    for seq_index in range(TEST_SIZE):
        input_seq = encoder_input_data[seq_index:seq_index + 1]
        decoded_sentence = decode_sequence(input_seq)
        reference = list(filter(lambda x:x not in unused_chars, target_texts[seq_index][1:-1]))
        candidate = list(filter(lambda x:x not in unused_chars, decoded_sentence[:-1]))
        score = sentence_bleu(reference, candidate, smoothing_function=smoothie, weights=(1, 0, 0, 0))
        bleu_scores.append(score)

    return np.mean(bleu_scores)

# Evaluate BLEU score on 100/1000 - 1%/10% of the corpus
print("Average BLEU score on 100 samples:", bleu_score(100))
print("Average BLEU score on 1000 samples:", bleu_score(1000))

Average BLEU score on 100 samples: 0.5422193362193363
Average BLEU score on 1000 samples: 0.5871321965722542
