In [1]:
import tarfile

# Step 1: Extract the bz2 file
with tarfile.open("sentences.tar.bz2", "r:bz2") as tar:
    tar.extractall()

EOFError: Compressed file ended before the end-of-stream marker was reached

In [2]:
with tarfile.open("links.tar.bz2", "r:bz2") as tar:
    tar.extractall()

EOFError: Compressed file ended before the end-of-stream marker was reached

In [3]:
# Import modules and libraries
import pandas as pd
import re
import numpy as np


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed

## Dataset

In [4]:
# Load the full file
data = pd.read_csv('sentences.csv', sep='\t', names=['id', 'lang', 'sentence'], quoting=3)

# Filter for English and Italian
eng = data[data['lang'] == 'eng']
ita = data[data['lang'] == 'ita']

# Display the lenght file
print("English:", len(eng))
print("Italian:", len(ita))

# Load the links to join sentences
links = pd.read_csv("links.csv", sep="\t", names=["source_id", "target_id"])
print("Links shape:", links.shape)
links.head()


English: 165895
Italian: 14723
Links shape: (3483638, 2)


Unnamed: 0,source_id,target_id
0,1,2481
1,1,5350
2,1,180624
3,1,344899
4,1,380381


In [5]:
eng.head()

Unnamed: 0,id,lang,sentence
1232,1276,eng,Let's try something.
1233,1277,eng,I have to go to sleep.
1235,1280,eng,Today is June 18th and it is Muiriel's birthday!
1236,1282,eng,Muiriel is 20 now.
1237,1283,eng,"The password is ""Muiriel""."


In [6]:
ita.head()

Unnamed: 0,id,lang,sentence
4253,4369,ita,Devo andare a dormire.
4254,4371,ita,Che cos'è?
4255,4373,ita,"La parola d'accesso è ""Muiriel""."
4256,4375,ita,Non cambierà niente.
4257,4376,ita,Costerà trenta euro.


In [7]:
# Keep only sentence ID and sentence text
eng_df = eng[['id', 'sentence']].rename(columns={'id': 'source_id', 'sentence': 'english'})
ita_df = ita[['id', 'sentence']].rename(columns={'id': 'target_id', 'sentence': 'italian'})

print(eng_df.shape, ita_df.shape)

(165895, 2) (14723, 2)


In [8]:
links.head()

Unnamed: 0,source_id,target_id
0,1,2481
1,1,5350
2,1,180624
3,1,344899
4,1,380381


In [9]:
# Merge English sentences with links
eng_links = pd.merge(links, eng_df, on='source_id')

# Merge with Italian sentences
translation_pairs = pd.merge(eng_links, ita_df, on='target_id')

translation_pairs.head()

Unnamed: 0,source_id,target_id,english,italian
0,1276,565618,Let's try something.,Proviamo qualcosa!
1,1277,4369,I have to go to sleep.,Devo andare a dormire.
2,1280,383739,Today is June 18th and it is Muiriel's birthday!,Oggi è il 18 giugno ed è il compleanno di Muir...
3,1280,565612,Today is June 18th and it is Muiriel's birthday!,Oggi è il 18 di giugno ed è il compleanno di M...
4,1282,565608,Muiriel is 20 now.,Muiriel ha 20 anni adesso.


In [10]:
# Keep only the sentence columns
translation_pairs = translation_pairs[['english', 'italian']]

translation_pairs.head()

Unnamed: 0,english,italian
0,Let's try something.,Proviamo qualcosa!
1,I have to go to sleep.,Devo andare a dormire.
2,Today is June 18th and it is Muiriel's birthday!,Oggi è il 18 giugno ed è il compleanno di Muir...
3,Today is June 18th and it is Muiriel's birthday!,Oggi è il 18 di giugno ed è il compleanno di M...
4,Muiriel is 20 now.,Muiriel ha 20 anni adesso.


In [11]:
# Check if there are missing values
translation_pairs.isnull().sum()

Unnamed: 0,0
english,0
italian,0


In [12]:
# See the shape
translation_pairs.shape

(12168, 2)

In [13]:
# Limit our dataset to a sample size like 10000 (apply also the shuffle)
translation_pairs = translation_pairs.sample(n=10000, random_state=42).reset_index(drop=True)
print('The size now is: ', translation_pairs.shape)
translation_pairs.head()

The size now is:  (10000, 2)


Unnamed: 0,english,italian
0,I am afraid to go.,Ho paura ad andare.
1,There are movements to try to ban TV advertising.,Ci sono movimenti che provano a bandire la pub...
2,Don't apply that nickname to me.,Non darmi quel soprannome.
3,The patient was in danger.,Il paziente era in pericolo.
4,He refused to take the bribe.,Rifiutò di prendere la bustarella.


## Preprocessing
Prepare text data so it can be used in a Neural Machine Translation (NMT) model


In [14]:
# Define the function to clean sentences
def clean_sentence(sentence):
    sentence = sentence.lower().strip()                     # Convert to Lowercase plus remove spaces at the beginning and at the end
    sentence = re.sub(r"[^a-zA-Zàèéìòùç' ]", "", sentence)  # remove numbers, symbols
    return sentence

# Apply function
translation_pairs['english'] = translation_pairs['english'].apply(clean_sentence)
translation_pairs['italian'] = translation_pairs['italian'].apply(clean_sentence)

translation_pairs.head()

Unnamed: 0,english,italian
0,i am afraid to go,ho paura ad andare
1,there are movements to try to ban tv advertising,ci sono movimenti che provano a bandire la pub...
2,don't apply that nickname to me,non darmi quel soprannome
3,the patient was in danger,il paziente era in pericolo
4,he refused to take the bribe,rifiutò di prendere la bustarella


In [15]:
# Add <start> and <end> tokens to Italian sentences (target). This helps the decoder know where to start and stop translation
# The decoder is trained to predict the next word step by step.
translation_pairs['italian_input'] = '<start> ' + translation_pairs['italian']
translation_pairs['italian_output'] = translation_pairs['italian'] + ' <end>'

translation_pairs.head()

Unnamed: 0,english,italian,italian_input,italian_output
0,i am afraid to go,ho paura ad andare,<start> ho paura ad andare,ho paura ad andare <end>
1,there are movements to try to ban tv advertising,ci sono movimenti che provano a bandire la pub...,<start> ci sono movimenti che provano a bandir...,ci sono movimenti che provano a bandire la pub...
2,don't apply that nickname to me,non darmi quel soprannome,<start> non darmi quel soprannome,non darmi quel soprannome <end>
3,the patient was in danger,il paziente era in pericolo,<start> il paziente era in pericolo,il paziente era in pericolo <end>
4,he refused to take the bribe,rifiutò di prendere la bustarella,<start> rifiutò di prendere la bustarella,rifiutò di prendere la bustarella <end>


### Tokenizer
Encoder Input	--> in this case English sentence	(What the model uses to understand the meaning)

Decoder Input	--> in this case Italian with <start>	(What the model sees to predict the next word)

Decoder Output --> in this case Italian with <end>	(What the model tries to predict one word at a time)

In [16]:
# Create tokenizers
eng_tokenizer = Tokenizer(filters='') # filters='' -->  Don't remove anything
ita_tokenizer = Tokenizer(filters='')

# Fit on english and italian input
eng_tokenizer.fit_on_texts(translation_pairs['english'])
ita_tokenizer.fit_on_texts(translation_pairs['italian_input'])

# Convert text to sequences of all
eng_sequences = eng_tokenizer.texts_to_sequences(translation_pairs['english'])
ita_input_sequences = ita_tokenizer.texts_to_sequences(translation_pairs['italian_input'])
ita_output_sequences = ita_tokenizer.texts_to_sequences(translation_pairs['italian_output'])

In [17]:
print("The length of all sequences are: \nEng ", len(eng_sequences), "\nIta input ", len(ita_input_sequences), "\nIta Output ",len(ita_output_sequences))
print("They are lists of list: \n", "First sentence length", len(ita_output_sequences[0]), "\n", "Second sentence length" , len(ita_output_sequences[1]))
print(ita_output_sequences[:3])

The length of all sequences are: 
Eng  10000 
Ita input  10000 
Ita Output  10000
They are lists of list: 
 First sentence length 4 
 Second sentence length 10
[[16, 220, 98, 55], [44, 11, 3676, 9, 3677, 7, 3678, 3, 2503, 2504], [6, 920, 140, 2505]]


In [18]:
# Analyze tokenizer
print("English --> Type: ", type(eng_tokenizer.word_index), "length: ", len(eng_tokenizer.word_index))
print("English --> Type: ", type(ita_tokenizer.word_index), "length: ", len(ita_tokenizer.word_index))

English --> Type:  <class 'dict'> length:  5235
English --> Type:  <class 'dict'> length:  7368


In [19]:
# If they are dictionaries we can convert into list and print them

# English words and values
en_words_list = list(eng_tokenizer.word_index.keys())
en_values_list = list(eng_tokenizer.word_index.values())
print("Enslish words: ", en_words_list[:10])
print("Respective values: ", en_values_list[:10])

# Italian words and values
ita_words_list = list(ita_tokenizer.word_index.keys())
ita_values_list = list(ita_tokenizer.word_index.values())
print("Italian words: ", ita_words_list[:10])
print("Respective values: ", ita_values_list[:10])

Enslish words:  ['the', 'i', 'you', 'to', 'a', 'is', 'he', 'in', 'of', 'my']
Respective values:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Italian words:  ['<start>', 'è', 'la', 'il', 'di', 'non', 'a', 'un', 'che', 'ha']
Respective values:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [20]:
# Create the vocabulary size
# In Keras, token indices start at 1 while python start at 0. Add 1 so that index 0 is reserved for padding
eng_vocab_size = len(eng_tokenizer.word_index) + 1
ita_vocab_size = len(ita_tokenizer.word_index) + 1

print("English vocab size:", eng_vocab_size)
print("Italian vocab size:", ita_vocab_size)

English vocab size: 5236
Italian vocab size: 7369


In [21]:
eng_sequences[0]

[2, 61, 256, 4, 41]

In [22]:
ita_input_sequences[0]

[1, 16, 220, 98, 55]

In [23]:
ita_output_sequences[0]

[16, 220, 98, 55]

In [24]:
# We need lead all sentences with the same length

# Find the max length for english and italian sentences
max_eng_len = max([len(seq) for seq in eng_sequences])
max_ita_len = max([len(seq) for seq in ita_input_sequences])
print(f"The max length of a phrase of english is {max_eng_len}")
print(f"The max length of a phrase of italian is {max_ita_len}")

# Apply padding to lead all with the same length
eng_sequences = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')         # padding='post' --> Add zeros after the sentence
ita_input_sequences = pad_sequences(ita_input_sequences, maxlen=max_ita_len, padding='post')
ita_output_sequences = pad_sequences(ita_output_sequences, maxlen=max_ita_len, padding='post')


The max length of a phrase of english is 67
The max length of a phrase of italian is 74


In [25]:
print(f"Sample English input --> Length {len(eng_sequences[0])}\n {eng_sequences[0]} \n")
print(f"Sample Italian input (decoder input) --> Length {len(ita_input_sequences[0])}\n {ita_input_sequences[0]} \n")
print(f"Sample Italian output (decoder output) --> Length {len(ita_output_sequences[0])}\n {ita_output_sequences[0]} \n")

Sample English input --> Length 67
 [  2  61 256   4  41   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0] 

Sample Italian input (decoder input) --> Length 74
 [  1  16 220  98  55   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0] 

Sample Italian output (decoder output) --> Length 74
 [ 16 220  98  55   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

## Model

In [26]:
# Create the encoder LSTM

# Encoder input (Accepts sequences of any length)
encoder_inputs = Input(shape=(None,), name="encoder_inputs")

# Embedding layer --> Transforms words (integers) into dense vectors of meaning (word embeddings)
# The Embedding layer takes each word (represented as an integer from the Tokenizer) and maps it to a vector of 256 real numbers
encoder_embedding = Embedding(input_dim=eng_vocab_size, output_dim=256, name="encoder_embedding")(encoder_inputs)

# LSTM layer -->  keeps the final state (summary of the sentence)
encoder_lstm, state_h, state_c = LSTM(256, return_state=True, name="encoder_lstm")(encoder_embedding)

# encoder_lstm --> is the output at each time step from the LSTM and we can ignore it in encoder

# These two vectors state_h (hidden) and state_c (cell) represent the context vector to pass to the decoder
encoder_states = [state_h, state_c]

In [27]:
# Create the decoder LSTM

# Decoder input (Italian with <start>)
decoder_inputs = Input(shape=(None,), name="decoder_inputs")

# Decoder embedding (same as encoder, convert word tokens into dense vectors)
decoder_embedding_layer = Embedding(input_dim=ita_vocab_size, output_dim=256, name="decoder_embedding")
decoder_embedding = decoder_embedding_layer(decoder_inputs)

# LSTM: we initialize with encoder's final states (state_h, state_c)
decoder_lstm, _, _ = LSTM(256, return_sequences=True, return_state=True, name="decoder_lstm")(decoder_embedding, initial_state=encoder_states)

# Output layer: converts LSTM output into predicted words (using softmax)
# TimeDistributed() --> Apply the dense layer to each time step individually (each word in output sentence)
decoder_dense = TimeDistributed(Dense(ita_vocab_size, activation="softmax"), name="decoder_output")
decoder_outputs = decoder_dense(decoder_lstm)


In [28]:
# Build the full model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile it
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Check model structure
model.summary()

In [29]:
eng_sequences.shape

(10000, 67)

In [30]:
ita_input_sequences.shape

(10000, 74)

In [31]:
ita_output_sequences.shape

(10000, 74)

In [32]:
# Prepare Training Data

# Convert sequences to NumPy arrays
encoder_input_data = np.array(eng_sequences)
decoder_input_data = np.array(ita_input_sequences)

# Reshape decoder target data to be compatible with sparse_categorical_crossentropy
decoder_target_data = np.expand_dims(ita_output_sequences, -1)

print("Encoder input shape:", encoder_input_data.shape)
print("Decoder input shape:", decoder_input_data.shape)
print("Decoder target shape:", decoder_target_data.shape)


Encoder input shape: (10000, 67)
Decoder input shape: (10000, 74)
Decoder target shape: (10000, 74, 1)


In [33]:
# Train the model
history = model.fit(
    [encoder_input_data, decoder_input_data],        # Inputs: English + Italian <start>
    decoder_target_data,                             # 	Target: Italian <end> (the words we want the model to predict)
    batch_size=64,
    epochs=15,
    validation_split=0.2
)

Epoch 1/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 5s/step - accuracy: 0.8800 - loss: 2.2379 - val_accuracy: 0.9198 - val_loss: 0.6023
Epoch 2/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 4s/step - accuracy: 0.9213 - loss: 0.5868 - val_accuracy: 0.9205 - val_loss: 0.5939
Epoch 3/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m520s[0m 4s/step - accuracy: 0.9208 - loss: 0.5832 - val_accuracy: 0.9205 - val_loss: 0.5901
Epoch 4/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m600s[0m 4s/step - accuracy: 0.9216 - loss: 0.5731 - val_accuracy: 0.9204 - val_loss: 0.5884
Epoch 5/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m550s[0m 4s/step - accuracy: 0.9214 - loss: 0.5745 - val_accuracy: 0.9205 - val_loss: 0.5878
Epoch 6/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m557s[0m 4s/step - accuracy: 0.9209 - loss: 0.5756 - val_accuracy: 0.9205 - val_loss: 0.5874
Epoch 7/15
[1m125/125

In [39]:
history.history

{'accuracy': [0.9124780893325806,
  0.921221137046814,
  0.9213108420372009,
  0.9212617874145508,
  0.921302318572998,
  0.9213293790817261,
  0.9212837815284729,
  0.9212886691093445,
  0.9213125705718994,
  0.9213818907737732,
  0.9214849472045898,
  0.9215676784515381,
  0.9216150045394897,
  0.9217128157615662,
  0.9218074083328247],
 'loss': [1.0678175687789917,
  0.5860112905502319,
  0.5789924263954163,
  0.5760493278503418,
  0.5740000009536743,
  0.5722640156745911,
  0.5704774856567383,
  0.5676262378692627,
  0.5649533867835999,
  0.5616636872291565,
  0.5583800673484802,
  0.5550853610038757,
  0.5520541667938232,
  0.5494454503059387,
  0.546970546245575],
 'val_accuracy': [0.9198377728462219,
  0.9204662442207336,
  0.9204662442207336,
  0.9204054474830627,
  0.9204662442207336,
  0.9205135703086853,
  0.920473039150238,
  0.9205541014671326,
  0.9206081032752991,
  0.9205608367919922,
  0.9207634329795837,
  0.9207838177680969,
  0.9209932684898376,
  0.9207972884178162

### Model for inference

In [47]:
# Encoder model (for inference).
# It takes the English input, returns [state_h, state_c].
encoder_model = Model(encoder_inputs, encoder_states)

In [50]:
# Inputs at inference time
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reuse decoder embedding layer
decoder_emb_infer = decoder_embedding_layer(decoder_inputs)

# Run decoder with passed states
decoder_lstm_out, state_h, state_c = decoder_lstm(decoder_emb_infer, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

# Final dense layer (same as training)
decoder_outputs_infer = decoder_dense(decoder_lstm_out)

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_infer] + decoder_states
)


NameError: name 'decoder_embedding_layer' is not defined

In [None]:
def translate_sentence(input_seq):
    # Tokenize the English sentence
    seq = eng_tokenizer.texts_to_sequences([input_seq.lower()])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')

    # Encode input sentence
    states_value = encoder_model.predict(seq)

    # Start decoding with "<start>"
    target_seq = np.array([[ita_tokenizer.word_index['<start>']]])

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get most likely next word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = ita_tokenizer.index_word.get(sampled_token_index, '')

        if (sampled_word == '<end>' or len(decoded_sentence.split()) > max_ita_len):
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

            # Update target sequence
            target_seq = np.array([[sampled_token_index]])
            states_value = [h, c]

    return decoded_sentence.strip()


In [None]:
print("EN: How are you?")
print("IT:", translate_sentence("How are you?"))

print("\nEN: I like football.")
print("IT:", translate_sentence("I like football."))

print("\nEN: I am tired today.")
print("IT:", translate_sentence("I am tired today."))


In [None]:
from nltk.translate.bleu_score import sentence_bleu

# Example:
reference = [['sto', 'bene']]  # True translation
candidate = ['sto', 'ok']     # Model translation
score = sentence_bleu(reference, candidate)
print("BLEU score:", score)
