**This code is Seq2Seq (Machine Translation) for English to French language**

**Encoder(LSTM) - Decoder(LSTM)**

**BLEU, ROUGE, METEOR**

**Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Library**

In [2]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download fr_core_news_sm
# !pip install datasets
# !pip install torchtext==0.14.0
# !pip install rouge-score

In [3]:
import os, sys, re
import time, math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy, random
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import tqdm
import torch
import torchtext
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

**Step 2: Load Dataset**

In [5]:
# Load your dataset from CSV
data = pd.read_csv('/content/drive/MyDrive/Interview/eng-french.csv')
data.rename(columns={'English words/sentences': 'en', 'French words/sentences': 'fr'}, inplace=True)
data.head(5)

Unnamed: 0,en,fr
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [6]:
data.shape

(175621, 2)

In [7]:
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.33, random_state=42)  # 0.33 * 0.3 ≈ 0.1

In [8]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

In [9]:
train_dataset = train_dataset.remove_columns("__index_level_0__")
valid_dataset = valid_dataset.remove_columns("__index_level_0__")
test_dataset = test_dataset.remove_columns("__index_level_0__")

In [10]:
from datasets import DatasetDict

In [11]:
# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [12]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['en', 'fr'],
        num_rows: 122934
    })
    validation: Dataset({
        features: ['en', 'fr'],
        num_rows: 35300
    })
    test: Dataset({
        features: ['en', 'fr'],
        num_rows: 17387
    })
})

In [13]:
train_data, valid_data, test_data = (
    dataset_dict["train"],
    dataset_dict["validation"],
    dataset_dict["test"],
)

In [14]:
train_data[0]

{'en': "We've been spotted.", 'fr': 'Nous avons été repérés.'}

In [15]:
train_data.shape, valid_data.shape, test_data.shape

((122934, 2), (35300, 2), (17387, 2))

In [16]:
# Load SpaCy models
en_nlp = spacy.load('en_core_web_sm')
fr_nlp = spacy.load('fr_core_news_sm')

In [17]:
string = "What a lovely day it is today!"

[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [18]:
def tokenize_example(example, en_nlp, fr_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    fr_tokens = [token.text for token in fr_nlp.tokenizer(example["fr"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        fr_tokens = [token.lower() for token in fr_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    fr_tokens = [sos_token] + fr_tokens + [eos_token]
    return {"en_tokens": en_tokens, "fr_tokens": fr_tokens}

In [19]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "fr_nlp": fr_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/122934 [00:00<?, ? examples/s]

Map:   0%|          | 0/35300 [00:00<?, ? examples/s]

Map:   0%|          | 0/17387 [00:00<?, ? examples/s]

In [20]:
train_data[0]

{'en': "We've been spotted.",
 'fr': 'Nous avons été repérés.',
 'en_tokens': ['<sos>', 'we', "'ve", 'been', 'spotted', '.', '<eos>'],
 'fr_tokens': ['<sos>', 'nous', 'avons', 'été', 'repérés', '.', '<eos>']}

In [21]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

fr_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["fr_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [22]:
en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'i', 'you', 'to', 'the', '?']

In [23]:
en_vocab.get_itos()[9]

'?'

In [24]:
fr_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'je', 'de', '?', 'pas', 'est']

In [25]:
en_vocab.get_stoi()["the"]

8

In [26]:
en_vocab["the"]

8

In [27]:
len(en_vocab), len(fr_vocab)

(8513, 13285)

In [28]:
"the" in en_vocab

True

In [29]:
"The" in en_vocab

False

In [30]:
assert en_vocab[unk_token] == fr_vocab[unk_token]
assert en_vocab[pad_token] == fr_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [31]:
en_vocab.set_default_index(unk_index)
fr_vocab.set_default_index(unk_index)

In [32]:
en_vocab["The"]

0

In [33]:
en_vocab.get_itos()[0]

'<unk>'

In [34]:
tokens = ["i", "love", "watching", "crime", "shows"]

In [35]:
en_vocab.lookup_indices(tokens)

[5, 139, 609, 1176, 2684]

In [36]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', 'crime', 'shows']

In [37]:
def numericalize_example(example, en_vocab, fr_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    fr_ids = fr_vocab.lookup_indices(example["fr_tokens"])
    return {"en_ids": en_ids, "fr_ids": fr_ids}

In [38]:
fn_kwargs = {"en_vocab": en_vocab, "fr_vocab": fr_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/122934 [00:00<?, ? examples/s]

Map:   0%|          | 0/35300 [00:00<?, ? examples/s]

Map:   0%|          | 0/17387 [00:00<?, ? examples/s]

In [39]:
train_data[0]

{'en': "We've been spotted.",
 'fr': 'Nous avons été repérés.',
 'en_tokens': ['<sos>', 'we', "'ve", 'been', 'spotted', '.', '<eos>'],
 'fr_tokens': ['<sos>', 'nous', 'avons', 'été', 'repérés', '.', '<eos>'],
 'en_ids': [2, 24, 54, 99, 3510, 4, 3],
 'fr_ids': [2, 27, 98, 80, 0, 4, 3]}

In [40]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>', 'we', "'ve", 'been', 'spotted', '.', '<eos>']

In [41]:
data_type = "torch"
format_columns = ["en_ids", "fr_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [42]:
train_data[0]

{'en_ids': tensor([   2,   24,   54,   99, 3510,    4,    3]),
 'fr_ids': tensor([ 2, 27, 98, 80,  0,  4,  3]),
 'en': "We've been spotted.",
 'fr': 'Nous avons été repérés.',
 'en_tokens': ['<sos>', 'we', "'ve", 'been', 'spotted', '.', '<eos>'],
 'fr_tokens': ['<sos>', 'nous', 'avons', 'été', 'repérés', '.', '<eos>']}

In [43]:
type(train_data[0]["en_ids"])

torch.Tensor

In [44]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_fr_ids = [example["fr_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_fr_ids = nn.utils.rnn.pad_sequence(batch_fr_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "fr_ids": batch_fr_ids,
        }
        return batch

    return collate_fn

In [45]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [46]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [47]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell

In [48]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

In [49]:
"""
import torch
import torch.nn as nn

class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super(Decoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Embedding layer for decoder
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer for decoder
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout)

        # Fully connected layer for output
        self.fc_out = nn.Linear(hidden_dim, vocab_size)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def init_hidden_state(self, batch_size, device):
        # Initialize the hidden state (h) and cell state (c) to zeros for multi-layer LSTM
        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c_0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return h_0, c_0

    def forward(self, input, hidden, encoder_outputs=None):
        # Pass input through embedding layer
        embedded = self.dropout(self.embedding(input))

        # Pass through LSTM layer
        output, hidden = self.rnn(embedded, hidden)

        # Pass output through fully connected layer
        output = self.fc_out(output)

        return output, hidden
"""

'\nimport torch\nimport torch.nn as nn\n\nclass Decoder(nn.Module):\n    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):\n        super(Decoder, self).__init__()\n        self.hidden_dim = hidden_dim\n        self.num_layers = num_layers\n\n        # Embedding layer for decoder\n        self.embedding = nn.Embedding(vocab_size, embedding_dim)\n\n        # LSTM layer for decoder\n        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout)\n\n        # Fully connected layer for output\n        self.fc_out = nn.Linear(hidden_dim, vocab_size)\n\n        # Dropout layer\n        self.dropout = nn.Dropout(dropout)\n\n    def init_hidden_state(self, batch_size, device):\n        # Initialize the hidden state (h) and cell state (c) to zeros for multi-layer LSTM\n        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)\n        c_0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)\n 

In [50]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [51]:
input_dim = len(fr_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)
print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(13285, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(8513, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=8513, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [52]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(13285, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(8513, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=8513, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [53]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 17,303,873 trainable parameters


In [54]:
optimizer = optim.Adam(model.parameters())

In [55]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [56]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["fr_ids"].to(device)
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [57]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["fr_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [58]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):

  start_time = time.time()

  train_loss = train_fn(
      model,
      train_data_loader,
      optimizer,
      criterion,
      clip,
      teacher_forcing_ratio,
      device,
  )
  valid_loss = evaluate_fn(
      model,
      valid_data_loader,
      criterion,
      device,
  )

  end_time = time.time()
  epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

  if valid_loss < best_valid_loss:
      best_valid_loss = valid_loss
      torch.save(model.state_dict(), "/content/drive/MyDrive/Interview/seq2seq.pt")
  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s', flush=True)
  print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
  print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")
  # Additional print statement to show loss of each epoch
  print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val Loss: {valid_loss:.3f}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 01 | Time: 3.0m 29.23480463027954s


 10%|█         | 1/10 [03:33<31:58, 213.14s/it]

	Train Loss:   4.368 | Train PPL:  78.900
	Valid Loss:   3.998 | Valid PPL:  54.515
Epoch: 01 | Train Loss: 4.368 | Val Loss: 3.998
Epoch: 02 | Time: 3.0m 28.83294105529785s


 20%|██        | 2/10 [07:02<28:05, 210.72s/it]

	Train Loss:   3.290 | Train PPL:  26.856
	Valid Loss:   3.338 | Valid PPL:  28.151
Epoch: 02 | Train Loss: 3.290 | Val Loss: 3.338
Epoch: 03 | Time: 3.0m 26.149043083190918s


 30%|███       | 3/10 [10:28<24:21, 208.78s/it]

	Train Loss:   2.682 | Train PPL:  14.616
	Valid Loss:   2.861 | Valid PPL:  17.472
Epoch: 03 | Train Loss: 2.682 | Val Loss: 2.861
Epoch: 04 | Time: 3.0m 27.628522634506226s


 40%|████      | 4/10 [13:56<20:50, 208.40s/it]

	Train Loss:   2.258 | Train PPL:   9.563
	Valid Loss:   2.589 | Valid PPL:  13.320
Epoch: 04 | Train Loss: 2.258 | Val Loss: 2.589
Epoch: 05 | Time: 3.0m 27.748502254486084s


 50%|█████     | 5/10 [17:24<17:21, 208.23s/it]

	Train Loss:   1.954 | Train PPL:   7.056
	Valid Loss:   2.393 | Valid PPL:  10.946
Epoch: 05 | Train Loss: 1.954 | Val Loss: 2.393
Epoch: 06 | Time: 3.0m 36.78524613380432s


 60%|██████    | 6/10 [21:01<14:04, 211.21s/it]

	Train Loss:   1.731 | Train PPL:   5.645
	Valid Loss:   2.288 | Valid PPL:   9.852
Epoch: 06 | Train Loss: 1.731 | Val Loss: 2.288
Epoch: 07 | Time: 3.0m 25.604087591171265s


 70%|███████   | 7/10 [24:27<10:28, 209.45s/it]

	Train Loss:   1.553 | Train PPL:   4.726
	Valid Loss:   2.177 | Valid PPL:   8.819
Epoch: 07 | Train Loss: 1.553 | Val Loss: 2.177
Epoch: 08 | Time: 3.0m 25.952284574508667s


 80%|████████  | 8/10 [27:53<06:56, 208.42s/it]

	Train Loss:   1.421 | Train PPL:   4.140
	Valid Loss:   2.124 | Valid PPL:   8.368
Epoch: 08 | Train Loss: 1.421 | Val Loss: 2.124
Epoch: 09 | Time: 3.0m 27.102211236953735s


 90%|█████████ | 9/10 [31:20<03:28, 208.07s/it]

	Train Loss:   1.310 | Train PPL:   3.705
	Valid Loss:   2.065 | Valid PPL:   7.888
Epoch: 09 | Train Loss: 1.310 | Val Loss: 2.065
Epoch: 10 | Time: 3.0m 24.176708221435547s


100%|██████████| 10/10 [34:45<00:00, 208.52s/it]

	Train Loss:   1.212 | Train PPL:   3.362
	Valid Loss:   2.030 | Valid PPL:   7.615
Epoch: 10 | Train Loss: 1.212 | Val Loss: 2.030





**Test Accuracy**

In [59]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    epoch_loss = 0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():  # No need to compute gradients during evaluation
        for batch in data_loader:
            src = batch["fr_ids"].to(device)  # French sentences
            trg = batch["en_ids"].to(device)  # English sentences (target)

            # Get model predictions
            output = model(src, trg, teacher_forcing_ratio=0)  # No teacher forcing during evaluation

            # Calculate the loss
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)  # Remove <sos> token
            trg = trg[1:].view(-1)  # Remove <sos> token from target

            loss = criterion(output, trg)
            epoch_loss += loss.item()

            # Calculate accuracy
            pred = output.argmax(1)  # Get the index of the highest prediction
            correct_preds += (pred == trg).sum().item()
            total_preds += trg.size(0)

    return epoch_loss / len(data_loader), correct_preds / total_preds
test_loss, test_accuracy = evaluate_fn(model, test_data_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Test Loss: 2.0223
Test Accuracy: 26.04%


**Bleu, ROUGE, METEOR Score**

In [74]:
import nltk
nltk.download('wordnet')
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import torch

# Function to calculate BLEU score
def calculate_bleu_score(references, hypotheses):
    return corpus_bleu(references, hypotheses)

# Function to calculate ROUGE score
def calculate_rouge_score(references, hypotheses):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = []
    for ref, hyp in zip(references, hypotheses):
        # Ensure both are joined into strings of space-separated tokens
        ref_str = ' '.join(ref)  # Join reference tokens
        hyp_str = ' '.join(hyp)  # Join hypothesis tokens
        score = scorer.score(ref_str, hyp_str)
        rouge_scores.append({
            'rouge1': score['rouge1'].fmeasure,
            'rouge2': score['rouge2'].fmeasure,
            'rougeL': score['rougeL'].fmeasure
        })
    return rouge_scores

# Function to calculate METEOR score
def calculate_meteor_score(references, hypotheses):
    meteor_scores = []
    for ref, hyp in zip(references, hypotheses):
        meteor_scores.append(meteor_score([ref], hyp))
    return meteor_scores

# Define evaluation function for 10 samples
def evaluate_fn(model, data_loader, device, en_vocab, max_samples=10):
    model.eval()
    references = []
    hypotheses = []
    samples_processed = 0

    with torch.no_grad():
        for batch in data_loader:
            if samples_processed >= max_samples:
                break  # Stop after processing 10 samples

            src = batch["fr_ids"].to(device)
            trg = batch["en_ids"].to(device)
            output = model(src, trg, teacher_forcing_ratio=0)  # No teacher forcing during evaluation

            # Output contains [trg_length, batch_size, output_dim]
            # Get the predictions for each timestep
            for i in range(output.shape[1]):
                prediction = output[:, i, :].argmax(dim=1)

                # Lookup tokens for predicted and target sentences using en_vocab
                predicted_sentence = en_vocab.lookup_tokens(prediction.tolist())  # Convert IDs to words
                target_sentence = en_vocab.lookup_tokens(trg[:, i].tolist())  # Convert IDs to words

                hypotheses.append(predicted_sentence)
                references.append(target_sentence)  # No need to wrap in list here

                samples_processed += 1
                if samples_processed >= max_samples:
                    break  # Stop after processing the required number of samples

    # Calculate BLEU score
    bleu = calculate_bleu_score(references, hypotheses)

    # Calculate ROUGE score
    rouge = calculate_rouge_score(references, hypotheses)

    # Calculate METEOR score
    meteor = calculate_meteor_score(references, hypotheses)

    return bleu, rouge, meteor

# Example usage (with max_samples set to 10)
bleu_score, rouge_scores, meteor_score = evaluate_fn(model, test_data_loader, device, en_vocab, max_samples=10)

# Print the scores
print(f"BLEU score: {bleu_score}")
for i, rouge_score in enumerate(rouge_scores):
    print(f"ROUGE scores for sample {i + 1}: {rouge_score}")
print(f"METEOR scores: {meteor_score}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


BLEU score: 9.309480961664223e-232
ROUGE scores for sample 1: {'rouge1': 0.38095238095238093, 'rouge2': 0.10000000000000002, 'rougeL': 0.3333333333333333}
ROUGE scores for sample 2: {'rouge1': 0.24390243902439024, 'rouge2': 0.15384615384615385, 'rougeL': 0.24390243902439024}
ROUGE scores for sample 3: {'rouge1': 0.2857142857142857, 'rouge2': 0.15, 'rougeL': 0.2857142857142857}
ROUGE scores for sample 4: {'rouge1': 0.3181818181818182, 'rouge2': 0.14285714285714285, 'rougeL': 0.3181818181818182}
ROUGE scores for sample 5: {'rouge1': 0.3333333333333333, 'rouge2': 0.3, 'rougeL': 0.3333333333333333}
ROUGE scores for sample 6: {'rouge1': 0.14285714285714285, 'rouge2': 0.05000000000000001, 'rougeL': 0.14285714285714285}
ROUGE scores for sample 7: {'rouge1': 0.1951219512195122, 'rouge2': 0.15384615384615385, 'rougeL': 0.1951219512195122}
ROUGE scores for sample 8: {'rouge1': 0.19047619047619047, 'rouge2': 0.05000000000000001, 'rougeL': 0.19047619047619047}
ROUGE scores for sample 9: {'rouge1':

In [73]:
import nltk
nltk.download('wordnet')
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import torch

# Function to calculate BLEU score
def calculate_bleu_score(references, hypotheses):
    return corpus_bleu(references, hypotheses)

# Function to calculate ROUGE score
def calculate_rouge_score(references, hypotheses):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = []
    for ref, hyp in zip(references, hypotheses):
        # Ensure both are joined into strings of space-separated tokens
        ref_str = ' '.join(ref)  # Join reference tokens
        hyp_str = ' '.join(hyp)  # Join hypothesis tokens
        score = scorer.score(ref_str, hyp_str)
        rouge_scores.append({
            'rouge1': score['rouge1'].fmeasure,
            'rouge2': score['rouge2'].fmeasure,
            'rougeL': score['rougeL'].fmeasure
        })
    return rouge_scores

# Function to calculate METEOR score
def calculate_meteor_score(references, hypotheses):
    meteor_scores = []
    for ref, hyp in zip(references, hypotheses):
        meteor_scores.append(meteor_score([ref], hyp))
    return meteor_scores

# Define evaluation function
def evaluate_fn(model, data_loader, device, en_vocab):
    model.eval()
    references = []
    hypotheses = []
    with torch.no_grad():
        for batch in data_loader:
            src = batch["fr_ids"].to(device)
            trg = batch["en_ids"].to(device)
            output = model(src, trg, teacher_forcing_ratio=0)  # No teacher forcing during evaluation

            # Output contains [trg_length, batch_size, output_dim]
            # Get the predictions for each timestep
            for i in range(output.shape[1]):
                prediction = output[:, i, :].argmax(dim=1)

                # Lookup tokens for predicted and target sentences using en_vocab
                predicted_sentence = en_vocab.lookup_tokens(prediction.tolist())  # Convert IDs to words
                target_sentence = en_vocab.lookup_tokens(trg[:, i].tolist())  # Convert IDs to words

                hypotheses.append(predicted_sentence)
                references.append(target_sentence)  # No need to wrap in list here

    # Calculate BLEU score
    bleu = calculate_bleu_score(references, hypotheses)

    # Calculate ROUGE score
    rouge = calculate_rouge_score(references, hypotheses)

    # Calculate METEOR score
    meteor = calculate_meteor_score(references, hypotheses)

    return bleu, rouge, meteor

# Example usage
bleu_score, rouge_scores, meteor_score = evaluate_fn(model, test_data_loader, device, en_vocab)

# Print the scores
print(f"BLEU score: {bleu_score}")
for i, rouge_score in enumerate(rouge_scores):
    print(f"ROUGE scores for sample {i + 1}: {rouge_score}")
print(f"METEOR scores: {meteor_score}")

[nltk_data] Downloading package wordnet to /root/nltk_data...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ROUGE scores for sample 12390: {'rouge1': 0.34782608695652173, 'rouge2': 0.2727272727272727, 'rougeL': 0.34782608695652173}
ROUGE scores for sample 12391: {'rouge1': 0.21739130434782608, 'rouge2': 0.18181818181818182, 'rougeL': 0.21739130434782608}
ROUGE scores for sample 12392: {'rouge1': 0.17391304347826086, 'rouge2': 0.09090909090909091, 'rougeL': 0.17391304347826086}
ROUGE scores for sample 12393: {'rouge1': 0.2608695652173913, 'rouge2': 0.22727272727272727, 'rougeL': 0.2608695652173913}
ROUGE scores for sample 12394: {'rouge1': 0.2608695652173913, 'rouge2': 0.22727272727272727, 'rougeL': 0.2608695652173913}
ROUGE scores for sample 12395: {'rouge1': 0.2608695652173913, 'rouge2': 0.09090909090909091, 'rougeL': 0.21739130434782608}
ROUGE scores for sample 12396: {'rouge1': 0.13043478260869565, 'rouge2': 0.045454545454545456, 'rougeL': 0.13043478260869565}
ROUGE scores for sample 12397: {'rouge1': 0.2608695652173913, 'ro

**Save and Load Model**

In [None]:
# Save model weights
def save_model(model, filepath):
    torch.save(model.state_dict(), filepath)
    print(f"Model saved to {filepath}")

# Example usage after training
save_model(model, "model_weights.pth")

# Load the model's weights
def load_model(model, filepath, device):
    model.load_state_dict(torch.load(filepath, map_location=device))
    model.to(device)  # Ensure model is on the right device (CPU or GPU)
    model.eval()  # Set the model to evaluation mode
    print(f"Model loaded from {filepath}")

# Example usage
load_model(model, "model_weights.pth", device)

**Inference Testing**

In [81]:
def infer_english_to_french(model, sentence, en_vocab, fr_vocab, device):
    model.eval()

    # Tokenize and numericalize the input sentence (English)
    tokenized_sentence = sentence.split()  # Use the same tokenization method as in training (or use spacy/nltk)
    print(f"Tokenized input (English): {tokenized_sentence}")  # Check tokenization

    # Convert sentence to tensor (use en_vocab to map words to indices)
    sentence_tensor = torch.tensor([en_vocab[token] for token in tokenized_sentence if token in en_vocab]).unsqueeze(0).to(device)

    # Handle any out-of-vocabulary words in the tokenized sentence
    oov_tokens = [token for token in tokenized_sentence if token not in en_vocab]
    if oov_tokens:
        print(f"Out of vocabulary tokens: {oov_tokens}")

    # Generate predictions using the trained model
    with torch.no_grad():
        output = model(sentence_tensor, sentence_tensor, teacher_forcing_ratio=0)  # No teacher forcing during inference

    # Convert predicted tokens (French) from the model's output
    predicted_tokens = []
    for i in range(output.shape[1]):  # Iterate through the timesteps
        prediction = output[:, i, :].argmax(dim=1)  # Get the predicted word ID
        predicted_word = fr_vocab.lookup_token(prediction.item())  # Convert ID to word (French)
        predicted_tokens.append(predicted_word)

    # Join tokens into a string (French translation)
    predicted_sentence = ' '.join(predicted_tokens)
    return predicted_sentence


# Example usage
sentence = "Hello, today it will rain heavily"  # English sentence
predicted_translation = infer_english_to_french(model, sentence, en_vocab, fr_vocab, device)

print(f"Predicted French translation: {predicted_translation}")

Tokenized input (English): ['Hello,', 'today', 'it', 'will', 'rain', 'heavily']
Out of vocabulary tokens: ['Hello,']
Predicted French translation: <unk> <unk> <unk> <unk> <unk>
