# Exercise 2
## Neural machine translation with attention
### By: Daniel Mehta

---

## Imports and Config

In [27]:
import os
import re
import random
from pathlib import Path
from collections import Counter
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# Setting up seed
SEED = 5501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [5]:
# setting device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


---

## Dataset Path Setup

In [7]:
# setting path to dataset
data_dir = Path("spa-eng")
data_path = data_dir / "spa.txt"

In [8]:
if not data_path.exists():
    raise FileNotFoundError(f"Dataset not found at {data_path}")
print(f"Dataset located at: {data_path}")

Dataset located at: spa-eng\spa.txt


---

## Data Exploration and Cleaning

In [10]:
# reading the file and split into lines
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")

In [11]:
print(f"Total sentence pairs in file: {len(lines)}")
print("Sample lines:")
for i in range(5):
    print(lines[i])

Total sentence pairs in file: 142511
Sample lines:
Go.	Ve.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)
Go.	Vete.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986656 (cueyayotl)
Go.	Vaya.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986657 (cueyayotl)
Go.	Váyase.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #6586271 (arh)
Hi.	Hola.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #431975 (Leono)


In [12]:
#Separating into English and Spanish
pairs = [line.split("\t") for line in lines]
english_sentences = [pair[0] for pair in pairs] #English (target)
spanish_sentences = [pair[1] for pair in pairs] #Spanish (source)

print("\nExample pair:")
print("EN:", english_sentences[0])
print("ES:", spanish_sentences[0])


Example pair:
EN: Go.
ES: Ve.


---

## Tokenization & vocab building

In [14]:
#  start and end tokens to the English targets
START_TOKEN="<start>"
END_TOKEN="<end>"

english_sentences = [f"{START_TOKEN} {s} {END_TOKEN}" for s in english_sentences]

In [15]:
#Basic tokenization
#lowercase, split on spaces, strip punctuation
def tokenize(text):
    return text.lower().strip().split()

In [17]:
#Tokenize all the sentences
tokenized_es = [tokenize(s) for s in spanish_sentences]
tokenized_en = [tokenize(s) for s in english_sentences]

In [18]:
#Build vocabularies
def build_vocab(tokenized_sents,min_freq=1):
    counter = Counter(token for sent in tokenized_sents for token in sent)
    vocab = {token: idx+2 for idx, (token, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab["<pad>"] =0
    vocab["<unk>"] =1
    return vocab

In [21]:
src_vocab = build_vocab(tokenized_es)
tgt_vocab = build_vocab(tokenized_en)

In [22]:
# reverse look up for decoding 
src_idx2word = {idx: word for word,idx in src_vocab.items()}
tgt_idx2word = {idx: word for word,idx in tgt_vocab.items()}

In [23]:
print(f"Source vocab size (ES): {len(src_vocab)}")
print(f"Target vocab size (EN): {len(tgt_vocab)}")

Source vocab size (ES): 46045
Target vocab size (EN): 25767


---

##  Convert sentences to index tensors

In [24]:
#Numericalize tokenized sentences
def numericalize(tokenized_sents,vocab):
    return [torch.tensor([vocab.get(tok,vocab["<unk>"]) for tok in sent],dtype=torch.long) 
            for sent in tokenized_sents]

In [25]:
src_tensors =numericalize(tokenized_es, src_vocab)
tgt_input_tensors = numericalize([sent[:-1] for sent in tokenized_en], tgt_vocab)# without <end>
tgt_target_tensors = numericalize([sent[1:] for sent in tokenized_en], tgt_vocab)# without <start>

In [28]:
# Pad the sequences
src_tensors = pad_sequence(src_tensors,batch_first=True, padding_value=src_vocab["<pad>"])
tgt_input_tensors = pad_sequence(tgt_input_tensors,batch_first=True, padding_value=tgt_vocab["<pad>"])
tgt_target_tensors = pad_sequence(tgt_target_tensors,batch_first=True, padding_value=tgt_vocab["<pad>"])

In [29]:
print(f"Example ES tensor: {src_tensors[0]}")
print(f"Example EN input tensor: {tgt_input_tensors[0]}")
print(f"Example EN target tensor: {tgt_target_tensors[0]}")

Example ES tensor: tensor([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Example EN input tensor: tensor([2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Example EN target tensor: tensor([3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


---

## Dataloader Setup

In [30]:
# Combineing into dataset
dataset=list(zip(src_tensors,tgt_input_tensors,tgt_target_tensors))

In [31]:
# Train/validation split (80/20)
split_idx=int(len(dataset)*0.8)
train_data=dataset[:split_idx]
val_data=dataset[split_idx:]

BATCH_SIZE =64

train_loader = DataLoader(train_data,batch_size=BATCH_SIZE,shuffle=True)
val_loader = DataLoader(val_data,batch_size=BATCH_SIZE)

print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

Train batches: 1782, Val batches: 446


---

## Model Architecture

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding =nn.Embedding(input_vocab_size,embed_dim)
        self.gru =nn.GRU(embed_dim, hidden_dim, batch_first=True,bidirectional=True)
        self.fc =nn.Linear(hidden_dim*2, hidden_dim)# project bi GRU utput to hiddendim

    def forward(self, src_idxs):
        # src_idxs:(batch,src_len)
        embedded = self.embedding(src_idxs) #(batch, rc_len,embed_dim)
        outputs, hidden = self.gru(embedded) # outputs:(batch,src_len,hidden_dim*2)
        
        # Mergeing the bidirectional hidden states
        hidden=torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))) # (batch, hidden_dim)
        hidden=hidden.unsqueeze(0) # (1, batch, hidden_dim)
        
        return outputs, hidden # outputs for attention, hidden for decoder init


class LuongAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn =nn.Linear(hidden_dim, hidden_dim)

    def forward(self, decoder_hidden, encoder_outputs):
        # decoder_hidden:(1, batch, hidden_dim)
        # encoder_outputs:(batch, src_len, hidden_dim)
        
        # Repeat decoder hidden state across src_len
        decoder_hidden =decoder_hidden.permute(1,0,2) #(batch,1,hidden_dim)
        
        # score: batch matrix multiply
        scores = torch.bmm(self.attn(encoder_outputs), decoder_hidden.transpose(1, 2))# (batch, src_len, 1)
        attn_weights = torch.softmax(scores,dim=1) # (batch, src_len, 1)
        
        #Context vector
        context = torch.bmm(attn_weights.transpose(1,2), encoder_outputs)#(batch, 1, hidden_dim)
        
        return context,attn_weights


class Decoder(nn.Module):
    def __init__(self,output_vocab_size, embed_dim,hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_vocab_size, embed_dim)
        self.gru =nn.GRU(embed_dim+hidden_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim *2,output_vocab_size)
        self.attention = LuongAttention(hidden_dim)

    def forward(self, tgt_input_idxs, hidden, encoder_outputs):
        # tgt_input_idxs: (batch, tgt_len)
        embedded = self.embedding(tgt_input_idxs) # (batch, tgt_len, embed_dim)
        
        outputs = []
        for t in range(embedded.size(1)):# step through target sequence
            input_t = embedded[:, t, :].unsqueeze(1) #(batch, 1,embed_dim)
            
            # Attention context
            context, attn_weights =self.attention(hidden,encoder_outputs)
            
            # Combineing the context with current input
            rnn_input = torch.cat((input_t, context),dim= 2)# (batch, 1, embed_dim+hidden_dim)
            
            output,hidden = self.gru(rnn_input, hidden) # output: (batch, 1, hidden_dim)
            
            #The Final output layer
            output_combined = torch.cat((output, context),dim= 2)# (batch, 1, hidden_dim*2)
            prediction = self.fc_out(output_combined) # (batch,1, output_vocab_size)
            
            outputs.append(prediction)
        
        outputs = torch.cat(outputs, dim=1) #(batch,tgt_len,output_vocab_size)
        return outputs


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.device=device

    def forward(self, src_idxs, tgt_input_idxs):
        encoder_outputs,hidden=self.encoder(src_idxs)
        outputs =self.decoder(tgt_input_idxs,hidden,encoder_outputs)
        return outputs
