# Machine Translation with Sequence-to-Sequence and RNNs

This lab will demonstrate fundamentals of Sequence-to-sequence RNN model for translation.  Using various libraries: `pandas`, `numpy`, `sklearn`, `seaborn`, `matplotlib`, `torch`, `spacy`.  (NOTE: we will use `spacy` for the natural language processing (NLP). 

Will demonstrate use of the Multi30K dataset (a large machine translation dataset with extensive English to German sentence pairs). 


In [22]:
import os
import sys
import warnings

warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'

# Install packages
os.system(f"{sys.executable} -m pip install -qq 'numpy<2.0' 2>/dev/null")
os.system(f"{sys.executable} -m pip install -qq torch==2.2.2 torchvision==0.17.2 torchtext==0.17.2 2>/dev/null")
os.system(f"{sys.executable} -m pip install -qq 'spacy<3.8' 'thinc<8.3' 2>/dev/null")
os.system(f"{sys.executable} -m pip install -qq pandas matplotlib seaborn scikit-learn portalocker 2>/dev/null")
os.system(f"{sys.executable} -m pip install -qq torchdata==0.7.1 nltk 2>/dev/null")
os.system(f"{sys.executable} -m spacy download en_core_web_sm 2>/dev/null")

os.system(f"{sys.executable} -m spacy download de_core_news_sm 2>/dev/null")

# Verify
print("Installation complete! Testing imports...")
import torch
import spacy
import numpy as np
print(f"✓ NumPy {np.__version__}")
print(f"✓ PyTorch {torch.__version__}")
print(f"✓ spaCy {spacy.__version__}")

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.2 MB/s[0m  [33m0:00:00[0mm0:00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Defaulting to user installation because normal site-packages is not writeable
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m29.7 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('d

## Import required libraries

In [26]:
from torchtext.data.utils import get_tokenizer #import get_tokenizer function from torchtext
from torchtext.vocab import build_vocab_from_iterator #import function that builds vocabulary from tokenized text
from nltk.translate.bleu_score import sentence_bleu #Used for evals/BLEU score metrics
from torchtext.datasets import multi30k, Multi30k #Import the utilities/functions to access Multi30k dataset
from typing import Iterable, List # For type hints
from torch.nn.utils.rnn import pad_sequence #for batch sequence length noramlization
from torch.utils.data import DataLoader #for creation of data batches
from torchdata.datapipes.iter import IterableWrapper, Mapper
import torchtext #Torchtext for NLP tasks

import torch #Main PyTorch library for tensor operations
import torch.nn as nn #utilities for building blocks of the neural network (e.g. layers, loss functions)
import torch.optim as optim #optimization algo for model weight updates during training

#suppress warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

# Sequence to Sequence Model

We will implement a Seq2Seq model in PyTorch.  Sequence-to-sequence or `seq2seq` models useful for: 
- Translation from source language to target language
- Chat bots (e.g. answering question or generatugin natural language responses to input sequences)
- Summarization

## Sequence-to-sequence (seq2seq) architecture
- Seq2seq utilize an encoder-decoder structure: (1) the encoder encodes input seq into fixed-dimensional representation context vector (ht) (2) the decoder generates output sequence based on encoded context vector.
- This architecture takes input token x_t and embedding layer converts word ID to dense vector. RNN decoder takes the embedded word and previous hidden state (h_t) to output new hidden state h_{t+1}. A Linear layer projects the new hidden state to vocab size and outputs probability distribution for next word prediction. This process repeats, with each predicted word becoming the next input x_t.

In [38]:
#Define the encoder class

class Encoder(nn.Module): #Encoder class inherits from PyTorch nn.Module base class
    def __init__(self, vocab_len, emb_dim, hid_dim, n_layers, dropout_prob):
        super().__init__() #run initialization code from parent class

        self.hid_dim = hid_dim # Save hidden size (for decoder)
        self.n_layers = n_layers # Save layer count (for decoder)

        #Create the three layers
        self.embedding = nn.Embedding(vocab_len, emb_dim) #Word to vec lookup
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout_prob)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_batch): #forward runs encoding to convert input sentences into context vectors
        embed = self.dropout(self.embedding(input_batch)) #Store word vectors in embed + dropout
        embed = embed.to(device) #Move to device for computation
        outputs, (hidden, cell) = self.lstm(embed) # Pass embeddings through LSTM, get outputs and states
        return hidden, cell #return encoded representation

In [39]:
#Optional but recommended: Test encoder and verify functionality
vocab_len = 8
emb_dim = 10
hid_dim=8
n_layers=1
dropout_prob=0.5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder_t = Encoder(vocab_len, emb_dim, hid_dim, n_layers, dropout_prob).to(device)

src_batch = torch.tensor([[0,3,4,2,1]])
# you need to transpose the input tensor as the encoder LSTM is in Sequence_first mode by default
src_batch = src_batch.t().to(device)
print("Shape of input(src) tensor:", src_batch.shape)
hidden_t , cell_t = encoder_t(src_batch)
print("Hidden tensor from encoder:",hidden_t ,"\nCell tensor from encoder:", cell_t)

Shape of input(src) tensor: torch.Size([5, 1])
Hidden tensor from encoder: tensor([[[-0.0548, -0.1163, -0.1314, -0.2997,  0.2321,  0.0804,  0.0700,
          -0.2902]]], grad_fn=<StackBackward0>) 
Cell tensor from encoder: tensor([[[-0.1117, -0.2588, -0.4679, -0.7646,  0.4135,  0.2224,  0.4433,
          -0.6059]]], grad_fn=<StackBackward0>)


In [40]:
#Implement decoder class that inherits from nn.Module
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__() 

        self.output_dim = output_dim #Store target vocab size (output_dim is vocab size)
        self.hid_dim = hid_dim #Store hidden dimension
        self.n_layers = n_layers #Store num of LSTM layers


        self.embedding = nn.Embedding(output_dim, emb_dim) # Create learnable lookup table for target word ID → vector conversion
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout) # Create LSTM decoder with input size emb_dim
        self.fc_out = nn.Linear(hid_dim, output_dim) # Linear layer to project hidden states to vocab size
        self.softmax = nn.LogSoftmax(dim=1) # Convert logits to log probabilities for word prediction
        self.dropout = nn.Dropout(dropout) # Dropout layer for regularization during training

    # Forward method processes one word at a time through decoder, generating next word prediction
    def forward(self, input, hidden, cell):
        # input=[batch_size], hidden/cell=[n_layers, batch_size, hid_dim]
        
        # Add sequence dimension for LSTM compatibility
        input = input.unsqueeze(0)  # [1, batch_size]
        
        # Convert word IDs to vectors and apply dropout
        embedded = self.dropout(self.embedding(input))  # [1, batch_size, emb_dim]
        
        # Process through LSTM with previous states
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # output=[1, batch_size, hid_dim], hidden/cell=[n_layers, batch_size, hid_dim]
        
        # Remove seq dimension and project to vocabulary size
        prediction_logit = self.fc_out(output.squeeze(0))  # [batch_size, output_dim]
        
        # Convert logits to log probabilities
        prediction = self.softmax(prediction_logit)  # [batch_size, output_dim]
        
        return prediction, hidden, cell

In [None]:
#Test deocder by creating an instance 