In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
nltk.download('punkt_tab')

In [None]:
df = pd.read_csv("Datasets/eng_-french.csv")

In [None]:
def clean_text(text):
    text = text.lower()                          # Lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)   # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()     # Remove extra spaces
    tokens = word_tokenize(text)                 # Tokenize
    return " ".join(tokens)

In [7]:
df['English words/sentences'] = df['English words/sentences'].apply(clean_text)
df['French words/sentences'] = df['French words/sentences'].apply(clean_text)

In [9]:
print(df[['English words/sentences', 'French words/sentences']].head())

  English words/sentences French words/sentences
0                      hi                  salut
1                     run                  cours
2                     run                 courez
3                     who                    qui
4                     wow                a alors


In [11]:
eng_sentences = df['English words/sentences'].tolist()
fr_sentences = df['French words/sentences'].tolist()

eng_tokens = [sentence.split() for sentence in eng_sentences]
fr_tokens = [sentence.split() for sentence in fr_sentences]


In [13]:
def build_vocab(tokenized_sentences, min_freq=2):
    counter = Counter()
    for tokens in tokenized_sentences:
        counter.update(tokens)

    vocab = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
    index = 4
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = index
            index += 1
    return vocab


In [15]:
eng_vocab = build_vocab(eng_tokens)
fr_vocab = build_vocab(fr_tokens)

In [17]:
def encode(sentence, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in sentence] + [vocab['<EOS>']]

encoded_eng = [torch.tensor(encode(tokens, eng_vocab)) for tokens in eng_tokens]
encoded_fr = [torch.tensor([fr_vocab['<SOS>']] + encode(tokens, fr_vocab)) for tokens in fr_tokens]

padded_eng = pad_sequence(encoded_eng, batch_first=True, padding_value=eng_vocab['<PAD>'])
padded_fr = pad_sequence(encoded_fr, batch_first=True, padding_value=fr_vocab['<PAD>'])

print("Padded English shape:", padded_eng.shape)
print("Padded French shape:", padded_fr.shape)


Padded English shape: torch.Size([175621, 45])
Padded French shape: torch.Size([175621, 56])


In [19]:
class TranslationDataset(Dataset):
    def __init__(self, source_tensor, target_tensor):
        self.source = source_tensor
        self.target = target_tensor

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        return self.source[idx], self.target[idx]


In [21]:
dataset = TranslationDataset(padded_eng, padded_fr)

In [23]:
BATCH_SIZE = 32

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [25]:
#Encoder RNN
class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim, num_layers, batch_first=True)
        
    def forward(self, src):
        # src: [batch_size, seq_len]
        embedded = self.embedding(src)  # [batch_size, seq_len, emb_dim]
        outputs, hidden = self.gru(embedded)  # outputs: [batch_size, seq_len, hidden_dim], hidden: [num_layers, batch_size, hidden_dim]
        return hidden


In [27]:
#Decoder RNN
class DecoderRNN(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.rnn = nn.GRU(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden):
        # input shape: (batch_size) — single token input
        input = input.unsqueeze(1)  # shape -> (batch_size, 1)
        embedded = self.embedding(input)  # (batch_size, 1, embed_dim)
        output, hidden = self.rnn(embedded, hidden)  # output: (batch_size, 1, hidden_dim)
        prediction = self.fc_out(output.squeeze(1))  # (batch_size, output_dim)
        return prediction, hidden


In [29]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # Step 1: Encode the source sentence
        hidden = self.encoder(src)

        # Step 2: First input to the decoder is <SOS>
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)  # One step
            outputs[:, t] = output

            # Decide: use teacher forcing or predicted word?
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)  # Get index of highest probability

            input = trg[:, t] if teacher_force else top1

        return outputs


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim

# Set hyperparameters
INPUT_DIM = len(eng_vocab)
OUTPUT_DIM = len(fr_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate encoder, decoder, and Seq2Seq model
encoder = EncoderRNN(INPUT_DIM, ENC_EMB_DIM, HID_DIM)
decoder = DecoderRNN(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)

# Define optimizer and loss function
optimizer = optim.Adam(seq2seq.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=fr_vocab['<PAD>'])

# Define training loop
def train(model, dataloader, optimizer, criterion, device, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt)
            
            # Reshape output and target to compute loss
            output = output[:, 1:].reshape(-1, output.shape[-1])
            tgt = tgt[:, 1:].reshape(-1)
            
            loss = criterion(output, tgt)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {epoch_loss/len(dataloader):.4f}")


In [None]:
train(seq2seq, dataloader, optimizer, criterion, device, num_epochs=5)


In [None]:
def translate_sentence(model, sentence, eng_vocab, fr_vocab, device, max_len=20):
    model.eval()

    # Tokenize and encode input sentence
    tokens = sentence.lower().split()
    encoded = [eng_vocab.get(token, eng_vocab['<UNK>']) for token in tokens]
    input_tensor = torch.tensor(encoded).unsqueeze(0).to(device)  # shape: (1, seq_len)

    # Pass through encoder
    with torch.no_grad():
        hidden = model.encoder(input_tensor)

    # Decoder start with <SOS>
    outputs = [fr_vocab['<SOS>']]
    
    for _ in range(max_len):
        prev_word = torch.tensor([outputs[-1]]).unsqueeze(0).to(device)  # shape: (1, 1)

        with torch.no_grad():
            output, hidden = model.decoder(prev_word, hidden)
            pred_token = output.argmax(2).item()  # Get the most likely word

        outputs.append(pred_token)

        if pred_token == fr_vocab['<EOS>']:
            break

    # Invert fr_vocab to convert ids back to words
    idx2word = {idx: word for word, idx in fr_vocab.items()}
    translated = [idx2word.get(idx, '<UNK>') for idx in outputs[1:]]  # Skip <SOS>

    return ' '.join(translated)


In [None]:
example = "i like cats"
translation = translate_sentence(seq2seq, example, eng_vocab, fr_vocab, device)
print(f"English: {example}")
print(f"French : {translation}")