In [42]:
!pip install torchsummary
!pip install richprint

from richprint import RichPrint
rprint = RichPrint()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torchvision import datasets, models
from torch.utils.data import DataLoader

import os
import re
import sys
import math
import random
from tqdm.auto import tqdm
from collections import Counter

import warnings
warnings.filterwarnings("ignore")



In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[device]:", device)

[device]: cuda


In [45]:
root = '/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv'
sentences = pd.read_csv(root)
sentences.head(5)

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [46]:
def preprocess(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"[^a-zA-Z\u0900-\u097F0-9?.!,¿]+", " ", sentence)
    sentence = re.sub(r"\s+", " ", sentence)
    return sentence

sentences = sentences.dropna(subset = ['english', 'hindi'])
sentences['english'] = sentences['english'].apply(preprocess)
sentences['hindi'] = sentences['hindi'].apply(lambda x: '<sos> ' + preprocess(x) + ' <eos>')
sentences = sentences[(sentences['english'].str.split().str.len() > 5) & (sentences['english'].str.split().str.len() <= 30)]
sentences = sentences[(sentences['hindi'].str.split().str.len() > 5) & (sentences['hindi'].str.split().str.len() <= 30)]

In [47]:
sentences = sentences[:20000]

In [48]:
def find_threshold(column = "hindi"):
    max_sequence_length = 0.0
    for sentence in sentences[column]:
        current_len = len(str(sentence).split())
        max_sequence_length = max(max_sequence_length, current_len)
    return max_sequence_length

In [49]:
def create_vocab(sentences):
    counter = Counter()
    for sentence in sentences:
        sentence = str(sentence)
        counter.update(sentence.split())

    vocab = {word: i + 4 for i, word in enumerate(counter.keys())}
    vocab["<pad>"] = 0
    vocab["<sos>"] = 1
    vocab["<eos>"] = 2
    vocab["<unk>"] = 3
    return vocab

In [50]:
def tokenize(sentence, vocab):
    tokens = [vocab.get(word, vocab["<unk>"]) for word in str(sentence).split()]
    return tokens

def pad_sequence(tokens, limit):
    padded_sequence = [0] * (limit - len(tokens))
    padded_sequence = np.array(tokens + padded_sequence)
    return padded_sequence

In [51]:
source_vocab = create_vocab(sentences["english"])
target_vocab = create_vocab(sentences["hindi"])

print(len(source_vocab))
print(len(target_vocab))

3934
4268


In [52]:
class Configure(torch.utils.data.Dataset):
    def __init__(self, data, max_len, source_vocab, target_vocab):
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        target = self.data.iloc[index]["hindi"]
        source = self.data.iloc[index]["english"]
        target_tokens = tokenize(target, target_vocab)
        source_tokens = tokenize(source, source_vocab)
        padded_target = pad_sequence(target_tokens, self.max_len)
        padded_source = pad_sequence(source_tokens, self.max_len)
        target_tensor = torch.from_numpy(padded_target).long().to(device)
        source_tensor = torch.from_numpy(padded_source).long().to(device)
        return source_tensor, target_tensor

In [53]:
batch_size = 20
num_workers = 0
max_sequence_length = 40

dataset = Configure(
    data = sentences, 
    max_len = max_sequence_length, 
    source_vocab = source_vocab, 
    target_vocab = target_vocab
)

dataloader = DataLoader(dataset, batch_size, num_workers = num_workers, shuffle = True)

In [54]:
X, y = next(iter(dataloader))
print(X.shape, y.shape)

torch.Size([20, 40]) torch.Size([20, 40])


In [55]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.lookup = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first = True)

    def forward(self, x):
        out = self.lookup(x)
        out, hidden = self.gru(out)
        return out, hidden

In [56]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1, bias = False)

    def forward(self, hidden, output):
        hidden = hidden.permute(1, 0, 2)
        hidden = hidden.repeat(1, output.shape[1], 1)
        energy = torch.tanh(self.fc1(torch.cat((hidden, output), dim = 2)))
        return torch.softmax(self.fc2(energy).squeeze(2), dim = 1)

In [57]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.lookup = nn.Embedding(vocab_size, embedding_dim)
        self.attention = Attention(hidden_dim)
        self.gru = nn.GRU(
            embedding_dim + hidden_dim, 
            hidden_dim, 
            batch_first = True
        )
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, x, hidden, encoder_outputs):
        out = self.lookup(x)
        attention_weights = self.attention(hidden, encoder_outputs).unsqueeze(1)

        context = torch.bmm(attention_weights, encoder_outputs)

        if out.dim() == 2:   
            out = out.unsqueeze(1) 

        if context.dim() == 2: 
            context = context.unsqueeze(1)  
        
        _ = torch.cat((out, context), dim = 2)
        out, hidden = self.gru(_, hidden)
        out = out.squeeze(1)
        context = context.squeeze(1)

        out = self.fc(torch.cat((out, context), dim = 1))
        return out, hidden, attention_weights.squeeze(1)

In [58]:
class PositionalEncoding(nn.Module):
    """
    In the paper 'Attention is all you need', Google researchers focused on something 
    they called 'Positional Encoding' that helps the model establish distances between
    words, or more precisely 'word' embeddings. They say its easy to find relations when the functions
    are bounded like 'sine' or 'cosine' that lie in the range [-1, 1].
    Read: https://en.wikipedia.org/wiki/Bounded_function
    Read more on: https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)

    The idea is pretty simple, we create a 2x2 tensor holding positional encodings 
    for the embedding matrix of every word in the sequence.

    They precomputed these values and used them as a buffer (non-trainable). The formula 
    seems pretty straightforward that justifies their claims.
    """
    def __init__(self, d, dropout, max_length = 200):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        
        pe = torch.zeros(max_length, d)
        pos = torch.arange(0, max_length).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d, 2).float() * (-math.log(10000.0) / d))
        
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [59]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: torch.Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [60]:
class Seq2Seq(nn.Module):
    def __init__(self,
                 num_encoder_layers, 
                 num_decoder_layers, 
                 embedding_dim, 
                 n_heads, 
                 source_vocab_size,
                 target_vocab_size, 
                 dim_feedforward = 512, 
                 dropout = 0.1, 
                 pad_idx = 0
                ):
        super(Seq2Seq, self).__init__()
        self.pad_idx = pad_idx
        
        self.transformer = nn.Transformer(
            d_model = embedding_dim,
            nhead = n_heads,
            num_encoder_layers = num_encoder_layers,
            num_decoder_layers = num_decoder_layers,
            dim_feedforward = dim_feedforward,
            dropout = dropout,
            batch_first = True
        )

        self.generator = nn.Linear(embedding_dim, target_vocab_size)
        self.source_token_embedding = TokenEmbedding(source_vocab_size, embedding_dim)
        self.target_token_embedding = TokenEmbedding(target_vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim, dropout)

    def make_pad_mask(self, seq):
        return (seq == self.pad_idx)

    def make_causal_mask(self, size, device):
        mask = torch.triu(torch.ones(size, size, device = device), diagonal = 1)
        mask = mask.masked_fill(mask == 1, float("-inf"))
        return mask

    def forward(self, src, trg):
        src_emb = self.positional_encoding(self.source_token_embedding(src))
        trg_emb = self.positional_encoding(self.target_token_embedding(trg))

        src_pad_mask = self.make_pad_mask(src)
        trg_pad_mask = self.make_pad_mask(trg)
        tgt_mask = self.make_causal_mask(trg.size(1), trg.device)

        outs = self.transformer(
            src_emb,
            trg_emb,
            tgt_mask = tgt_mask,
            src_key_padding_mask = src_pad_mask,
            tgt_key_padding_mask = trg_pad_mask,
            memory_key_padding_mask = src_pad_mask
        )
        return self.generator(outs)

In [61]:
embedding_dim = 256
nhead = 8
num_encoder_layers = 3
num_decoder_layers = 3
dim_feedforward = 512
dropout = 0.1
pad_idx = 0
learning_rate = 1e-4
epochs = 20
source_vocab_size = len(source_vocab)
target_vocab_size = max(target_vocab.values()) + 1
hidden_dim = 512
teacher_forcing_ratio = 0.5

model = Seq2Seq(
    num_encoder_layers, 
    num_decoder_layers,
    embedding_dim, 
    nhead,
    source_vocab_size,
    target_vocab_size,
    dim_feedforward,
    dropout,
    pad_idx
).to(device)

In [62]:
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

optimizer = torch.optim.Adam(
    model.parameters(),
    lr = learning_rate,
    betas = (0.9, 0.98),
    eps = 1e-9
)

In [63]:
losses = []

for epoch in range(epochs):
    running_loss = []
    model.train()
    progress_bar = tqdm(dataloader, desc = f"Epoch {epoch + 1}/{epochs}", leave = False)

    for source, target in progress_bar:
        optimizer.zero_grad()
        outputs = model(source, target[:, :-1])
        outputs = outputs.reshape(-1, outputs.shape[2])
        
        target = target[:, 1:].reshape(-1)
        loss = criterion(outputs, target)
        
        running_loss.append(loss.item())
        progress_bar.set_postfix({"Loss": loss.item()})
        
        loss.backward()
        optimizer.step()

    avg_loss = sum(running_loss) / len(running_loss)
    losses.append(avg_loss)
    rprint.color("white").style('bold').show(f"[Epoch {epoch + 1}/{epochs}] - Mean Loss: {avg_loss: .4f}")

Epoch 1/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 2/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 3/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 4/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 5/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 6/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 7/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 8/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 9/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 10/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 11/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 12/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 13/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 14/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 15/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 16/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 17/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 18/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 19/20:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 20/20:   0%|          | 0/1000 [00:00<?, ?it/s]

In [64]:
def translate_sentence(sentence, model, source_vocab, target_vocab, max_len = 50):
    model.eval()
    idx2word = {i: w for w, i in target_vocab.items()}
    source_tokens = tokenize(sentence, source_vocab)
    padded_source = pad_sequence(source_tokens, max_sequence_length)
    source_tensor = torch.from_numpy(padded_source).to(device).unsqueeze(0)
    sos_idx = target_vocab["<sos>"]
    eos_idx = target_vocab["<eos>"]
    target_indices = [sos_idx]

    with torch.no_grad():
        for _ in range(max_len):
            target_tensor = torch.tensor(target_indices).to(device).unsqueeze(0)
            outputs = model(source_tensor, target_tensor)  
            next_token = outputs[:, -1, :].argmax(dim = -1).item()
            target_indices.append(next_token)
            
            if next_token == eos_idx:
                break

    return " ".join(idx2word[i] for i in target_indices[1:-1])

In [78]:
translate_sentence("positioning and plugin and toolbars", model = model, source_vocab = source_vocab, target_vocab = target_vocab)

'प्लगइन और उपकरण पट्टियाँ'