Added dataset manually, if you want to download it, you can access it from here https://github.com/google-research-datasets/dakshina. Here I have used Tamil language.

# Question 1

In [1]:
# Importing libraries

import torch
import torch.nn as nn

In [2]:
# Encoder RNN

class EncoderRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, cell_type='RNN', num_layers=1, dropout=0.0):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embed_size)
        self.cell_type = cell_type.upper()
        
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[self.cell_type]
        self.rnn = rnn_class(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        if self.cell_type == 'LSTM':
            outputs, (hidden, cell) = self.rnn(embedded)
            return outputs, (hidden, cell)
        else:
            outputs, hidden = self.rnn(embedded)
            return outputs, hidden

In [3]:
# Decoder RNN

class DecoderRNN(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, cell_type='RNN', num_layers=1, dropout=0.0):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(output_size, embed_size)
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type.upper()
        
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[self.cell_type]
        self.rnn = rnn_class(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_char, hidden):
        embedded = self.embedding(input_char).unsqueeze(1)
        embedded = self.dropout(embedded)
        if self.cell_type == 'LSTM':
            output, (hidden, cell) = self.rnn(embedded, hidden)
            output = self.out(output.squeeze(1))
            return output, (hidden, cell)
        else:
            output, hidden = self.rnn(embedded, hidden)
            output = self.out(output.squeeze(1))
            return output, hidden

In [4]:
# Seq2Seq model

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        outputs = torch.zeros(batch_size, target_len, self.decoder.out.out_features).to(source.device)

        encoder_outputs, hidden = self.encoder(source)
        
        decoder_input = target[:, 0]
        
        # Handle differing encoder/decoder layers
        if self.encoder.cell_type == 'LSTM':
            hidden, cell = hidden
            if self.encoder.num_layers != self.decoder.num_layers:
                # Compute repeat factor or select layers
                if self.decoder.num_layers > self.encoder.num_layers:
                    factor = self.decoder.num_layers // self.encoder.num_layers + 1
                    hidden = hidden.repeat(factor, 1, 1)[:self.decoder.num_layers]
                    cell = cell.repeat(factor, 1, 1)[:self.decoder.num_layers]

                else:
                    hidden = hidden[-self.decoder.num_layers:]
                    cell = cell[-self.decoder.num_layers:]
            hidden = (hidden, cell)
        else:
            # For RNN and GRU
            if self.encoder.num_layers != self.decoder.num_layers:
                if self.decoder.num_layers > self.encoder.num_layers:
                    factor = self.decoder.num_layers // self.encoder.num_layers + 1
                    hidden = hidden.repeat(factor, 1, 1)[:self.decoder.num_layers]
                else:
                    hidden = hidden[-self.decoder.num_layers:]
        
        for t in range(1, target_len):
            output, hidden = self.decoder(decoder_input, hidden)
            outputs[:, t, :] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1

        return outputs

In [5]:
# Wrapper function to create model

def create_model(input_vocab_size, output_vocab_size, embed_size=256, hidden_size=512, 
                 cell_type='RNN', encoder_layers=1, decoder_layers=1, dropout=0.0):
    encoder = EncoderRNN(input_vocab_size, embed_size, hidden_size, cell_type, encoder_layers, dropout)
    decoder = DecoderRNN(output_vocab_size, embed_size, hidden_size, cell_type, decoder_layers, dropout)
    model = Seq2Seq(encoder, decoder)
    return model

# Question 2

## Preparing dataset for training

In [6]:
# Importing Libraries

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
from collections import Counter
import os
import math

In [7]:
# Vocabulary class to handle character-to-index mapping
class Vocabulary:
    def __init__(self):
        self.char2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.idx2char = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.size = 4

    def add_sequence(self, sequence):
        for char in sequence:
            if char not in self.char2idx:
                self.char2idx[char] = self.size
                self.idx2char[self.size] = char
                self.size += 1

    def get_indices(self, sequence):
        indices = [self.char2idx.get(char, self.char2idx['<UNK>']) for char in sequence]
        return indices

In [8]:
# Custom Dataset class

class DakshinaDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = self.data.iloc[idx, 1]  # English (Latin)
        tgt = self.data.iloc[idx, 0]  # Tamil
        src_indices = [self.src_vocab.char2idx['<SOS>']] + self.src_vocab.get_indices(src) + [self.src_vocab.char2idx['<EOS>']]
        tgt_indices = [self.tgt_vocab.char2idx['<SOS>']] + self.tgt_vocab.get_indices(tgt) + [self.tgt_vocab.char2idx['<EOS>']]
        return torch.tensor(src_indices, dtype=torch.long), torch.tensor(tgt_indices, dtype=torch.long)

In [9]:
# Function to load and preprocess data
def load_dakshina_data(train_path, val_path, test_path):
    # Read TSV files without headers
    train_df = pd.read_csv(train_path, sep='\t', header=None, usecols=[0, 1])
    val_df = pd.read_csv(val_path, sep='\t', header=None, usecols=[0, 1])
    test_df = pd.read_csv(test_path, sep='\t', header=None, usecols=[0, 1])

    # Ensure strings
    train_df[0] = train_df[0].astype(str)
    train_df[1] = train_df[1].astype(str)
    val_df[0] = val_df[0].astype(str)
    val_df[1] = val_df[1].astype(str)
    test_df[0] = test_df[0].astype(str)
    test_df[1] = test_df[1].astype(str)

    # Build vocabularies
    src_vocab = Vocabulary()  # English (Latin)
    tgt_vocab = Vocabulary()  # Tamil

    # Add characters to vocab from training data
    for _, row in train_df.iterrows():
        src_vocab.add_sequence(row[1])
        tgt_vocab.add_sequence(row[0])

    # Create datasets
    train_dataset = DakshinaDataset(train_df, src_vocab, tgt_vocab)
    val_dataset = DakshinaDataset(val_df, src_vocab, tgt_vocab)
    test_dataset = DakshinaDataset(test_df, src_vocab, tgt_vocab)

    return train_dataset, val_dataset, test_dataset, src_vocab, tgt_vocab


In [10]:
# Collate function for DataLoader
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    # Pad sequences
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

In [11]:
# Wrapper function for easier access

def prepare_data_loaders(train_path, val_path, test_path, batch_size=32):
    train_dataset, val_dataset, test_dataset, src_vocab, tgt_vocab = load_dakshina_data(train_path, val_path, test_path)
    
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, pin_memory=True
    )
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, pin_memory=True
    )
    
    return train_loader, val_loader, test_loader, src_vocab, tgt_vocab

In [12]:
# Paths to your local TSV files (update as needed)
train_path = '/kaggle/input/dakshina-tamil/ta.translit.sampled.train.tsv'
val_path = '/kaggle/input/dakshina-tamil/ta.translit.sampled.dev.tsv'
test_path = '/kaggle/input/dakshina-tamil/ta.translit.sampled.test.tsv'

In [13]:
# Create data loaders
train_loader, val_loader, test_loader, src_vocab, tgt_vocab = prepare_data_loaders(train_path, val_path, test_path)

In [14]:
# Print vocabulary sizes
print(f"Source (English) vocabulary size: {src_vocab.size}")
print(f"Target (Tamil) vocabulary size: {tgt_vocab.size}")

Source (English) vocabulary size: 30
Target (Tamil) vocabulary size: 50


## Setting up wandb

In [15]:
!pip install wandb -q

In [16]:
import wandb

In [17]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")

In [18]:
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mda24m007[0m ([33mda24m007-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Running wandb sweep

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
# Beam search decoding
def beam_search_decode(model, src, max_len, beam_width, sos_idx, eos_idx):
    model.eval()
    src = src.to(device)
    batch_size = src.size(0)
    encoder_outputs, hidden = model.encoder(src)
    
    # Adjust hidden state to match decoder layers
    if model.encoder.cell_type == 'LSTM':
        hidden, cell = hidden
        if model.encoder.num_layers != model.decoder.num_layers:
            factor = model.decoder.num_layers // model.encoder.num_layers
            if factor > 1:
                hidden = hidden.repeat(factor, 1, 1)
                cell = cell.repeat(factor, 1, 1)
            else:
                hidden = hidden[-model.decoder.num_layers:]
                cell = cell[-model.decoder.num_layers:]
        hidden = (hidden, cell)
    else:
        if model.encoder.num_layers != model.decoder.num_layers:
            factor = model.decoder.num_layers // model.encoder.num_layers
            if factor > 1:
                hidden = hidden.repeat(factor, 1, 1)
            else:
                hidden = hidden[-model.decoder.num_layers:]
    
    # Initialize beam
    beams = [(torch.tensor([sos_idx], device=device), hidden, 0.0)]  # (sequence, hidden, score)
    completed = []
    
    for _ in range(max_len):
        new_beams = []
        for seq, hid, score in beams:
            if seq[-1].item() == eos_idx:
                completed.append((seq, score))
                continue
            output, new_hidden = model.decoder(seq[-1].unsqueeze(0), hid)
            probs = torch.softmax(output, dim=-1)
            top_probs, top_idx = probs.topk(beam_width)
            
            for i in range(beam_width):
                new_seq = torch.cat([seq, top_idx[:, i]])
                new_score = score - math.log(top_probs[:, i].item())
                new_beams.append((new_seq, new_hidden, new_score))
        
        # Keep top beam_width beams
        new_beams = sorted(new_beams, key=lambda x: x[2])[:beam_width]
        beams = new_beams
        
        if len(completed) >= beam_width:
            break
    
    # Return best sequence
    completed = sorted(completed, key=lambda x: x[1])
    if completed:
        return completed[0][0]
    return beams[0][0]


In [21]:
# Training and evaluation function
def train_and_evaluate():
    wandb.init()
    config = wandb.config
    
    # Create model with sweep parameters
    model = create_model(
        input_vocab_size=src_vocab.size,
        output_vocab_size=tgt_vocab.size,
        embed_size=config.embed_size,
        hidden_size=config.hidden_size,
        cell_type=config.cell_type,
        encoder_layers=config.encoder_layers,
        decoder_layers=config.decoder_layers,
        dropout=config.dropout
    ).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    for epoch in range(10):
        model.train()
        train_loss = 0
        for src, tgt in train_loader:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt, teacher_forcing_ratio=0.5)
            output = output[:, 1:].reshape(-1, output.size(-1))
            tgt = tgt[:, 1:].reshape(-1)
            loss = criterion(output, tgt)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Compute training accuracy on one batch
        model.eval()
        train_correct = 0
        train_total = 0
        with torch.no_grad():
            for src, tgt in train_loader:
                src, tgt = src.to(device), tgt.to(device)
                for i in range(src.size(0)):
                    pred = beam_search_decode(
                        model, src[i:i+1], max_len=50,
                        beam_width=config.beam_width,
                        sos_idx=tgt_vocab.char2idx['<SOS>'],
                        eos_idx=tgt_vocab.char2idx['<EOS>']
                    )
                    pred_str = ''.join([tgt_vocab.idx2char[idx.item()] for idx in pred if idx.item() not in [0, 1, 2]])
                    tgt_str = ''.join([tgt_vocab.idx2char[idx.item()] for idx in tgt[i, 1:] if idx.item() not in [0, 1, 2]])
                    if pred_str == tgt_str:
                        train_correct += 1
                    train_total += 1
                break
        
        # Compute validation loss and accuracy
        val_loss = 0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for src, tgt in val_loader:
                src, tgt = src.to(device), tgt.to(device)
                output = model(src, tgt, teacher_forcing_ratio=0.0)
                output = output[:, 1:].reshape(-1, output.size(-1))
                tgt = tgt[:, 1:].reshape(-1)
                loss = criterion(output, tgt)
                val_loss += loss.item()
            
            # Validation accuracy on one batch
            for src, tgt in val_loader:
                src, tgt = src.to(device), tgt.to(device)
                for i in range(src.size(0)):
                    pred = beam_search_decode(
                        model, src[i:i+1], max_len=50,
                        beam_width=config.beam_width,
                        sos_idx=tgt_vocab.char2idx['<SOS>'],
                        eos_idx=tgt_vocab.char2idx['<EOS>']
                    )
                    pred_str = ''.join([tgt_vocab.idx2char[idx.item()] for idx in pred if idx.item() not in [0, 1, 2]])
                    tgt_str = ''.join([tgt_vocab.idx2char[idx.item()] for idx in tgt[i, 1:] if idx.item() not in [0, 1, 2]])
                    if pred_str == tgt_str:
                        val_correct += 1
                    val_total += 1
                break
        
        # Log metrics to WandB
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss / len(train_loader),
            "val_loss": val_loss / len(val_loader),
            "train_accuracy": train_correct / train_total,
            "val_accuracy": val_correct / val_total
        })

In [22]:
# WandB sweep configuration
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'embed_size': {
            'values': [16,32,64, 128, 256]
        },
        'hidden_size': {
            'values': [16,32,128, 256, 512]
        },
        'encoder_layers': {
            'values': [1, 2, 3]
        },
        'decoder_layers': {
            'values': [1, 2, 3]
        },
        'cell_type': {
            'values': ['RNN', 'GRU', 'LSTM']
        },
        'dropout': {
            'values': [0.2, 0.3]
        },
        'beam_width': {
            'values': [1, 3, 5]
        }
    }
}

In [23]:
# Initialize and run sweep
sweep_id = wandb.sweep(sweep_config, project="DL-A3")
wandb.agent(sweep_id, function=train_and_evaluate, count=20)

Create sweep with ID: 6krpo8ms
Sweep URL: https://wandb.ai/da24m007-iit-madras/DL-A3/sweeps/6krpo8ms


[34m[1mwandb[0m: Agent Starting Run: efutol5l with config:
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 16
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250515_070224-efutol5l[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mswift-sweep-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/da24m007-iit-madras/DL-A3[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/da24m007-iit-madras/DL-A3/sweeps/6krpo8ms[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/da24m007-iit-madras/DL-A3/runs/efutol5l[0m
[34m[1mwandb[0m:                                                            