Added dataset manually, if you want to download it, you can access it from here https://github.com/google-research-datasets/dakshina. Here I have used Tamil language.

# Question 1

In [1]:
# Importing libraries

import torch
import torch.nn as nn

In [3]:
# Encoder RNN

class EncoderRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, cell_type='RNN', num_layers=1, dropout=0.0):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embed_size)
        self.cell_type = cell_type.upper()
        
        # Select RNN cell type
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[self.cell_type]
        self.rnn = rnn_class(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)  # [batch_size, seq_len, embed_size]
        if self.cell_type == 'LSTM':
            outputs, (hidden, cell) = self.rnn(embedded)
            return outputs, (hidden, cell)
        else:
            outputs, hidden = self.rnn(embedded)
            return outputs, hidden

In [4]:
# Decoder RNN

class DecoderRNN(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, cell_type='RNN', num_layers=1, dropout=0.0):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(output_size, embed_size)
        self.cell_type = cell_type.upper()
        
        # Select RNN cell type
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[self.cell_type]
        self.rnn = rnn_class(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_char, hidden):
        embedded = self.embedding(input_char).unsqueeze(1)  # [batch_size, 1, embed_size]
        if self.cell_type == 'LSTM':
            output, (hidden, cell) = self.rnn(embedded, hidden)
            output = self.out(output.squeeze(1))
            return output, (hidden, cell)
        else:
            output, hidden = self.rnn(embedded, hidden)
            output = self.out(output.squeeze(1))
            return output, hidden


In [5]:
# Seq2Seq model

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        outputs = torch.zeros(batch_size, target_len, self.decoder.out.out_features).to(source.device)

        encoder_outputs, hidden = self.encoder(source)
        
        # Initialize decoder input with <SOS> token (assuming 0 is <SOS>)
        decoder_input = target[:, 0]
        
        # Handle hidden state for LSTM
        if self.encoder.cell_type == 'LSTM':
            hidden = (hidden[0], hidden[1])
        
        for t in range(1, target_len):
            output, hidden = self.decoder(decoder_input, hidden)
            outputs[:, t, :] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1

        return outputs

In [6]:
# Wrapper function to create model

def create_model(input_vocab_size, output_vocab_size, embed_size=256, hidden_size=512, 
                 cell_type='RNN', num_layers=1, dropout=0.0):
    encoder = EncoderRNN(input_vocab_size, embed_size, hidden_size, cell_type, num_layers, dropout)
    decoder = DecoderRNN(output_vocab_size, embed_size, hidden_size, cell_type, num_layers, dropout)
    model = Seq2Seq(encoder, decoder)
    return model

# Question 2

## Preparing dataset for training

In [7]:
# Importing Libraries

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import os

In [8]:
# Vocabulary class to handle character-to-index mapping
class Vocabulary:
    def __init__(self):
        self.char2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.idx2char = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.size = 4

    def add_sequence(self, sequence):
        for char in sequence:
            if char not in self.char2idx:
                self.char2idx[char] = self.size
                self.idx2char[self.size] = char
                self.size += 1

    def get_indices(self, sequence):
        indices = [self.char2idx.get(char, self.char2idx['<UNK>']) for char in sequence]
        return indices

In [9]:
# Custom Dataset class

class DakshinaDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = self.data.iloc[idx, 1]  # English (Latin)
        tgt = self.data.iloc[idx, 0]  # Tamil
        src_indices = [self.src_vocab.char2idx['<SOS>']] + self.src_vocab.get_indices(src) + [self.src_vocab.char2idx['<EOS>']]
        tgt_indices = [self.tgt_vocab.char2idx['<SOS>']] + self.tgt_vocab.get_indices(tgt) + [self.tgt_vocab.char2idx['<EOS>']]
        return torch.tensor(src_indices, dtype=torch.long), torch.tensor(tgt_indices, dtype=torch.long)

In [10]:
# Function to load and preprocess data
def load_dakshina_data(train_path, val_path, test_path):
    # Read TSV files without headers
    train_df = pd.read_csv(train_path, sep='\t', header=None, usecols=[0, 1])
    val_df = pd.read_csv(val_path, sep='\t', header=None, usecols=[0, 1])
    test_df = pd.read_csv(test_path, sep='\t', header=None, usecols=[0, 1])

    # Ensure strings
    train_df[0] = train_df[0].astype(str)
    train_df[1] = train_df[1].astype(str)
    val_df[0] = val_df[0].astype(str)
    val_df[1] = val_df[1].astype(str)
    test_df[0] = test_df[0].astype(str)
    test_df[1] = test_df[1].astype(str)

    # Build vocabularies
    src_vocab = Vocabulary()  # English (Latin)
    tgt_vocab = Vocabulary()  # Tamil

    # Add characters to vocab from training data
    for _, row in train_df.iterrows():
        src_vocab.add_sequence(row[1])
        tgt_vocab.add_sequence(row[0])

    # Create datasets
    train_dataset = DakshinaDataset(train_df, src_vocab, tgt_vocab)
    val_dataset = DakshinaDataset(val_df, src_vocab, tgt_vocab)
    test_dataset = DakshinaDataset(test_df, src_vocab, tgt_vocab)

    return train_dataset, val_dataset, test_dataset, src_vocab, tgt_vocab


In [11]:
# Collate function for DataLoader
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    # Pad sequences
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

In [12]:
# Wrapper function for easier access

def prepare_data_loaders(train_path, val_path, test_path, batch_size=32):
    train_dataset, val_dataset, test_dataset, src_vocab, tgt_vocab = load_dakshina_data(train_path, val_path, test_path)
    
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, pin_memory=True
    )
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, pin_memory=True
    )
    
    return train_loader, val_loader, test_loader, src_vocab, tgt_vocab

In [13]:
# Paths to your local TSV files (update as needed)
train_path = '/kaggle/input/dakshina-tamil/ta.translit.sampled.train.tsv'
val_path = '/kaggle/input/dakshina-tamil/ta.translit.sampled.dev.tsv'
test_path = '/kaggle/input/dakshina-tamil/ta.translit.sampled.test.tsv'

In [14]:
# Create data loaders
train_loader, val_loader, test_loader, src_vocab, tgt_vocab = prepare_data_loaders(train_path, val_path, test_path)

In [15]:
# Print vocabulary sizes
print(f"Source (English) vocabulary size: {src_vocab.size}")
print(f"Target (Tamil) vocabulary size: {tgt_vocab.size}")

Source (English) vocabulary size: 30
Target (Tamil) vocabulary size: 50
