<a href="https://colab.research.google.com/github/gowthambangaru/MNIST-/blob/main/Untitled49.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
EMBED_DIM = 64
HIDDEN_DIM = 128
NUM_LAYERS = 1
RNN_TYPE = 'LSTM'
BATCH_SIZE = 32
EPOCHS = 10
TEACHER_FORCING_RATIO = 0.5

class TransliterationDataset(Dataset):
    def __init__(self, filepath):
        # Load source and target language pairs
        data = pd.read_csv(filepath, sep="\t", names=["src", "tgt"])
        self.src_texts = data["src"].astype(str)
        self.tgt_texts = data["tgt"].astype(str)

        # Build vocabularies for source and target
        self.src_vocab = self.build_vocab(self.src_texts, special=["<PAD>", "<SOS>", "<EOS>"])
        self.tgt_vocab = self.build_vocab(self.tgt_texts, special=["<PAD>", "<SOS>", "<EOS>"])

        # Store important token IDs
        self.pad_idx = self.tgt_vocab["<PAD>"]
        self.sos_idx = self.tgt_vocab["<SOS>"]
        self.eos_idx = self.tgt_vocab["<EOS>"]

        # Preprocess all data
        self.data = [
            (self.encode(src, self.src_vocab),
             self.encode(tgt, self.tgt_vocab, add_special=True))
            for src, tgt in zip(self.src_texts, self.tgt_texts)
        ]

    def build_vocab(self, texts, special=[]):
        chars = sorted(set("".join(texts)))
        vocab = {ch: i + len(special) for i, ch in enumerate(chars)}
        for i, token in enumerate(special):
            vocab[token] = i
        return vocab

    def encode(self, text, vocab, add_special=False):
        ids = [vocab[char] for char in text]
        if add_special:
            ids = [vocab["<SOS>"]] + ids + [vocab["<EOS>"]]
        return torch.tensor(ids, dtype=torch.long)

    def decode(self, ids, vocab):
        inv_vocab = {v: k for k, v in vocab.items()}
        return ''.join([inv_vocab.get(i, "") for i in ids if inv_vocab.get(i, "") not in ("<PAD>", "<SOS>", "<EOS>")])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def pad_collate(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_padded = nn.utils.rnn.pad_sequence(src_seqs, padding_value=0, batch_first=True)
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_seqs, padding_value=0, batch_first=True)
    return src_padded, tgt_padded

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, rnn_type):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = getattr(nn, rnn_type)(embed_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        _, hidden = self.rnn(x)
        return hidden


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, rnn_type):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = getattr(nn, rnn_type)(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x.unsqueeze(1))  # Add time dimension
        output, hidden = self.rnn(x, hidden)
        prediction = self.fc(output.squeeze(1))  # Remove time dimension
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, sos_idx):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.sos_idx = sos_idx

    def forward(self, src, tgt=None, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        max_len = tgt.size(1) if tgt is not None else 20
        tgt_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, max_len, tgt_vocab_size).to(device)
        hidden = self.encoder(src)
        input = torch.full((batch_size,), self.sos_idx, dtype=torch.long).to(device)

        for t in range(max_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force and tgt is not None else top1

        return outputs

# Training
def train(model, dataloader, optimizer, loss_fn, pad_idx):
    model.train()
    total_loss = 0

    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        output = output.reshape(-1, output.shape[-1])
        target = tgt[:, 1:].reshape(-1)

        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# Evaluation
def evaluate(model, dataloader, dataset):
    model.eval()
    correct = 0
    total = 0
    examples = []

    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, teacher_forcing_ratio=0)
            pred = output.argmax(-1)

            for i in range(src.size(0)):
                pred_str = dataset.decode(pred[i].cpu().tolist(), dataset.tgt_vocab)
                tgt_str = dataset.decode(tgt[i].cpu().tolist(), dataset.tgt_vocab)
                if pred_str == tgt_str:
                    correct += 1
                total += 1
                if len(examples) < 5:
                    src_str = dataset.decode(src[i].cpu().tolist(), dataset.src_vocab)
                    examples.append((src_str, pred_str, tgt_str))

    accuracy = correct / total
    return accuracy, examples

def main():

    train_file = "/content/hi.translit.sampled.train.tsv"
    dev_file = "/content/hi.translit.sampled.dev.tsv"

    train_dataset = TransliterationDataset(train_file)
    dev_dataset = TransliterationDataset(dev_file)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
    dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)

    encoder = Encoder(len(train_dataset.src_vocab), EMBED_DIM, HIDDEN_DIM, NUM_LAYERS, RNN_TYPE)
    decoder = Decoder(len(train_dataset.tgt_vocab), EMBED_DIM, HIDDEN_DIM, NUM_LAYERS, RNN_TYPE)
    model = Seq2Seq(encoder, decoder, train_dataset.sos_idx).to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.pad_idx)

    for epoch in range(EPOCHS):
        loss = train(model, train_loader, optimizer, criterion, train_dataset.pad_idx)
        acc, examples = evaluate(model, dev_loader, dev_dataset)

        print(f"\nEpoch {epoch+1}/{EPOCHS}")
        print(f"Training Loss: {loss:.4f}")
        print(f"Validation Accuracy: {acc:.4f}")
        for src, pred, tgt in
