In [1]:
%cd /content/drive/MyDrive/SanskritToEnglish

/content/drive/MyDrive/SanskritToEnglish


In [2]:
!pip install datasets tokenizers torch

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.

In [3]:
from datasets import load_dataset

dataset = load_dataset("rahular/itihasa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.61M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/75162 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6149 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11722 [00:00<?, ? examples/s]

In [None]:
import datasets
import tokenizers
import torch
import os

# Create a cache folder, used to speed up expensive operations between runs
from pathlib import Path
CACHE_FOLDER = "./cache"
Path(CACHE_FOLDER).mkdir(parents=True, exist_ok=True)

def train_itihasa_tokenizers(merged_data):

    # Using Byte-Pair Encoding for tokenization
    en_bpe = tokenizers.Tokenizer(tokenizers.models.BPE())
    sn_bpe = tokenizers.Tokenizer(tokenizers.models.BPE())

    en_bpe_cache_file, sn_bpe_cache_file = CACHE_FOLDER + "/en_bpe", CACHE_FOLDER + "/sn_bpe"
    if os.path.isfile(en_bpe_cache_file) and os.path.isfile(sn_bpe_cache_file):
        return (en_bpe.from_file(en_bpe_cache_file), sn_bpe.from_file(sn_bpe_cache_file))

    # Use whitespace as a word delimiter
    en_bpe.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
    sn_bpe.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()

    english_corpus_iter =  map(lambda x: x['en'], merged_data['translation'])
    sanskrit_corpus_iter =  map(lambda x: x['sn'], merged_data['translation'])

    corpus_length = merged_data.num_rows

    trainer = tokenizers.trainers.BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    en_bpe.train_from_iterator(english_corpus_iter, length=corpus_length, trainer=trainer)
    sn_bpe.train_from_iterator(sanskrit_corpus_iter, length=corpus_length)

    en_bpe.save(CACHE_FOLDER + "/en_bpe")
    sn_bpe.save(CACHE_FOLDER + "/sn_bpe")

    return (en_bpe, sn_bpe)


def load_itihasa():
    dataset = datasets.load_dataset("rahular/itihasa")

    training_data = dataset['train']
    validation_data = dataset['validation']
    test_data = dataset['test']
    return (training_data, validation_data, test_data)


def preprocess_parallel_pair(
    raw_english_sentence, raw_sanskrit_sentence, tokenizers):
    # Tokenize sentence
    en_tokenizer, sn_tokenizer = tokenizers
    tokenized_en = en_tokenizer.encode(raw_english_sentence)
    tokenized_sn = sn_tokenizer.encode(raw_sanskrit_sentence)

    return (torch.LongTensor(tokenized_en.ids),
            torch.LongTensor(tokenized_sn.ids))


class ItihasaDataset(torch.utils.data.Dataset):

    def __init__(self, parallel_text, tokenizer):
        self._parallel_text = parallel_text
        self._tokenizer = tokenizer

    def __getitem__(self, idx):

        parallel_pair = self._parallel_text['translation'][idx]
        en_sentence, sn_sentence = parallel_pair['en'], parallel_pair['sn']

        preprocessed_english, preprocessed_sanskrit = preprocess_parallel_pair(
            en_sentence, sn_sentence, self._tokenizer)

        source, target = preprocessed_sanskrit, preprocessed_english
        return (source, target)


    def __len__(self):
        return self._parallel_text.num_rows

if __name__ == "__main__":

    # Download the Itihasa dataset
    training_data, validation_data, test_data = load_itihasa()

    # Combine all data splits for tokenizer training
    merged_data = datasets.concatenate_datasets(
        (training_data, validation_data, test_data))

    # Train the tokenizers on the existing corpora
    tokenizers = train_itihasa_tokenizers(merged_data)

    # Create Torch Datasets for each split of the Itihasa dataset
    itihasa_dataset_train = ItihasaDataset(training_data, tokenizers)
    itihasa_dataset_val = ItihasaDataset(validation_data, tokenizers)
    itihasa_dataset_test = ItihasaDataset(test_data, tokenizers)

    # Create a torch Dataset from the full Itihasa dataset
    itihasa_dataset_full = ItihasaDataset(merged_data, tokenizers)


In [None]:
import itihasa
import torch
import datasets
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast

class TransformerBaseline(nn.Module):
    def __init__(self, d_model, src_vocab_size, tgt_vocab_size, num_encoder_layers=6, num_decoder_layers=6, nhead=8, dim_feedforward=2048, dropout=0.1):
        super(TransformerBaseline, self).__init__()
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, batch_first=True)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, src, tgt, src_mask, tgt_mask):
        src_emb = self.src_tok_emb(src)
        tgt_emb = self.tgt_tok_emb(tgt)
        transformer_output = self.transformer(src_emb, tgt_emb, src_mask=src_mask, tgt_mask=tgt_mask)
        out = self.fc_out(transformer_output)
        return self.softmax(out)

def generate_square_subsequent_mask(size):
    mask = torch.tril(torch.ones(size, size) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def train_model(model, train_dataloader, val_dataloader, vocab_sizes, optimizer, criterion, scheduler, num_epochs, device):
    model = model.to(device)
    scaler = GradScaler()  # For mixed precision training

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch_idx, (sn_train, en_train) in enumerate(train_dataloader):
            sn_train, en_train = sn_train.to(device), en_train.to(device)
            optimizer.zero_grad()
            tgt_input = en_train[:, :-1]
            tgt_output = en_train[:, 1:]
            src_mask = generate_square_subsequent_mask(sn_train.size(1)).to(device)
            tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)

            with autocast():
                output = model(sn_train, tgt_input, src_mask, tgt_mask)
                loss = criterion(output.view(-1, output.shape[-1]), tgt_output.reshape(-1))

            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            scaler.step(optimizer)
            scaler.update()

            epoch_loss += loss.item()

            if batch_idx % 10 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")

        scheduler.step()
        print(f"Epoch [{epoch+1}/{num_epochs}] completed. Average Loss: {epoch_loss / len(train_dataloader):.4f}")

        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for sn_val, en_val in val_dataloader:
                sn_val, en_val = sn_val.to(device), en_val.to(device)
                tgt_input = en_val[:, :-1]
                tgt_output = en_val[:, 1:]
                src_mask = generate_square_subsequent_mask(sn_val.size(1)).to(device)
                tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)

                with autocast():
                    output = model(sn_val, tgt_input, src_mask, tgt_mask)
                    loss = criterion(output.view(-1, output.shape[-1]), tgt_output.reshape(-1))

                val_loss += loss.item()

        print(f"Validation Loss after Epoch [{epoch+1}/{num_epochs}]: {val_loss / len(val_dataloader):.4f}")

        # Save checkpoint
        torch.save(model.state_dict(), f'model_epoch_{epoch+1}.pth')

def build_itihasa_datasets_for_training(batch_size, num_workers):
    # Download the Itihasa dataset
    training_data, validation_data, test_data = itihasa.load_itihasa()

    # Combine all data splits for tokenizer training
    merged_data = datasets.concatenate_datasets((training_data, validation_data, test_data))

    # Train the tokenizers on the existing corpora
    tokenizers = itihasa.train_itihasa_tokenizers(merged_data)

    # Create Torch Datasets for each split of the Itihasa dataset
    itihasa_dataset_train = itihasa.ItihasaDataset(training_data, tokenizers)
    itihasa_dataset_val = itihasa.ItihasaDataset(validation_data, tokenizers)
    itihasa_dataset_test = itihasa.ItihasaDataset(test_data, tokenizers)

    # Pads all sentences in a batch. Note that source and target batches are padded separately.
    def pad_parallel_pair(batch_of_parallel_pairs):
        batch_of_source_sentences = [src for src, tgt in batch_of_parallel_pairs]
        batch_of_target_sentences = [tgt for src, tgt in batch_of_parallel_pairs]

        return (torch.nn.utils.rnn.pad_sequence(batch_of_source_sentences, batch_first=True),
                torch.nn.utils.rnn.pad_sequence(batch_of_target_sentences, batch_first=True))

    # Create dataloaders for batching
    train_dataloader = torch.utils.data.DataLoader(
        itihasa_dataset_train, batch_size=batch_size, shuffle=True, collate_fn=pad_parallel_pair, num_workers=num_workers, pin_memory=True)
    val_dataloader = torch.utils.data.DataLoader(
        itihasa_dataset_val, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    test_dataloader = torch.utils.data.DataLoader(
        itihasa_dataset_test, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)

    dataloaders = {'train': train_dataloader, 'val': val_dataloader, 'test': test_dataloader}

    # Get tokenizers + vocab size from dataset
    sn_tokenizer, en_tokenizer = tokenizers
    en_vocab_size = en_tokenizer.get_vocab_size()
    sn_vocab_size = sn_tokenizer.get_vocab_size()

    vocab_sizes = {'en': en_vocab_size, 'sn': sn_vocab_size}

    return dataloaders, vocab_sizes

if __name__ == "__main__":
    # Check if GPU is available and set device accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    dataloaders, vocab_sizes = build_itihasa_datasets_for_training(batch_size=16, num_workers=4)  # Increased batch size and workers

    # Instantiate model
    baseline_model = TransformerBaseline(d_model=512, src_vocab_size=vocab_sizes['sn'], tgt_vocab_size=vocab_sizes['en'], num_encoder_layers=6, num_decoder_layers=6, nhead=8, dim_feedforward=2048, dropout=0.1)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(baseline_model.parameters(), lr=0.0001)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    # Perform training
    train_model(baseline_model, dataloaders['train'], dataloaders['val'], vocab_sizes, optimizer, criterion, scheduler, num_epochs=20, device=device)


In [16]:
import itihasa
import torch
import datasets
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

def build_itihasa_datasets_for_testing(batch_size, num_workers):
    # Load the Itihasa dataset
    training_data, validation_data, test_data = itihasa.load_itihasa()

    # Train the tokenizers on the existing corpora
    tokenizers = itihasa.train_itihasa_tokenizers(training_data)

    # Create Torch Datasets for each split of the Itihasa dataset
    itihasa_dataset_test = itihasa.ItihasaDataset(test_data, tokenizers)

    # Pads all sentences in a batch. Note that source and target batches are padded separately.
    def pad_parallel_pair(batch_of_parallel_pairs):
        batch_of_source_sentences = [src for src, tgt in batch_of_parallel_pairs]
        batch_of_target_sentences = [tgt for src, tgt in batch_of_parallel_pairs]

        return (torch.nn.utils.rnn.pad_sequence(batch_of_source_sentences, batch_first=True),
                torch.nn.utils.rnn.pad_sequence(batch_of_target_sentences, batch_first=True))

    # Create dataloaders for batching
    test_dataloader = DataLoader(
        itihasa_dataset_test, batch_size=batch_size, shuffle=False, collate_fn=pad_parallel_pair, num_workers=num_workers, pin_memory=True)

    return test_dataloader, tokenizers

def evaluate_model(model, dataloader, device):
    model.eval()
    total_bleu_score = 0
    num_sentences = 0

    with torch.no_grad():
        for sn_test, en_test in dataloader:
            sn_test, en_test = sn_test.to(device), en_test.to(device)
            tgt_input = en_test[:, :-1]
            tgt_output = en_test[:, 1:]
            src_mask = generate_square_subsequent_mask(sn_test.size(1)).to(device)
            tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)

            output = model(sn_test, tgt_input, src_mask, tgt_mask)
            output = torch.argmax(output, dim=-1)

            for i in range(en_test.size(0)):
                reference = en_test[i].cpu().numpy()
                hypothesis = output[i].cpu().numpy()
                total_bleu_score += sentence_bleu([reference], hypothesis)
                num_sentences += 1

    avg_bleu_score = total_bleu_score / num_sentences
    return avg_bleu_score

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    test_dataloader, tokenizers = build_itihasa_datasets_for_testing(batch_size=16, num_workers=4)

    baseline_model = TransformerBaseline(d_model=512, src_vocab_size=tokenizers[1].get_vocab_size(), tgt_vocab_size=tokenizers[0].get_vocab_size(), num_encoder_layers=6, num_decoder_layers=6, nhead=8, dim_feedforward=2048, dropout=0.1)
    baseline_model.load_state_dict(torch.load('model_epoch_20.pth'))
    baseline_model.to(device)

    avg_bleu_score = evaluate_model(baseline_model, test_dataloader, device)
    print(f"Average BLEU score on the test set: {avg_bleu_score:.2f}")


Average BLEU score on the test set: 23.19


In [14]:
import datasets
import torch
from torch.utils.data import DataLoader

def load_itihasa_test_split():
    # Load the Itihasa dataset
    dataset = datasets.load_dataset("rahular/itihasa")
    test_data = dataset['test']
    return test_data

def tokenize_sentence(tokenizer, sentence):
    return torch.LongTensor(tokenizer.encode(sentence).ids)

def detokenize_sentence(tokenizer, tokens):
    return tokenizer.decode(tokens.tolist(), skip_special_tokens=True)

def generate_translation(model, tokenizer_sn, tokenizer_en, sentence, device):
    model.eval()
    with torch.no_grad():
        src = tokenize_sentence(tokenizer_sn, sentence).unsqueeze(0).to(device)
        tgt = torch.tensor([[tokenizer_en.token_to_id("[CLS]")]]).to(device)

        for i in range(100):  # Limit the length of the generated sequence
            src_mask = generate_square_subsequent_mask(src.size(1)).to(device)
            tgt_mask = generate_square_subsequent_mask(tgt.size(1)).to(device)
            output = model(src, tgt, src_mask, tgt_mask)
            next_token = output.argmax(dim=-1)[:, -1].unsqueeze(-1)
            tgt = torch.cat([tgt, next_token], dim=1)
            if next_token.item() == tokenizer_en.token_to_id("[SEP]"):
                break

        translated_sentence = detokenize_sentence(tokenizer_en, tgt.squeeze().cpu())
        return translated_sentence

def generate_translations(test_data, model, tokenizers, device):
    translations = []

    for example in test_data:
        translation = example['translation']
        sanskrit_shloka = translation['sn']
        actual_translation = translation['en']
        predicted_translation = generate_translation(model, tokenizers[1], tokenizers[0], sanskrit_shloka, device)
        translations.append({
            'sanskrit_shloka': sanskrit_shloka,
            'actual_translation': actual_translation,
            'predicted_translation': predicted_translation
        })

    return translations

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load the test data
    test_data = load_itihasa_test_split()

    # Load tokenizers
    tokenizers = (tokenizers.Tokenizer.from_file("./cache/en_bpe"), tokenizers.Tokenizer.from_file("./cache/sn_bpe"))

    # Load the trained model
    model = TransformerBaseline(d_model=512, src_vocab_size=tokenizers[1].get_vocab_size(), tgt_vocab_size=tokenizers[0].get_vocab_size(), num_encoder_layers=6, num_decoder_layers=6, nhead=8, dim_feedforward=2048, dropout=0.1)
    model.load_state_dict(torch.load('model_epoch_20.pth'))
    model.to(device)

    # Generate translations
    translations = generate_translations(test_data, model, tokenizers, device)

    # Print a few examples
    for i in range(5):  # Print the first 5 examples
        print(f"Sanskrit Shloka: {translations[i]['sanskrit_shloka']}")
        print(f"Actual Translation: {translations[i]['actual_translation']}")
        print(f"Predicted Translation: {translations[i]['predicted_translation']}")
        print()


Sanskrit Shloka: विश्वामित्रवचः श्रुत्वा राघवः सहलक्ष्मणः। विस्मयं परमं गत्वा विश्वामित्रमथाब्रवीत्॥
Actual Translation: Hearing the words of Viśvāmitra, Rāghava, together with Laksmana, was struck with amazement, and spoke to Viśvāmitra, saying,
Predicted Translation: the words of Viśvāmitra, Rāghava, together with Laksmana, was extra struck with amazement, and spoke to saying,

Sanskrit Shloka: अत्यद्भुतमिदं ब्रह्मन् कथितं परमं त्वया। गङ्गावतरणं पुण्यं सागरस्यापि पूरणम्॥
Actual Translation: O Brāhmaṇa, wonderful is the story that you have recited to us, viz; that of Ganga's sacred dissension and the replenishing of the Ocean.
Predicted Translation: O is wonderful the story that you have recited to us, viz; that of Ganga's sacred dissension and the replenishing of the Ocean.

Sanskrit Shloka: क्षणभूतेव नौ रात्रिः संवृत्तेयं परंतप। इमां चिन्तयतोः सर्वा निखिलेन कथां तव॥
Actual Translation: And, O afflicter of foes, as we had been reflecting upon all this at length, the night has passed 