In [None]:
from datasets import load_dataset

ds = load_dataset("Hemanth-thunder/en_ta")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

corpus.bcn.train 166k.csv:   0%|          | 0.00/99.2M [00:00<?, ?B/s]

en-ta train 118k.csv:   0%|          | 0.00/75.8M [00:00<?, ?B/s]

corpus.bcn.dev 1k.csv:   0%|          | 0.00/589k [00:00<?, ?B/s]

corpus.bcn.test 2k.csv:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/285630 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
import numpy as np
import torch
import math
from torch import nn
import torch.nn.functional as F

def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token, end_token):

        def tokenize(sentence, start_token, end_token):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized)
        return tokenized.to(get_device())

    def forward(self, x, start_token, end_token): # sentence
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(get_device())
        x = self.dropout(x + pos)
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out


class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x

class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

class Encoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, self_attention_mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers(x, self_attention_mask)
        return x


class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size() # in practice, this is the same for both languages...so we can technically combine with normal attention
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask) # We don't need the mask for cross attention, removing in outer function!
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out


class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)

        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)

        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y


class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

class Decoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token):
        y = self.sentence_embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y


class Transformer(nn.Module):
    def __init__(self,
                d_model,
                ffn_hidden,
                num_heads,
                drop_prob,
                num_layers,
                max_sequence_length,
                kn_vocab_size,
                english_to_index,
                kannada_to_index,
                START_TOKEN,
                END_TOKEN,
                PADDING_TOKEN
                ):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, kannada_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, kn_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=False, # We should make this true
                dec_end_token=False): # x, y are batch of sentences
        x = self.encoder(x, encoder_self_attention_mask, start_token=enc_start_token, end_token=enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out)
        return out

In [None]:
eng=ds['train']['en']
tam=ds['train']['ta']

In [None]:
eng[0],tam[0]

("MMA vice president Qazi Hussain Ahmad declared last month: 'We are not extremists.\n",
 'MMA கட்சியின் துணைத்தலைவர் க்வாஸி ஹுசேன் அகமத் சென்ற மாதம் பின்வருமாறு அறிவித்தார்: ``நாங்கள் தீவிரவாதிகள் அல்ல.\n')

In [None]:
# Step 1: File path to the parallel dataset
file_path = "/content/parallel_dataset.txt"

# Step 2: Lists to hold Tamil and English sentences
tamil_sentences = []
english_sentences = []

# Step 3: Read the file and split Tamil and English sentences
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Split the line by tab ('\t')
        parts = line.strip().split('\t')
        if len(parts) == 2:  # Ensure valid format
            tamil_sentences.append(parts[0])
            english_sentences.append(parts[1])

# Step 4: Print samples of the loaded data
print("Sample Tamil Sentences:")
for i, sentence in enumerate(tamil_sentences[:5]):  # Print first 5 Tamil sentences
    print(f"{i+1}: {sentence}")

print("\nSample English Sentences:")
for i, sentence in enumerate(english_sentences[:5]):  # Print first 5 English sentences
    print(f"{i+1}: {sentence}")

# Step 5: Verify dataset length
print(f"\nTotal Tamil Sentences: {len(tamil_sentences)}")
print(f"Total English Sentences: {len(english_sentences)}")


Sample Tamil Sentences:
1: ராஜாவாகிய ஆகாஸ் அரசாளும்போது தம்முடைய பாதகத்தினால் எறிந்துபோட்ட சகல பணிமுட்டுகளையும் முஸ்திப்பாக்கிப் பரிசுத்தம்பண்ணினோம்; இதோ , அவைகள் கர்த்தரின் ஆலயத்திற்கு முன்பாக இருக்கிறது என்றார்கள் .
2: சர்வதேச நாணய நிதியம் இலங்கைக்கு கடன் வழங்கினால் இதே போன்ற நிபந்தனைகள் திணிக்கப்படும் .
3: தற்போது அதற்கு எதிராக வாதாடுகிறார் சர்வதேச சட்டத்தை செயல்படுத்துவதற்குப் பதிலாக புதிய சட்டம் உருவாக்கப்பட்டு நிறுவப்பட வேண்டும் என்று எழுதுகிறார் .
4: அமெரிக்காவின் மூன்றாம் பெரிய கார் தயாரிப்பு நிறுவனமான கிறைஸ்லர் வியாழனன்று நியூ யோர்க்கில் திவாலடைந்ததற்காக மனு செய்தது; அத்தியாயம் 11 ன் படி மறு சீரமைத்து வெளிவரும் வரை அது தன்னுடைய உற்பத்தி நிலையங்களை மூடும் என்றும் அறிவித்துள்ளது .
5: மேலும் இனைவிட்டு தலிபானால் வெளியேற்றப்பட்ட 1995 இல் இருந்து ஈரானில் கூடுதலாக வாழ்ந்துவந்துள்ளார் .

Sample English Sentences:
1: moreover all the vessels , which king ahaz in his reign did cast away in his transgression , have we prepared and sanctified , and , behold , they are before the altar 

In [None]:
with open('/content/En-Ta English.txt', 'r', encoding='utf-8') as eng_file, \
     open('/content/En-Ta Tamil.txt', 'r', encoding='utf-8') as tam_file:
    english_lines_1 = eng_file.readlines()
    tamil_lines_1= tam_file.readlines()

In [None]:
english_lines_1 = english_lines_1[3:]
tamil_lines_1 = tamil_lines_1[3:]

In [None]:
eng=ds['train']['en']
tam=ds['train']['ta']

In [None]:
len(english_lines_1), len(tamil_lines_1)

(8946, 8946)

In [None]:
len(eng), len(tam)

(285630, 285630)

In [None]:
tamil_sentences.extend(tam)
english_sentences.extend(eng)

In [None]:
len(english_sentences), len(tamil_sentences)

(575081, 575081)

In [None]:
tamil_sentences.extend(tamil_lines_1)
english_sentences.extend(english_lines_1)

In [None]:
len(english_sentences), len(tamil_sentences)

(584027, 584027)

In [None]:
english_lines = english_sentences
tamil_lines = tamil_sentences

In [None]:
import re
import unicodedata
from collections import Counter

# Initialize special tokens
START_TOKEN = '<START>'
END_TOKEN = '<END>'
PADDING_TOKEN = '<PAD>'

# Clean the data (handle diacritics more efficiently using NFD normalization)
def clean_line(line):
    # Normalize using NFD to decompose characters and diacritics
    line = unicodedata.normalize("NFD", line)
    line = line.strip()

    # Remove unwanted characters, keeping only Tamil and English letters and necessary punctuation
    line = re.sub(r'\s+', ' ', line)  # Replace multiple spaces with a single space
    line = re.sub(r'[^a-zA-Z\u0B80-\u0BFF0-9.,;!?\'"()\s]', '', line)  # Remove non-alphanumeric characters
    line = re.sub(r'\s*([.,;!?\'"()])\s*', r'\1 ', line)  # Normalize spaces around punctuation

    # Normalize Tamil text further by removing unnecessary spaces or marks
    line = unicodedata.normalize("NFC", line)  # Reapply NFC normalization after cleaning
    return line.strip()

# Example of loading and cleaning your lines
english_lines = [clean_line(line.lower()) for line in english_lines]
tamil_lines = [clean_line(line) for line in tamil_lines]

# Ensure line count matches
assert len(english_lines) == len(tamil_lines), (
    f"Line mismatch: {len(english_lines)} English lines and {len(tamil_lines)} Tamil lines."
)

# Generate Vocabulary
def build_vocabulary(lines, special_tokens=None):
    special_tokens = special_tokens or []
    vocab = set(special_tokens)
    for line in lines:
        vocab.update(line)  # Add each character in the line
    return sorted(vocab, key=lambda x: (x not in special_tokens, x))

# Creating vocab for English and Tamil with special tokens
english_vocab = build_vocabulary(english_lines, [START_TOKEN, END_TOKEN, PADDING_TOKEN])
tamil_vocab = build_vocabulary(tamil_lines, [START_TOKEN, END_TOKEN, PADDING_TOKEN])

# Save vocabularies
with open('english_vocab.txt', 'w', encoding='utf-8') as eng_vocab_file, \
     open('tamil_vocab.txt', 'w', encoding='utf-8') as tam_vocab_file:
    eng_vocab_file.write('\n'.join(english_vocab))
    tam_vocab_file.write('\n'.join(tamil_vocab))

# Debugging Output for Vocabulary sizes
print("English Vocabulary Size:", len(english_vocab))
print("Tamil Vocabulary Size:", len(tamil_vocab))
print("Sample English Vocab:", english_vocab[:10])
print("Sample Tamil Vocab:", tamil_vocab[:10])

# Mapping indexes
index_to_tamil = {k: v for k, v in enumerate(tamil_vocab)}
tamil_to_index = {v: k for k, v in enumerate(tamil_vocab)}

index_to_english = {k: v for k, v in enumerate(english_vocab)}
english_to_index = {v: k for k, v in enumerate(english_vocab)}

print("Index to Tamil Mappings:", index_to_tamil)
print("Tamil to Index Mappings:", tamil_to_index)
print("Index to English Mappings:", index_to_english)
print("English to Index Mappings:", english_to_index)

# Check sentence lengths for filtering purposes
import numpy as np

PERCENTILE = 97
print(f"{PERCENTILE}th percentile length Tamil: {np.percentile([len(x) for x in tamil_lines], PERCENTILE)}")
print(f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_lines], PERCENTILE)}")

# Sentence validation functions
max_sequence_length = 200

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1)  # leaving 1 space for the end token

valid_sentence_indicies = []
for index in range(len(tamil_lines)):
    tamil_sentence, english_sentence = tamil_lines[index], english_lines[index]
    if is_valid_length(tamil_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(tamil_sentence, tamil_vocab):
        valid_sentence_indicies.append(index)

# Output number of valid sentences
print(f"Number of sentences: {len(tamil_lines)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

# Filter out invalid sentences
tamil_lines = [tamil_lines[i] for i in valid_sentence_indicies]
english_lines = [english_lines[i] for i in valid_sentence_indicies]

# Now the `tamil_lines` and `english_lines` are ready for further processing


English Vocabulary Size: 49
Tamil Vocabulary Size: 123
Sample English Vocab: ['<END>', '<PAD>', '<START>', ' ', '!', '"', "'", '(', ')', ',']
Sample Tamil Vocab: ['<END>', '<PAD>', '<START>', ' ', '!', '"', "'", '(', ')', ',']
Index to Tamil Mappings: {0: '<END>', 1: '<PAD>', 2: '<START>', 3: ' ', 4: '!', 5: '"', 6: "'", 7: '(', 8: ')', 9: ',', 10: '.', 11: '0', 12: '1', 13: '2', 14: '3', 15: '4', 16: '5', 17: '6', 18: '7', 19: '8', 20: '9', 21: ';', 22: '?', 23: 'A', 24: 'B', 25: 'C', 26: 'D', 27: 'E', 28: 'F', 29: 'G', 30: 'H', 31: 'I', 32: 'J', 33: 'K', 34: 'L', 35: 'M', 36: 'N', 37: 'O', 38: 'P', 39: 'Q', 40: 'R', 41: 'S', 42: 'T', 43: 'U', 44: 'V', 45: 'W', 46: 'X', 47: 'Y', 48: 'Z', 49: 'a', 50: 'b', 51: 'c', 52: 'd', 53: 'e', 54: 'f', 55: 'g', 56: 'h', 57: 'i', 58: 'j', 59: 'k', 60: 'l', 61: 'm', 62: 'n', 63: 'o', 64: 'p', 65: 'q', 66: 'r', 67: 's', 68: 't', 69: 'u', 70: 'v', 71: 'w', 72: 'x', 73: 'y', 74: 'z', 75: 'ஃ', 76: 'அ', 77: 'ஆ', 78: 'இ', 79: 'ஈ', 80: 'உ', 81: 'ஊ', 82: '

In [None]:
import torch

d_model = 512
batch_size = 32
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 2
max_sequence_length = 200
kn_vocab_size = len(tamil_vocab)

transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          kn_vocab_size,
                          english_to_index,
                          tamil_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, tamil_sentences):
        self.english_sentences = english_sentences
        self.tamil_sentences = tamil_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.tamil_sentences[idx]
dataset = TextDataset(english_lines, tamil_lines)

In [None]:
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [None]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('moreover all the vessels, which king ahaz in his reign did cast away in his transgression, have we prepared and sanctified, and, behold, they are before the altar of the lord.', 'similar conditions will be imposed if the sri lankan government is given an imf loan.', 'now kornelius argues the opposite instead of enforcing the adherence to international law, new laws would now have to be devised and established.', 'moreover, khan has been in exile in iran for most of the time since he was ousted from herat by the taliban in 1995.', 'you have to take your next promotion.', 'for since the beginning of the world men have not heard, nor perceived by the ear, neither has the eye seen, o god, beside you, what he has prepared for him that waits for him.', 'if you love these people so much you can mourn for them.', 'i have made the earth, and created man on it i, even my hands, have stretched out the heavens, and all their host have i commanded.', "their latest film' enga rasi nalla rasi' is 

In [None]:
# Change the loss function to handle Tamil
from torch import nn
criterian = nn.CrossEntropyLoss(ignore_index=tamil_to_index[PADDING_TOKEN],
                                reduction='none')

# Initialize the transformer parameters using Xavier uniform initialization
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

# Set up the optimizer
optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)

# Set the device
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [None]:
NEG_INFTY = -1e9

def create_masks(ta_batch, en_batch):
    num_sentences = len(ta_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length], True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length], False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length], False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length], False)

    for idx in range(num_sentences):
        ta_sentence_length, en_sentence_length = len(ta_batch[idx]), len(en_batch[idx])
        ta_chars_to_padding_mask = np.arange(ta_sentence_length + 1, max_sequence_length)
        en_chars_to_padding_mask = np.arange(en_sentence_length + 1, max_sequence_length)
        encoder_padding_mask[idx, :, ta_chars_to_padding_mask] = True
        encoder_padding_mask[idx, ta_chars_to_padding_mask, :] = True
        decoder_padding_mask_self_attention[idx, :, en_chars_to_padding_mask] = True
        decoder_padding_mask_self_attention[idx, en_chars_to_padding_mask, :] = True
        decoder_padding_mask_cross_attention[idx, :, ta_chars_to_padding_mask] = True
        decoder_padding_mask_cross_attention[idx, en_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask = torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)

    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [None]:
import torch.nn as nn

# Initialize the loss function
criterion = nn.CrossEntropyLoss(ignore_index=tamil_to_index[PADDING_TOKEN])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    iterator = iter(train_loader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        en_batch, ta_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(en_batch, ta_batch)
        optim.zero_grad()
        ta_predictions = transformer(en_batch,
                                     ta_batch,
                                     encoder_self_attention_mask.to(device),
                                     decoder_self_attention_mask.to(device),
                                     decoder_cross_attention_mask.to(device),
                                     enc_start_token=False,
                                     enc_end_token=False,
                                     dec_start_token=True,
                                     dec_end_token=True)
        labels = transformer.decoder.sentence_embedding.batch_tokenize(ta_batch, start_token=False, end_token=True)
        loss = criterion(
            ta_predictions.view(-1, len(tamil_vocab)).to(device),
            labels.view(-1).to(device)
        ).to(device)
        valid_indices = torch.where(labels.view(-1) == tamil_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indices.sum()
        loss.backward()
        optim.step()
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            print(f"English: {en_batch[0]}")
            print(f"Tamil Translation: {ta_batch[0]}")
            ta_sentence_predicted = torch.argmax(ta_predictions[0], axis=1)
            predicted_sentence = ""
            for idx in ta_sentence_predicted:
                if idx == tamil_to_index[END_TOKEN]:
                    break
                predicted_sentence += index_to_tamil[idx.item()]
            print(f"Tamil Prediction: {predicted_sentence}")

    # Evaluation step
    transformer.eval()
    ta_sentence = ("",)
    en_sentence = ("my birthday is on august 29.",)
    for word_counter in range(max_sequence_length):
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(en_sentence, ta_sentence)
        predictions = transformer(en_sentence,
                                  ta_sentence,
                                  encoder_self_attention_mask.to(device),
                                  decoder_self_attention_mask.to(device),
                                  decoder_cross_attention_mask.to(device),
                                  enc_start_token=False,
                                  enc_end_token=False,
                                  dec_start_token=True,
                                  dec_end_token=False)
        next_token_prob_distribution = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_tamil[next_token_index]
        ta_sentence = (ta_sentence[0] + next_token, )
        if next_token == END_TOKEN:
            break

    print(f"Evaluation translation (should we go to the mall?) : {ta_sentence}")
    print("-------------------------------------------")


Epoch 0
Iteration 0 : 0.001508654560893774
English: moreover all the vessels, which king ahaz in his reign did cast away in his transgression, have we prepared and sanctified, and, behold, they are before the altar of the lord.
Tamil Translation: ராஜாவாகிய ஆகாஸ் அரசாளும்போது தம்முடைய பாதகத்தினால் எறிந்துபோட்ட சகல பணிமுட்டுகளையும் முஸ்திப்பாக்கிப் பரிசுத்தம்பண்ணினோம்; இதோ, அவைகள் கர்த்தரின் ஆலயத்திற்கு முன்பாக இருக்கிறது என்றார்கள்.
Tamil Prediction: ஆJஆஆzஆஆ.ணv1TqHஆ1ஆ1ஆஉ1ஐஉஆஆஐஐஐஐஐஐJHஆஆஆஐஐஐஐஆஐஆஆJJஆJஜJஐஐஐ1ஐJஆJ1(J1ே(ஆஐHஐஐJ1(ஆண1mஆ(ஆHq(ஆஆேஆஜஜஆJ(HJஐஆஐஆHஐஐqஐணிJqXே1உXq(ஆqணHணண1((qJ(ண1ேஐணணணணqணஐ(ணJணஸqqஐsணqஸJq.ணணJணJJJஐஐஐXணணஐJணஐணJJணJJJணஐJஐஆJHJJJJJணஆணqJJJJHேHqH
Iteration 100 : 0.0010143734980374575
English: psychology is taught by a visiting lecturer' .
Tamil Translation: அவர்கள் ஆறு பாடங்களை கற்பிக்கின்றனர்.
Tamil Prediction: ்்்்்் ்்்்்்்் ்்்்்்்்்் ்்்்்க்க்்்்்்்்்்்்்்் ்்த்்்்்்்்்்்்்்்ு ்்்்்்்்்்்்்்்்த ்்்்்் ்்்்்்்்்்்்்்்்்்்்்்்கக்்்்்்்ி்்ு ் ்்்்்்்்்்்் ்்்்்்்்்்்்்க்க்்்்க்்்்்்்

KeyboardInterrupt: 

In [None]:
transformer.eval()

def translate(eng_sentence):
    # Convert the English sentence to a tuple
    eng_sentence = (eng_sentence,)
    # Initialize the Tamil translation with an empty string
    tn_sentence = ("",)
    # Loop through each word in the maximum sequence length
    for word_counter in range(max_sequence_length):
        # Create masks for the encoder and decoder
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_sentence, tn_sentence)
        # Get predictions from the model
        predictions = transformer(eng_sentence,
                                  tn_sentence,
                                  encoder_self_attention_mask.to(device),
                                  decoder_self_attention_mask.to(device),
                                  decoder_cross_attention_mask.to(device),
                                  enc_start_token=False,
                                  enc_end_token=False,
                                  dec_start_token=True,
                                  dec_end_token=False)
        # Get the predicted next token's probability distribution
        next_token_prob_distribution = predictions[0][word_counter]
        # Choose the token with the highest probability
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_tamil[next_token_index]  # Update this line to map to your Tamil vocabulary
        # Append the predicted token to the sentence
        tn_sentence = (tn_sentence[0] + next_token,)
        # Break if the END token is reached
        if next_token == END_TOKEN:
            break
    return tn_sentence[0]


In [None]:
translation = translate("shall we go")
print(translation)

இந்த நிகழ்ச்சியில் இருக்கும் என்று அவர் கூறினார்.<END>
