In [28]:
import tensorflow as tf
from datasets import load_dataset
import re
import numpy as np
from collections import Counter

# Parameters
MAX_LEN_EN = 40
MAX_LEN_HI = 40

BATCH_SIZE = 64
EMBEDDING_DIM = 512
ENC_UNITS = 512
DEC_UNITS = ENC_UNITS * 2 
EPOCHS = 30
LEARNING_RATE = 0.001

# Special tokens
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"
START_TOKEN = "<start>"
END_TOKEN = "<end>"
special_tokens = [PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN]

# 1. Load dataset
from datasets import load_dataset

dataset = load_dataset("cfilt/iitb-english-hindi", split="train[:2%]")



In [29]:
dataset

Dataset({
    features: ['translation'],
    num_rows: 33182
})

In [30]:
for i in range(5):
  print(f"English: {dataset[i]['translation']['en']}")
  print(f"Hindi: {dataset[i]['translation']['hi']}")
  print("="*30)


English: Give your application an accessibility workout
Hindi: अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
English: Accerciser Accessibility Explorer
Hindi: एक्सेर्साइसर पहुंचनीयता अन्वेषक
English: The default plugin layout for the bottom panel
Hindi: निचले पटल के लिए डिफोल्ट प्लग-इन खाका
English: The default plugin layout for the top panel
Hindi: ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
English: A list of plugins that are disabled by default
Hindi: उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है


In [31]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s,.!?'-]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    return text.split()

def preprocess_sentence(sentence, language='hi'):
    sentence = clean_text(sentence)
    tokens = tokenize(sentence)
    if language == 'en':
        tokens = [START_TOKEN] + tokens + [END_TOKEN]
    return tokens

def build_vocab(tokenized_sentences, vocab_size=None):
    counter = Counter()
    for sent in tokenized_sentences:
        counter.update(sent)
    vocab = special_tokens + [word for word, freq in counter.most_common() if word not in special_tokens]
    if vocab_size:
        vocab = vocab[:vocab_size]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    return word2idx

def encode_sentence(tokens, word2idx, max_len):
    encoded = [word2idx.get(token, word2idx[UNK_TOKEN]) for token in tokens]
    if len(encoded) > max_len:
        encoded = encoded[:max_len]
    else:
        encoded += [word2idx[PAD_TOKEN]] * (max_len - len(encoded))
    return encoded

# Prepare sentences
english_sentences = [example['translation']['en'] for example in dataset]
hindi_sentences = [example['translation']['hi'] for example in dataset]

tokenized_en = [preprocess_sentence(s, 'en') for s in english_sentences]
tokenized_hi = [preprocess_sentence(s, 'hi') for s in hindi_sentences]

en_vocab = build_vocab(tokenized_en)
hi_vocab = build_vocab(tokenized_hi)

encoded_en = np.array([encode_sentence(s, en_vocab, MAX_LEN_EN) for s in tokenized_en])
encoded_hi = np.array([encode_sentence(s, hi_vocab, MAX_LEN_HI) for s in tokenized_hi])

# Prepare tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices((encoded_en, encoded_hi))
dataset = dataset.shuffle(10000).batch(BATCH_SIZE, drop_remainder=True)

# Indices of special tokens for loss masking
PAD_IDX = en_vocab[PAD_TOKEN]


In [32]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super().__init__()
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.bi_lstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(enc_units, return_sequences=True, return_state=True)
        )

    def call(self, x):
        x = self.embedding(x)
        output, forward_h, forward_c, backward_h, backward_c = self.bi_lstm(x)
        state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
        state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
        return output, state_h, state_c

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super().__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(dec_units, return_sequences=True, return_state=True)
        self.attention = BahdanauAttention(dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden, cell, enc_output):
        x = self.embedding(x)
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x, initial_state=[hidden, cell])
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state_h, state_c, attention_weights

In [33]:
encoder = Encoder(len(hi_vocab), EMBEDDING_DIM, ENC_UNITS)
decoder = Decoder(len(en_vocab), EMBEDDING_DIM, DEC_UNITS)

optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, PAD_IDX))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

@tf.function
def train_step(enc_input, dec_target):
    loss = 0
    batch_size = enc_input.shape[0]

    with tf.GradientTape() as tape:
        enc_output, enc_hidden, enc_cell = encoder(enc_input)
        dec_hidden, dec_cell = enc_hidden, enc_cell

        dec_input = tf.expand_dims([en_vocab[START_TOKEN]] * batch_size, 1)

        for t in range(1, dec_target.shape[1]):
            predictions, dec_hidden, dec_cell, _ = decoder(dec_input, dec_hidden, dec_cell, enc_output)
            loss += loss_function(dec_target[:, t], predictions)
            dec_input = tf.expand_dims(dec_target[:, t], 1)

    batch_loss = loss / int(dec_target.shape[1] - 1)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

# Training loop
for epoch in range(EPOCHS):
    total_loss = 0
    for batch, (enc_batch, dec_batch) in enumerate(dataset):
        batch_loss = train_step(enc_batch, dec_batch)
        total_loss += batch_loss

    print(f"Epoch {epoch+1} Loss {total_loss/(batch+1):.4f}")

InvalidArgumentError: Graph execution error:

Detected at node encoder_3_1/embedding_6_1/GatherV2 defined at (most recent call last):
  File "C:\Users\EDWIN\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main

  File "C:\Users\EDWIN\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\tornado\platform\asyncio.py", line 211, in start

  File "C:\Users\EDWIN\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 600, in run_forever

  File "C:\Users\EDWIN\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 1896, in _run_once

  File "C:\Users\EDWIN\AppData\Local\Programs\Python\Python310\lib\asyncio\events.py", line 80, in _run

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 519, in dispatch_queue

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 508, in process_one

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 400, in dispatch_shell

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel\ipkernel.py", line 368, in execute_request

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 767, in execute_request

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel\ipkernel.py", line 455, in do_execute

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\ipykernel\zmqshell.py", line 577, in run_cell

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3077, in run_cell

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3132, in _run_cell

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3336, in run_cell_async

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3519, in run_ast_nodes

  File "C:\Users\EDWIN\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3579, in run_code

  File "C:\Users\EDWIN\AppData\Local\Temp\ipykernel_21708\715884765.py", line 41, in <module>

  File "C:\Users\EDWIN\AppData\Local\Temp\ipykernel_21708\715884765.py", line 20, in train_step

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\layers\layer.py", line 941, in __call__

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\ops\operation.py", line 59, in __call__

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\EDWIN\AppData\Local\Temp\ipykernel_21708\1409500457.py", line 26, in call

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\layers\layer.py", line 941, in __call__

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\ops\operation.py", line 59, in __call__

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\layers\core\embedding.py", line 150, in call

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\ops\numpy.py", line 5795, in take

  File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 2340, in take

indices[62,3] = 870 is not in [0, 728)
	 [[{{node encoder_3_1/embedding_6_1/GatherV2}}]] [Op:__inference_train_step_60822]

In [6]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super().__init__()
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.bi_lstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(enc_units, return_sequences=True, return_state=True)
        )

    def call(self, x):
        x = self.embedding(x)
        output, forward_h, forward_c, backward_h, backward_c = self.bi_lstm(x)
        state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
        state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
        return output, state_h, state_c


In [7]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super().__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(dec_units, return_sequences=True, return_state=True)
        self.attention = BahdanauAttention(dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden, cell, enc_output):
        x = self.embedding(x)
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x, initial_state=[hidden, cell])
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state_h, state_c, attention_weights


In [8]:
encoder = Encoder(len(hi_vocab), EMBEDDING_DIM, ENC_UNITS)
decoder = Decoder(len(en_vocab), EMBEDDING_DIM, DEC_UNITS)

optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

PAD_IDX = en_vocab[PAD_TOKEN]


In [9]:
@tf.function
def train_step(enc_input, dec_target):
    loss = 0
    batch_size = enc_input.shape[0]

    with tf.GradientTape() as tape:
        enc_output, enc_hidden, enc_cell = encoder(enc_input)
        dec_hidden, dec_cell = enc_hidden, enc_cell

        # Decoder input starts with English <START> token
        dec_input = tf.expand_dims([en_vocab[START_TOKEN]] * batch_size, 1)

        for t in range(1, dec_target.shape[1]):
            predictions, dec_hidden, dec_cell, _ = decoder(dec_input, dec_hidden, dec_cell, enc_output)
            loss += loss_function(dec_target[:, t], predictions)
            dec_input = tf.expand_dims(dec_target[:, t], 1)

    batch_loss = loss / int(dec_target.shape[1] - 1)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


In [10]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, PAD_IDX))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)


In [11]:
for epoch in range(EPOCHS):
    total_loss = 0
    for batch, (enc_batch, dec_batch) in enumerate(dataset):
        batch_loss = train_step(enc_batch, dec_batch)
        total_loss += batch_loss

    print(f"Epoch {epoch+1} Loss {total_loss/(batch+1):.4f}")


ValueError: in user code:

    File "C:\Users\EDWIN\AppData\Local\Temp\ipykernel_21708\3939651672.py", line 14, in train_step  *
        predictions, dec_hidden, dec_cell, _ = decoder(dec_input, dec_hidden, dec_cell, enc_output)
    File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\EDWIN\AppData\Local\Temp\ipykernel_21708\1823708170.py", line 14, in call
        output, state_h, state_c = self.lstm(x, initial_state=[hidden, cell])

    ValueError: Exception encountered when calling LSTMCell.call().
    
    [1mDimensions must be equal, but are 1024 and 512 for '{{node decoder_1/lstm_1_1/lstm_cell_1/MatMul_1}} = MatMul[T=DT_FLOAT, grad_a=false, grad_b=false, transpose_a=false, transpose_b=false](encoder_1/concatenate_2_1/concat, decoder_1/lstm_1_1/lstm_cell_1/Cast_1/ReadVariableOp)' with input shapes: [64,1024], [512,2048].[0m
    
    Arguments received by LSTMCell.call():
      • inputs=tf.Tensor(shape=(64, 1536), dtype=float32)
      • states=('tf.Tensor(shape=(64, 1024), dtype=float32)', 'tf.Tensor(shape=(64, 1024), dtype=float32)')
      • training=False


In [15]:
import tensorflow as tf
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. Load Dataset
dataset = load_dataset("cfilt/iitb-english-hindi", split="train[:2%]")
src_texts = [x['translation']['en'] for x in dataset]
tgt_texts = [x['translation']['hi'] for x in dataset]

# 2. Tokenization
src_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tgt_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [16]:
max_src_len, max_tgt_len = 40, 40

def encode(texts, tokenizer, max_len):
    tokens = tokenizer(texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="np")
    return tokens['input_ids']

src_sequences = encode(src_texts, src_tokenizer, max_src_len)
tgt_sequences = encode(tgt_texts, tgt_tokenizer, max_tgt_len)

# Prepare decoder input (shifted right)
def shift_right(arr):
    pad_id = tgt_tokenizer.pad_token_id
    sos_id = tgt_tokenizer.cls_token_id if tgt_tokenizer.cls_token_id else tgt_tokenizer.bos_token_id
    shifted = np.full_like(arr, pad_id)
    shifted[:, 1:] = arr[:, :-1]
    shifted[:, 0] = sos_id
    return shifted

decoder_input = shift_right(tgt_sequences)
dataset = tf.data.Dataset.from_tensor_slices(((src_sequences, decoder_input), tgt_sequences)).batch(32)

# 3. Model hyperparameters
ENC_UNITS = 256
DEC_UNITS = 256
VOCAB_SIZE_SRC = src_tokenizer.vocab_size
VOCAB_SIZE_TGT = tgt_tokenizer.vocab_size

In [17]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.bilstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(enc_units, return_sequences=True, return_state=True)
        )

    def call(self, x):
        x = self.embedding(x)
        output, forward_h, forward_c, backward_h, backward_c = self.bilstm(x)
        state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
        state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
        return output, state_h, state_c

# 5. Attention
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        return context_vector, tf.squeeze(attention_weights, -1)

# 6. Decoder
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, enc_units):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.attention = BahdanauAttention(dec_units)
        self.lstm = tf.keras.layers.LSTM(dec_units * 2, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Project encoder states to correct decoder size
        self.state_proj_h = tf.keras.layers.Dense(dec_units * 2)
        self.state_proj_c = tf.keras.layers.Dense(dec_units * 2)

    def call(self, x, hidden, cell, enc_output):
        context_vector, attn_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        context_vector = tf.expand_dims(context_vector, 1)
        x = tf.concat([context_vector, x], axis=-1)
        output, state_h, state_c = self.lstm(x, initial_state=[
            self.state_proj_h(hidden), self.state_proj_c(cell)
        ])
        output = self.fc(output)
        return output, state_h, state_c, attn_weights


In [18]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(y_true, y_pred):
    mask = tf.cast(tf.math.not_equal(y_true, tgt_tokenizer.pad_token_id), dtype=tf.float32)
    loss_ = loss_object(y_true, y_pred)
    loss_ *= mask
    return tf.reduce_mean(loss_)

optimizer = tf.keras.optimizers.Adam()

# 8. Training step
@tf.function
def train_step(enc_in, dec_in, dec_true, encoder, decoder):
    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(enc_in)
        dec_h, dec_c = enc_h, enc_c
        outputs, dec_h, dec_c, attn = decoder(dec_in, dec_h, dec_c, enc_output)
        loss = loss_function(dec_true, outputs)
    variables = encoder.trainable_variables + decoder.trainable_variables
    tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(variables, tape.gradient(loss, variables)))
    return loss

# 9. Training loop
encoder = Encoder(VOCAB_SIZE_SRC, 128, ENC_UNITS)
decoder = Decoder(VOCAB_SIZE_TGT, 128, DEC_UNITS, ENC_UNITS)

for epoch in range(10):
    total_loss = 0
    for batch, ((enc_batch, dec_batch), dec_true) in enumerate(dataset):
        batch_loss = train_step(enc_batch, dec_batch, dec_true, encoder, decoder)
        total_loss += batch_loss
        if batch % 20 == 0:
            print(f"Epoch {epoch+1}, Batch {batch}, Loss {batch_loss:.4f}")
    print(f"Epoch {epoch+1} Loss {total_loss/(batch+1):.4f}")


ValueError: in user code:

    File "C:\Users\EDWIN\AppData\Local\Temp\ipykernel_21708\3142741670.py", line 17, in train_step  *
        outputs, dec_h, dec_c, attn = decoder(dec_in, dec_h, dec_c, enc_output)
    File "c:\Users\EDWIN\OneDrive\Documents\GitHub\Multilingual-ASR\.venv\lib\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\EDWIN\AppData\Local\Temp\ipykernel_21708\3481671349.py", line 48, in call
        x = tf.concat([context_vector, x], axis=-1)

    ValueError: Exception encountered when calling Decoder.call().
    
    [1mDimension 1 in both shapes must be equal, but are 1 and 40. Shapes are [32,1] and [32,40]. for '{{node decoder_1_1/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](decoder_1_1/ExpandDims, decoder_1_1/embedding_3_1/GatherV2, decoder_1_1/concat/axis)' with input shapes: [32,1,512], [32,40,128], [] and with computed input tensors: input[2] = <-1>.[0m
    
    Arguments received by Decoder.call():
      • x=tf.Tensor(shape=(32, 40), dtype=int64)
      • hidden=tf.Tensor(shape=(32, 512), dtype=float32)
      • cell=tf.Tensor(shape=(32, 512), dtype=float32)
      • enc_output=tf.Tensor(shape=(32, 40, 512), dtype=float32)


In [None]:
def translate(sentence):
    # Preprocess Hindi input
    sentence = preprocess_sentence(sentence, 'hi')
    encoded = encode_sentence(sentence, hi_vocab, MAX_LEN_HI)
    inputs = tf.expand_dims(encoded, 0)

    # Encoder
    enc_output, enc_hidden, enc_cell = encoder(inputs)
    dec_hidden, dec_cell = enc_hidden, enc_cell

    # Decoder starts with <START> in English vocab
    dec_input = tf.expand_dims([en_vocab[START_TOKEN]], 0)
    result = []

    for t in range(MAX_LEN_EN):
        predictions, dec_hidden, dec_cell, attention_weights = decoder(
            dec_input, dec_hidden, dec_cell, enc_output
        )
        predicted_id = tf.argmax(predictions[0]).numpy()

        if predicted_id == en_vocab[END_TOKEN]:
            break

        result.append(predicted_id)
        dec_input = tf.expand_dims([predicted_id], 0)

    # Convert back to words
    inv_en_vocab = {v: k for k, v in en_vocab.items()}
    translated_sentence = ' '.join(inv_en_vocab.get(idx, UNK_TOKEN) for idx in result)
    return translated_sentence


# Example translation
test_sentence = "धन्यवाद"   # Hindi input
print("Hindi:", test_sentence)
print("English:", translate(test_sentence))


In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-hi-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [None]:
text = ["उसका कहा हुआ एक शब्द भी सुनने लायक नहीं है"]
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
translated = model.generate(**inputs)
print(tokenizer.decode(translated[0], skip_special_tokens=True))


In [20]:
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# 1. Load dataset (Hindi to English)
dataset = load_dataset('cfilt/iitb-english-hindi', split='train')
train_texts = [x['translation']['hi'] for x in dataset]
target_texts = [x['translation']['en'] for x in dataset]










In [21]:
model_name = "Helsinki-NLP/opus-mt-hi-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# 3. Tokenize the data
max_source_length = 64
max_target_length = 64

def preprocess_function(batch):
    inputs = tokenizer(batch['translation']['hi'], max_length=max_source_length, truncation=True, padding="max_length")
    targets = tokenizer(batch['translation']['en'], max_length=max_target_length, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# 4. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map:   0%|          | 0/1659083 [00:00<?, ? examples/s]


TypeError: list indices must be integers or slices, not str

In [None]:
# 5. Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt-hi-en",
    per_device_train_batch_size=8,
    num_train_epochs=2,
    save_steps=500,
    logging_steps=50,
    learning_rate=2e-5,
    save_total_limit=2,
    fp16=True,
)

# 6. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# 7. Training
trainer.train()



In [None]:
# 8. Saving the model
trainer.save_model("./mt-hi-en-finetuned")
tokenizer.save_pretrained("./mt-hi-en-finetuned")