<a href="https://colab.research.google.com/github/fazal735/DL_A3/blob/main/DL_A3_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, RNN, LSTM, GRU, SimpleRNN


class TransliterationSeq2Seq(Model):
    def __init__(self,
                 input_vocab_size,
                 target_vocab_size,
                 embedding_dim,
                 encoder_units,
                 decoder_units,
                 cell_type='lstm',
                 encoder_layers=1,
                 decoder_layers=1):
        """
        Initialize the Seq2Seq model for transliteration.

        Args:
            input_vocab_size: Size of the input language vocabulary
            target_vocab_size: Size of the target language vocabulary
            embedding_dim: Dimension of character embeddings
            encoder_units: Number of units in encoder cell
            decoder_units: Number of units in decoder cell
            cell_type: Type of RNN cell ('rnn', 'lstm', or 'gru')
            encoder_layers: Number of layers in the encoder
            decoder_layers: Number of layers in the decoder
        """
        super(TransliterationSeq2Seq, self).__init__()

        # Store model parameters
        self.encoder_units = encoder_units
        self.decoder_units = decoder_units
        self.embedding_dim = embedding_dim
        self.input_vocab_size = input_vocab_size
        self.target_vocab_size = target_vocab_size

        # Create embeddings for input characters
        self.embedding = Embedding(input_vocab_size, embedding_dim)

        # Setup the cell type based on user preference
        self.cell_type = cell_type.lower()
        if self.cell_type == 'lstm':
            rnn_cell = LSTM
        elif self.cell_type == 'gru':
            rnn_cell = GRU
        elif self.cell_type == 'rnn':
            rnn_cell = SimpleRNN
        else:
            raise ValueError(f"Unsupported cell type: {cell_type}")

        # Create encoder layers
        self.encoder_layers = []
        for i in range(encoder_layers):
            input_size = embedding_dim if i == 0 else encoder_units
            return_sequences = (i < encoder_layers - 1)
            self.encoder_layers.append(
                rnn_cell(encoder_units,
                         return_sequences=return_sequences,
                         return_state=True,
                         name=f'encoder_layer_{i+1}')
            )

        # Create decoder layers
        self.decoder_layers = []
        for i in range(decoder_layers):
            input_size = embedding_dim if i == 0 else decoder_units
            return_sequences = True
            self.decoder_layers.append(
                rnn_cell(decoder_units,
                         return_sequences=return_sequences,
                         return_state=True,
                         name=f'decoder_layer_{i+1}')
            )

        # Output layer to convert decoder output to character probabilities
        self.fc = Dense(target_vocab_size, activation='softmax')

        if encoder_units != decoder_units:
            if self.cell_type == 'lstm':
                self.state_h_adapter = tf.keras.layers.Dense(decoder_units, name='state_h_adapter')
                self.state_c_adapter = tf.keras.layers.Dense(decoder_units, name='state_c_adapter')
            else:
                self.state_adapter = tf.keras.layers.Dense(decoder_units, name='state_adapter')

    def encode(self, input_seq):
        """
        Encode the input sequence.

        Args:
            input_seq: Input sequence tensor of shape (batch_size, seq_length)

        Returns:
            encoder_outputs: Outputs from the encoder
            encoder_states: Final states from the encoder (to initialize decoder)
        """
        # Embed input sequence
        x = self.embedding(input_seq)

        # Process through encoder layers
        encoder_states = []
        for i, encoder_layer in enumerate(self.encoder_layers):
            if i == 0:
                outputs = x

            if self.cell_type == 'lstm':
                outputs, state_h, state_c = encoder_layer(outputs)
                encoder_states.extend([state_h, state_c])
            else:  # RNN or GRU
                outputs, state = encoder_layer(outputs)
                encoder_states.append(state)

        return outputs, encoder_states

    def adapt_encoder_states_for_decoder(self, encoder_states):
        """Adapt encoder states to be compatible with decoder dimensions"""
        decoder_states = []

        for state in encoder_states:
            if self.cell_type == 'lstm':
                state_h, state_c = state
                if hasattr(self, 'state_h_adapter'):
                    adapted_h = self.state_h_adapter(state_h)
                    adapted_c = self.state_c_adapter(state_c)
                    decoder_states.append((adapted_h, adapted_c))
                else:
                    decoder_states.append((state_h, state_c))
            else:
                if hasattr(self, 'state_adapter'):
                    adapted_state = self.state_adapter(state)
                    decoder_states.append(adapted_state)
                else:
                    decoder_states.append(state)

        return decoder_states

    def decode_step(self, x, states):
        """
        Perform one decoding step.

        Args:
            x: Input character tensor of shape (batch_size, 1)
            states: Previous states from the decoder

        Returns:
            output: Output probabilities
            new_states: Updated states
        """
        # Embed input character
        x = self.embedding(x)

        # Process through decoder layers
        all_new_states = []
        for i, decoder_layer in enumerate(self.decoder_layers):
            if i == 0:
                layer_input = x
            else:
                layer_input = outputs

            # Extract the states for this layer
            if self.cell_type == 'lstm':
                layer_states = [states[i*2], states[i*2+1]]
            else:  # RNN or GRU
                layer_states = [states[i]]

            # Process through the layer
            if self.cell_type == 'lstm':
                outputs, state_h, state_c = decoder_layer(layer_input, initial_state=layer_states)
                all_new_states.extend([state_h, state_c])
            else:  # RNN or GRU
                outputs, state = decoder_layer(layer_input, initial_state=layer_states)
                all_new_states.append(state)

        # Generate output probabilities
        output = self.fc(outputs)

        return output, all_new_states

    def call(self, inputs, training=None):
        """
        Forward pass through the model.

        Args:
            inputs: Tuple of (input_seq, target_seq)
            training: Whether the model is in training mode

        Returns:
            outputs: Sequence of output probabilities
        """
        input_seq, target_seq = inputs

        # Encode the input sequence
        encoder_outputs, encoder_states = self.encode(input_seq)

        # Initialize decoder states with encoder states
        decoder_states = encoder_states

        # Teacher forcing: feeding the target as the next input
        decoder_inputs = target_seq[:, :-1]  # exclude last character

        # Initialize list to store outputs
        outputs = []

        # Process each character in the target sequence
        for t in range(decoder_inputs.shape[1]):
            # Extract input for this timestep
            decoder_input = decoder_inputs[:, t:t+1]

            # Perform one decoding step
            output, decoder_states = self.decode_step(decoder_input, decoder_states)

            # Store output
            outputs.append(output)

        # Concatenate outputs along time axis
        return tf.concat(outputs, axis=1)

    def predict(self, input_seq, max_length=100, start_token=1, end_token=2):
        """
        Generate a transliteration for an input sequence.

        Args:
            input_seq: Input sequence tensor of shape (batch_size, seq_length)
            max_length: Maximum length of the output sequence
            start_token: ID of the start token
            end_token: ID of the end token

        Returns:
            outputs: Generated output sequence
        """
        # Encode the input sequence
        _, encoder_states = self.encode(input_seq)

        # Initialize decoder states with encoder states
        decoder_states = encoder_states

        # Start with the start token
        batch_size = input_seq.shape[0]
        decoder_input = tf.ones((batch_size, 1), dtype=tf.int32) * start_token

        # Initialize list to store outputs
        outputs = []

        # Generate characters until max_length or end token
        for t in range(max_length):
            # Perform one decoding step
            output, decoder_states = self.decode_step(decoder_input, decoder_states)

            # Get the most likely character
            predicted_id = tf.argmax(output, axis=-1)

            # Store output
            outputs.append(predicted_id)

            # Break if end token is predicted
            if predicted_id == end_token:
                break

            # Use predicted ID as next input
            decoder_input = predicted_id

        # Concatenate outputs along time axis
        return tf.concat(outputs, axis=1)


# Example usage
def create_model(input_vocab_size=1000,
                target_vocab_size=1000,
                embedding_dim=256,
                encoder_units=512,
                decoder_units=512,
                cell_type='lstm',
                encoder_layers=1,
                decoder_layers=1):
    """
    Create and compile a transliteration model.

    Args:
        input_vocab_size: Size of the input vocabulary
        target_vocab_size: Size of the target vocabulary
        embedding_dim: Dimension of character embeddings
        encoder_units: Number of units in encoder cell
        decoder_units: Number of units in decoder cell
        cell_type: Type of RNN cell ('rnn', 'lstm', or 'gru')
        encoder_layers: Number of layers in the encoder
        decoder_layers: Number of layers in the decoder

    Returns:
        model: Compiled transliteration model
    """
    # Create inputs
    input_seq = Input(shape=(None,), name='input_sequence')
    target_seq = Input(shape=(None,), name='target_sequence')

    # Create model
    model = TransliterationSeq2Seq(
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        embedding_dim=embedding_dim,
        encoder_units=encoder_units,
        decoder_units=decoder_units,
        cell_type=cell_type,
        encoder_layers=encoder_layers,
        decoder_layers=decoder_layers
    )

    # Compile model
    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy']
    )

    return model


# Training example
def train_model(model, input_sequences, target_sequences, epochs=10, batch_size=64):
    """
    Train the transliteration model.

    Args:
        model: Compiled transliteration model
        input_sequences: Input sequences (Latin script)
        target_sequences: Target sequences (Devanagari script)
        epochs: Number of epochs to train
        batch_size: Batch size for training

    Returns:
        history: Training history
    """
    # Prepare target sequences for training (shifted by 1)
    shifted_target_sequences = target_sequences[:, 1:]

    # Train the model
    history = model.fit(
        [input_sequences, target_sequences],
        shifted_target_sequences,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.2
    )

    return history


In [None]:

!pip uninstall -y wandb
!pip uninstall -y wandb-sdk
!pip uninstall -y wandb-core
!pip install --upgrade pip

Found existing installation: wandb 0.19.11
Uninstalling wandb-0.19.11:
  Successfully uninstalled wandb-0.19.11


In [None]:
!pip install wandb
!pip install tensorflow

Collecting wandb
  Using cached wandb-0.19.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Using cached wandb-0.19.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.4 MB)
Installing collected packages: wandb
Successfully installed wandb-0.19.11


In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import wandb
# from wandb.keras import WandbCallback
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, RNN, LSTM, GRU, SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Data loading and preprocessing


def load_dakshina_data(language='ur', data_dir='/content/drive/MyDrive/dakshina_dataset_v1.0/'):

    data_path = os.path.join(data_dir, language, 'lexicons')

    train_file = os.path.join(data_path, f'{language}.translit.sampled.train.tsv')
    dev_file = os.path.join(data_path, f'{language}.translit.sampled.dev.tsv')
    test_file = os.path.join(data_path, f'{language}.translit.sampled.test.tsv')

    train_data = pd.read_csv(train_file, sep='\t', header=None, names=['native', 'latin','n'])
    dev_data = pd.read_csv(dev_file, sep='\t', header=None, names=['native', 'latin','n'])
    test_data = pd.read_csv(test_file, sep='\t', header=None, names=['native', 'latin','n'])


    print(f"Loaded {len(train_data)} training, {len(dev_data)} dev, and {len(test_data)} test examples.")

    return train_data, dev_data, test_data

In [3]:


def preprocess_data(train_data, dev_data, test_data):
    """
    Preprocess the data: tokenize, create vocabulary, and convert to sequences.

    Args:
        train_data, dev_data, test_data: DataFrames with native and latin script pairs

    Returns:
        Dictionary containing preprocessed data and tokenizers
    """

    print(train_data.head())

    # Clean the 'latin' column: remove NaNs and convert all to strings
    train_data['latin'] = train_data['latin'].astype(str).fillna('')

    # Or, to be safer and remove completely invalid rows:
    train_data = train_data.dropna(subset=['latin'])
    train_data['latin'] = train_data['latin'].astype(str)


    # Create character-level tokenizers
    latin_tokenizer = Tokenizer(char_level=True)
    native_tokenizer = Tokenizer(char_level=True)

    # Fit tokenizers on training data
    latin_tokenizer.fit_on_texts((train_data['latin']).tolist())
    native_tokenizer.fit_on_texts((train_data['native']).tolist())

    # Get vocabulary sizes
    latin_vocab_size = len(latin_tokenizer.word_index) + 1  # +1 for padding
    native_vocab_size = len(native_tokenizer.word_index) + 1  # +1 for padding

    # Convert text to sequences
    train_latin_seq = latin_tokenizer.texts_to_sequences(list(train_data['latin']))
    train_native_seq = native_tokenizer.texts_to_sequences(list(train_data['native']))

    dev_latin_seq = latin_tokenizer.texts_to_sequences(list(dev_data['latin']))
    dev_native_seq = native_tokenizer.texts_to_sequences(list(dev_data['native']))

    test_latin_seq = latin_tokenizer.texts_to_sequences(list(test_data['latin']))
    test_native_seq = native_tokenizer.texts_to_sequences(list(test_data['native']))

    # Find maximum sequence lengths
    max_latin_len = max(max(len(seq) for seq in train_latin_seq),
                        max(len(seq) for seq in dev_latin_seq),
                        max(len(seq) for seq in test_latin_seq))

    max_native_len = max(max(len(seq) for seq in train_native_seq),
                         max(len(seq) for seq in dev_native_seq),
                         max(len(seq) for seq in test_native_seq))

    # Add 2 for start and end tokens
    max_latin_len += 2
    max_native_len += 2

    # Add start (<s>) and end (</s>) tokens to target sequences
    start_token = native_vocab_size  # Use vocab_size as start token
    end_token = native_vocab_size + 1  # Use vocab_size+1 as end token
    native_vocab_size += 2  # Increase vocab size for start and end tokens

    # Add start and end tokens to native sequences
    def add_start_end(sequences, max_len, start_token, end_token):
        new_sequences = []
        for seq in sequences:
            new_seq = [start_token] + seq + [end_token]
            new_sequences.append(new_seq)
        return pad_sequences(new_sequences, maxlen=max_len, padding='post')

    # Pad sequences
    train_latin_padded = pad_sequences(train_latin_seq, maxlen=max_latin_len, padding='post')
    train_native_padded = add_start_end(train_native_seq, max_native_len, start_token, end_token)

    dev_latin_padded = pad_sequences(dev_latin_seq, maxlen=max_latin_len, padding='post')
    dev_native_padded = add_start_end(dev_native_seq, max_native_len, start_token, end_token)

    test_latin_padded = pad_sequences(test_latin_seq, maxlen=max_latin_len, padding='post')
    test_native_padded = add_start_end(test_native_seq, max_native_len, start_token, end_token)

    # Target for teacher forcing (shifted right by one position)
    train_native_target = train_native_padded[:, 1:]  # Remove start token
    dev_native_target = dev_native_padded[:, 1:]  # Remove start token

    return {
        'latin_tokenizer': latin_tokenizer,
        'native_tokenizer': native_tokenizer,
        'latin_vocab_size': latin_vocab_size,
        'native_vocab_size': native_vocab_size,
        'max_latin_len': max_latin_len,
        'max_native_len': max_native_len,
        'start_token': start_token,
        'end_token': end_token,
        'train_latin': train_latin_padded,
        'train_native': train_native_padded,
        'train_native_target': train_native_target,
        'dev_latin': dev_latin_padded,
        'dev_native': dev_native_padded,
        'dev_native_target': dev_native_target,
        'test_latin': test_latin_padded,
        'test_native': test_native_padded
    }


In [4]:

# Enhanced Seq2Seq model with dropout and beam search
class TransliterationSeq2Seq(Model):
    def __init__(self,
                 input_vocab_size,
                 target_vocab_size,
                 embedding_dim,
                 decoder_hidden_dim,
                 encoder_units,
                 decoder_units,
                 cell_type='lstm',
                 encoder_layers=1,
                 decoder_layers=1,
                 dropout_rate=0.0,


                 recurrent_dropout_rate=0.0):
        """
        Initialize the Seq2Seq model for transliteration with dropout support.

        Args:
            input_vocab_size: Size of the input language vocabulary
            target_vocab_size: Size of the target language vocabulary
            embedding_dim: Dimension of character embeddings
            encoder_units: Number of units in encoder cell
            decoder_units: Number of units in decoder cell
            cell_type: Type of RNN cell ('rnn', 'lstm', or 'gru')
            encoder_layers: Number of layers in the encoder
            decoder_layers: Number of layers in the decoder
            dropout_rate: Dropout rate after RNN layers
            recurrent_dropout_rate: Dropout rate inside RNN cells
        """
        super(TransliterationSeq2Seq, self).__init__()

        # Store model parameters
        self.encoder_units = encoder_units
        self.decoder_units = decoder_units
        self.embedding_dim = embedding_dim
        self.input_vocab_size = input_vocab_size
        self.target_vocab_size = target_vocab_size
        self.dropout_rate = dropout_rate
        self.target_embedding = tf.keras.layers.Embedding(input_dim=self.target_vocab_size,
                                                          output_dim=self.embedding_dim )

        self.embedding_projection = tf.keras.layers.Dense(decoder_hidden_dim)

        # Create embeddings for input and target characters
        self.input_embedding = Embedding(input_vocab_size, embedding_dim)
        self.target_embedding = Embedding(target_vocab_size, embedding_dim)

        # Setup the cell type based on user preference
        self.cell_type = cell_type.lower()
        rnn_kwargs = {'return_state': True}

        # Add recurrent dropout if specified
        if recurrent_dropout_rate > 0:
            rnn_kwargs['recurrent_dropout'] = recurrent_dropout_rate

        if self.cell_type == 'lstm':
            rnn_cell = LSTM
        elif self.cell_type == 'gru':
            rnn_cell = GRU
        elif self.cell_type == 'rnn':
            rnn_cell = SimpleRNN
        else:
            raise ValueError(f"Unsupported cell type: {cell_type}")

        # Create encoder layers
        self.encoder_layers = []
        for i in range(encoder_layers):
            return_sequences = (i < encoder_layers - 1)
            layer_kwargs = rnn_kwargs.copy()
            layer_kwargs['return_sequences'] = return_sequences
            layer_kwargs['name'] = f'encoder_layer_{i+1}'

            self.encoder_layers.append(rnn_cell(encoder_units, **layer_kwargs))

            # Add dropout between layers if specified
            if dropout_rate > 0 and i < encoder_layers - 1:
                self.encoder_layers.append(Dropout(dropout_rate))

        # Create decoder layers
        self.decoder_layers = []
        for i in range(decoder_layers):
            return_sequences = True
            layer_kwargs = rnn_kwargs.copy()
            layer_kwargs['return_sequences'] = return_sequences
            layer_kwargs['name'] = f'decoder_layer_{i+1}'

            self.decoder_layers.append(rnn_cell(decoder_units, **layer_kwargs))

            # Add dropout between layers if specified
            if dropout_rate > 0 and i < decoder_layers - 1:
                self.decoder_layers.append(Dropout(dropout_rate))

        # Final dropout before output layer
        if dropout_rate > 0:
            self.final_dropout = Dropout(dropout_rate)

        # Output layer to convert decoder output to character probabilities
        self.fc = Dense(target_vocab_size, activation='softmax')

    def encode(self, input_seq):
        """
        Encode the input sequence.

        Args:
            input_seq: Input sequence tensor of shape (batch_size, seq_length)

        Returns:
            encoder_outputs: Outputs from the encoder
            encoder_states: Final states from the encoder (to initialize decoder)
        """
        # Embed input sequence
        x = self.input_embedding(input_seq)

        # Process through encoder layers
        encoder_states = []
        layer_output = x

        for layer in self.encoder_layers:
            if isinstance(layer, Dropout):
                layer_output = layer(layer_output)
            else:
                if self.cell_type == 'lstm':
                    layer_output, state_h, state_c = layer(layer_output)
                    encoder_states.extend([state_h, state_c])
                else:  # RNN or GRU
                    layer_output, state = layer(layer_output)
                    encoder_states.append(state)

        return layer_output, encoder_states
    def adapt_encoder_states_for_decoder(self, encoder_states):
        """Adapt encoder states to be compatible with decoder dimensions"""
        decoder_states = []

        for state in encoder_states:
            if self.cell_type == 'lstm':
                state_h, state_c = state
                if hasattr(self, 'state_h_adapter'):
                    adapted_h = self.state_h_adapter(state_h)
                    adapted_c = self.state_c_adapter(state_c)
                    decoder_states.append((adapted_h, adapted_c))
                else:
                    decoder_states.append((state_h, state_c))
            else:
                if hasattr(self, 'state_adapter'):
                    adapted_state = self.state_adapter(state)
                    decoder_states.append(adapted_state)
                else:
                    decoder_states.append(state)

        return decoder_states

    def decode_step(self, x, states,training=None):
        """
        Perform one decoding step.

        Args:
            x: Input character tensor of shape (batch_size, 1)
            states: Previous states from the decoder

        Returns:
            output: Output probabilities
            new_states: Updated states
        """
        # Embed input character

        x = self.target_embedding(x)
        x = self.embedding_projection(x)

        # Process through decoder layers
        all_new_states = []
        layer_output = x

        # Track which state to use for each RNN layer
        state_idx = 0
        layer_idx = 0

        for layer in self.decoder_layers:
            if isinstance(layer, Dropout):
                layer_output = layer(layer_output,training=False)
            else:
                # Extract the states for this layer
                if self.cell_type == 'lstm':
                    layer_states = [states[state_idx], states[state_idx+1]]
                    state_idx += 2
                else:  # RNN or GRU
                    layer_states = [states[state_idx]]
                    state_idx += 1

                # Process through the layer
                if self.cell_type == 'lstm':
                    layer_output, state_h, state_c = layer(layer_output, initial_state=layer_states)
                    all_new_states.extend([state_h, state_c])
                else:  # RNN or GRU
                    layer_output, state = layer(layer_output, initial_state=layer_states)
                    all_new_states.append(state)

                layer_idx += 1

        # Apply final dropout if specified
        if hasattr(self, 'final_dropout'):
            layer_output = self.final_dropout(layer_output)

        # Generate output probabilities
        output = self.fc(layer_output)

        return output, all_new_states

    def call(self, inputs, training=None):
        """
        Forward pass through the model.

        Args:
            inputs: Tuple of (input_seq, target_seq)
            training: Whether the model is in training mode

        Returns:
            outputs: Sequence of output probabilities
        """
        input_seq, target_seq = inputs

        # Encode the input sequence
        encoder_outputs, encoder_states = self.encode(input_seq)



        # Initialize decoder states with encoder states
        decoder_states = encoder_states

        # Initialize decoder states to zeros
        decoder_states = []
        for layer in self.decoder_layers:
            if isinstance(layer, tf.keras.layers.RNN):  # Could be LSTM, GRU, or SimpleRNN
                units = layer.cell.units if hasattr(layer, "cell") else layer.units
                batch_size = tf.shape(input_seq)[0]
                if isinstance(layer.cell, tf.keras.layers.LSTMCell):
                    decoder_states.extend([
                        tf.zeros((batch_size, units)),  # h
                        tf.zeros((batch_size, units))   # c
                     ])
                else:
                    decoder_states.append(tf.zeros((batch_size, units)))


        # Teacher forcing: feeding the target as the next input
        decoder_inputs = target_seq[:, :-1]  # exclude last character

        # Initialize list to store outputs
        outputs = []

        # Process each character in the target sequence
        for t in range(decoder_inputs.shape[1]):
            # Extract input for this timestep
            decoder_input = decoder_inputs[:, t:t+1]

            # Perform one decoding step
            output, decoder_states = self.decode_step(decoder_input, decoder_states,training=training)

            # Store output
            outputs.append(output)
        output, decoder_states = self.decode_step(decoder_input, decoder_states, training=training)

        def build(self, input_shape):
          # Example: input_shape = (batch_size, timesteps)
            dummy_input = tf.random.uniform((1, input_shape[1]))
            dummy_embedding = self.embedding(dummy_input)

           # Pass through encoder
            _= self.encoder(dummy_embedding)

            # Create dummy decoder input with correct shape
            decoder_input = tf.random.uniform((1, input_shape[1], self.embedding_dim))
            decoder_state = [tf.zeros((1, self.decoder_units)) for _ in range(self.decoder_layers)]

            # Pass through decoder (for LSTM, you need two states per layer: h and c)
            for layer in self.decoder_layers_list:
                if isinstance(layer.cell, tf.keras.layers.LSTMCell):
                    state = [tf.zeros((1, self.decoder_units)), tf.zeros((1, self.decoder_units))]
                else:
                    state = [tf.zeros((1, self.decoder_units))]
                _ = layer(decoder_input, initial_state=state)

            super().build(input_shape)

        # Concatenate outputs along time axis
        return tf.concat(outputs, axis=1)

    def beam_search_decode(self, input_seq, max_length, beam_size=3, start_token=None, end_token=None):
        """
        Generate a transliteration using beam search.

        Args:
            input_seq: Input sequence tensor of shape (batch_size, seq_length)
            max_length: Maximum length of the output sequence
            beam_size: Number of beams to track
            start_token: ID of the start token
            end_token: ID of the end token

        Returns:
            best_sequences: List of best sequences found (one per input sequence)
        """
        batch_size = input_seq.shape[0]

        # Default tokens if not provided
        if start_token is None:
            start_token = self.target_vocab_size - 2
        if end_token is None:
            end_token = self.target_vocab_size - 1

        # Encode input sequence
        _, encoder_states = self.encode(input_seq)

        # Process each input sequence separately (beam search isn't batch-friendly)
        best_sequences = []

        for b in range(batch_size):
            # Extract states for this batch item
            if self.cell_type == 'lstm':
                batch_states = []
                for i in range(0, len(encoder_states), 2):
                    batch_states.append(encoder_states[i][b:b+1])
                    batch_states.append(encoder_states[i+1][b:b+1])
            else:
                batch_states = [state[b:b+1] for state in encoder_states]

            # Initialize beam with start token
            beams = [(0.0, [start_token], batch_states)]
            complete_beams = []

            # Generate sequence
            for t in range(max_length):
                new_beams = []

                # Process each beam
                for score, sequence, states in beams:
                    # Check if sequence is complete
                    if sequence[-1] == end_token:
                        complete_beams.append((score, sequence))
                        continue

                    # Get next token predictions
                    decoder_input = tf.constant([[sequence[-1]]], dtype=tf.int32)
                    output, new_states = self.decode_step(decoder_input, states)

                    # Get top k predictions
                    log_probs = tf.math.log(output[0, 0])
                    top_k_probs, top_k_indices = tf.math.top_k(log_probs, k=beam_size)

                    # Create new beams
                    for i in range(beam_size):
                        new_score = score + top_k_probs[i].numpy()
                        new_sequence = sequence + [top_k_indices[i].numpy()]
                        new_beams.append((new_score, new_sequence, new_states))

                # Keep only the best beams
                beams = sorted(new_beams, key=lambda x: x[0], reverse=True)[:beam_size]

                # Early stopping if all beams are complete
                if all(beam[1][-1] == end_token for beam in beams):
                    complete_beams.extend(beams)
                    break

            # Add incomplete beams to complete ones
            complete_beams.extend([(score, sequence) for score, sequence, _ in beams if sequence[-1] != end_token])

            # Sort and select the best sequence
            if complete_beams:
                best_sequence = sorted(complete_beams, key=lambda x: x[0], reverse=True)[0][1]
                # Remove start and end tokens
                if best_sequence[-1] == end_token:
                    best_sequence = best_sequence[1:-1]
                else:
                    best_sequence = best_sequence[1:]
            else:
                best_sequence = []

            best_sequences.append(best_sequence)

        return best_sequences

# Decoding and evaluation functions
def decode_sequences(model, sequences, start_token, end_token, max_length, beam_size=1):
    """
    Decode sequences using the model.

    Args:
        model: Trained model
        sequences: Input sequences to decode
        start_token, end_token: Tokens to mark sequence start/end
        max_length: Maximum output sequence length
        beam_size: Beam size for beam search (1 = greedy)

    Returns:
        predicted_sequences: List of predicted token sequences
    """
    if beam_size > 1:
        return model.beam_search_decode(
            sequences,
            max_length=max_length,
            beam_size=beam_size,
            start_token=start_token,
            end_token=end_token
        )
    else:
        # Initialize with start token
        batch_size = sequences.shape[0]
        current_tokens = tf.ones((batch_size, 1), dtype=tf.int32) * start_token
        predicted_sequences = []

        # Encode input
        _, encoder_states = model.encode(sequences)
        decoder_states = encoder_states

        # Generate tokens one by one
        for t in range(max_length):
            # Predict next token
            output, decoder_states = model.decode_step(current_tokens, decoder_states)
            predicted_token = tf.argmax(output, axis=-1)

            # Store prediction
            predicted_sequences.append(predicted_token)

            # Check for end tokens
            if tf.reduce_all(predicted_token == end_token):
                break

            # Update current token
            current_tokens = predicted_token

        # Concatenate and convert to list
        predicted_sequences = tf.concat(predicted_sequences, axis=1).numpy()

        # Remove end tokens and convert to list
        result = []
        for seq in predicted_sequences:
            # Find position of end token
            try:
                end_pos = list(seq).index(end_token)
                result.append(list(seq[:end_pos]))
            except ValueError:
                # No end token found
                result.append(list(seq))

        return result

def calculate_accuracy(true_seqs, pred_seqs):
    """
    Calculate character and word accuracy between true and predicted sequences.

    Args:
        true_seqs: List of true sequences
        pred_seqs: List of predicted sequences

    Returns:
        Dict with character and word level accuracy
    """
    char_correct = 0
    char_total = 0
    word_correct = 0
    word_total = len(true_seqs)

    for true_seq, pred_seq in zip(true_seqs, pred_seqs):
        # Word is correct if all characters match
        word_is_correct = True

        # Count character matches
        for i in range(min(len(true_seq), len(pred_seq))):
            if true_seq[i] == pred_seq[i]:
                char_correct += 1
            else:
                word_is_correct = False

        # Add extra characters as errors
        if len(true_seq) != len(pred_seq):
            word_is_correct = False

        # Count total characters
        char_total += max(len(true_seq), len(pred_seq))

        # Update word accuracy
        if word_is_correct:
            word_correct += 1

    return {
        'char_accuracy': char_correct / char_total if char_total > 0 else 0,
        'word_accuracy': word_correct / word_total if word_total > 0 else 0
    }

def tokens_to_text(sequences, tokenizer):
    """
    Convert token sequences back to text.

    Args:
        sequences: List of token sequences
        tokenizer: Keras tokenizer

    Returns:
        List of text sequences
    """
    # Reverse the word index
    index_word = {v: k for k, v in tokenizer.word_index.items()}

    # Convert sequences to text
    texts = []
    for seq in sequences:
        text = ''.join(index_word.get(token, '') for token in seq if token in index_word)
        texts.append(text)

    return texts


In [None]:
# Model creation with dropout support
def create_model(input_vocab_size,
                target_vocab_size,
                embedding_dim,
                decoder_hidden_dim,
                encoder_units,
                decoder_units,
                cell_type='lstm',
                encoder_layers=1,
                decoder_layers=1,
                dropout_rate=0.0,
                recurrent_dropout_rate=0.0):
    """
    Create and compile a transliteration model.

    Args:
        input_vocab_size: Size of the input vocabulary
        target_vocab_size: Size of the target vocabulary
        embedding_dim: Dimension of character embeddings
        encoder_units: Number of units in encoder cell
        decoder_units: Number of units in decoder cell
        cell_type: Type of RNN cell ('rnn', 'lstm', or 'gru')
        encoder_layers: Number of layers in the encoder
        decoder_layers: Number of layers in the decoder
        dropout_rate: Dropout rate after RNN layers
        recurrent_dropout_rate: Dropout rate inside RNN cells

    Returns:
        model: Compiled transliteration model
    """
    # Create inputs
    input_seq = Input(shape=(None,), name='input_sequence')
    target_seq = Input(shape=(None,), name='target_sequence')

    # Create model
    model = TransliterationSeq2Seq(
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        embedding_dim=embedding_dim,
        decoder_hidden_dim=decoder_hidden_dim,
        encoder_units=encoder_units,
        decoder_units=decoder_units,
        cell_type=cell_type,
        encoder_layers=encoder_layers,
        decoder_layers=decoder_layers,
        dropout_rate=dropout_rate,
        recurrent_dropout_rate=recurrent_dropout_rate
    )

    # Compile model
    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy']
    )

    return model


# WandB setup and hyperparameter sweep
def setup_wandb_sweep():
    """
    Setup wandb hyperparameter sweep.

    Returns:
        sweep_id: ID of the created sweep
    """
    sweep_config = {
        'method': 'bayes',  # Bayesian optimization
        'metric': {
            'name': 'val_accuracy',
            'goal': 'maximize'
        },
        'parameters': {
            'embedding_dim': {
                'values': [16, 32, 64]
            },
            'decoder_hidden_dim': {
                'values': [16, 32, 64]
            },
            'encoder_units': {
                'values': [32, 64, 128, 256]
            },
            'decoder_units': {
                'values': [32, 64, 128, 256]
            },

            "encoder_decoder_units_pair": {
            "values": [
                {"encoder_units": 32, "decoder_units": 32},
                {"encoder_units": 64, "decoder_units": 64},
                {"encoder_units": 128, "decoder_units": 128},
                {"encoder_units": 256, "decoder_units": 256}
            ]},
            'encoder_layers': {
                'values': [1, 2, 3]
            },
            'decoder_layers': {
                'values': [1, 2, 3]
            },
            'cell_type': {
                'values': ['lstm', 'gru', 'rnn']
            },
            'dropout_rate': {
                'values': [0.0, 0.2, 0.3]
            },
            'recurrent_dropout_rate': {
                'values': [0.0, 0.1, 0.2]
            },
            'beam_size': {
                'values': [1, 3, 5]
            }
        }
    }

    # Create sweep
    sweep_id = wandb.sweep(sweep_config, project="dakshina-transliteration")

    return sweep_id


# Training function for wandb sweep
def train_sweep():
    """
    Training function for wandb sweep.
    """
    # Initialize wandb
    run = wandb.init()

    # Get wandb config
    config = wandb.config

    # Load data
    train_data, dev_data, test_data = load_dakshina_data()
    preprocessed_data = preprocess_data(train_data, dev_data, test_data)

    # Create model with config parameters
    model = create_model(
        input_vocab_size=preprocessed_data['latin_vocab_size'],
        target_vocab_size=preprocessed_data['native_vocab_size'],
        embedding_dim=config.embedding_dim,
        decoder_hidden_dim=config.decoder_hidden_dim,
        encoder_units=config.encoder_units,
        decoder_units=config.decoder_units,
        cell_type=config.cell_type,
        encoder_layers=config.encoder_layers,
        decoder_layers=config.decoder_layers,
        dropout_rate=config.dropout_rate,
        recurrent_dropout_rate=config.recurrent_dropout_rate
    )

    # # Setup callbacks
    # callbacks = [
    #     WandbCallback(),
    #     EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    #     ModelCheckpoint('best_model.h5', save_best_only=True)
    # ]

    # Train model
    history = model.fit(
        [preprocessed_data['train_latin'], preprocessed_data['train_native']],
        preprocessed_data['train_native_target'],
        validation_data=(
            [preprocessed_data['dev_latin'], preprocessed_data['dev_native']],
            preprocessed_data['dev_native_target']
        ),
        epochs=20,
        batch_size=64,
        # callbacks=callbacks
    )

    # Evaluate with beam search
    beam_size = config.beam_size

    # Decode dev set with beam search
    pred_dev_sequences = decode_sequences(
        model,
        preprocessed_data['dev_latin'],
        preprocessed_data['start_token'],
        preprocessed_data['end_token'],
        preprocessed_data['max_native_len'],
        beam_size=beam_size
    )

    # Remove start/end tokens from the target sequences
    true_dev_sequences = [seq[1:-1] for seq in preprocessed_data['dev_native'].tolist()]

    # Calculate accuracy
    accuracy = calculate_accuracy(true_dev_sequences, pred_dev_sequences)

    # Log metrics
    wandb.log({
        'dev_char_accuracy': accuracy['char_accuracy'],
        'dev_word_accuracy': accuracy['word_accuracy']
    })

    # Convert some examples to text for visual inspection
    native_tokenizer = preprocessed_data['native_tokenizer']
    latin_tokenizer = preprocessed_data['latin_tokenizer']

    # Get sample predictions
    sample_idx = np.random.choice(len(pred_dev_sequences), min(10, len(pred_dev_sequences)), replace=False)
    samples = []

    for idx in sample_idx:
        input_text = tokens_to_text([preprocessed_data['dev_latin'][idx].tolist()], latin_tokenizer)[0]
        true_text = tokens_to_text([true_dev_sequences[idx]], native_tokenizer)[0]
        pred_text = tokens_to_text([pred_dev_sequences[idx]], native_tokenizer)[0]

        samples.append({
            'input': input_text,
            'true': true_text,
            'pred': pred_text
        })

    # Log sample predictions
    wandb.log({'samples': wandb.Table(
        columns=['Input', 'True', 'Predicted'],
        data=[[s['input'], s['true'], s['pred']] for s in samples]
    )})

    # Clean up
    wandb.finish()


# Main script for running the sweep
def main():
    """Main function to run the sweep."""
    sweep_id = setup_wandb_sweep()
    wandb.agent(sweep_id, train_sweep, count=20)  # Run 20 experiments


if __name__ == '__main__':
    main()




Create sweep with ID: 0ftoj7ow
Sweep URL: https://wandb.ai/mfazal735-iit-madras-foundation/dakshina-transliteration/sweeps/0ftoj7ow


[34m[1mwandb[0m: Agent Starting Run: jxnu41s9 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	decoder_hidden_dim: 16
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	decoder_units: 32
[34m[1mwandb[0m: 	dropout_rate: 0
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	encoder_decoder_units_pair: {'decoder_units': 256, 'encoder_units': 256}
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	encoder_units: 32
[34m[1mwandb[0m: 	recurrent_dropout_rate: 0


Loaded 106260 training, 10424 dev, and 10517 test examples.
  native latin  n
0     آؤ   aao  3
1     آؤ  aaoo  1
2     آؤ   aau  1
3     آؤ   aaw  1
4     آؤ  aawo  1
Epoch 1/20


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Exception encountered when calling GRU.call().

[1mIterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/aut

[34m[1mwandb[0m: [32m[41mERROR[0m Run jxnu41s9 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-6-368e14052b3f>", line 160, in train_sweep
[34m[1mwandb[0m: [32m[41mERROR[0m     history = model.fit(
[34m[1mwandb[0m: [32m[41mERROR[0m               ^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler
[34m[1mwandb[0m: [32m[41mERROR[0m     raise e.with_traceback(filtered_tb) from None
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-4-d138a4a49ecf>", line 224, in call
[34m[1mwandb[0m: [32m[41mERROR[0m     encoder_outputs, encoder_states = self.encode

Loaded 106260 training, 10424 dev, and 10517 test examples.
  native latin  n
0     آؤ   aao  3
1     آؤ  aaoo  1
2     آؤ   aau  1
3     آؤ   aaw  1
4     آؤ  aawo  1
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.6742 - loss: 1.2800Epoch 1/20
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 31ms/step - accuracy: 0.6742 - loss: 1.2799 - val_accuracy: 0.7059 - val_loss: 1.0623
Epoch 2/20
[1m 419/1661[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m12s[0m 10ms/step - accuracy: 0.7003 - loss: 1.0783



[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 12ms/step - accuracy: 0.7010 - loss: 1.0740 - val_accuracy: 0.7078 - val_loss: 1.0464
Epoch 3/20
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.7027 - loss: 1.0579 - val_accuracy: 0.7095 - val_loss: 1.0401
Epoch 4/20
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 10ms/step - accuracy: 0.7047 - loss: 1.0485 - val_accuracy: 0.7093 - val_loss: 1.0331
Epoch 5/20
[1m 731/1661[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m8s[0m 9ms/step - accuracy: 0.7055 - loss: 1.0425



[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.7055 - loss: 1.0419 - val_accuracy: 0.7120 - val_loss: 1.0270
Epoch 6/20
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.7061 - loss: 1.0365 - val_accuracy: 0.7113 - val_loss: 1.0236
Epoch 7/20
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.7068 - loss: 1.0320 - val_accuracy: 0.7124 - val_loss: 1.0204
Epoch 8/20
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 10ms/step - accuracy: 0.7074 - loss: 1.0298 - val_accuracy: 0.7111 - val_loss: 1.0198
Epoch 9/20
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.7079 - loss: 1.0275 - val_accuracy: 0.7116 - val_loss: 1.0169
Epoch 10/20
[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.7086 - loss: 1.0259 - val_accuracy: 0.7122 - val_loss: 1.0172
Epoch 11/20
[1m

[1m1427/1661[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m1:17[0m 332ms/step - accuracy: 0.6691 - loss: 1.3375

[1m1661/1661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m795s[0m 348ms/step - accuracy: 0.6723 - loss: 1.3112 - val_accuracy: 0.7035 - val_loss: 1.0661
Epoch 2/20
[1m1517/1661[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m42s[0m 293ms/step - accuracy: 0.7052 - loss: 1.0501

e4d0a8c3ccaf2534e9ab91c659e420ba5114533f