# **Importing required libraries**

In [1]:
import numpy as np
import csv
import pandas as pd
import random
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import LSTM, SimpleRNN, GRU, Embedding, Dense, TimeDistributed, Concatenate, AdditiveAttention

2025-05-19 06:46:21.428799: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747637181.655470      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747637181.717806      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
!pip install wandb
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mduenchombo1[0m ([33mduenchombo1-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# **Train,Test and Dev Data uploaded in kaggle**

In [None]:
tsv_file = open("/kaggle/input/hi-translit/hi.translit.sampled.train.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

In [None]:
val_tsv_file = open("/kaggle/input/hi-translit/hi.translit.sampled.dev.tsv")
val_read_tsv = csv.reader(val_tsv_file, delimiter="\t")

# **Processing training and validation data**

In [5]:
# Training data
devnagri = []
english = []

for i in read_tsv:
    devnagri.append(i[0])
    english.append(i[1])

devnagri = np.array(devnagri)
english = np.array(english)

# Validation data
val_devnagri = []
val_english = []

for i in val_read_tsv:
    val_devnagri.append(i[0])
    val_english.append(i[1])

val_devnagri = np.array(val_devnagri)
val_english = np.array(val_english)

In [6]:
for i in range(devnagri.shape[0]):
    devnagri[i] = "\t" + devnagri[i] + "\n"

for i in range(val_devnagri.shape[0]):
    val_devnagri[i] = "\t" + val_devnagri[i] + "\n"

In [7]:
# Getting input and target language characters

# Training set
english_characters = set()
devnagri_characters = set()

for word in english:
    for char in word:
        if char not in english_characters:
            english_characters.add(char)

for word in devnagri:
    for char in word:
        if char not in devnagri_characters:
            devnagri_characters.add(char)

# Validation set
v_english_characters = set()
v_devnagri_characters = set()

for word in val_english:
    for char in word:
        if char not in v_english_characters:
            v_english_characters.add(char)

for word in val_devnagri:
    for char in word:
        if char not in v_devnagri_characters:
            v_devnagri_characters.add(char)

In [8]:
english_characters = sorted(list(english_characters))
devnagri_characters = sorted(list(devnagri_characters))

num_encoder_tokens = len(english_characters)
num_decoder_tokens = len(devnagri_characters)

max_encoder_seq_length = max([len(txt) for txt in english])
max_decoder_seq_length = max([len(txt) for txt in devnagri])

print("Number of samples:", len(english))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 44204
Number of unique input tokens: 26
Number of unique output tokens: 65
Max sequence length for inputs: 20
Max sequence length for outputs: 19


# **Preparing Encoder and Decoder Inputs**

In [9]:
# Preparing train encoder and decoder inputs

input_token_index = dict([(char, i) for i, char in enumerate(english_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(devnagri_characters)])

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

encoder_input_data = np.zeros((len(english), max_encoder_seq_length), dtype="float32")
decoder_input_data = np.zeros((len(english), max_decoder_seq_length), dtype="float32")
decoder_target_data = np.zeros((len(english), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (english, devnagri) in enumerate(zip(english, devnagri)):
    for t, char in enumerate(english):
        encoder_input_data[i, t] = input_token_index[char]

    for t, char in enumerate(devnagri):
        decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0

In [10]:
# Preparing validation encoder and decoder inputs

encoder_val_input_data = np.zeros((len(val_english), max_encoder_seq_length), dtype="float32")
decoder_val_input_data = np.zeros((len(val_english), max_decoder_seq_length), dtype="float32")
decoder_val_target_data = np.zeros((len(val_english), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (e, d) in enumerate(zip(val_english, val_devnagri)):
    for t, char in enumerate(e):
        encoder_val_input_data[i, t] = input_token_index[char]

    for t, char in enumerate(d):
        decoder_val_input_data[i, t] =  target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep and will not include the start character.
            decoder_val_target_data[i, t - 1, target_token_index[char]] = 1.0

# **Defining Seq2Seq Model**

In [11]:
def training(input_embedding_size, dp, cell_type, hidden_layer_size, num_encoder_layers, num_decoder_layers):

    # ENCODER

    encoder_inputs = Input(shape=(max_encoder_seq_length,))
    encoder_embedding = Embedding(num_encoder_tokens, input_embedding_size, trainable=True)(encoder_inputs)

    encoder_layers = []
    encoder_states = []
    if cell_type == 'RNN':
        encoder = SimpleRNN(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
        encoder_layers.append(encoder)
        encoder_outputs, state_h = encoder(encoder_embedding)
        encoder_states.append([state_h])
        if num_encoder_layers > 1:
            encoder = SimpleRNN(hidden_layer_size,return_sequences=True,return_state=True, dropout = dp)
            encoder_layers.append(encoder)
            encoder_outputs, state_h2 = encoder(encoder_outputs)
            encoder_states.append([state_h2])
        if num_encoder_layers > 2:
            encoder = SimpleRNN(hidden_layer_size,return_sequences=True,return_state=True, dropout = dp)
            encoder_layers.append(encoder)
            encoder_outputs, state_h3 = encoder(encoder_outputs)
            encoder_states.append([state_h3])

    elif cell_type == 'GRU':
        encoder = GRU(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
        encoder_layers.append(encoder)
        encoder_outputs, state_h = encoder(encoder_embedding)
        encoder_states.append([state_h])
        if num_encoder_layers > 1:
            encoder = GRU(hidden_layer_size,return_sequences=True,return_state=True, dropout = dp)
            encoder_layers.append(encoder)
            encoder_outputs, state_h2 = encoder(encoder_outputs)
            encoder_states.append([state_h2])
        if num_encoder_layers > 2:
            encoder = GRU(hidden_layer_size,return_sequences=True,return_state=True, dropout = dp)
            encoder_layers.append(encoder)
            encoder_outputs, state_h3 = encoder(encoder_outputs)
            encoder_states.append([state_h3])

    else:
        encoder = LSTM(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
        encoder_layers.append(encoder)
        encoder_outputs, state_h, state_c = encoder(encoder_embedding)
        encoder_states.append([state_h, state_c])
        if num_encoder_layers > 1:
            encoder = LSTM(hidden_layer_size,return_sequences=True,return_state=True, dropout = dp)
            encoder_layers.append(encoder)
            encoder_outputs, state_h2, state_c2 = encoder(encoder_outputs)
            encoder_states.append([state_h2, state_c2])
        if num_encoder_layers > 2:
            encoder = LSTM(hidden_layer_size,return_sequences=True,return_state=True, dropout = dp)
            encoder_layers.append(encoder)
            encoder_outputs, state_h3, state_c3 = encoder(encoder_outputs)
            encoder_states.append([state_h3, state_c3])


    # DECODER

    decoder_inputs = Input(shape=(max_decoder_seq_length,))
    decoder_embedding = Embedding(num_decoder_tokens, input_embedding_size, trainable=True)(decoder_inputs)

    # We set up our decoder to return full output sequences, and to return internal states as well.
    # We don't use the return states in the training model, but we will use them in inference.

    decoder_layers = []
    if cell_type == 'RNN':
        decoder_RNN = SimpleRNN(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
        decoder_layers.append(decoder_RNN)
        decoder_outputs, _ = decoder_RNN(decoder_embedding, initial_state=encoder_states[0])
        if num_decoder_layers > 1:
            decoder_RNN = SimpleRNN(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
            decoder_layers.append(decoder_RNN)
            decoder_outputs, _  = decoder_RNN(decoder_outputs, initial_state=encoder_states[1])
        if num_decoder_layers > 2:
            decoder_RNN = SimpleRNN(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
            decoder_layers.append(decoder_RNN)
            decoder_outputs, _  = decoder_RNN(decoder_outputs, initial_state=encoder_states[2])

    elif cell_type == 'GRU':
        decoder_GRU = GRU(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
        decoder_layers.append(decoder_GRU)
        decoder_outputs, _ = decoder_GRU(decoder_embedding, initial_state=encoder_states[0])
        if num_decoder_layers > 1:
            decoder_GRU = GRU(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
            decoder_layers.append(decoder_GRU)
            decoder_outputs, _  = decoder_GRU(decoder_outputs, initial_state=encoder_states[1])
        if num_decoder_layers > 2:
            decoder_GRU = GRU(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
            decoder_layers.append(decoder_GRU)
            decoder_outputs, _  = decoder_GRU(decoder_outputs, initial_state=encoder_states[2])

    else:
        decoder_lstm = LSTM(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
        decoder_layers.append(decoder_lstm)
        decoder_outputs, _ , _ = decoder_lstm(decoder_embedding, initial_state=encoder_states[0])
        if num_decoder_layers > 1:
            decoder_lstm = LSTM(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
            decoder_layers.append(decoder_lstm)
            decoder_outputs, _ , _  = decoder_lstm(decoder_outputs, initial_state=encoder_states[1])
        if num_decoder_layers > 2:
            decoder_lstm = LSTM(hidden_layer_size, return_sequences=True, return_state=True, dropout = dp)
            decoder_layers.append(decoder_lstm)
            decoder_outputs, _ , _  = decoder_lstm(decoder_outputs, initial_state=encoder_states[2])

    decoder_dense = TimeDistributed(Dense(num_decoder_tokens, activation="softmax"))
    decoder_outputs = decoder_dense(decoder_outputs)

    # MODEL
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model, encoder_layers, decoder_layers

# **Inference model**

In [12]:
def inferencing(model,num_encoder_layers,num_decoder_layers,encoder_layers,decoder_layers,cell_type, hidden_layer_size):

    # ENCODER MODEL RECONSTRUCTION
    encoder_inputs = model.input[0]  # input_1
    encoder_states = []
    enc_emb = model.layers[2]     # embedding 1
    encoder_outputs = enc_emb(encoder_inputs)

    if cell_type == 'RNN' or cell_type =="GRU":
        for i in range(num_encoder_layers):
            encoder_outputs, state_h_enc = encoder_layers[i](encoder_outputs)
            encoder_states += [state_h_enc]
    else:
        for i in range(num_encoder_layers):
            encoder_outputs, state_h_enc, state_c_enc = encoder_layers[i](encoder_outputs)
            encoder_states += [state_h_enc, state_c_enc]

    encoder_model = Model(encoder_inputs, encoder_states + [encoder_outputs])


    # DECODER MODEL RECONSTRUCTION
    input_names = [["input_100","input_101"],["input_102","input_103"],["input_104","input_105"],"input_106"]

    decoder_inputs = model.input[1]       # input_2
    decoder_embedding = model.layers[3]   # embedding 2
    decoder_outputs = decoder_embedding(decoder_inputs)
    decoder_states = []
    decoder_states_inputs = []

    if cell_type == 'RNN' or cell_type =="GRU":
        for i in range(num_decoder_layers):
            decoder_states_inputs += [Input(shape=(hidden_layer_size,), name=input_names[i][0])]
        for i in range(num_decoder_layers):
            decoder_outputs, state_h_dec = decoder_layers[i](decoder_outputs, initial_state=decoder_states_inputs[i])
            decoder_states += [state_h_dec]
    else:
        for i in range(num_decoder_layers):
            decoder_states_inputs += [Input(shape=(hidden_layer_size,), name=input_names[i][0]), Input(shape=(hidden_layer_size,), name=input_names[i][1])]
        j = 0
        for i in range(num_decoder_layers):
            decoder_outputs, state_h_dec, state_c_dec = decoder_layers[i](decoder_outputs, initial_state=decoder_states_inputs[i+j:i+j+2])
            decoder_states += [state_h_dec , state_c_dec]
            j += 1

    decoder_dense = model.layers[4+2*num_encoder_layers]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

    return encoder_model, decoder_model

In [13]:
def decode_sequence(input_seq,encoder_model,decoder_model):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    states_value = states_value[:-1]
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index["\t"]
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        dec_ip = [target_seq]+states_value
        output_tokens = decoder_model.predict(dec_ip)
        sampled_token_index = np.argmax(output_tokens[0][0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = output_tokens[1:]

    return decoded_sentence

# **Fitting the model**

In [14]:
# batch_size = 128
# epochs = 7
# input_embedding_size = 512
# hidden_layer_size = 256
# num_layers = 3
# num_encoder_layers = num_layers
# num_decoder_layers = num_layers
# dropout = 0.2
# cell_type = 'LSTM'

# # TRAIN
# model, encoder_layers, decoder_layers = training(input_embedding_size, dropout, cell_type, hidden_layer_size, num_encoder_layers, num_decoder_layers)

# # COMPILE
# model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# # FIT
# model.fit(
#     [encoder_input_data, decoder_input_data],
#     decoder_target_data,
#     batch_size=batch_size,
#     epochs=epochs,
#     shuffle = True,
#     # validation_data= ([encoder_val_input_data, decoder_val_input_data], decoder_val_target_data)
# )

# encoder_model, decoder_model = inferencing(model, num_encoder_layers, num_decoder_layers, encoder_layers, decoder_layers, cell_type, hidden_layer_size)
# correct = 0
# n = val_devnagri.shape[0]
# for i in range(n):
#     input = encoder_val_input_data[i:i+1]
#     output = decode_sequence(input,encoder_model, decoder_model)
#     if output.strip() == val_devnagri[i].strip():
#         correct += 1
# print("Validation accuracy : ", correct*100/n)

# **Hyperparameter Tuning**

In [15]:
sweep_config = {
    'method': 'bayes',
    'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
    'parameters': {'input_embedding_size': {'values': [128, 256, 512]},
                   'hidden_layer_size': {'values': [128, 256, 512]},
                   'cell_type': {'values': ['LSTM', 'RNN', 'GRU']},
                   'num_layers': {'values': [1,2,3]},
                   'batch_size': {'values': [128,256,512]},
                   'dropout': {'values': [0.1, 0.2, 0.3, 0.4]}
                }}

In [16]:
def train():
    var1 = wandb.init()
    var2 = var1.config
    epochs = 10

    model, encoder_layers, decoder_layers = training(var2.input_embedding_size, var2.dropout, var2.cell_type , var2.hidden_layer_size, var2.num_layers, var2.num_layers)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=var2.batch_size,
        epochs=epochs,
        # callbacks=[WandbCallback()]
    )

    encoder_model, decoder_model = inferencing(model,var2.num_layers, var2.num_layers,encoder_layers,decoder_layers,var2.cell_type,var2.hidden_layer_size)
    correct = 0
    n = val_devnagri.shape[0]
    for i in range(n):
        input = encoder_val_input_data[i:i+1]
        output = decode_sequence(input,encoder_model, decoder_model)
        if output.strip() == val_devnagri[i].strip():
            correct += 1
    wandb.log({'val_accuracy' : correct*100/n})

In [None]:
sweep_id = wandb.sweep(sweep_config, project="Assignment_3")
# wandb.agent(sweep_id, train, count=28)
wandb.agent(sweep_id, train)

Create sweep with ID: 0wozbn0s
Sweep URL: https://wandb.ai/duenchombo1-indian-institute-of-technology-madras/Assignment_3/sweeps/0wozbn0s


[34m[1mwandb[0m: Agent Starting Run: li9v6102 with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	hidden_layer_size: 128
[34m[1mwandb[0m: 	input_embedding_size: 256
[34m[1mwandb[0m: 	num_layers: 3


I0000 00:00:1747637225.264088      89 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/10


I0000 00:00:1747637238.015178     124 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 28ms/step - accuracy: 0.0684 - loss: 1.2879
Epoch 2/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.0975 - loss: 1.0373
Epoch 3/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.1196 - loss: 0.9442
Epoch 4/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.1564 - loss: 0.8042
Epoch 5/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.1906 - loss: 0.6652
Epoch 6/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.2205 - loss: 0.5477
Epoch 7/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.2471 - loss: 0.4457
Epoch 8/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.2671 - loss: 0.3758
Epoch 9/10
[1m173/173[0m [32m━━━━━━━━━━