In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model

# Load the dataset
train_data = pd.read_csv('/content/hi.translit.sampled.train.tsv', sep='\t', header=None, names=['devanagari','latin','count'])
dev_data = pd.read_csv('/content/hi.translit.sampled.dev.tsv', sep='\t', header=None, names=['devanagari','latin','count'])
test_data = pd.read_csv('/content/hi.translit.sampled.test.tsv', sep='\t', header=None, names=['devanagari','latin','count'])

# Preprocess
train_data['latin'] = train_data['latin'].astype(str)
train_data['devanagari'] = train_data['devanagari'].astype(str)
dev_data['latin'] = dev_data['latin'].astype(str)
dev_data['devanagari'] = dev_data['devanagari'].astype(str)

latin_vocab = sorted(set(''.join(train_data['latin'])))
devanagari_vocab = sorted(set(''.join(train_data['devanagari'])))

latin_char_to_idx = {char: idx + 1 for idx, char in enumerate(latin_vocab)}
devanagari_char_to_idx = {char: idx + 1 for idx, char in enumerate(devanagari_vocab)}
idx_to_devanagari = {idx + 1: char for idx, char in enumerate(devanagari_vocab)}

latin_vocab_size = len(latin_char_to_idx) + 1
devanagari_vocab_size = len(devanagari_char_to_idx) + 1

max_seq_length = max(train_data['latin'].apply(len).max(), train_data['devanagari'].apply(len).max()) + 1

def text_to_indices(text, char_to_idx, max_length):
    return [char_to_idx.get(char, 0) for char in text] + [0] * (max_length - len(text))

def prepare_data(data):
    encoder_input = np.array([text_to_indices(text, latin_char_to_idx, max_seq_length) for text in data['latin']])
    decoder_target = np.array([text_to_indices(text, devanagari_char_to_idx, max_seq_length) for text in data['devanagari']])
    decoder_input = np.zeros_like(decoder_target)
    decoder_input[:, 1:] = decoder_target[:, :-1]
    return encoder_input, decoder_input, decoder_target

X_train, y_train_input, y_train_target = prepare_data(train_data)
X_dev, y_dev_input, y_dev_target = prepare_data(dev_data)
X_test, y_test_input, y_test_target = prepare_data(test_data)

# Model parameters
embedding_dim = 256
hidden_dim = 512

# Build training model
encoder_inputs = Input(shape=(None,))
x = Embedding(latin_vocab_size, embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(hidden_dim, return_state=True)(x)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(devanagari_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(devanagari_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train
model.fit([X_train, y_train_input], y_train_target,
          batch_size=64, epochs=10,
          validation_data=([X_dev, y_dev_input], y_dev_target))

# Build inference models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(hidden_dim,))
decoder_state_input_c = Input(shape=(hidden_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reuse the decoder_embedding layer instead of calling it again
decoder_embedding2 = decoder_embedding
# decoder_embedding2 = decoder_embedding(decoder_inputs)  # This line was incorrect
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_embedding2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)


decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

# Greedy decoding
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    decoded_sentence = ''

    for _ in range(max_seq_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = idx_to_devanagari.get(sampled_token_index, '')

        if sampled_char == '' or sampled_token_index == 0:
            break

        decoded_sentence += sampled_char
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence

# Predict on test
for latin in test_data['latin'][:10]:
    input_seq = np.array([text_to_indices(latin, latin_char_to_idx, max_seq_length)])
    decoded = decode_sequence(input_seq)
    print(f"Input: {latin} -> Predicted: {decoded}")


Epoch 1/10
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 17ms/step - accuracy: 0.7296 - loss: 1.0974 - val_accuracy: 0.8114 - val_loss: 0.6642
Epoch 2/10
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - accuracy: 0.8287 - loss: 0.5897 - val_accuracy: 0.9042 - val_loss: 0.3156
Epoch 3/10
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - accuracy: 0.9170 - loss: 0.2680 - val_accuracy: 0.9362 - val_loss: 0.2033
Epoch 4/10
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 17ms/step - accuracy: 0.9503 - loss: 0.1592 - val_accuracy: 0.9449 - val_loss: 0.1766
Epoch 5/10
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - accuracy: 0.9639 - loss: 0.1145 - val_accuracy: 0.9478 - val_loss: 0.1673
Epoch 6/10
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 17ms/step - accuracy: 0.9727 - loss: 0.0875 - val_accuracy: 0.9498 - val_loss: 0.1634
Epoch 7/10
[1m6