In [None]:
# Install required packages
!pip install tensorflow==2.12.0
!pip install pandas
!pip install gdown

Collecting tensorflow==2.12.0
  Downloading tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting numpy<1.24,>=1.22 (from tensorflow==2.12.0)
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.12.0)
  Downloading protobuf-4.25.6-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting tensorboard<2.13,>=2.12 (from tensorflow==2.12.0)
  Downloading tensorboard-2.12.3-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading tensorflow_estimator-2.12.0-py2.py3-none-an



Question 1

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, GRU, SimpleRNN, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
# Load Hindi data
def load_hindi_data():
    train_path = "/content/hi.translit.sampled.train.tsv"
    dev_path = "/content/hi.translit.sampled.dev.tsv"
    test_path = "/content/hi.translit.sampled.test.tsv"

    train_data = pd.read_csv(train_path, sep='\t', header=None,
                           names=['devanagari', 'latin', 'count'])
    dev_data = pd.read_csv(dev_path, sep='\t', header=None,
                         names=['devanagari', 'latin', 'count'])
    test_data = pd.read_csv(test_path, sep='\t', header=None,
                          names=['devanagari', 'latin', 'count'])

    return train_data, dev_data, test_data

train_data, dev_data, test_data = load_hindi_data()

# Show some samples
print("Sample training data:")
print(train_data.head())

Sample training data:
  devanagari     latin  count
0         अं        an      3
1    अंकगणित  ankganit      3
2       अंकल     uncle      4
3      अंकुर     ankur      4
4     अंकुरण   ankuran      3


Preprocess the data

In [None]:
def preprocess_data(train_data, dev_data, test_data, max_sequence_length=20):
    # First, clean the data by removing any rows with NaN values
    train_data = train_data.dropna()
    dev_data = dev_data.dropna()
    test_data = test_data.dropna()

    # Combine all data for vocabulary creation
    all_latin = pd.concat([train_data['latin'], dev_data['latin'], test_data['latin']])
    all_devanagari = pd.concat([train_data['devanagari'], dev_data['devanagari'], test_data['devanagari']])

    # Convert to string type to ensure we don't have any numeric values
    all_latin = all_latin.astype(str)
    all_devanagari = all_devanagari.astype(str)

    # Create character-level tokenizers
    latin_tokenizer = Tokenizer(char_level=True, lower=False)
    latin_tokenizer.fit_on_texts(all_latin)

    devanagari_tokenizer = Tokenizer(char_level=True, lower=False)
    devanagari_tokenizer.fit_on_texts(all_devanagari)

    # Add start and end tokens for decoder sequences
    devanagari_tokenizer.word_index['<start>'] = len(devanagari_tokenizer.word_index) + 1
    devanagari_tokenizer.word_index['<end>'] = len(devanagari_tokenizer.word_index) + 1

    # Convert texts to sequences
    def process_sequences(texts, tokenizer, max_len):
        # Ensure all texts are strings
        texts = [str(text) for text in texts]
        seq = tokenizer.texts_to_sequences(texts)
        seq = pad_sequences(seq, maxlen=max_len, padding='post')
        return seq

    # Process input (Latin) sequences
    X_train = process_sequences(train_data['latin'], latin_tokenizer, max_sequence_length)
    X_dev = process_sequences(dev_data['latin'], latin_tokenizer, max_sequence_length)
    X_test = process_sequences(test_data['latin'], latin_tokenizer, max_sequence_length)

    # Process target (Devanagari) sequences with start/end tokens
    def process_target_sequences(texts, tokenizer, max_len):
        # Ensure all texts are strings
        texts = [str(text) for text in texts]
        seq = tokenizer.texts_to_sequences(texts)
        # Add start and end tokens
        seq = [[tokenizer.word_index['<start>']] + s + [tokenizer.word_index['<end>']] for s in seq]
        seq = pad_sequences(seq, maxlen=max_len+2, padding='post')  # +2 for start/end tokens
        return seq

    y_train = process_target_sequences(train_data['devanagari'], devanagari_tokenizer, max_sequence_length)
    y_dev = process_target_sequences(dev_data['devanagari'], devanagari_tokenizer, max_sequence_length)
    y_test = process_target_sequences(test_data['devanagari'], devanagari_tokenizer, max_sequence_length)

    # Create decoder input (shifted by one) and output data
    decoder_input_train = y_train[:, :-1]
    decoder_output_train = y_train[:, 1:]

    decoder_input_dev = y_dev[:, :-1]
    decoder_output_dev = y_dev[:, 1:]

    decoder_input_test = y_test[:, :-1]
    decoder_output_test = y_test[:, 1:]

    # One-hot encode the output
    def one_hot_encode(sequences, vocab_size):
        return np.array([tf.keras.utils.to_categorical(s, num_classes=vocab_size) for s in sequences])

    vocab_size = len(devanagari_tokenizer.word_index) + 1  # +1 for 0 padding

    decoder_output_train = one_hot_encode(decoder_output_train, vocab_size)
    decoder_output_dev = one_hot_encode(decoder_output_dev, vocab_size)
    decoder_output_test = one_hot_encode(decoder_output_test, vocab_size)

    return (X_train, decoder_input_train, decoder_output_train,
            X_dev, decoder_input_dev, decoder_output_dev,
            X_test, decoder_input_test, decoder_output_test,
            latin_tokenizer, devanagari_tokenizer)

(X_train, decoder_input_train, decoder_output_train,
 X_dev, decoder_input_dev, decoder_output_dev,
 X_test, decoder_input_test, decoder_output_test,
 latin_tokenizer, devanagari_tokenizer) = preprocess_data(train_data, dev_data, test_data)

Build and train the model

In [None]:
def build_seq2seq_model(input_vocab_size, target_vocab_size, embedding_dim=64,
                       hidden_units=128, cell_type='lstm'):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)

    # Choose RNN cell
    if cell_type == 'lstm':
        encoder_rnn = LSTM(hidden_units, return_state=True)
        encoder_outputs, state_h, state_c = encoder_rnn(encoder_embedding)
        encoder_states = [state_h, state_c]
    elif cell_type == 'gru':
        encoder_rnn = GRU(hidden_units, return_state=True)
        encoder_outputs, state_h = encoder_rnn(encoder_embedding)
        encoder_states = [state_h]
    else:  # SimpleRNN
        encoder_rnn = SimpleRNN(hidden_units, return_state=True)
        encoder_outputs, state_h = encoder_rnn(encoder_embedding)
        encoder_states = [state_h]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(target_vocab_size, embedding_dim)(decoder_inputs)

    if cell_type == 'lstm':
        decoder_rnn = LSTM(hidden_units, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_rnn(decoder_embedding, initial_state=encoder_states)
    elif cell_type == 'gru':
        decoder_rnn = GRU(hidden_units, return_sequences=True, return_state=True)
        decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=encoder_states)
    else:
        decoder_rnn = SimpleRNN(hidden_units, return_sequences=True, return_state=True)
        decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=encoder_states)

    decoder_dense = Dense(target_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Training model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # Inference models
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_state_inputs = [Input(shape=(hidden_units,)) for _ in encoder_states]
    decoder_outputs, *decoder_states = decoder_rnn(
        decoder_embedding, initial_state=decoder_state_inputs)
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_state_inputs,
        [decoder_outputs] + decoder_states)

    return model, encoder_model, decoder_model

# Build the model
input_vocab_size = len(latin_tokenizer.word_index) + 1
target_vocab_size = len(devanagari_tokenizer.word_index) + 1

model, encoder_model, decoder_model = build_seq2seq_model(
    input_vocab_size, target_vocab_size,
    embedding_dim=64, hidden_units=128, cell_type='lstm')

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Train the model
history = model.fit(
    [X_train, decoder_input_train],
    decoder_output_train,
    batch_size=64,
    epochs=30,
    validation_data=([X_dev, decoder_input_dev], decoder_output_dev),
    verbose=1
)

Epoch 1/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.7084 - loss: 1.2910 - val_accuracy: 0.7643 - val_loss: 0.8593
Epoch 2/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7631 - loss: 0.8641 - val_accuracy: 0.7876 - val_loss: 0.7434
Epoch 3/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.7955 - loss: 0.7181 - val_accuracy: 0.8392 - val_loss: 0.5507
Epoch 4/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.8445 - loss: 0.5300 - val_accuracy: 0.8764 - val_loss: 0.4120
Epoch 5/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.8813 - loss: 0.3946 - val_accuracy: 0.8999 - val_loss: 0.3268
Epoch 6/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9044 - loss: 0.3134 - val_accuracy: 0.9146 - val_loss: 0.2823
Epoch 7/30
[1m691/691

In [None]:
def decode_sequence(input_seq, encoder_model, decoder_model,
                   latin_tokenizer, devanagari_tokenizer, max_length=20):
    # Encode input
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = devanagari_tokenizer.word_index['<start>']

    reverse_target_char_index = {i: char for char, i in devanagari_tokenizer.word_index.items()}

    decoded_sentence = []
    for _ in range(max_length):
        output_tokens, *states_value = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence.append(sampled_char)

        if sampled_char == '<end>':
            break

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

    return ''.join([c for c in decoded_sentence if c not in ['<start>', '<end>']])

# Evaluate on test set
test_loss, test_acc = model.evaluate([X_test, decoder_input_test], decoder_output_test, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")

# Show some predictions
for i in range(5):
    input_seq = X_test[i:i+1]
    decoded = decode_sequence(
        input_seq, encoder_model, decoder_model,
        latin_tokenizer, devanagari_tokenizer)

    original_input = latin_tokenizer.sequences_to_texts([X_test[i]])[0]
    original_target = devanagari_tokenizer.sequences_to_texts([decoder_input_test[i]])[0]

    print(f"\nSample {i+1}:")
    print(f"Input (Latin): {original_input}")
    print(f"Target (Devanagari): {original_target}")
    print(f"Predicted (Devanagari): {decoded}")

Test Accuracy: 0.9457
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Sample 1:
Input (Latin): a n k
Target (Devanagari): अ ं क
Predicted (Devanagari): ऐंक
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step

Sample 2:
Input (Latin): a n k a
Target (Devanagari): अ ं क
Predicted (Devanagari): अंका
[1m1/1

Question 2