<a href="https://colab.research.google.com/github/deepthidornala/DL-Assignment-2/blob/main/Question_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, SimpleRNN, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Configuration class for flexible model architecture
class ModelConfig:
    def __init__(self):
        # Embedding dimensions
        self.embedding_dim = 128

        # RNN layer configuration
        self.rnn_type = 'lstm'  # 'lstm', 'gru', or 'rnn'
        self.hidden_size = 256
        self.num_layers = 1

        # Training parameters
        self.batch_size = 64
        self.epochs = 30
        self.validation_split = 0.2

# Load and preprocess data
def load_data(path):
    df = pd.read_csv(path, sep='\t', header=None)
    df.dropna(inplace=True)
    return list(zip(df[1], df[0]))  # Latin, Devanagari

# Load train and test data
train_pairs = load_data('/content/hi.translit.sampled.train.tsv')
test_pairs = load_data('/content/hi.translit.sampled.test.tsv')

# Prepare text data
input_texts = [inp.lower() for inp, _ in train_pairs]
target_texts = ['\t' + tgt + '\n' for _, tgt in train_pairs]

# Build vocabulary
input_chars = sorted(set(''.join(input_texts)))
target_chars = sorted(set(''.join(target_texts)))
input_token_index = {char: i+1 for i, char in enumerate(input_chars)}  # 0 reserved for padding
target_token_index = {char: i+1 for i, char in enumerate(target_chars)}

reverse_input_char_index = {i: char for char, i in input_token_index.items()}
reverse_target_char_index = {i: char for char, i in target_token_index.items()}

# Calculate sequence lengths
max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

# Tokenize and pad sequences
def tokenize(texts, token_index, max_len):
    sequences = [[token_index.get(char, 0) for char in text] for text in texts]
    return pad_sequences(sequences, maxlen=max_len, padding='post')

encoder_input_data = tokenize(input_texts, input_token_index, max_encoder_seq_length)
decoder_input_data = tokenize(target_texts, target_token_index, max_decoder_seq_length)

# Prepare decoder target data (shifted by one)
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

# Split into train and validation
(enc_train, enc_val,
 dec_in_train, dec_in_val,
 dec_tgt_train, dec_tgt_val) = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data,
    test_size=0.2, random_state=42
)

# Build the model
def build_model(config, num_encoder_tokens, num_decoder_tokens):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    enc_emb = Embedding(num_encoder_tokens, config.embedding_dim)(encoder_inputs)

    # Choose RNN type
    if config.rnn_type.lower() == 'lstm':
        RNN = LSTM
    elif config.rnn_type.lower() == 'gru':
        RNN = GRU
    else:
        RNN = SimpleRNN

    # Encoder RNN
    encoder_rnn = RNN(config.hidden_size, return_state=True)
    encoder_outputs, *encoder_states = encoder_rnn(enc_emb)

    # Decoder
    decoder_inputs = Input(shape=(None,))
    dec_emb = Embedding(num_decoder_tokens, config.embedding_dim)(decoder_inputs)

    # Decoder RNN
    decoder_rnn = RNN(config.hidden_size, return_sequences=True, return_state=True)
    decoder_outputs, *decoder_states = decoder_rnn(
        dec_emb, initial_state=encoder_states
    )

    # Dense layer
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Full model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

# Initialize configuration
config = ModelConfig()
num_encoder_tokens = len(input_token_index) + 1  # +1 for padding
num_decoder_tokens = len(target_token_index) + 1

# Build and train model
model = build_model(config, num_encoder_tokens, num_decoder_tokens)
model.summary()

history = model.fit(
    [enc_train, dec_in_train],
    np.expand_dims(dec_tgt_train, -1),
    batch_size=config.batch_size,
    epochs=config.epochs,
    validation_data=(
        [enc_val, dec_in_val],
        np.expand_dims(dec_tgt_val, -1)
    )
)

# Build inference models
def build_inference_models(model, config):
    # Encoder inference model
    encoder_inputs = model.input[0]
    encoder_outputs, *encoder_states = model.layers[4].output  # RNN layer
    encoder_model = Model(encoder_inputs, encoder_states)

    # Decoder inference model
    decoder_inputs = model.input[1]
    decoder_embedding = model.layers[3]  # Decoder embedding

    # State inputs
    decoder_state_inputs = [
        Input(shape=(config.hidden_size,))
        for _ in range(len(encoder_states))
    ]

    # Decoder RNN
    decoder_rnn = model.layers[5]
    dec_emb = decoder_embedding(decoder_inputs)
    decoder_outputs, *decoder_states = decoder_rnn(
        dec_emb, initial_state=decoder_state_inputs
    )

    # Dense layer
    decoder_dense = model.layers[6]
    decoder_outputs = decoder_dense(decoder_outputs)

    decoder_model = Model(
        [decoder_inputs] + decoder_state_inputs,
        [decoder_outputs] + decoder_states
    )

    return encoder_model, decoder_model

encoder_model, decoder_model = build_inference_models(model, config)

# Decode sequence function
def decode_sequence(input_seq, encoder_model, decoder_model,
                   target_token_index, reverse_target_char_index,
                   max_decoder_seq_length):
    # Encode the input sequence
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index['\t']

    # Sampling loop
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, *states_value = decoder_model.predict(
            [target_seq] + states_value
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index.get(sampled_token_index, '')

        # Exit condition
        if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True
        else:
            decoded_sentence += sampled_char

        # Update target sequence
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

    return decoded_sentence

# Prepare test data
test_input_texts = [inp.lower() for inp, _ in test_pairs]
test_target_texts = [tgt for _, tgt in test_pairs]

test_encoder_input = tokenize(test_input_texts, input_token_index, max_encoder_seq_length)

# Evaluate on test set
correct = 0
total = len(test_input_texts)
predictions = []

for i in range(total):
    input_seq = test_encoder_input[i:i+1]
    decoded = decode_sequence(
        input_seq, encoder_model, decoder_model,
        target_token_index, reverse_target_char_index,
        max_decoder_seq_length
    )
    predictions.append((test_input_texts[i], test_target_texts[i], decoded))
    if decoded == test_target_texts[i]:
        correct += 1

accuracy = correct / total
print(f"\nTest Accuracy: {accuracy:.2%}")

# Show sample predictions
print("\nSample Predictions:")
for i in range(min(10, len(predictions))):
    print(f"Input: {predictions[i][0]}")
    print(f"Target: {predictions[i][1]}")
    print(f"Predicted: {predictions[i][2]}")
    print()

Epoch 1/30
[1m130/553[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m1:37[0m 231ms/step - accuracy: 0.6584 - loss: 1.6900