<a href="https://colab.research.google.com/github/ganesh-codes404/DL-Assignment-2/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tarfile

In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/dakshina_dataset_v1.0.tar'


In [None]:
def load_data(path, max_samples=5000):
    input_texts, target_texts = [], []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            if '\t' not in line:
                continue
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
            src, tgt = parts[0], parts[1]
            if src and tgt:
                input_texts.append(src)
                target_texts.append('\t' + tgt + '\n')
            if len(input_texts) >= max_samples:
                break
    return input_texts, target_texts

In [6]:
input_texts, target_texts = load_data(file_path)
print(f"Loaded {len(input_texts)} samples.")

# Vocab and preprocessing
input_chars = sorted(set("".join(input_texts)))
target_chars = sorted(set("".join(target_texts)))

input_token_index = {char: i for i, char in enumerate(input_chars)}
target_token_index = {char: i for i, char in enumerate(target_chars)}
reverse_target_index = {i: char for char, i in target_token_index.items()}

max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)

encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype='int32')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length), dtype='int32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[char]
    for t, char in enumerate(target_text):
        decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 5000 samples.


In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense

def build_model(cell_type='LSTM', embedding_dim=64, hidden_dim=128):
    encoder_inputs = Input(shape=(None,), name="encoder_inputs")
    encoder_embed = Embedding(input_dim=num_encoder_tokens, output_dim=embedding_dim, name="encoder_embedding")(encoder_inputs)

    if cell_type == 'LSTM':
        encoder_outputs, state_h, state_c = LSTM(hidden_dim, return_state=True, name="encoder_lstm")(encoder_embed)
        encoder_states = [state_h, state_c]
    elif cell_type == 'GRU':
        encoder_outputs, state_h = GRU(hidden_dim, return_state=True, name="encoder_gru")(encoder_embed)
        encoder_states = [state_h]
    else:
        encoder_outputs, state_h = SimpleRNN(hidden_dim, return_state=True, name="encoder_rnn")(encoder_embed)
        encoder_states = [state_h]

    decoder_inputs = Input(shape=(None,), name="decoder_inputs")
    decoder_embed = Embedding(input_dim=num_decoder_tokens, output_dim=embedding_dim, name="decoder_embedding")(decoder_inputs)

    if cell_type == 'LSTM':
        decoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True, name="decoder_lstm")
        decoder_outputs, _, _ = decoder_lstm(decoder_embed, initial_state=encoder_states)
    elif cell_type == 'GRU':
        decoder_gru = GRU(hidden_dim, return_sequences=True, return_state=True, name="decoder_gru")
        decoder_outputs, _ = decoder_gru(decoder_embed, initial_state=encoder_states)
    else:
        decoder_rnn = SimpleRNN(hidden_dim, return_sequences=True, return_state=True, name="decoder_rnn")
        decoder_outputs, _ = decoder_rnn(decoder_embed, initial_state=encoder_states)

    decoder_dense = Dense(num_decoder_tokens, activation='softmax', name="output_dense")
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

model = build_model(cell_type='LSTM', embedding_dim=64, hidden_dim=128)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


In [8]:
from sklearn.model_selection import train_test_split
enc_in_train, enc_in_val, dec_in_train, dec_in_val, dec_tgt_train, dec_tgt_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2
)

# Train the model
history = model.fit(
    [enc_in_train, dec_in_train],
    dec_tgt_train,
    batch_size=64,
    epochs=50,
    validation_data=([enc_in_val, dec_in_val], dec_tgt_val)
)


Epoch 1/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 4s/step - accuracy: 0.0657 - loss: 1.2239 - val_accuracy: 0.0810 - val_loss: 1.1284
Epoch 2/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 4s/step - accuracy: 0.0789 - loss: 1.1225 - val_accuracy: 0.0812 - val_loss: 1.1021
Epoch 3/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 4s/step - accuracy: 0.0792 - loss: 1.0984 - val_accuracy: 0.0902 - val_loss: 1.0600
Epoch 4/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 4s/step - accuracy: 0.0920 - loss: 1.0616 - val_accuracy: 0.1006 - val_loss: 1.0258
Epoch 5/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 4s/step - accuracy: 0.1015 - loss: 1.0350 - val_accuracy: 0.1032 - val_loss: 1.0098
Epoch 6/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 4s/step - accuracy: 0.1011 - loss: 1.0231 - val_accuracy: 0.1030 - val_loss: 0.9978
Epoch 7/50
[1m63/63[0m [32m━━━━

In [9]:
encoder_model = Model(model.input[0], model.get_layer("encoder_lstm").output[1:])

decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs = Input(shape=(1,), name="decoder_input_token")
decoder_embed = model.get_layer("decoder_embedding")(decoder_inputs)
decoder_lstm = model.get_layer("decoder_lstm")
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embed, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_dense = model.get_layer("output_dense")
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1), dtype='int32')
    target_seq[0, 0] = target_token_index['\t']

    stop = False
    decoded_sentence = ''
    while not stop:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_index[sampled_token_index]
        decoded_sentence += sampled_char
        if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length:
            stop = True
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()
print("Sample predictions:\n")
for i in range(5):
    input_seq = encoder_input_data[i:i+1]
    decoded = decode_sequence(input_seq)
    print(f"Latin Input      : {input_texts[i]}")
    print(f"Ground Truth     : {target_texts[i].strip()}")
    print(f"Predicted Output : {decoded}")
    print("-" * 40)


Sample predictions:

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 439ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Latin Input      : dakshina_dataset_v1.0/bn/                                                                           0000755 0606127 0257523 00000000000 13614417114 016176  5                                                                                                    ustar   roark                           primarygroup                                                                         