<a href="https://colab.research.google.com/github/deepraj16/100-days-of-deep-learning/blob/main/day_51Understanding_uncoder_decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
!pip install -q kaggle


In [None]:

import kagglehub

# Download latest version
path = kagglehub.dataset_download("umasrikakollu72/hindi-english-truncated-corpus")

print("Path to dataset files:", path)

In [None]:
import os

path = "/kaggle/input/hindi-english-truncated-corpus"
print(os.listdir(path))

In [None]:
df = pd.read_csv(os.path.join(path, "Hindi_English_Truncated_Corpus.csv"))

In [None]:
df.head(5)

In [None]:
df['english_sentence'][0]

In [None]:
df['hindi_sentence'][0]

In [None]:
df.iloc[:30000,:]

In [None]:
lines = df
lines = lines[lines['source'] == 'ted'][['english_sentence', 'hindi_sentence']].dropna().drop_duplicates()
lines = lines.sample(n=25000, random_state=42)
print(f"Filtered dataset shape: {lines.shape}")

In [None]:
lines['english_sentence'][0]

In [None]:
import string
def clean_text(text):
    exclude = set(string.punctuation)
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.translate(str.maketrans('', '', string.digits))
    return text.strip().lower()

In [None]:
lines['english_sentence'] = lines['english_sentence'].apply(clean_text)
lines['hindi_sentence'] = lines['hindi_sentence'].apply(clean_text)
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: 'start_ ' + x + ' _end')

In [None]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(lines['english_sentence'])
eng_seq = eng_tokenizer.texts_to_sequences(lines['english_sentence'])

hin_tokenizer = Tokenizer(filters='')
hin_tokenizer.fit_on_texts(lines['hindi_sentence'])
hin_seq = hin_tokenizer.texts_to_sequences(lines['hindi_sentence'])

In [None]:
print(f"English vocabulary size: {len(eng_tokenizer.word_index)}")
print(f"Hindi vocabulary size: {len(hin_tokenizer.word_index)}")

In [None]:
max_eng_len = max(len(seq) for seq in eng_seq)
max_hin_len = max(len(seq) for seq in hin_seq)

encoder_input = pad_sequences(eng_seq, maxlen=max_eng_len, padding='post')
decoder_input = pad_sequences(hin_seq, maxlen=max_hin_len, padding='post')
print(f"Encoder input shape: {encoder_input.shape}")
print(f"Decoder input shape: {decoder_input.shape}")

In [None]:
decoder_target = np.zeros((decoder_input.shape[0], decoder_input.shape[1], 1))
decoder_target[:, 0:-1, 0] = decoder_input[:, 1:]
print(f"Decoder target shape: {decoder_target.shape}")

In [None]:
# Cell 14: Split data into train and validation sets
train_encoder_input, val_encoder_input, train_decoder_input, val_decoder_input, train_decoder_target, val_decoder_target = train_test_split(
    encoder_input, decoder_input, decoder_target, test_size=0.2, random_state=42
)

print(f"Training samples: {len(train_encoder_input)}")
print(f"Validation samples: {len(val_encoder_input)}")

In [None]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1
hin_vocab_size = len(hin_tokenizer.word_index) + 1
embedding_dim = 256
hidden_units = 512

In [None]:
print(f"English vocab size: {eng_vocab_size}")
print(f"Hindi vocab size: {hin_vocab_size}")

In [None]:
# Cell 16: Build the encoder-decoder model
# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
encoder_embedding = Embedding(eng_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True, use_cudnn=False)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_hin_len,))
decoder_embedding = Embedding(hin_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, use_cudnn=False)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(hin_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [None]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Cell 17: Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("Model compiled successfully!")
print("Model summary:")
model.summary()

In [None]:
print(f"\nInput shapes:")
print(f"Encoder input: {train_encoder_input.shape}")
print(f"Decoder input: {train_decoder_input.shape}")
print(f"Decoder target: {train_decoder_target.shape}")


In [None]:
print("Starting training...")

# Start with a smaller batch size and fewer epochs for testing
history = model.fit(
    [train_encoder_input, train_decoder_input],
    train_decoder_target,
    batch_size=32,  # Reduced batch size
    epochs=5,       # Reduced epochs for initial testing
    validation_data=([val_encoder_input, val_decoder_input], val_decoder_target),
    verbose=1
)


In [None]:
# Continue training for 5 more epochs
print("Continuing training for 5 more epochs...")
history_continued = model.fit(
    [train_encoder_input, train_decoder_input],
    train_decoder_target,
    batch_size=32,
    epochs=5,
    validation_data=([val_encoder_input, val_decoder_input], val_decoder_target),
    verbose=1
)

In [None]:
# Continue training for 5 more epochs
print("Continuing training for 5 more epochs...")
history_continued = model.fit(
    [train_encoder_input, train_decoder_input],
    train_decoder_target,
    batch_size=32,
    epochs=5,
    validation_data=([val_encoder_input, val_decoder_input], val_decoder_target),
    verbose=1
)

In [None]:
# Continue training for 5 more epochs
print("Continuing training for 5 more epochs...")
history_continued = model.fit(
    [train_encoder_input, train_decoder_input],
    train_decoder_target,
    batch_size=32,
    epochs=10,
    validation_data=([val_encoder_input, val_decoder_input], val_decoder_target),
    verbose=1
)

In [None]:
# Cell 20: Build inference models
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Use the same decoder layers but for inference
decoder_inputs_single = Input(shape=(1,))
decoder_embedding_inf = Embedding(hin_vocab_size, embedding_dim)(decoder_inputs_single)
decoder_lstm_inf = LSTM(hidden_units, return_sequences=True, return_state=True, use_cudnn=False)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm_inf(
    decoder_embedding_inf, initial_state=decoder_states_inputs
)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf
)

print("Inference models created successfully!")


In [None]:

# Cell 21: Define translation function
def translate_sentence(input_sentence):
    # Clean and tokenize input sentence
    input_sentence = clean_text(input_sentence)
    input_seq = eng_tokenizer.texts_to_sequences([input_sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_eng_len, padding='post')

    # Encode the input
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))
    # Set the first character of target sequence with the start character
    target_seq[0, 0] = hin_tokenizer.word_index.get('start_', 1)

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = None

        for word, index in hin_tokenizer.word_index.items():
            if sampled_token_index == index:
                if word != 'start_' and word != '_end':
                    decoded_sentence += ' ' + word
                sampled_word = word
                break

        if sampled_word == '_end' or len(decoded_sentence.split()) > max_hin_len:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()



In [None]:
# Cell 22: Test the translation function
print("Testing translation function:")
print("-" * 50)

# Test with some examples from validation set
test_sentences = [
    "hello how are you",
    "what is your name",
    "i am fine thank you",
    "where are you from",
    "good morning"
]

for sentence in test_sentences:
    translation = translate_sentence(sentence)
    print(f"English: {sentence}")
    print(f"Hindi: {translation}")
    print("-" * 30)


In [None]:

# Cell 23: Evaluate on validation set
print("Evaluating model on validation set...")
val_loss, val_accuracy = model.evaluate(
    [val_encoder_input, val_decoder_input],
    val_decoder_target,
    verbose=0
)

print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Cell 24: Save the model and tokenizers
print("Saving model and tokenizers...")

# Save the trained model
model.save('translation_model.h5')



In [None]:
# Save tokenizers
import pickle
with open('eng_tokenizer.pkl', 'wb') as f:
    pickle.dump(eng_tokenizer, f)

with open('hin_tokenizer.pkl', 'wb') as f:
    pickle.dump(hin_tokenizer, f)

print("Model and tokenizers saved successfully!")

# Cell 25: Interactive translation function
def interactive_translation():
    print("Interactive Translation System")
    print("Type 'quit' to exit")
    print("-" * 40)

    while True:
        english_sentence = input("Enter English sentence: ")
        if english_sentence.lower() == 'quit':
            break

        try:
            hindi_translation = translate_sentence(english_sentence)
            print(f"Hindi translation: {hindi_translation}")
        except Exception as e:
            print(f"Error in translation: {e}")

        print("-" * 40)

# Uncomment the line below to start interactive translation
# interactive_translation()