In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import tensorflow as tf

# Load dataset
data = pd.read_csv("English_Hindi_Clean_New.csv", encoding='utf-8')

# Preprocess: vocabulary creation
all_eng_words = set(word for sentence in data['English'] for word in sentence.split())
all_hin_words = set(word for sentence in data['Hindi'] for word in sentence.split())

# Filter long sentences
data['len_eng_sen'] = data['English'].apply(lambda x: len(x.split()))
data['len_hin_sen'] = data['Hindi'].apply(lambda x: len(x.split()))
data = data[(data['len_eng_sen'] <= 20) & (data['len_hin_sen'] <= 20)]

# Max lengths
max_len_src = max(data['len_eng_sen'])
max_len_tar = max(data['len_hin_sen'])

# Vocabulary
inp_words = sorted(list(all_eng_words))
tar_words = sorted(list(all_hin_words))
num_enc_toks = len(inp_words) + 1
num_dec_toks = len(tar_words) + 1

# Token indices
inp_tok_idx = {word: i+1 for i, word in enumerate(inp_words)}
tar_tok_idx = {word: i+1 for i, word in enumerate(tar_words)}
rev_inp_tok_idx = {i: word for word, i in inp_tok_idx.items()}
rev_tar_tok_idx = {i: word for word, i in tar_tok_idx.items()}

# Split data
X, y = data['English'], data['Hindi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Generator function
def generate_batch(X, y, batch_size):
    while True:
        for j in range(0, len(X), batch_size):
            enc_input = np.zeros((batch_size, max_len_src), dtype='float32')
            dec_input = np.zeros((batch_size, max_len_tar), dtype='float32')
            dec_target = np.zeros((batch_size, max_len_tar, num_dec_toks), dtype='float32')

            for i, (inp, tar) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(inp.split()):
                    enc_input[i, t] = inp_tok_idx.get(word, 0)
                for t, word in enumerate(tar.split()):
                    if t < max_len_tar:
                        dec_input[i, t] = tar_tok_idx.get(word, 0)
                    if t > 0 and t < max_len_tar:
                        dec_target[i, t-1, tar_tok_idx.get(word, 0)] = 1.0

            yield [enc_input, dec_input], dec_target

# Model parameters
latent_dim = 250
batch_size = 256
epochs = 50

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_enc_toks, latent_dim, mask_zero=True)(encoder_inputs)
_, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_dec_toks, latent_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_dec_toks, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train
train_steps = len(X_train) // batch_size
val_steps = len(X_test) // batch_size

model.fit(
    generate_batch(X_train, y_train, batch_size),
    steps_per_epoch=train_steps,
    epochs=epochs,
    validation_data=generate_batch(X_test, y_test, batch_size),
    validation_steps=val_steps
)
