In [None]:
# importing the library
import re
import os
import sys
import string
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [None]:
np.set_printoptions(threshold=sys.maxsize)

In [None]:
data = pd.read_csv("./Hindi_English_Truncated_Corpus.csv", encoding = "UTF-8")

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.count()

In [None]:
pd.isnull(data).sum()

In [None]:
data = data.dropna()

In [None]:
pd.isnull(data).sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data = data[data["source"] == "ted"]

In [None]:
data.count()

In [None]:
data = data.sample(n=25000,random_state=42)
data.shape

In [None]:
data.head()

In [None]:
def preprocessing(data_input, column_name):
    data_input = data_input.apply(lambda x: x.lower())
    data_input = data_input.apply(lambda x: re.sub("'"           , '', x))
    data_input = data_input.apply(lambda x: re.sub("[^\w\s]"     , '', x))
    data_input = data_input.apply(lambda x: re.sub("[0-9]"       , '', x))
    if column_name == "hindi_sentence":
        data_input = data_input.apply(lambda x: re.sub("[२३०८१५७९४६]", '', x))
        data_input = data_input.apply(lambda x: 'START_ ' + x + ' _END')
    data_input = data_input.apply(lambda x: x.strip())
    data_input = data_input.apply(lambda x: re.sub(" +"          , ' ', x))
    return data_input   

In [None]:
data["english_sentence"] = preprocessing(data["english_sentence"], "english_sentence")

In [None]:
data["hindi_sentence"]   = preprocessing(data["hindi_sentence"], "hindi_sentence")

In [None]:
data.head()

In [None]:
all_eng_words=set()
for eng in data['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in data['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [None]:
len(all_eng_words)

In [None]:
len(all_hindi_words)

In [None]:
data['length_eng_sentence']=data['english_sentence'].apply(lambda x:len(x.split(" ")))
data['length_hin_sentence']=data['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [None]:
data.head()

In [None]:
data=data[data['length_eng_sentence']<=20]
data=data[data['length_hin_sentence']<=20]

In [None]:
print("maximum length of Hindi Sentence ",max(data['length_hin_sentence']))
print("maximum length of English Sentence ",max(data['length_eng_sentence']))

In [None]:
max_length_src=max(data['length_hin_sentence'])
max_length_tar=max(data['length_eng_sentence'])

In [None]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

In [None]:
num_decoder_tokens += 1

In [None]:
num_encoder_tokens, num_decoder_tokens

In [None]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [None]:
reverse_input_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_index = dict((i, word) for word, i in target_token_index.items())

In [None]:
input_token_index, reverse_input_index

In [None]:
target_token_index, reverse_target_index

In [None]:
X, y = data['english_sentence'], data['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

In [None]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [None]:
latent_dim = 300

In [None]:
# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(20,))
enc_emb =  tf.keras.layers.Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [None]:
temp_model = tf.keras.Model(inputs = encoder_inputs, outputs = [encoder_inputs, enc_emb,encoder_outputs, state_h, state_c])

In [None]:
temp_model.summary(line_length =  120)

In [None]:
# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(20,))
enc_emb =  tf.keras.layers.Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(latent_dim, return_state=True, return_sequences = True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [None]:
temp_model = tf.keras.Model(inputs = encoder_inputs, outputs = [encoder_inputs, enc_emb,encoder_outputs, state_h, state_c])

In [None]:
temp_model.summary(line_length=120)

In [None]:
a, e, x, y , z = temp_model.predict([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]])

In [None]:
a.shape, e.shape, x.shape, y.shape, z.shape

In [None]:
x

In [None]:
y

In [None]:
z

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = tf.keras.layers.Input(shape=(None,))
dec_emb_layer = tf.keras.layers.Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
tf.keras.utils.plot_model(model, "my_first_model_with_shape_info.png", show_shapes=True)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
model.summary()

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 400
epochs = 10

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)