In [262]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import Counter
import matplotlib.pyplot as plt
import re
from string import digits
from keras.models import Model
import string

In [263]:
lines=pd.read_csv(r"E:\Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [264]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [265]:
lines.shape

(127607, 3)

In [266]:
lines[lines['english_sentence'].isnull()]

Unnamed: 0,source,english_sentence,hindi_sentence
37554,indic2012,,सन् 330 ईसापूर्व में मकदूनिया (यूनान) के विजेत...
59804,indic2012,,लेकिन उस समय इस्लाम का उदय नहीं हुआ था; ईरान क...


In [267]:
lines['hindi_sentence'].isnull().sum()

0

In [268]:
lines=lines.dropna()

In [269]:
lines.shape

(127605, 3)

In [270]:
lines['source'].value_counts()

source
tides        50000
ted          39881
indic2012    37724
Name: count, dtype: int64

In [271]:
lines.drop_duplicates(inplace=True)

In [272]:
lines.shape #after dropping duplicate values

(124827, 3)

In [273]:
lines=lines.sample(n=25000,random_state=42)

In [274]:
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [275]:
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [276]:
exclude=set(string.punctuation)

In [277]:
exclude.add('“')
exclude.add('”')

In [278]:
exclude

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '“',
 '”'}

In [279]:
#need to remove all of the special characters

In [280]:
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [281]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
25520,indic2012,islam is word from arabic and it full word is ...,इस्लाम शब्द अरबी भाषा का शब्द है जिसका मूल शब्...
118633,ted,everything is reliant on these computers working,इन कंप्यूटरों पर सब कुछ निर्भर है
113495,tides,parliament does not control the government,संसद का सरकार पपर नियंत्रण नपहीं रहता
29783,tides,race equality new laws,नये कानून नस्ली समानता
111804,tides,the provision would not affect the power of pa...,व्यवसायों आदि से होने वाली आय के बारे में विधि...


In [282]:
# removing extra space
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [283]:
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')


In [284]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
25520,indic2012,islam is word from arabic and it full word is ...,START_ इस्लाम शब्द अरबी भाषा का शब्द है जिसका ...
118633,ted,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END
113495,tides,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END
29783,tides,race equality new laws,START_ नये कानून नस्ली समानता _END
111804,tides,the provision would not affect the power of pa...,START_ व्यवसायों आदि से होने वाली आय के बारे म...


In [285]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [286]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
25520,indic2012,islam is word from arabic and it full word is ...,START_ इस्लाम शब्द अरबी भाषा का शब्द है जिसका ...,14,21
118633,ted,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END,7,9
113495,tides,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END,6,9
29783,tides,race equality new laws,START_ नये कानून नस्ली समानता _END,4,6
111804,tides,the provision would not affect the power of pa...,START_ व्यवसायों आदि से होने वाली आय के बारे म...,22,24


In [287]:
lines[lines['length_eng_sentence']>30].shape

(2435, 5)

In [288]:
print("maximum length of Hindi Sentence:",max(lines['length_hin_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Hindi Sentence: 314
maximum length of English Sentence  348


In [289]:
list(lines[lines['length_eng_sentence']==348]['english_sentence'])

['as with terrorism the concept of state terrorism is controversial the chairman of the united nations counterterrorism committee has stated that the committee was conscious of international conventions on the subject and none of them referred to state terrorism which was not an international legal concept if states abused their power they should be judged against international conventions dealing with war crimes international human rights and international humanitarian law former united nations secretarygeneral kofi annan has said that it is time to set aside debates on socalled state terrorism the use of force by states is already thoroughly regulated under international law however he also made clear that regardless of the differences between governments on the question of definition of terrorism what is clear and what we can all agree on is any deliberate attack on innocent civilians regardless of ones cause is unacceptable and fits into the definition of terrorismstate terrorism h

In [290]:
list(lines[lines['length_eng_sentence']==348]['hindi_sentence'])

['START_ राज्य आतंकवाद की अवधारणा विवादास्पद है राज्यों द्वारा सैन्य कार्रवाई के दौरान युद्ध आम तौर पर आतंकवाद तब भी जब वे महत्वपूर्ण नागरिक हताहत शामिल विचार नहीं कर रहे हैंअध्यक्ष ने संयुक्त राष्ट्र काउंटर के आतंकवाद समिति का मानना है कि इस समिति इस विषय पर अंतरराष्ट्रीय समझौतों के प्रति जागरूक किया गया हैं और उनमें से कोई भी नहीं है जो एक अंतरराष्ट्रीय कानूनी अवधारणा नहीं थी राज्य आतंकवाद को भेजायदि राज्यों को उनकी सत्ता का दुरुपयोग वे अंतरराष्ट्रीय सम्मेलनों से निपटने के खिलाफ न्याय किया जाना चाहिए युद्ध अपराधों war crimes अंतरराष्ट्रीय मानव अधिकार और अंतर्राष्ट्रीय मानवीय कानून international humanitarian lawपूर्व संयुक्त राष्ट्रमहासचिव secretarygeneralकोफी अन्नान कि यह कथित पर बहस अलग सेट करने के लिए समय है ने कहा है कि राज्य के आतंकवाद इस राज्यों द्वारा बल का प्रयोग use of force by states पहले से ही पूरी तरह अंतरराष्ट्रीय कानून के तहत विनियमित है हालांकि उन्होंने यह भी कहा कि चाहे आतंकवाद की परिभाषा के प्रश्न पर सरकारों के बीच के अंतर के क्या है और स्पष्ट है हम सब पर क्या सहमत कर

In [291]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_hin_sentence']<=20]

In [292]:
lines.shape

(16520, 5)

In [293]:
max_length_src=max(lines['length_hin_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [294]:
# creating hindi and english vocabulary
all_eng_words=set()
for english in lines['english_sentence']:
    for word in english.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [295]:
len(all_eng_words)

17047

In [296]:
len(all_hindi_words)

19333

In [386]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))

In [387]:
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(17047, 19333)

In [388]:
num_decoder_tokens+=1 #for zero padding

In [389]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])


In [390]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [392]:
# lines = shuffle(lines)
lines.head(10)


Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
118633,ted,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END,7,9
113495,tides,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END,6,9
29783,tides,race equality new laws,START_ नये कानून नस्ली समानता _END,4,6
57202,ted,there was lasagna there was casseroles,START_ वहां लाजान्या था कैसेरोल थे _END,6,7
107821,indic2012,super power india source google writer vedprat...,START_ महाशक्ति भारत गूगल पुस्तक लेखक वेदप्रता...,8,9
85746,tides,each was a blow to conservatism,START_ इनमें से प्रत्येक यथास्थितिवादियों पर च...,6,10
55848,tides,the colour of the drake is black at the neck a...,START_ नर बतख का रंग गर्दन और पीठ पर काला होता...,12,13
103460,ted,in the mathare valley slums,START_ माथेरा घाटी की झुग्गियों में। _END,5,7
91981,ted,the second time was a procedure that involved ...,START_ दूसरी बार के उपचार में बेहोश करने की आव...,10,12
101360,indic2012,uttarpradesh a mirrorlive hindustan,START_ उत्तर प्रदेश एक आईना लाइव हिन्दुस्तान _END,4,8


In [303]:
# encoding_data=list(lines['english_sentence'])
# decoding_data=list(lines['hindi_sentence'])

In [304]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

In [305]:
# max_len=20
# trunc_dim='post'
# oov_token="<OOV>"

In [306]:
# tokenizer_eng=Tokenizer(num_words=num_encoder_tokens+1)
# tokenizer_eng.fit_on_texts(encoding_data)
# word_index_eng=tokenizer_eng.word_index
# input_token=tokenizer_eng.texts_to_sequences(encoding_data)
# padded_input_token=pad_sequences(sequences=input_token,maxlen=max_len,padding=trunc_dim)

In [393]:
# padded_input_token.shape

In [394]:
# (word_index_eng)

In [395]:
# tokenizer_hindi=Tokenizer(num_words=num_decoder_tokens+1)
# tokenizer_hindi.fit_on_texts(decoding_data)
# word_index_hindi=tokenizer_hindi.word_index
# target_token=tokenizer_hindi.texts_to_sequences(decoding_data)
# padded_output_token=pad_sequences(sequences=target_token,maxlen=max_len,padding=trunc_dim)

In [396]:
# padded_output_token[0]

In [397]:
# len(word_index_hindi)

In [398]:
# index_to_word_hindi[19333]

In [399]:
# index_to_word_hindi=tokenizer_hindi.index_word
# index_to_word_eng=tokenizer_eng.index_word

In [401]:
from sklearn.model_selection import train_test_split
X, y =lines['english_sentence'],lines['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((13216,), (3304,))

In [402]:
X_train

39389          and im earning more than the footballers wow
124244    yamuna joins ganga at allahabad from the left ...
114346                          i have spent my entire life
88938     to some extent farooq is nervous about the ong...
2249      he had all the weaknesses of a normal human be...
                                ...                        
19102      okay so lets have a look at a little bit of data
51342     it has also been recommended as a core element...
20089                darwin also has a lot of other talents
40877                                the human being played
44343                             i want to understand them
Name: english_sentence, Length: 13216, dtype: object

In [383]:
# def genrate_batch(X=X_train,y=y_train,batch_size=128):
#     while True:
#         for j in range(0, len(X), batch_size):
#             input_batch=[]
#             output_batch=[]
#             decoder_target_data = np.zeros((batch_size, 20, num_decoder_tokens+1),dtype='float32')
#             for i, (input_text, output_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
#                 output_t=output_text.copy()
#                 for t,token in enumerate(output_text):
#                     if token==0.0:
#                         output_t[t-1]=0.0
#                     if t>0:
#                         decoder_target_data[i,t-1,token]=1
#                 input_batch.append(list(input_text))
#                 output_batch.append(list(output_t))
#             yield ([np.asarray(input_batch),np.asarray(output_batch)],decoder_target_data)

In [404]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

In [451]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [426]:
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense

In [427]:
latent_dim=300

In [428]:
encoder_inputs=Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [429]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
#
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [430]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [431]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, None, 300)    5114100     ['input_9[0][0]']                
                                                                                                  
 embedding_9 (Embedding)        (None, None, 300)    5800200     ['input_10[0][0]']               
                                                                                            

In [435]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100
# len(X_train)/128

In [452]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

Epoch 1/100


  model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x268706b0940>

In [453]:
# model.save_weights('nmt_weights.h5')
model.load_weights('nmt_weights.h5')

In [454]:
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [455]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [476]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1
test_gen=generate_batch(np.asarray(["everything in these computers are really good"]),y_train,batch_size=1)
(q,_),_=next(test_gen)
q

array([[ 5086.,  7232., 15303.,  3035.,   866., 12354.,  6222.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.]], dtype=float32)

In [457]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: and im earning more than the footballers wow
Actual Hindi Translation:  और मैं फूटबाल खिलाडियों से ज्यादा कम रहा हूँ वाह 
Predicted Hindi Translation:  और मैं फूटबाल खिलाडियों से ज्यादा कम रहा हूँ वाह 


In [458]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: yamuna joins ganga at allahabad from the left side
Actual Hindi Translation:  यमुना इलाहाबाद के निकट बायीं ओर से गंगा नदी में जा मिलती है। 
Predicted Hindi Translation:  यमुना इलाहाबाद के निकट बायीं ओर से गंगा नदी मे


In [459]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: i have spent my entire life
Actual Hindi Translation:  मेरी सारी उम्र बीती है 
Predicted Hindi Translation:  मेरी सारी उम्र बीती है 


In [460]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: to some extent farooq is nervous about the ongoing secret talks between the hurriyat and delhi s nonofficial negotiators
Actual Hindi Translation:  फारूक ह्र्रियत और दिल्ली के गैरसरकारी वार्ताकारों के बीच चलती बातचीत से कुछ बेचैन हैं 
Predicted Hindi Translation:  फारूक ह्र्रियत और दिल्ली के गैरसरकारी वार्ताकारो


In [486]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:])

Input English sentence: it was chic to be left wear kurtas and sport angst as an unshaven commitment
Actual Hindi Translation:  कुर्ता पहनना दाढी रखना और आक्रोश जताना पक्की प्रतिबद्धता की निशानी थी 
Predicted Hindi Translation:  कुर्ता पहनना दाढी रखना और आक्रोश जताना पक्की प्रतिबद्धता


In [485]:

(input_seq, actual_output), _ = next(test_gen)
decoded_sentence = decode_sequence(input_seq)
# print('Input English sentence:', X_train[k:k+1].values[0])
# print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Predicted Hindi Translation:  ये प्रचार है जैसे कि हर क्षेत्रों में क्या ले 
