In [1]:
# importing the library
import re
import os
import sys
import string
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
np.set_printoptions(threshold=sys.maxsize)

In [3]:
data = pd.read_csv("./Hindi_English_Truncated_Corpus.csv", encoding = "UTF-8")

In [4]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [5]:
data.describe()

Unnamed: 0,source,english_sentence,hindi_sentence
count,127607,127605,127607
unique,3,124317,97662
top,tides,(Laughter),(हँसी)
freq,50000,555,212


In [6]:
data.count()

source              127607
english_sentence    127605
hindi_sentence      127607
dtype: int64

In [7]:
pd.isnull(data).sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

In [8]:
data = data.dropna()

In [9]:
pd.isnull(data).sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [10]:
data.drop_duplicates(inplace=True)

In [11]:
data = data[data['source']=='ted']

In [12]:
data = data.sample(n=25000,random_state=42)
data.shape

(25000, 3)

In [13]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
82040,ted,"We still don't know who her parents are, who s...",हम अभी तक नहीं जानते हैं कि उसके माता-पिता कौन...
85038,ted,"no keyboard,","कोई कुंजीपटल नहीं,"
58018,ted,"But as far as being a performer,",लेकिन एक कलाकार होने के साथ
74470,ted,"And this particular balloon,","और यह खास गुब्बारा,"
122330,ted,and it's not as hard as you think. Integrate c...,"और जितना आपको लगता है, यह उतना कठिन नहीं है.अप..."


In [14]:
data["english_sentence"] = data["english_sentence"].apply(lambda x : x.lower())

In [15]:
data["hindi_sentence"]   = data["hindi_sentence"].apply(lambda x : x.lower())

In [16]:
data["english_sentence"] = data["english_sentence"].apply(lambda x : re.sub("'", '', x))

In [17]:
data["hindi_sentence"]   = data["hindi_sentence"].apply(lambda x :  re.sub("'", '', x))

In [18]:
exclude = set(string.punctuation)
data['english_sentence'] = data['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['hindi_sentence']   = data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [19]:
data["english_sentence"] = data["english_sentence"].apply(lambda x : re.sub("\d+", "", x))

In [20]:
data["hindi_sentence"]   = data["hindi_sentence"].apply(lambda x : re.sub("\d+", "", x))

In [21]:
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

In [22]:
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.strip())
data['hindi_sentence']   = data['hindi_sentence'].apply(lambda x: x.strip())
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
data['hindi_sentence']   = data['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [23]:
data["english_sentence"]

82040     we still dont know who her parents are who she is
85038                                           no keyboard
58018                       but as far as being a performer
74470                           and this particular balloon
122330    and its not as hard as you think integrate cli...
                                ...                        
49566     using either image recognition or marker techn...
118399           and theyve started doing dna tests on kids
20473                  so there is not a lot of competition
20729                    a woman with indefatigable stamina
91889                  and you say “how about eight oclock”
Name: english_sentence, Length: 25000, dtype: object

In [24]:
data["hindi_sentence"]

82040     हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन ...
85038                                     कोई कुंजीपटल नहीं
58018                           लेकिन एक कलाकार होने के साथ
74470                                    और यह खास गुब्बारा
122330    और जितना आपको लगता है यह उतना कठिन नहीं हैअपने...
                                ...                        
49566     छवि मान्यता या मार्कर प्रौद्योगिकी का इस्तेमाल...
118399     और उन्होंने बच्चो पर dna परीक्षण शुरू कर दिये है
20473                         तो ज्यादा प्रतियोगिता नहीं है
20729                                  एक अजेय बलवाली महिला
91889                            और पूछें आठ बजे कैसा रहेगा
Name: hindi_sentence, Length: 25000, dtype: object

In [25]:
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [26]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापि...
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END
122330,ted,and its not as hard as you think integrate cli...,START_ और जितना आपको लगता है यह उतना कठिन नहीं...


In [27]:
data.to_excel("pre-processed_data.xls", encoding="UTF-8")

In [28]:
### Get English and Hindi Vocabulary
all_eng_words=set()
for eng in data['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in data['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [29]:
len(all_eng_words)

14030

In [30]:
len(all_hindi_words)

17540

In [31]:
data['length_eng_sentence']=data['english_sentence'].apply(lambda x:len(x.split(" ")))
data['length_hin_sentence']=data['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [32]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापि...,11,16
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END,2,5
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END,7,8
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END,4,6
122330,ted,and its not as hard as you think integrate cli...,START_ और जितना आपको लगता है यह उतना कठिन नहीं...,16,20


In [33]:
data=data[data['length_eng_sentence']<=20]
data=data[data['length_hin_sentence']<=20]

In [34]:
print("maximum length of Hindi Sentence ",max(data['length_hin_sentence']))
print("maximum length of English Sentence ",max(data['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [35]:
max_length_src=max(data['length_hin_sentence'])
max_length_tar=max(data['length_eng_sentence'])

In [36]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(14030, 17540)

In [37]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [38]:
reverse_input_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_index = dict((i, word) for word, i in target_token_index.items())

In [39]:
input_token_index, reverse_input_index

({'a': 1,
  'aaaahhh': 2,
  'aaron': 3,
  'ab': 4,
  'abacha': 5,
  'abandoned': 6,
  'abc': 7,
  'abducting': 8,
  'abdul': 9,
  'abe': 10,
  'abhishek': 11,
  'abilities': 12,
  'ability': 13,
  'abject': 14,
  'able': 15,
  'abnormality': 16,
  'abode': 17,
  'abolished': 18,
  'abololo': 19,
  'about': 20,
  'about”': 21,
  'above': 22,
  'abraham': 23,
  'abrahams': 24,
  'abroad': 25,
  'absence': 26,
  'absent': 27,
  'absolute': 28,
  'absolutely': 29,
  'absoluteness': 30,
  'absolution': 31,
  'absorb': 32,
  'absorbing': 33,
  'abstract': 34,
  'absurd': 35,
  'abu': 36,
  'abundance': 37,
  'abundant': 38,
  'abuse': 39,
  'abused': 40,
  'abusers': 41,
  'abusing': 42,
  'academic': 43,
  'academically': 44,
  'academy': 45,
  'accede': 46,
  'accelerate': 47,
  'accelerating': 48,
  'acceleration': 49,
  'accelerator': 50,
  'accelerometers': 51,
  'accent': 52,
  'accentuate': 53,
  'accenture': 54,
  'accept': 55,
  'acceptable': 56,
  'acceptance': 57,
  'accepted': 58

In [40]:
target_token_index, reverse_target_index

({'START_': 1,
  '_END': 2,
  'a': 3,
  'ab': 4,
  'abololo': 5,
  'accelerometer': 6,
  'ackermann': 7,
  'actuated': 8,
  'africa': 9,
  'aids': 10,
  'air': 11,
  'alain': 12,
  'and': 13,
  'ap': 14,
  'aproch': 15,
  'argentina': 16,
  'arthur': 17,
  'articulated': 18,
  'as': 19,
  'atm': 20,
  'atomic': 21,
  'attention': 22,
  'augustine': 23,
  'authenticity': 24,
  'awesomethingscom': 25,
  'b': 26,
  'ban': 27,
  'batman': 28,
  'behavior': 29,
  'bg': 30,
  'bhabha': 31,
  'biotechnology': 32,
  'blackberry': 33,
  'blind': 34,
  'blinding”': 35,
  'bollingbroke': 36,
  'botton': 37,
  'bovary': 38,
  'box': 39,
  'bridge': 40,
  'browse': 41,
  'buggy': 42,
  'buick': 43,
  'burg': 44,
  'c': 45,
  'ca': 46,
  'cablesuspended': 47,
  'cdc': 48,
  'centre': 49,
  'character': 50,
  'chimera': 51,
  'christ': 52,
  'cindy': 53,
  'clarke': 54,
  'click': 55,
  'climatecrisisnet': 56,
  'climber': 57,
  'cloud': 58,
  'co': 59,
  'commons': 60,
  'corporation': 61,
  'd': 62

In [41]:
data_new = data[["english_sentence", "hindi_sentence"]]

In [42]:
data_new

Unnamed: 0,english_sentence,hindi_sentence
82040,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापि...
85038,no keyboard,START_ कोई कुंजीपटल नहीं _END
58018,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END
74470,and this particular balloon,START_ और यह खास गुब्बारा _END
122330,and its not as hard as you think integrate cli...,START_ और जितना आपको लगता है यह उतना कठिन नहीं...
...,...,...
49566,using either image recognition or marker techn...,START_ छवि मान्यता या मार्कर प्रौद्योगिकी का इ...
118399,and theyve started doing dna tests on kids,START_ और उन्होंने बच्चो पर dna परीक्षण शुरू क...
20473,so there is not a lot of competition,START_ तो ज्यादा प्रतियोगिता नहीं है _END
20729,a woman with indefatigable stamina,START_ एक अजेय बलवाली महिला _END


In [43]:
X, y = data['english_sentence'], data['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((19819,), (4955,))

In [44]:
# BUFFER_SIZE = len(X_train)
# BATCH_SIZE = 64
# dataset_train = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
# dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True)

In [45]:
# BUFFER_SIZE = len(X_test)
# BATCH_SIZE = 64
# dataset_test = tf.data.Dataset.from_tensor_slices((X_test, y_test)).shuffle(BUFFER_SIZE)
# dataset_test = dataset_test.batch(BATCH_SIZE, drop_remainder=True)

In [46]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [47]:
latent_dim=300

In [48]:
# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(None,))
enc_emb =  tf.keras.layers.Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [51]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = tf.keras.layers.Input(shape=(None,))
dec_emb_layer = tf.keras.layers.Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [52]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [53]:
model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    4209000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    5262000     input_3[0][0]                    
_______________________________________________________________________________________

In [54]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 10

In [55]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/10
  1/154 [..............................] - ETA: 0s - loss: 4.8136

InvalidArgumentError:  indices[22,0] = 14030 is not in [0, 14030)
	 [[node functional_5/embedding/embedding_lookup (defined at <ipython-input-55-b22c3ec5e69e>:1) ]] [Op:__inference_train_function_15706]

Errors may have originated from an input operation.
Input Source operations connected to node functional_5/embedding/embedding_lookup:
 functional_5/embedding/embedding_lookup/10564 (defined at C:\ProgramData\Anaconda3\lib\contextlib.py:113)

Function call stack:
train_function
