In [1]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint

import re
import string
from string import digits

In [2]:
df = pd.read_csv(r"C:\Users\gufra\Desktop\Work\Projects\NLP\MachineTranslation\data\Hindi_English_Truncated_Corpus.csv")
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [3]:
df=df[~pd.isnull(df['english_sentence'])]
df.drop_duplicates(inplace=True)

In [4]:
df=df.sample(n=30000,random_state=42)
df = df.reset_index()
df.head()

Unnamed: 0,index,source,english_sentence,hindi_sentence
0,25520,indic2012,Islam is word from arabic and it full word is ...,इस्लाम शब्द अरबी भाषा का शब्द है जिसका मूल शब्...
1,118633,ted,Everything is reliant on these computers working.,इन कंप्यूटरों पर सब कुछ निर्भर है .
2,113495,tides,Parliament does not control the government .,संसद का सरकार पपर नियंत्रण नपहीं रहता .
3,29783,tides,Race equality New laws,नये कानून नस्ली समानता
4,111804,tides,The provision would not affect the power of Pa...,व्यवसायों आदि से होने वाली आय के बारे में विधि...


In [5]:
df.drop(["index","source"],inplace=True, axis=1)
# df.drop(["source"],inplace=True, axis=1)

In [6]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,Islam is word from arabic and it full word is ...,इस्लाम शब्द अरबी भाषा का शब्द है जिसका मूल शब्...
1,Everything is reliant on these computers working.,इन कंप्यूटरों पर सब कुछ निर्भर है .
2,Parliament does not control the government .,संसद का सरकार पपर नियंत्रण नपहीं रहता .
3,Race equality New laws,नये कानून नस्ली समानता
4,The provision would not affect the power of Pa...,व्यवसायों आदि से होने वाली आय के बारे में विधि...


In [7]:
df['english_sentence']=df['english_sentence'].apply(lambda x: x.lower())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.lower())

In [8]:
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub("'", '', x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [9]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [10]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
df['english_sentence']=df['english_sentence'].apply(lambda x: x.translate(remove_digits))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.strip())
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [11]:
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [12]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,islam is word from arabic and it full word is ...,START_ इस्लाम शब्द अरबी भाषा का शब्द है जिसका ...
1,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END
2,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END
3,race equality new laws,START_ नये कानून नस्ली समानता _END
4,the provision would not affect the power of pa...,START_ व्यवसायों आदि से होने वाली आय के बारे म...


In [13]:
eng_vocab = set()
hindi_vocab = set()

for i in range(len(df)):
    eng = df.english_sentence.iloc[i].split(" ")
    hindi = df.hindi_sentence.iloc[i].split(" ")
    
    for j in eng:
        eng_vocab.add(j)
        
    for j in hindi:
        hindi_vocab.add(j)

len(eng_vocab), len(hindi_vocab)

(34219, 40734)

In [14]:
df['length_eng_sentence']=df['english_sentence'].apply(lambda x:len(x.split(" ")))
df['length_hin_sentence']=df['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [15]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,islam is word from arabic and it full word is ...,START_ इस्लाम शब्द अरबी भाषा का शब्द है जिसका ...,14,21
1,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END,7,9
2,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END,6,9
3,race equality new laws,START_ नये कानून नस्ली समानता _END,4,6
4,the provision would not affect the power of pa...,START_ व्यवसायों आदि से होने वाली आय के बारे म...,22,24


In [16]:
df[df['length_eng_sentence']>30].shape

(2924, 4)

In [17]:
df.shape

(30000, 4)

In [18]:
max_sentence_length = 20

In [19]:
df=df[df['length_eng_sentence']<=max_sentence_length]
df=df[df['length_hin_sentence']<=max_sentence_length]

In [20]:
max_length_eng=max(df['length_hin_sentence'])
max_length_hindi=max(df['length_eng_sentence'])

In [21]:
df.shape

(19829, 4)

In [22]:
tokenizere = Tokenizer()
tokenizere.fit_on_texts(df['english_sentence'])
df['english_sentence'] = tokenizere.texts_to_sequences(df['english_sentence'])

In [23]:
tokenizerh = Tokenizer()
tokenizerh.fit_on_texts(df['hindi_sentence'])
df['hindi_sentence'] = tokenizerh.texts_to_sequences(df['hindi_sentence'])

In [24]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
1,"[426, 7, 8444, 16, 42, 1415, 292]","[1, 73, 5013, 15, 102, 34, 630, 5, 2]",7,9
2,"[310, 166, 24, 460, 1, 106]","[1, 398, 9, 129, 10260, 545, 10261, 611, 2]",6,9
3,"[1670, 2685, 117, 1322]","[1, 631, 834, 6720, 3484, 2]",4,6
6,"[28, 11, 8445, 28, 11, 8446]","[1, 351, 10262, 27, 10263, 41, 2]",6,7
7,"[2686, 192, 46, 614, 337, 642, 8447, 8448]","[1, 4101, 44, 340, 434, 1068, 6721, 674, 2]",8,9


In [25]:
df['english_sentence'] = df['english_sentence'].apply(np.array)
df['hindi_sentence'] = df['hindi_sentence'].apply(np.array)

In [26]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
1,"[426, 7, 8444, 16, 42, 1415, 292]","[1, 73, 5013, 15, 102, 34, 630, 5, 2]",7,9
2,"[310, 166, 24, 460, 1, 106]","[1, 398, 9, 129, 10260, 545, 10261, 611, 2]",6,9
3,"[1670, 2685, 117, 1322]","[1, 631, 834, 6720, 3484, 2]",4,6
6,"[28, 11, 8445, 28, 11, 8446]","[1, 351, 10262, 27, 10263, 41, 2]",6,7
7,"[2686, 192, 46, 614, 337, 642, 8447, 8448]","[1, 4101, 44, 340, 434, 1068, 6721, 674, 2]",8,9


In [27]:
X, y = df['english_sentence'], df['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((15863,), (3966,))

In [28]:
X_train = X_train.reset_index()
X_test = X_test.reset_index()
y_train = y_train.reset_index()
y_test = y_test.reset_index()

In [29]:
X_train.drop(["index"],inplace=True, axis=1)
X_test.drop(["index"],inplace=True, axis=1)
y_train.drop(["index"],inplace=True, axis=1)
y_test.drop(["index"],inplace=True, axis=1)

In [30]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [31]:
batch_size = 32

X_train = X_train[:-(len(X_train)%batch_size)]
X_test = X_test[:-(len(X_test)%batch_size)]
y_train = y_train[:-(len(y_train)%batch_size)]
y_test = y_test[:-(len(y_test)%batch_size)]

In [32]:
len(X_train)%batch_size

0

In [33]:
def generate_batch(X_train, y_train, batch_size = 8):
    while True:
        for i in range(0, len(X_train), batch_size):
            # print(i, i+batch_size)
            encoder_x = np.zeros((batch_size, max_length_eng),dtype='float32')
            decoder_x = np.zeros((batch_size, max_length_hindi),dtype='float32')
            decoder_y = np.zeros((batch_size, max_length_hindi, len(hindi_vocab)),dtype='float32')
            
            batchx = X_train[i:i+batch_size]
            batchy = y_train[i:i+batch_size]
            for j in range(0,batch_size):
                sent = batchx[j][0]
                for k in range(len(sent)):
                    encoder_x[j, k] = sent[k]

                sent = batchy[j][0]
                for k in range(len(sent)):
                    decoder_x[j, k] = sent[k]

                for k in range(1, len(sent)):
                    decoder_y[j, k-1, int(decoder_x[j,k])] = 1
            yield([encoder_x, decoder_x], decoder_y)

----
Model

In [34]:
latent_dim=300

encoder_input = Input(shape=[None,])
encoder_embeddings = Embedding(len(eng_vocab), latent_dim, mask_zero = True)(encoder_input)
encoder_outputs, eh, ec = LSTM(latent_dim, return_state=True)(encoder_embeddings)

encoder_states = [eh,ec]

In [35]:
decoder_input = Input(shape=[None,])

decoder_embedder = Embedding(len(hindi_vocab), latent_dim, mask_zero = True)
decoder_embeddings = decoder_embedder(decoder_input)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embeddings,initial_state=encoder_states)

decoder_dense = Dense(len(hindi_vocab), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [36]:
model = Model([encoder_input, decoder_input], decoder_outputs)

In [37]:
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 300)    10265700    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 300)    12220200    ['input_2[0][0]']                
                                                                                              

In [38]:
checkpoint = ModelCheckpoint(
    r'C:\Users\gufra\Desktop\Work\Projects\NLP\MachineTranslation\models\nmt.h5', 
    save_best_only=True, 
    monitor='val_loss',
    mode='min'
)


model.fit(
    generate_batch(X_train, y_train),
    steps_per_epoch = len(X_train)//batch_size,
    validation_data = generate_batch(X_test, y_test), 
    validation_steps = len(X_test)//batch_size,
    epochs=25,
    callbacks=[checkpoint]
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x231ea935270>

In [43]:
encoder_model = Model(encoder_input, encoder_states)

h_input = Input(shape=(latent_dim))
c_input = Input(shape=(latent_dim))
state_inputs = [h_input, c_input]

decoder_embeddings_test = decoder_embedder(decoder_input)

decoder_outputs_test, dh, dc = decoder_lstm(decoder_embeddings_test, initial_state=state_inputs)
decoder_outputs_test = decoder_dense(decoder_outputs_test)

decoder_model = Model(
    [decoder_input] + state_inputs,
    [decoder_outputs_test] + [dh, dc]
)