In [None]:
import os
import re
import string 
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense,TimeDistributed
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
%matplotlib inline

In [None]:
# Get Glove Vector
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip

# using 300-dim Glove word embeddings

glove_embeddings = {}
f = open('glove.840B.300d.txt')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    embedding = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    glove_embeddings[word] = embedding 
f.close()

# initializing unknown token (in case a word is not found, we'll assign it an unknown token)
UNK = np.random.random(300)
SOS = np.random.random(300)
EOS = np.random.random(300)

glove_embeddings['<sos>'] = SOS
glove_embeddings['<eos>'] = EOS

print('GloVe data loaded')

--2021-05-09 13:18:49--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2021-05-09 13:18:49--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2021-05-09 13:18:49--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [None]:
# Get word2vec
# !wget http://vectors.nlpl.eu/repository/20/5.zip
# !unzip 5.zip

In [None]:
# Get training dataset
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4" -O mt.zip && rm -rf /tmp/cookies.txt
! unzip mt.zip



--2021-04-14 12:18:29--  https://docs.google.com/uc?export=download&confirm=&id=1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4
Resolving docs.google.com (docs.google.com)... 216.58.192.78, 2607:f8b0:4026:803::200e
Connecting to docs.google.com (docs.google.com)|216.58.192.78|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0k-34-docs.googleusercontent.com/docs/securesc/efdsn1opk214p4lbgqckr21cs10mirg2/16bu55l0bgjp7tmukin8508tn4hkoeba/1618402650000/02653466601279893693/11144349674830726180Z/1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4?e=download [following]
--2021-04-14 12:18:31--  https://doc-0k-34-docs.googleusercontent.com/docs/securesc/efdsn1opk214p4lbgqckr21cs10mirg2/16bu55l0bgjp7tmukin8508tn4hkoeba/1618402650000/02653466601279893693/11144349674830726180Z/1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4?e=download
Resolving doc-0k-34-docs.googleusercontent.com (doc-0k-34-docs.googleusercontent.com)... 216.58.192.65, 2607:f8b0:4026:802::2001
Connecting to doc-0k-34-do

In [None]:
def read_file(file_name):
  f = open(file_name, 'r', encoding="utf8")
  lines = f.readlines()
  f.close()
  
  return [line.strip() for line in lines]
  
eng_train_list = read_file('MT/english.train')
eng_test_list = read_file('MT/english.test')
hindi_train_list = read_file('MT/hindi.train')
hindi_test_list = read_file('MT/hindi.test')

In [None]:
train_df = pd.DataFrame({'eng_sent':eng_train_list, 'hindi_sent':hindi_train_list})
train_df.head()

Unnamed: 0,eng_sent,hindi_sent
0,Give your application an accessibility workout,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
1,Accerciser Accessibility Explorer,एक्सेर्साइसर पहुंचनीयता अन्वेषक
2,The default plugin layout for the bottom panel,निचले पटल के लिए डिफोल्ट प्लग-इन खाका
3,The default plugin layout for the top panel,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
4,A list of plugins that are disabled by default,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...


In [None]:
test_df = pd.DataFrame({'eng_sent':eng_test_list, 'hindi_sent':hindi_test_list})
test_df.head()

Unnamed: 0,eng_sent,hindi_sent
0,A black box in your car?,आपकी कार में ब्लैक बॉक्स?
1,As America's road planners struggle to find th...,"जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए..."
2,"The devices, which track every mile a motorist...","यह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्..."
3,The usually dull arena of highway planning has...,आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी अचानक ...
4,Libertarians have joined environmental groups ...,"आपने द्वारा ड्राइव किए गए मील, तथा संभवतः ड्रा..."


In [None]:
exclude = set(string.punctuation)
remove_digits = str.maketrans('', '', string.digits)

def preprocess(sent):
  # Lowercase all characters
  sent = sent.strip().lower()

  # Removing quotes
  sent = re.sub("'", '', sent)

  # Removing all the special characters
  sent = ''.join(ch for ch in sent if ch not in exclude)

  # Removing all numbers from text
  sent = sent.translate(remove_digits)
  sent = re.sub("[२३०८१५७९४६]", "", sent)
  sent = sent.strip()
  sent = re.sub(" +", " ", sent)
  return '<sos> ' + sent + ' <eos>'


train_df['eng_sent'] = train_df['eng_sent'].apply(preprocess)
train_df['hindi_sent'] = train_df['hindi_sent'].apply(preprocess)

test_df['eng_sent'] = test_df['eng_sent'].apply(preprocess)
test_df['hindi_sent'] = test_df['hindi_sent'].apply(preprocess)

In [None]:
train_df.head()

Unnamed: 0,eng_sent,hindi_sent
0,<sos> give your application an accessibility w...,<sos> अपने अनुप्रयोग को पहुंचनीयता व्यायाम का ...
1,<sos> accerciser accessibility explorer <eos>,<sos> एक्सेर्साइसर पहुंचनीयता अन्वेषक <eos>
2,<sos> the default plugin layout for the bottom...,<sos> निचले पटल के लिए डिफोल्ट प्लगइन खाका <eos>
3,<sos> the default plugin layout for the top pa...,<sos> ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका <eos>
4,<sos> a list of plugins that are disabled by d...,<sos> उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप ...


In [None]:
test_df.head()

Unnamed: 0,eng_sent,hindi_sent
0,<sos> a black box in your car <eos>,<sos> आपकी कार में ब्लैक बॉक्स <eos>
1,<sos> as americas road planners struggle to fi...,<sos> जबकि अमेरिका के सड़क योजनाकार ध्वस्त होत...
2,<sos> the devices which track every mile a mot...,<sos> यह डिवाइस जो मोटरचालक द्वारा वाहन चलाए ग...
3,<sos> the usually dull arena of highway planni...,<sos> आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी ...
4,<sos> libertarians have joined environmental g...,<sos> आपने द्वारा ड्राइव किए गए मील तथा संभवतः...


In [None]:

# Create Tokenizer and fit the tokenizer on the whole text where each word is assigned a unique number 
# and every word is now represented by a number
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(train_df['eng_sent'].tolist() + test_df['eng_sent'].tolist())

eng_vocab_size = len(tokenizer_eng.word_index)+1 # +1 for UNK


tokenizer_hindi = Tokenizer()
tokenizer_hindi.fit_on_texts(train_df['hindi_sent'].tolist() + test_df['hindi_sent'].tolist())

hindi_vocab_size = len(tokenizer_hindi.word_index)+1 # +1 for UNK


In [None]:
# create a matrix that contains only the words present in our vocabulary and their corresponding embedding vector

embedding_matrix_eng = np.zeros((eng_vocab_size,300))
for word,i in tqdm(tokenizer_eng.word_index.items()):
    embedding_value = glove_embeddings.get(word, UNK)
    embedding_matrix_eng[i] = embedding_value

embedding_matrix_hindi = np.zeros((hindi_vocab_size,300))
for word,i in tqdm(tokenizer_hindi.word_index.items()):
    embedding_value = glove_embeddings.get(word, UNK)
    embedding_matrix_hindi[i] = embedding_value

100%|██████████| 10406/10406 [00:00<00:00, 344316.69it/s]
100%|██████████| 11973/11973 [00:00<00:00, 458602.98it/s]


In [None]:
# converts each sentence into a sequence of numbers
MAX_SEQ_LEN = 300
EMBEDDING_DIM = 300
eng_train_seq = tokenizer_eng.texts_to_sequences(train_df['eng_sent'])
pad_eng_train_seq = pad_sequences(eng_train_seq,maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

hindi_train_seq = tokenizer_hindi.texts_to_sequences(train_df['hindi_sent'])
pad_hindi_train_seq = pad_sequences(hindi_train_seq,maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

eng_test_seq = tokenizer_eng.texts_to_sequences(test_df['eng_sent'])
pad_eng_test_seq = pad_sequences(eng_test_seq,maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

hindi_test_seq = tokenizer_hindi.texts_to_sequences(test_df['hindi_sent'])
pad_hindi_test_seq = pad_sequences(hindi_test_seq,maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

#Neural Network

In [None]:
MAX_SEQ_LEN = 300
EMBEDDING_DIM = 300

def generate_batch(X, y , batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, MAX_SEQ_LEN),dtype='int')
            decoder_input_data = np.zeros((batch_size, MAX_SEQ_LEN),dtype='int')
            decoder_target_data = np.zeros((batch_size, MAX_SEQ_LEN, EMBEDDING_DIM ),dtype='float32')

            for i, (input_text, target_text) in enumerate( zip(X[j:j+batch_size], y[j:j+batch_size]) ):
                encoder_input_data[i] = input_text
                decoder_input_data[i] = target_text
                for t in range(1, MAX_SEQ_LEN):
                    decoder_target_data[i][t-1] = embedding_matrix_hindi[target_text[t]]
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)


In [None]:
# Encoder
encoder_inputs = Input(shape=(MAX_SEQ_LEN,))
enc_emb =  Embedding(output_dim=EMBEDDING_DIM, input_dim=eng_vocab_size, weights=[embedding_matrix_eng], input_length=MAX_SEQ_LEN, trainable = False, mask_zero = True)(encoder_inputs)

print(enc_emb.shape)
encoder_lstm = LSTM(MAX_SEQ_LEN, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

(None, 300, 300)


In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(MAX_SEQ_LEN,))
dec_emb_layer = Embedding(output_dim=EMBEDDING_DIM, input_dim=hindi_vocab_size,weights = [embedding_matrix_hindi],input_length=MAX_SEQ_LEN,trainable = False)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.

decoder_lstm = LSTM(MAX_SEQ_LEN, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
print(decoder_outputs.shape)
decoder_dense = Dense(EMBEDDING_DIM, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

print(model.summary())

(None, 300, 300)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 300, 300)     3122100     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     3592200     input_2[0][0]                    
_____________________________________________________________________________

In [None]:
train_samples = train_df.shape[0]
val_samples = test_df.shape[0]
batch_size = 512
epochs = 2

model.fit_generator(generator = generate_batch(pad_eng_train_seq, pad_hindi_train_seq, batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch( pad_eng_test_seq, pad_hindi_test_seq, batch_size),
                    validation_steps = int(val_samples/batch_size))




Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fc5ccbca110>

# Evaluation

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(MAX_SEQ_LEN,))
decoder_state_input_c = Input(shape=(MAX_SEQ_LEN,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [None]:
eng_token_index = dict(tokenizer_eng.word_index.items())
hindi_token_index = dict(tokenizer_hindi.word_index.items())

reverse_eng_word_map = dict(map(reversed, tokenizer_eng.word_index.items()))
reverse_hindi_word_map = dict(map(reversed, tokenizer_hindi.word_index.items()))

In [None]:
def decode_sequence(input_seq):
    
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = hindi_token_index['sos']
    
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    cnt = 0
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_hindi_word_map[sampled_token_index]
        decoded_sentence.append(sampled_char)
            
        if (sampled_char == 'eos' or len(decoded_sentence) >= MAX_SEQ_LEN):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
def convert_tokens_to_words(tokens, lang='english'):
    sentence = []
    if lang == 'english':
        for i in tokens:
            if i == 1:
                continue
            if i == 2:
                break
            word = reverse_eng_word_map[i]
            sentence.append(word)
    elif lang == 'hindi':
        for i in tokens:
            if i == 1:
                continue
            if i == 2:
                break
            word = reverse_hindi_word_map[i]
            sentence.append(word)
    return sentence

In [None]:
test_gen = generate_batch(pad_eng_test_seq, pad_hindi_test_seq, batch_size = 1)
k=-1

In [None]:
k+=1
(input_seq, actual_output), _ = next(test_gen)
decoded_sentence = decode_sequence(input_seq)
eng_sent = convert_tokens_to_words(pad_eng_test_seq[k:k+1][0], 'english')
hindi_sent = convert_tokens_to_words(pad_hindi_test_seq[k:k+1][0], 'hindi')
print("Input English sentence:", eng_sent)
print("Actual Hindi translation:", hindi_sent)
print("Predicted Hindi Translation:", decoded_sentence[:len(hindi_sent)])

Input English sentence: ['a', 'black', 'box', 'in', 'your', 'car']
Actual Hindi translation: ['आपकी', 'कार', 'में', 'ब्लैक', 'बॉक्स']
Predicted Hindi Translation: ['सका', 'सका', 'सका', 'सका', 'सका']


In [None]:
print(len(convert_tokens_to_words(pad_eng_test_seq[k:k+1][0], 'english')))
print(len(convert_tokens_to_words(pad_hindi_test_seq[k:k+1][0], 'hindi')))
print(len(decoded_sentence))

6
5
300


In [None]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

bleu_score = 0

test_gen = generate_batch(pad_eng_test_seq, pad_hindi_test_seq, batch_size = 1)
k=-1
max_num = 100
while k < 100:
    k+=1
    print("Test: ", k, "out of", 100)
    (input_seq, actual_output), _ = next(test_gen)
    decoded_sentence = decode_sequence(input_seq)
    hindi_sent = convert_tokens_to_words(pad_hindi_test_seq[k:k+1][0], 'hindi')
    reference = [hindi_sent]
    bleu_score += sentence_bleu(reference, decoded_sentence[:len(hindi_sent)], weights=(1, 0, 0, 0))


Test:  0 out of 100
Test:  1 out of 100
Test:  2 out of 100
Test:  3 out of 100
Test:  4 out of 100
Test:  5 out of 100
Test:  6 out of 100
Test:  7 out of 100
Test:  8 out of 100
Test:  9 out of 100
Test:  10 out of 100
Test:  11 out of 100
Test:  12 out of 100
Test:  13 out of 100
Test:  14 out of 100
Test:  15 out of 100
Test:  16 out of 100
Test:  17 out of 100
Test:  18 out of 100
Test:  19 out of 100
Test:  20 out of 100
Test:  21 out of 100
Test:  22 out of 100
Test:  23 out of 100
Test:  24 out of 100
Test:  25 out of 100
Test:  26 out of 100
Test:  27 out of 100
Test:  28 out of 100
Test:  29 out of 100
Test:  30 out of 100
Test:  31 out of 100
Test:  32 out of 100
Test:  33 out of 100
Test:  34 out of 100
Test:  35 out of 100
Test:  36 out of 100
Test:  37 out of 100
Test:  38 out of 100
Test:  39 out of 100
Test:  40 out of 100
Test:  41 out of 100
Test:  42 out of 100
Test:  43 out of 100
Test:  44 out of 100
Test:  45 out of 100
Test:  46 out of 100
Test:  47 out of 100
Te

In [None]:
print("Bleu Score:", bleu_score)

Bleu Score: 0.0248202
