In [1]:
#Set Seed
import numpy as np
np.random.seed(42)

## 1. Prepare data
Data for this exercise can be downloaded from http://www.manythings.org/anki/

### 1.1 Download and extract sentence pairs

In [4]:
!wget http://www.manythings.org/anki/hin-eng.zip --quiet

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
import zipfile
import io

In [4]:
zf = zipfile.ZipFile('hin-eng.zip', 'r')

#Read all Sentences
data = ''
with zf.open('hin.txt') as readfile:
  for line in io.TextIOWrapper(readfile, 'utf-8'):
    data += line

#Split sentences
data =  data.split('\n')

#lets review the data
print('Number of sentences: ', len(data))
data[100:105]

Number of sentences:  2868


['I have a dog.\tमेरे पास एक कुत्ता है।',
 'I understand.\tमैं समझता हूँ।',
 "I'm a doctor.\tमैं डॉक्टर हूँ।",
 "I'm starving!\tमैं भूख से मरा जा रहा हूँ।",
 'It is a book.\tयह किताब है।']

### 1.2 Separate out Encoder and Decoder input data

In [5]:
encoder_text = []
decoder_text = []

for line in data:
    try:
        in_txt, out_txt = line.split('\t')
        encoder_text.append(in_txt)
        
        # Add tab '<start>' as 'start sequence in target
        # And '<end>' as End
        decoder_text.append('<start> ' + out_txt + ' <end>')
    except:
        pass #ignore data which goes into error

In [6]:
encoder_text[100:105]

['I have a dog.',
 'I understand.',
 "I'm a doctor.",
 "I'm starving!",
 'It is a book.']

In [7]:
decoder_text[100:105]

['<start> मेरे पास एक कुत्ता है। <end>',
 '<start> मैं समझता हूँ। <end>',
 '<start> मैं डॉक्टर हूँ। <end>',
 '<start> मैं भूख से मरा जा रहा हूँ। <end>',
 '<start> यह किताब है। <end>']

### 1.3 Build Sequences for Encoder and Decoder Input

In [8]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


Encoder tokenizer

In [9]:
encoder_t = Tokenizer()
encoder_t.fit_on_texts(encoder_text)
encoder_seq = encoder_t.texts_to_sequences(encoder_text)
max_encoder_seq_length = max([len(txt) for txt in encoder_seq])
encoder_vocab_size = len(encoder_t.word_index)
print('Max words in input sentence: ', max_encoder_seq_length)
print('Input vocablury: ', encoder_vocab_size)

Max words in input sentence:  22
Input vocablury:  2402


In [10]:
encoder_text[100:105]

['I have a dog.',
 'I understand.',
 "I'm a doctor.",
 "I'm starving!",
 'It is a book.']

In [11]:
encoder_seq[100:105]

[[2, 12, 6, 130], [2, 213], [38, 6, 153], [38, 1289], [11, 5, 6, 70]]

Decoder tokenizer

In [12]:
decoder_t = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
decoder_t.fit_on_texts(decoder_text)
decoder_seq = decoder_t.texts_to_sequences(decoder_text)
max_decoder_seq_length = max([len(txt) for txt in decoder_seq])
decoder_vocab_size = len(decoder_t.word_index)

print('Max words in output sentence: ', max_decoder_seq_length)
print('Output vocablury: ', decoder_vocab_size)

Max words in output sentence:  27
Output vocablury:  3009


In [13]:
decoder_text[100:105]

['<start> मेरे पास एक कुत्ता है। <end>',
 '<start> मैं समझता हूँ। <end>',
 '<start> मैं डॉक्टर हूँ। <end>',
 '<start> मैं भूख से मरा जा रहा हूँ। <end>',
 '<start> यह किताब है। <end>']

In [14]:
decoder_seq[100:105]

[[1, 28, 40, 21, 208, 3, 2],
 [1, 6, 778, 19, 2],
 [1, 6, 186, 19, 2],
 [1, 6, 359, 8, 1499, 120, 38, 19, 2],
 [1, 25, 78, 3, 2]]

### 1.4 Padding Sequences

In [16]:
from keras.preprocessing.sequence import pad_sequences

encoder_input_data = pad_sequences(encoder_seq, maxlen=max_encoder_seq_length, padding='pre')
decoder_input_data = pad_sequences(decoder_seq, maxlen=max_decoder_seq_length, padding='post')

print('Encoder input shape: ', encoder_input_data.shape)
print('Decoder input shape: ', decoder_input_data.shape)

Encoder input shape:  (2867, 22)
Decoder input shape:  (2867, 27)


Integer to Word converter for Decoder data

In [17]:
#This will be used to convert output back to word
int_to_word_decoder = dict((i,c) for c, i in decoder_t.word_index.items())
int_to_word_decoder[15]

'की'

### 1.5 Building Decoder Output data

In [19]:
decoder_target_data = np.zeros((decoder_input_data.shape[0], decoder_input_data.shape[1]))

for i in range(decoder_input_data.shape[0]):
    for j in range(1,decoder_input_data.shape[1]):
        decoder_target_data[i][j-1] = decoder_input_data[i][j]   

decoder_input_data[0]

array([   1, 1466,    2,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

Convert target data in one hot vector

In [20]:
from keras.utils import  to_categorical

#Initialize decoder output matrix to all zeros
decoder_target_one_hot = np.zeros((decoder_input_data.shape[0], 
                                   decoder_input_data.shape[1],
                                   len(decoder_t.word_index)+1))

#Populate the output matrix by shifting decoder input by 1 step
for i in range(decoder_target_data.shape[0]):
    for j in range(decoder_target_data.shape[1]):
        decoder_target_one_hot[i][j] = to_categorical(decoder_target_data[i][j],
                                                      num_classes=len(
                                                          decoder_t.word_index)+1) 
        
#Decoder Output size
decoder_target_one_hot.shape

(2867, 27, 3010)

## 2. Building the Training Model

In [21]:
from keras.layers import Input, LSTM, Dense, Embedding
from keras.layers import concatenate, dot, Permute, Average
from keras.layers import Multiply, Activation, Bidirectional
from keras.callbacks import ModelCheckpoint

Define config parameters

In [22]:
encoder_embedding_size = 50
decoder_embedding_size = 50
rnn_units = 256

### 2.1 Build Encoder layers

In [0]:
#Encoder Input
encoder_inputs = Input(shape=(None,))

#Embedding layer 
encoder_embedding = Embedding(encoder_vocab_size+1, encoder_embedding_size)

#Embeding layer output
encoder_embedding_output = (encoder_embedding(encoder_inputs))

#Define LSTM layer for encoder
#Get all hidden states (needed for attention) and last h and c
encoder_lstm = Bidirectional(LSTM(rnn_units,return_state=True,
                                 return_sequences=True))

#Bidirectional encoder will return 5 tensors
encoder_all_states, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding_output)
#state_h = Average()([forward_h,backward_h])
#state_c = Average()([forward_c,backward_c])

#Create a list for hidden and cell state
encoder_states = [forward_h, forward_c, backward_h, backward_c ]

### 2.2 Build Decoder layers

In [0]:
#Decoder input
decoder_inputs = Input(shape=(None,))

#Embedding Layer
decoder_embedding = Embedding(decoder_vocab_size + 1, decoder_embedding_size)
decoder_embedding_output = decoder_embedding(decoder_inputs)

#Decoder LSTM
#Get all hidden states and last h, c
decoder_rnn = Bidirectional(LSTM(rnn_units, return_sequences=True, return_state=True))
decoder_all_states,_,_,_,_ = decoder_rnn(decoder_embedding_output, initial_state=encoder_states)

Add code for the Attention Layer - start with alignment matrix

In [0]:
#1. Dot Product between Decoder_all_states and encoder_all_hidden_states
#2. Apply softmax to get Alignment matrix

#Dimensions details
#decoder_all_states = batch_size x max_decoder_length x rnn_units
#encoder_all_states = batch_size x max_encoder_length x rnn_units
#score = batch_size x max_decoder_length x max_encoder_length
#alignment matrix = batch_size x max_decoder_length x max_encoder_length

score = dot([decoder_all_states, encoder_all_states], axes=2)
alignment_matrix = Activation('softmax')(score)

#Try general and concat approaches to alignment matrix

Build Context Vector

In [0]:
#Weighted sum of multiplication of Alignment matrix and encoder states
# Dimension of context_vector =  batch_size x max_decoder_length x rnn_units
context_vector = dot([alignment_matrix, encoder_all_states], axes=[2,1])


Build Attention Vector

In [0]:
#Concatenate context vector and decoder_all_states
#context_decoder_hidden = batch_size x max_decoder_length x 2*rnn_units
#attention_vector = batch_size x max_decoder_length x 128

context_decoder_hidden = concatenate([context_vector, decoder_all_states])
attention_dense_layer = Dense(128, use_bias=False, 
                         activation='tanh')
attention_vector = attention_dense_layer(context_decoder_hidden)

In [0]:
#Output layer
decoder_dense = Dense(decoder_vocab_size + 1, activation='softmax')

In [0]:
#With attention input will be attention_vector and not decoder_all_states
#decoder_outputs = decoder_dense(decoder_all_states)
decoder_outputs = decoder_dense(attention_vector)

### 2.3 Build Model using both Encoder and Decoder layers

In [0]:
from tensorflow.python.keras.models import Model

In [0]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [0]:
model.summary()

## 3. Train the model

In [0]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_one_hot,
          batch_size=32,
          epochs=100,
          validation_split=0.2)

In [0]:
model.save('drive/AI-ML/models/seq2seq_enghin_trg_bi_attention.h5')

## 4. Building Model for Prediction

### 4.1 Build the Encoder Model to predict Encoder States

In [0]:
#Get both last c and h as well as encoder_all_states for Attention
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_all_states] + 
                      encoder_states)

### 4.2 Build the Decoder Model 

1. Define Input for both 'h' state and 'c' state initialization
2. Get RNN outputs along with h and c state
3. Define Decoder Output
4. Build Model

In [0]:
#State h and c from Encoder
initial_fwd_h = Input(shape=(rnn_units,))
initial_fwd_c = Input(shape=(rnn_units,))
initial_bkwd_h = Input(shape=(rnn_units,))
initial_bkwd_c = Input(shape=(rnn_units,))
#Build list of state inputs 
decoder_states_inputs = [initial_fwd_h, initial_fwd_c,
                        initial_bkwd_h, initial_bkwd_c]

#Input for Attention layer
encoder_outputs = Input(shape=(max_encoder_seq_length, 2*rnn_units,))

#Get RNN outputs and state(s) using trained layers
x = decoder_embedding(decoder_inputs)
rnn_outputs, f_state_h, f_state_c, b_state_h, b_state_c = decoder_rnn(x,
                                                                      initial_state=decoder_states_inputs)

#Why do we need this?
decoder_states = [f_state_h, f_state_c, b_state_h, b_state_c]

Build Attention Layer

In [0]:
p_score = dot([rnn_outputs, encoder_outputs], axes=2)
p_alignment_matrix = Activation('softmax')(p_score)
p_context_vector = dot([p_alignment_matrix, encoder_outputs], axes=[2,1])
p_context_decoder_hidden = concatenate([p_context_vector, rnn_outputs])
p_attention_vector = attention_dense_layer(p_context_decoder_hidden)


get Decoder output

In [0]:
#decoder_outputs = decoder_dense(rnn_outputs)
decoder_outputs = decoder_dense(p_attention_vector)

Build Decoder Model

In [0]:
decoder_model = Model([decoder_inputs] + decoder_states_inputs + [encoder_outputs],  #Model inputs
                     [decoder_outputs] + decoder_states + [p_alignment_matrix])

## 5.0 Predicting Output

Build a prediction function

In [0]:
def decode_sentence(input_sequence):
    
    #Get the encoder state values
    encoder_output =  encoder_model.predict(input_sequence)
    decoder_initial_states_value = encoder_output[1:]    
    encoded_seqs = encoder_output[0]
       
    
    #decoder_initial_states_value = [encoder_last_h, encoder_last_c]
    
    #Build a sequence with '<start>' - starting sequence for Decoder
    target_seq = np.zeros((1,1))    
    target_seq[0][0] = decoder_t.word_index['<start>']
    
    #flag to check if prediction should be stopped
    stop_loop = False
    
    #Initialize predicted sentence
    predicted_sentence = ''
    
    #start the loop
    while not stop_loop:
        
        predicted_outputs, f_h, f_c, b_h, b_c, a = decoder_model.predict([target_seq] +
                                                           decoder_initial_states_value +
                                                           [encoded_seqs])
        
        #Get the predicted output with highest probability
        predicted_output = np.argmax(predicted_outputs[0,-1,:])
        
        #Get the predicted word from predicter integer
        if (predicted_output == 0):
            predicted_word = ' '
        else:
            predicted_word = int_to_word_decoder[predicted_output]
        
        #Check if prediction should stop
        if(predicted_word == '<end>' or len(predicted_sentence) > max_decoder_seq_length):
            
            stop_loop = True
            continue
                    
        #Updated predicted sentence
        if (len(predicted_sentence) == 0):
            predicted_sentence = predicted_word
        else:
            predicted_sentence = predicted_sentence + ' ' + predicted_word
            
        #Update target_seq to be the predicted word index
        target_seq[0][0] = predicted_output
        
        #Update initial states value for decoder
        decoder_initial_states_value = [f_h,f_c,b_h,b_c]
        
        #Uncomment this line to print Alignment Matrix
        #print (a)
        
    
    return predicted_sentence

Call Prediction function above

In [0]:
#Get a random sentence
start_num = np.random.randint(0, high=len(encoder_text) - 10)
print(start_num)

for i in range(start_num, start_num + 10):
    input_seq = encoder_input_data[i : i+1]    
    predicted_sentence = decode_sentence(input_seq)
    print('--------')
    print ('Input sentence: ', encoder_text[i])
    print ('Predicted sentence: ', predicted_sentence )

## 6. Save Prediction models and tokenizers 

In [0]:
#Save encoder and decoder model for Prediction
encoder_model.compile(optimizer='adam', loss='mse')
decoder_model.compile(optimizer='adam', loss='categorical_crossentropy')
encoder_model.save('drive/AI-ML/models/seq2seq_encoder_eng_hin.hd5')
decoder_model.save('drive/AI-ML/models/seq2seq_decoder_eng_hin.hd5')

In [0]:
#Save tokenizers
import pickle

pickle.dump(encoder_t,open('drive/AI-ML/models/encoder_tokenizer_eng','wb'))
pickle.dump(decoder_t,open('drive/AI-ML/models/decoder_tokenizer_hin','wb'))

In [0]:
encoder_input_data[429]