<a href="https://colab.research.google.com/github/cnn22/SingerSongwriter/blob/main/NMT_Attention_Marathi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Dataset => http://www.manythings.org/anki/ (mar-eng.zip)

# Import Stuff

In [3]:
import tensorflow as tf

tf.compat.v1.enable_eager_execution()

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import time
import string

# import plotly
# import plotly.plotly as py
# from plotly.offline import init_notebook_mode, iplot
# plotly.offline.init_notebook_mode(connected=True)
# import plotly.graph_objs as go

# Preprocess
Removing digits, punctuation, and other non-related items from the sentences

In [4]:
file_path = './spa.txt' # please set the path according to your system

In [65]:
lines = open(file_path, encoding='UTF-8').read().strip().split('\n')
lines[0:2]

['Go.\tVe.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)',
 'Go.\tVete.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986656 (cueyayotl)']

In [6]:
len(lines)

139013

In [24]:
exclude = set(string.punctuation) # Set of all special characters to remove from lines
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [26]:
def preprocess_sentence(sent):
    '''Function to preprocess English sentence'''
    sent = sent.lower() # lower casing
    sent = re.sub("'", '', sent) # remove the quotation marks if any
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.translate(remove_digits) # remove the digits
    sent = sent.strip()
    sent = re.sub(" +", " ", sent) # remove extra spaces
    sent = '<start> ' + sent + ' <end>' # add <start> and <end> tokens
    return sent

In [29]:
preprocess_sentence("fasadf```adf12()312341;23fafasdf`1231231")

'<start> fasadfadffafasdf <end>'

In [34]:
# Generate pairs of cleaned English and Marathi sentences
sent_pairs = []
for line in lines:
    sent_pair = []
    eng, spa, trash = line.split('\t')
    eng = preprocess_sentence(eng)
    sent_pair.append(eng)
    spa = preprocess_sentence(spa)
    sent_pair.append(spa)
    sent_pairs.append(sent_pair)
sent_pairs[5000:5010]

[['<start> hes no saint <end>', '<start> él no es un santo <end>'],
 ['<start> hes not home <end>', '<start> no está en casa <end>'],
 ['<start> hes not sick <end>', '<start> no está enfermo <end>'],
 ['<start> hes studying <end>', '<start> él está estudiando <end>'],
 ['<start> hes your son <end>', '<start> él es tu hijo <end>'],
 ['<start> heat the milk <end>', '<start> calienta la leche <end>'],
 ['<start> help yourself <end>', '<start> sírvase usted mismo <end>'],
 ['<start> help yourself <end>', '<start> sírvete tú mismo <end>'],
 ['<start> help yourself <end>', '<start> servíos vosotros mismos <end>'],
 ['<start> help yourself <end>', '<start> sírvanse ustedes mismos <end>']]

# Creating dataset

In [33]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()

        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))

        self.vocab = sorted(self.vocab)

        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1

        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [35]:
def max_length(tensors):
    return max(len(t) for t in tensors)

In [63]:
def load_dataset(pairs, num_examples):
    # pairs => already created cleaned input, output pairs

    # index language using the class defined above: Example word to index = "day": 5, index to word = 5:"day"    
    inp_lang = LanguageIndex(en for en, sp in pairs) #English
    targ_lang = LanguageIndex(sp for en, sp in pairs) #Spanish 
    
    # Vectorize the input and target languages
    # English sentences
    input_tensor = [[inp_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]
    print("Input Tensors")
    print(input_tensor[0:2])
    # Spanish sentences
    target_tensor = [[targ_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]
    print("Targets Tensors")
    print(target_tensor[0:2])
    
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    print("Input Tensor Max Length")
    print(max_length_inp)

    print("Target Tensor Max Length")
    print(max_length_tar)
    
    # Padding the input and output tensor to the maximum length
    # Making sure all of the items on the list are the same size as the max len
    # For example:  [1,2,3] becomes [1,2,3,0,0,0] if the max is 6
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    print("Input Tensors after Tensorflow Pad Sequence")
    print(input_tensor[0:2])
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    print("Targets Tensors after Tensorflow Pad Sequence")
    print(target_tensor[0:2])

    
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [64]:
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(sent_pairs, len(lines))

Input Tensors
[[2, 5454, 1], [2, 5454, 1]]
Targets Tensors
[[2, 26684, 1], [2, 26915, 1]]
Input Tensor Max Length
72
Target Tensor Max Length
70
Input Tensors after Tensorflow Pad Sequence
[[   2 5454    1    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   2 5454    1    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
Targets Tensors after Tensorflow Pad Sequence
[[    2 26684     

In [66]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1, random_state = 101)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(125111, 125111, 13902, 13902)

In [67]:
BUFFER_SIZE = len(input_tensor_train) #traning dataset
BATCH_SIZE = 64 #how many of the BUFFER_SIZE it will take per batch
N_BATCH = BUFFER_SIZE//BATCH_SIZE #how many batch it will take: size of training divided by the batch size double // means no decimal
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx) #size of english vocab
vocab_tar_size = len(targ_lang.word2idx) #size of spanish vocab

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# It's learning time

In [69]:
def gru(units):
  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
  # the code automatically does that.
    if tf.test.is_gpu_available():
        return tf.keras.layers.CuDNNGRU(units, 
                                        return_sequences=True, 
                                        return_state=True, 
                                        recurrent_initializer='glorot_uniform')
    else:
        return tf.keras.layers.GRU(units, 
                                   return_sequences=True, 
                                   return_state=True, 
                                   recurrent_activation='sigmoid', 
                                   recurrent_initializer='glorot_uniform')

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz #how many items per batch
        self.enc_units = enc_units #ENCODER UNITS not incremental units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) #Embedding layer based on embedding_dim
        self.gru = gru(self.enc_units) #Creating GRU for the Encoder Unit
        
    def call(self, x, hidden):
        x = self.embedding(x) #run the embedding layer on x
        output, state = self.gru(x, initial_state = hidden) #run gru on x        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units)) #init the starting hidden state mostly zeros

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz #how many items per batch
        self.dec_units = dec_units #DECODER UNITS not decremental units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) #Embedding layer based on embedding_dim
        self.gru = gru(self.dec_units) #Creating GRU for the Encoder Unit
        self.fc = tf.keras.layers.Dense(vocab_size) #Creating a Full Connected layer that does linear and non-linear trasnformation
        #https://towardsdatascience.com/convolutional-layers-vs-fully-connected-layers-364f05ab460b#:~:text=Fully%20Connected%20Layers%20(FC%20Layers,vector%20through%20a%20weights%20matrix
        
        # used for attention: Bahdanau's Attention Linear Algebra
        self.W1 = tf.keras.layers.Dense(self.dec_units) 
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1) #v is weight vector 
        
    #https://blog.floydhub.com/attention-mechanism/
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [None]:
optimizer = tf.train.AdamOptimizer()


def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every epoch
    checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.5206
Epoch 1 Batch 100 Loss 1.0305
Epoch 1 Batch 200 Loss 0.7886
Epoch 1 Batch 300 Loss 0.7676
Epoch 1 Batch 400 Loss 0.7481
Epoch 1 Loss 0.8671
Time taken for 1 epoch 352.7763113975525 sec

Epoch 2 Batch 0 Loss 0.7193
Epoch 2 Batch 100 Loss 0.6355
Epoch 2 Batch 200 Loss 0.6897
Epoch 2 Batch 300 Loss 0.5928
Epoch 2 Batch 400 Loss 0.5776
Epoch 2 Loss 0.6535
Time taken for 1 epoch 351.2787010669708 sec

Epoch 3 Batch 0 Loss 0.5011
Epoch 3 Batch 100 Loss 0.4631
Epoch 3 Batch 200 Loss 0.5464
Epoch 3 Batch 300 Loss 0.5196
Epoch 3 Batch 400 Loss 0.4962
Epoch 3 Loss 0.4708
Time taken for 1 epoch 353.4211723804474 sec

Epoch 4 Batch 0 Loss 0.3234
Epoch 4 Batch 100 Loss 0.3257
Epoch 4 Batch 200 Loss 0.3345
Epoch 4 Batch 300 Loss 0.3141
Epoch 4 Batch 400 Loss 0.3244
Epoch 4 Loss 0.3200
Time taken for 1 epoch 353.1946771144867 sec

Epoch 5 Batch 0 Loss 0.2030
Epoch 5 Batch 100 Loss 0.2257
Epoch 5 Batch 200 Loss 0.1945
Epoch 5 Batch 300 Loss 0.2250
Epoch 5 Batch 400 Loss 0.2

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0x7fe3b1759be0>

#### Inference Setup

In [None]:
def evaluate(inputs, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):
    
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = ''
    for i in inputs[0]:
        if i == 0:
            break
        sentence = sentence + inp_lang.idx2word[i] + ' '
    sentence = sentence[:-1]
    
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.idx2word[predicted_id] + ' '

        if targ_lang.idx2word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

#### Function to predict (translate) a randomly selected test point

In [None]:
def predict_random_val_sentence():
    actual_sent = ''
    k = np.random.randint(len(input_tensor_val))
    random_input = input_tensor_val[k]
    random_output = target_tensor_val[k]
    random_input = np.expand_dims(random_input,0)
    result, sentence, attention_plot = evaluate(random_input, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)
    print('Input: {}'.format(sentence[8:-6]))
    print('Predicted translation: {}'.format(result[:-6]))
    for i in random_output:
        if i == 0:
            break
        actual_sent = actual_sent + targ_lang.idx2word[i] + ' '
    actual_sent = actual_sent[8:-7]
    print('Actual translation: {}'.format(actual_sent))
    attention_plot = attention_plot[:len(result.split(' '))-2, 1:len(sentence.split(' '))-1]
    sentence, result = sentence.split(' '), result.split(' ')
    sentence = sentence[1:-1]
    result = result[:-2]
    
    # use plotly to generate the heat map
    trace = go.Heatmap(z = attention_plot, x = sentence, y = result, colorscale='Reds')
    data=[trace]
    iplot(data)

#### Call the function to visualize outputs

In [None]:
predict_random_val_sentence()

In [None]:
predict_random_val_sentence()

# TODO Section

## TODO 08/05 
* Reverse engineer this dude's work and understand how to calculate the scores (s1,s2, etc.) thats used to calculate the attention weights

## TODO 09/23
* On the 16th we walked through the dude's code up until the decoder layer where we learn that he used Bahdanau's paper as reference. We also learn about Full Connected Layer where it does a linear and non-linear activation on the input. 
* Learn Bahdanau Attention Algorithm more 
* Go through the rest of the dude's code 

* https://blog.floydhub.com/attention-mechanism/#bahdanau-att-step4
* 