In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense,Embedding, Concatenate, TimeDistributed
from tensorflow.keras.models import Model,load_model, model_from_json
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
import pickle as pkl
import numpy as np
import time
import unicodedata

In [None]:
with open('/content/mar.txt','r') as f:
  data = f.read()

In [None]:
uncleaned_data_list = data.split('\n')
len(uncleaned_data_list)
uncleaned_data_list = uncleaned_data_list[:38695]
len(uncleaned_data_list)
english_word = []
marathi_word = []
cleaned_data_list = []
for word in uncleaned_data_list:
  english_word.append(word.split('\t')[:-1][0])
  marathi_word.append(word.split('\t')[:-1][1])
language_data = pd.DataFrame(columns=['English','Marathi'])
language_data['English'] = english_word
language_data['Marathi'] = marathi_word
language_data.to_csv('language_data.csv', index=False)

In [None]:
language_data.shape
language_data.head()
language_data['English'].values
language_data['Marathi'].values

array(['जा.', 'पळ!', 'धाव!', ..., 'मला ऑस्ट्रेलियात राहायचं नाहीये.',
       'मला स्वतःबद्दल बोलायचं नाहीये.',
       'मी चुकून माझी हार्ड डिस्क पुसून टाकली.'], dtype=object)

In [None]:
english_text = language_data['English'].values
marathi_text = language_data['Marathi'].values
len(english_text), len(marathi_text)

(38695, 38695)

In [None]:
#to lower case
english_text_ = [x.lower() for x in english_text]
marathi_text_ = [x.lower() for x in marathi_text]

english_text_ = [re.sub("'",'',x) for x in english_text_]
marathi_text_ = [re.sub("'",'',x) for x in marathi_text_]

def remove_punctuation(text):
    punctuation = string.punctuation
    cleaned_text = ''.join(char for char in text if char not in punctuation)
    return cleaned_text

def remove_punctuation_from_list(text_list):
    return [remove_punctuation(text) for text in text_list]

# Applying the function to English and Marathi text
english_text_ = remove_punctuation_from_list(english_text_)
marathi_text_ = remove_punctuation_from_list(marathi_text_)

remove_digits = str.maketrans('', '', digits)
removed_digits_text = []
for sent in english_text_:
  sentance = [w.translate(remove_digits) for w in sent.split(' ')]
  removed_digits_text.append(' '.join(sentance))
english_text_ = removed_digits_text

# removing the digits from the marathi sentances
marathi_text_ = [re.sub("[२३०८१५७९४६]","",x) for x in marathi_text_]
marathi_text_ = [re.sub("[\u200d]","",x) for x in marathi_text_]

# removing the starting and ending whitespaces
english_text_ = [x.strip() for x in english_text_]
marathi_text_ = [x.strip() for x in marathi_text_]

# Putting the start and end words in the marathi sentances
marathi_text_ = ["start " + x + " end" for x in marathi_text_]
# manipulated_marathi_text_
marathi_text_[0], english_text_[0]

('start जा end', 'go')

In [None]:
def tokenize_sent(text):
  '''
  Take list on texts as input and
  returns its tokenizer and enocoded text
  '''
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)

  return tokenizer, tokenizer.texts_to_sequences(text)
  # Tokenize english and marathi sentences
eng_tokenizer, eng_encoded= tokenize_sent(text= english_text_)
mar_tokenizer, mar_encoded= tokenize_sent(text= marathi_text_)

# English Word --> index dictionary
eng_index_word = eng_tokenizer.index_word

# English Index --> word dictionary
eng_word_index= eng_tokenizer.word_index

# size of English vocabulary for encoder input
# For zero padding we have to add +1 in size
ENG_VOCAB_SIZE = len(eng_tokenizer.word_counts)+1 #4494

# Marathi Word --> index dict
mar_word_index= mar_tokenizer.word_index

# Marathi Index --> word dict
mar_index_word = mar_tokenizer.index_word
# marathi vocab size for decoder output
MAR_VOCAB_SIZE=len(mar_tokenizer.word_counts)+1 #10642

# Getting max length of English and Marathi sentences
max_eng_len = 0
for i in range(len(eng_encoded)):
  if len(eng_encoded[i]) > max_eng_len:
    max_eng_len= len(eng_encoded[i]) #9

max_mar_len = 0
for i in range(len(mar_encoded)):
  if len(eng_encoded[i]) > max_mar_len:
    max_mar_len= len(mar_encoded[i]) #9


# Padding both
eng_padded = pad_sequences(eng_encoded, maxlen=max_eng_len, padding='post')
mar_padded = pad_sequences(mar_encoded, maxlen=max_mar_len, padding='post')

# Convert to array
eng_padded= np.array(eng_padded)
mar_padded= np.array(mar_padded)

# Split data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(eng_padded, mar_padded, test_size=0.1, random_state=0)


In [None]:
EPOCHS = 25
# BUFFER_SIZE stores the number of training points
BUFFER_SIZE = len(X_train)

# BATCH_SIZE is set to 64. Training and gradient descent happens in batches of 64
BATCH_SIZE = 64

# the number of batches in one epoch (also, the number of steps during training, when we go batch by batch)
steps_per_epoch = len(X_train)//BATCH_SIZE

# the length of the embedded vector
embedding_dim = 256

# no of GRUs
units = 1024

# Hidden dimension
hidden_dim = 1024

# now, we shuffle the dataset and split it into batches of 64
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) # the remainder after splitting by 64 are dropped

print(BUFFER_SIZE)
print(BUFFER_SIZE//64)
print(steps_per_epoch)

34825
544
544


In [None]:
class LuongAttention(tf.keras.Model):
    def __init__(self, rnn_size, attention_func):
        super(LuongAttention, self).__init__()
        self.attention_func = attention_func

        if attention_func not in ['dot', 'general', 'concat']:
            raise ValueError(
                'Attention score must be either dot, general or concat.')

        if attention_func == 'general':
            # General score function
            self.wa = tf.keras.layers.Dense(rnn_size)
        elif attention_func == 'concat':
            # Concat score function
            self.wa = tf.keras.layers.Dense(rnn_size, activation='tanh')
            self.va = tf.keras.layers.Dense(1)

    def call(self, decoder_output, encoder_output):
        if self.attention_func == 'dot':
            # Dot score function: decoder_output (dot) encoder_output
            # decoder_output has shape: (batch_size, 1, rnn_size)
            # encoder_output has shape: (batch_size, max_len, rnn_size)
            # => score has shape: (batch_size, 1, max_len)
            score = tf.matmul(decoder_output, encoder_output, transpose_b=True) # (batch_size, 1, max_len)
        elif self.attention_func == 'general':
            # General score function: decoder_output (dot) (Wa (dot) encoder_output)
            # decoder_output has shape: (batch_size, 1, rnn_size)
            # encoder_output has shape: (batch_size, max_len, rnn_size)
            # => score has shape: (batch_size, 1, max_len)
            score = tf.matmul(decoder_output, self.wa(
                encoder_output), transpose_b=True) #(batch_size, 1, max_len)
        elif self.attention_func == 'concat':
            # Concat score function: va (dot) tanh(Wa (dot) concat(decoder_output + encoder_output))
            # Decoder output must be broadcasted to encoder output's shape first
            decoder_output = tf.tile(
                decoder_output, [1, encoder_output.shape[1], 1]) #shape (batch size, max len,hidden_dim)

            # Concat => Wa => va
            # (batch_size, max_len, 2 * rnn_size) => (batch_size, max_len, rnn_size) => (batch_size, max_len, 1)
            score = self.va(
                self.wa(tf.concat((decoder_output, encoder_output), axis=-1))) # (batch_size, max len, 1)

            # Transpose score vector to have the same shape as other two above
            # (batch_size, max_len, 1) => (batch_size, 1, max_len)
            score = tf.transpose(score, [0, 2, 1]) #(batch_size, 1, max_len)

        # alignment a_t = softmax(score)
        alignment = tf.keras.activations.softmax(score, axis=-1) #(batch_size, 1, max_len)

        # context vector c_t is the weighted average sum of encoder output
        context = tf.matmul(alignment, encoder_output) # (batch_size, 1, hidden_dim)

        return context, alignment

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        # Define the embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        # Define the RNN layer, LSTM
        self.lstm = tf.keras.layers.LSTM(
            hidden_dim, return_sequences=True, return_state=True)

    def call(self, input_sequence, states):
        # Embed the input
        embed = self.embedding(input_sequence)
        # Call the LSTM unit
        output, state_h, state_c = self.lstm(embed, initial_state=states)

        return output, state_h, state_c

    def init_states(self, batch_size):
        # Return a all 0s initial states
        return (tf.zeros([batch_size, self.hidden_dim]),
                tf.zeros([batch_size, self.hidden_dim]))

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, attention_func):
        super(Decoder, self).__init__()
        self.attention = LuongAttention(hidden_dim, attention_func)
        self.hidden_dim = hidden_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(
            hidden_dim, return_sequences=True, return_state=True)
        self.wc = tf.keras.layers.Dense(hidden_dim, activation='tanh')
        self.ws = tf.keras.layers.Dense(vocab_size)

    def call(self, input_sequence, state, encoder_output):
        # Remember that the input to the decoder
        # is now a batch of one-word sequences,
        # which means that its shape is (batch_size, 1)
        embed = self.embedding(input_sequence)

        # Therefore, the lstm_out has shape (batch_size, 1, hidden_dim)
        lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)

        # Use self.attention to compute the context and alignment vectors
        # context vector's shape: (batch_size, 1, hidden_dim)
        # alignment vector's shape: (batch_size, 1, source_length)
        context, alignment = self.attention(lstm_out, encoder_output)

        # Combine the context vector and the LSTM output
        # Before combined, both have shape of (batch_size, 1, hidden_dim),
        # so let's squeeze the axis 1 first
        # After combined, it will have shape of (batch_size, 2 * hidden_dim)
        lstm_out = tf.concat(
            [tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)

        # lstm_out now has shape (batch_size, hidden_dim)
        lstm_out = self.wc(lstm_out)

        # Finally, it is converted back to vocabulary space: (batch_size, vocab_size)
        logits = self.ws(lstm_out)

        return logits, state_h, state_c, alignment

In [None]:
encoder = Encoder(ENG_VOCAB_SIZE, embedding_dim, hidden_dim)

In [None]:
decoder = Decoder(MAR_VOCAB_SIZE, embedding_dim, hidden_dim, attention_func='concat')

In [None]:
from tensorflow.keras import backend as K

def loss_func(targets, logits):
    crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True)
    # Mask padding values, they do not have to compute for loss
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    # Calculate the loss value
    loss = crossentropy(targets, logits, sample_weight=mask)

    return loss

def accuracy_fn(y_true, y_pred):
    # y_pred shape is batch_size, seq length, vocab size
    # y_true shape is batch_size, seq length
    pred_values = K.cast(K.argmax(y_pred, axis=-1), dtype='int32')
    correct = K.cast(K.equal(y_true, pred_values), dtype='float32')

    # 0 is padding, don't include those
    mask = K.cast(K.greater(y_true, 0), dtype='float32')
    n_correct = K.sum(mask * correct)
    n_total = K.sum(mask)

    return n_correct / n_total

In [None]:
@tf.function
def train_step(input_seq, target_seq_in, target_seq_out, en_initial_states, optimizer):
    ''' A training step, train a batch of the data and return the loss value reached
        Input:
        - input_seq: array of integers, shape [batch_size, max_seq_len, embedding dim].
            the input sequence
        - target_seq_out: array of integers, shape [batch_size, max_seq_len, embedding dim].
            the target seq, our target sequence
        - target_seq_in: array of integers, shape [batch_size, max_seq_len, embedding dim].
            the input sequence to the decoder, we use Teacher Forcing
        - en_initial_states: tuple of arrays of shape [batch_size, hidden_dim].
            the initial state of the encoder
        - optimizer: a tf.keras.optimizers.
        Output:
        - loss: loss value

    '''
    loss = 0.
    acc = 0.
    logits = None

    with tf.GradientTape() as tape:
        en_outputs = encoder(input_seq, en_initial_states)
        en_states = en_outputs[1:]
        de_state_h, de_state_c = en_states

        # We need to create a loop to iterate through the target sequences
        for i in range(target_seq_out.shape[1]):
            # Input to the decoder must have shape of (batch_size, length)
            # so we need to expand one dimension
            decoder_in = tf.expand_dims(target_seq_in[:, i], 1)
            logit, de_state_h, de_state_c, _ = decoder(
                decoder_in, (de_state_h, de_state_c), en_outputs[0])

            # The loss is now accumulated through the whole batch
            loss += loss_func(target_seq_out[:, i], logit)
            # Store the logits to calculate the accuracy
            logit = K.expand_dims(logit, axis=1)
            if logits is None:
                logits = logit
            else:
                logits = K.concatenate((logits,logit), axis=1)
        # Calculate the accuracy for the batch data
        acc = accuracy_fn(target_seq_out, logits)
    # Update the parameters and the optimizer
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss / target_seq_out.shape[1], acc

In [None]:
import os
def main_train(encoder, decoder, dataset, n_epochs, batch_size, optimizer, checkpoint, checkpoint_prefix):
    for epoch in range(n_epochs):
        start = time.time()
        en_initial_states = encoder.init_states(batch_size)
        for batch, (input_seq, target_seq) in enumerate(dataset.take(-1)):
            target_seq_in = target_seq[:, :-1]
            target_seq_out = target_seq[:, 1:]
            loss, accuracy = train_step(input_seq, target_seq_in, target_seq_out, en_initial_states, optimizer)
            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, loss.numpy(), accuracy.numpy()))
        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix=checkpoint_prefix)
        print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, loss.numpy(), accuracy.numpy()))
        print('Time taken for 1 epoch {:.2f} sec\n'.format(time.time() - start))
# Create an Adam optimizer and clips gradients by norm
optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)
# Create a checkpoint object to save the model
checkpoint_dir = './training_ckpt_seq2seq'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# Train the model
main_train(encoder, decoder, dataset, EPOCHS, BATCH_SIZE, optimizer, checkpoint, checkpoint_prefix)


Epoch 1 Batch 0 Loss 6.1031 Accuracy 0.0000
Epoch 1 Batch 100 Loss 3.3490 Accuracy 0.2303
Epoch 1 Batch 200 Loss 3.0762 Accuracy 0.2708
Epoch 1 Batch 300 Loss 3.1131 Accuracy 0.3065
Epoch 1 Batch 400 Loss 2.7275 Accuracy 0.3396
Epoch 1 Batch 500 Loss 2.6202 Accuracy 0.3658
Epoch 1 Loss 2.5731 Accuracy 0.3780
Time taken for 1 epoch 71.05 sec

Epoch 2 Batch 0 Loss 2.3915 Accuracy 0.3940
Epoch 2 Batch 100 Loss 2.1228 Accuracy 0.4465
Epoch 2 Batch 200 Loss 1.9210 Accuracy 0.4247
Epoch 2 Batch 300 Loss 1.8332 Accuracy 0.5268
Epoch 2 Batch 400 Loss 1.6776 Accuracy 0.5539
Epoch 2 Batch 500 Loss 1.3972 Accuracy 0.6000
Epoch 2 Loss 1.4120 Accuracy 0.5868
Time taken for 1 epoch 47.86 sec

Epoch 3 Batch 0 Loss 1.2992 Accuracy 0.5920
Epoch 3 Batch 100 Loss 1.0727 Accuracy 0.6706
Epoch 3 Batch 200 Loss 1.1458 Accuracy 0.6626
Epoch 3 Batch 300 Loss 1.0825 Accuracy 0.6486
Epoch 3 Batch 400 Loss 0.8656 Accuracy 0.7273
Epoch 3 Batch 500 Loss 0.9004 Accuracy 0.7098
Epoch 3 Loss 0.9542 Accuracy 0.6754
Ti

In [None]:
def predict_seq2seq_att(input_sentence, encoder, max_input_len, tokenizer_inputs, word2idx_outputs, idx2word_outputs):
    if input_sentence is None:
        input_sentence = input_data[np.random.choice(len(input_data))]
    print("Input Sentence:", input_sentence)

    # Tokenize the input text
    input_seq = tokenizer_inputs.texts_to_sequences([input_sentence])
    # Pad the sentence
    input_seq = pad_sequences(input_seq, maxlen=max_input_len, padding='post')
    # Get the encoder initial states
    en_initial_states = encoder.init_states(1)
    # Get the encoder outputs or hidden states
    en_outputs = encoder(tf.constant(input_seq), en_initial_states)
    # Set the decoder input to the sos token
    de_input = tf.constant([[word2idx_outputs['start']]])
    # Set the initial hidden states of the decoder to the hidden states of the encoder
    de_state_h, de_state_c = en_outputs[1:]

    out_words = []
    alignments = []

    while True:
        # Get the decoder with attention output
        de_output, de_state_h, de_state_c, alignment = decoder(
            de_input, (de_state_h, de_state_c), en_outputs[0])
        de_input = tf.expand_dims(tf.argmax(de_output, -1), 0)
        # Detokenize the output
        predicted_word = idx2word_outputs[de_input.numpy()[0][0]]
        if predicted_word == 'end' or len(out_words) >= 20:
            break
        out_words.append(predicted_word)
        # Save the alignment matrix
        alignments.append(alignment.numpy())

    # Join the output words
    predicted_sequence = ' '.join(out_words)
    print("Predicted Output Sequence:", predicted_sequence)

    return np.array(alignments), input_sentence.split(' '), out_words

# Example usage:
input_sentence = "i do not like my dress"
# Predict the output sequence
output_sequence = predict_seq2seq_att(input_sentence, encoder, max_eng_len, eng_tokenizer, mar_word_index, mar_index_word)


Input Sentence: i do not like my dress
Predicted Output Sequence: मला माझा ड्रेस आवडला नाही
