Ref:
https://www.tensorflow.org/text/tutorials/nmt_with_attention

In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import re
import contractions
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Portfolio/Machine_Translation/Data/train_data_max_len_30.csv",encoding="utf-8")
test_data = pd.read_csv("/content/drive/MyDrive/Portfolio/Machine_Translation/Data/test_data_max_len_30.csv",encoding="utf-8")
val_data = pd.read_csv("/content/drive/MyDrive/Portfolio/Machine_Translation/Data/val_data_max_len_30.csv",encoding="utf-8")
# train_data ,test_data = train_test_split(train_data,test_size = 0.7, random_state= 42)
# test_data ,val_data = train_test_split(test_data,test_size = 0.5, random_state= 42)

In [None]:
train_data.shape, test_data.shape, val_data.shape

((383985, 2), (127995, 2), (127995, 2))

In [None]:
# let check vocab size
en_words = []
hi_words = []
_ = train_data['en_sentences'].apply(lambda s : en_words.extend(s.split()))
_ = train_data['hi_sentences'].apply(lambda s : hi_words.extend(s.split()))

print("number of unique words in english : ",len(set(en_words)))
print("number of unique words in hindi : ",len(set(hi_words)))

number of unique words in english :  85415
number of unique words in hindi :  124785


In [None]:
# data_folder_path = 
BATCH_SIZE = 64
MAX_VOCAB_SIZE = 30000
MAX_SENT_LEN = 30

In [None]:
special_char_re = re.compile("[-@!#$%^&*<>\(\)\[\]{}?/\|;\+=}{~0-9]")
multiple_space_re = re.compile(" +")

# https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/
def expand_contraction(sent):
    expanded_words = []   
    for word in sent.split():
        # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))  
    expanded_text = ' '.join(expanded_words)
    return expanded_text
    
    

def pre_process_text(sent):
    # sent = sent.lower()
    lowercase = tf.strings.lower(sent)
    if re.search("[a-z]",sent):
        sent = expand_contraction(sent)
    sent = sent.replace('...','')\
                  .replace(' _ ','')\
                  .replace('_ ','')\
                  .replace('_','')\
                  .replace(',','')\
                  .replace(':','')\
                  .replace('"',"")\
                  .replace('”',"")\
                  .replace('“',"")\
                  .replace(';','')\
                  .replace('/','')\
                  .replace('\\','')\
                  .replace("ः","")\
                  .replace("।"," । ")\
                  .replace("."," . ")\
                  .replace("`","")\
                  .replace("?"," ? ")\
                  .replace("'"," ")\
                  .replace("¨"," ")\
                  
    sent = multiple_space_re.sub(" ",sent)

    return "[START] " + sent + " [END]"               

In [None]:
en_example = "When without a stapler, a staple and a ruler will work"
hi_example = "जब व्यवसायी न हो तो हाट और शासक भी वह काम करते हैं"
print("en : ",pre_process_text(en_example))
print("hi : ",pre_process_text(hi_example))

en :  [START] When without a stapler a staple and a ruler will work [END]
hi :  [START] जब व्यवसायी न हो तो हाट और शासक भी वह काम करते हैं [END]


In [None]:
train_data['en_sentences'] = train_data['en_sentences'].apply(pre_process_text)
test_data['en_sentences'] = test_data['en_sentences'].apply(pre_process_text)
val_data['en_sentences'] = val_data['en_sentences'].apply(pre_process_text)

In [None]:
train_data['hi_sentences'] = train_data['hi_sentences'].apply(pre_process_text)
test_data['hi_sentences'] = test_data['hi_sentences'].apply(pre_process_text)
val_data['hi_sentences'] = val_data['hi_sentences'].apply(pre_process_text)

# Tokenizer

### English tokenizer

In [None]:
en_tokenizer = Tokenizer( num_words=MAX_VOCAB_SIZE,
                         oov_token='[UNK]',
                         filters='!"#$%&()*+,-/:;<=>?@^_`{|}~\t\n'
                         )

en_tokenizer.fit_on_texts(train_data['en_sentences'].values)

In [None]:
print(list(en_tokenizer.word_index.items())[:10])

[('[UNK]', 1), ('[start]', 2), ('[end]', 3), ('.', 4), ('the', 5), ('of', 6), ('and', 7), ('to', 8), ('a', 9), ('in', 10)]


In [None]:
en_sent = train_data.iloc[0,0]
print(en_sent)
print(en_tokenizer.texts_to_sequences([en_sent]))

[START] when he ran away to the laden ship [END]
[[2, 55, 20, 2665, 221, 8, 5, 6572, 2060, 3]]


In [None]:
en_train_sequences = en_tokenizer.texts_to_sequences(train_data['en_sentences'].values)
en_test_sequences = en_tokenizer.texts_to_sequences(test_data['en_sentences'].values)
en_val_sequences = en_tokenizer.texts_to_sequences(val_data['en_sentences'].values)

In [None]:
en_train_padded_sequences = pad_sequences(en_train_sequences,maxlen=MAX_SENT_LEN, padding='post',truncating="post")
en_test_padded_sequences = pad_sequences(en_test_sequences,maxlen=MAX_SENT_LEN, padding='post',truncating="post")
en_val_padded_sequences = pad_sequences(en_val_sequences,maxlen=MAX_SENT_LEN, padding='post',truncating="post")

### Hindi tokenizer

In [None]:
hi_tokenizer = Tokenizer( num_words=MAX_VOCAB_SIZE,
                         oov_token='[UNK]',
                         filters='!"#$%&()*+,-/:;<=>?@^_`{|}~\t\n'
                         )

hi_tokenizer.fit_on_texts(train_data['hi_sentences'].values)

In [None]:
hi_sent = train_data.loc[0,'hi_sentences']
print(hi_sent)
print(hi_tokenizer.texts_to_sequences([hi_sent]))

[START] याद करो जब वह भरी नौका की ओर भाग निकला [END]
[[2, 395, 370, 66, 34, 1991, 4449, 8, 124, 187, 2915, 3]]


In [None]:
hi_train_sequences = hi_tokenizer.texts_to_sequences(train_data['hi_sentences'].values)
hi_test_sequences = hi_tokenizer.texts_to_sequences(test_data['hi_sentences'].values)
hi_val_sequences = hi_tokenizer.texts_to_sequences(val_data['hi_sentences'].values)

In [None]:
hi_train_padded_sequences = pad_sequences(hi_train_sequences,maxlen=MAX_SENT_LEN, padding='post',truncating="post")
hi_test_padded_sequences = pad_sequences(hi_test_sequences,maxlen=MAX_SENT_LEN, padding='post',truncating="post")
hi_val_padded_sequences = pad_sequences(hi_val_sequences,maxlen=MAX_SENT_LEN, padding='post',truncating="post")

In [None]:
print("hi_train_padded_sequences.shape : ",hi_train_padded_sequences.shape)
print("en_train_padded_sequences.shape : ",en_train_padded_sequences.shape)

hi_train_padded_sequences.shape :  (383985, 30)
en_train_padded_sequences.shape :  (383985, 30)


In [None]:
# en_text_vectorizer = tf.keras.layers.TextVectorization(
#     standardize=None,
#     max_tokens=MAX_VOCAB_SIZE
#     )
# en_text_vectorizer.adapt(train_data['en_sentences'].values)

# # Here are the first 10 words from the vocabulary:
# en_text_vectorizer.get_vocabulary()[:10]

In [None]:
# hi_text_vectorizer = tf.keras.layers.TextVectorization(
#     standardize=None,
#     max_tokens=MAX_VOCAB_SIZE
#     )

# hi_text_vectorizer.adapt(train_data['hi_sentences'])

# # Here are the first 10 words from the vocabulary:
# hi_text_vectorizer.get_vocabulary()[:10]

In [None]:
# print(en_text_vectorizer.vocabulary_size())
# print(hi_text_vectorizer.vocabulary_size())

In [None]:
# en_text_vectorizer(train_data.loc[0.'en_sentences'])
# hi_text_vectorizer(train_data.loc[0.'en_sentences'])


# Modeling

## Encoder Decoder

In [None]:
# https://blog.paperspace.com/nlp-machine-translation-with-keras/
# https://machinelearningmastery.com/define-encoder-decoder-sequence-sequence-model-neural-machine-translation-keras/
# https://medium.com/analytics-vidhya/machine-translation-encoder-decoder-model-7e4867377161

class Encoder(tf.keras.Model):

    def __init__(self, inp_vocab_size,inp_embedding_size,lstm_size):
        super(Encoder, self).__init__()
        self.lstm_size = lstm_size
        self.embedding = tf.keras.layers.Embedding(inp_vocab_size, inp_embedding_size,name="encoder_embeddings",mask_zero=True)
        self.lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True,name="encoder_lstm")

    def call(self, input):
        input_sequence = input[0]
        states = input[1]
        embedings = self.embedding(input_sequence)
        
        output, hidden_state, cell_state = self.lstm(embedings, initial_state=states)

        return output, hidden_state, cell_state
    
    def initialize_states(self,batch_size):
    
        return (tf.zeros([batch_size, self.lstm_size]),
                tf.zeros([batch_size, self.lstm_size]))
        
class Decoder(tf.keras.Model):
    def __init__(self,tar_vocab_size,tar_embedding_size,lstm_size,target_tokenizer):
        super(Decoder,self).__init__()
        self.lstm_size = lstm_size
        self.target_tokenizer = target_tokenizer
        self.decoder_embedding = tf.keras.layers.Embedding(tar_vocab_size, tar_embedding_size,name='decoder_embeddings',mask_zero=True)
        self.decoder_lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True,name="decoder_lstm")
        self.decoder_dense = tf.keras.layers.Dense(tar_vocab_size,activation='softmax',name="decoder_dense")
        
    def call(self,input,training=False):
        if training:
            target_sequence = input[0]
            encoder_states = input[1]

            decoder_embeddings = self.decoder_embedding(target_sequence)
            decoder_output, hidden_state, cell_state = self.decoder_lstm(decoder_embeddings,initial_state=encoder_states)
            output = self.decoder_dense(decoder_output)

            return output

        else:
            states = input
            batch_size = 1 if states[0].shape[0] is None else states[0].shape[0]
            sentence_end = False
            translation_op = np.zeros((batch_size,1))

            target_token = self.target_tokenizer.word_index['[start]']
            end_token_index = self.target_tokenizer.word_index['[end]']
            target_seq = np.zeros(shape=(batch_size,1))
            target_seq[:,0]=target_token
            while not sentence_end:

                embeddings = self.decoder_embedding(target_seq)

                decoder_token, hidden_state, cell_state =self.decoder_lstm(embeddings,initial_state=states)

                output = self.decoder_dense(decoder_token)

                pred_token_index = tf.argmax(output,axis=-1).numpy()
                pred_token_index = pred_token_index.reshape(batch_size,1)
                
                # if '[end]' token predicted append only 0 after that
                mask = translation_op[:,-1] == end_token_index
                pred_token_index[mask,:] = 0
                
                translation_op = np.append(translation_op,pred_token_index,1)
       
                if translation_op.shape[-1] > 30:
                    sentence_end = True
                target_seq = pred_token_index
                states = [hidden_state,cell_state]
                
            return translation_op
    
    def single_predict(self,input):
        states = input
        sentence_end = False
        translation_op = ""
        target_token = self.target_tokenizer.word_index['[start]']
        target_seq = np.zeros(shape=(1,1))
        target_seq[:,0]=target_token
        while not sentence_end:

            embeddings = self.decoder_embedding(target_seq)

            decoder_token, hidden_state, cell_state =self.decoder_lstm(embeddings,initial_state=states)

            output = self.decoder_dense(decoder_token)

            pred_token_index = tf.argmax(output,axis=-1).numpy()
            pred_word = self.target_tokenizer.sequences_to_texts(pred_token_index)
            translation_op = translation_op + " " +pred_word [0]

            if pred_word [0] == ['end'] or len(translation_op.split()) > 30:
                sentence_end = True
            
            target_seq[:,0]=pred_token_index
            states = [hidden_state,cell_state]
            
        return translation_op


In [None]:
inp_vocab_size = 30000
inp_embedding_size = 512
lstm_size = 256
encoder = Encoder(inp_vocab_size,inp_embedding_size,lstm_size)
batch_size = 8
encoder_input = [en_train_padded_sequences[:batch_size,:],encoder.initialize_states(batch_size=batch_size)]
output, hidden_state, cell_state = encoder(encoder_input)

In [None]:
en_train_padded_sequences[:batch_size,:].shape

(8, 30)

In [None]:
print(output.shape)
print(hidden_state.shape)
print(cell_state.shape)

(8, 30, 256)
(8, 256)
(8, 256)


In [None]:
tar_embedding_size = 512
lstm_size = 256
target_tokenizer = hi_tokenizer
decoder = Decoder(MAX_VOCAB_SIZE,
                  tar_embedding_size,
                  lstm_size,
                  target_tokenizer)

decoder_op_training_true = decoder([hi_test_padded_sequences[:batch_size,:],[hidden_state, cell_state]],training=True)
decoder_op_training_false = decoder([hidden_state, cell_state],training=False)

In [None]:
decoder_op_training_true.shape,decoder_op_training_false.shape

(TensorShape([8, 30, 30000]), (8, 31))

In [None]:
class NMT(tf.keras.Model):
    def __init__(self,
                 lstm_size,
                 inp_vocab_size,
                 inp_embedding_size,
                 tar_vocab_size,
                 tar_embedding_size,
                 target_tokenizer):
        super(NMT,self).__init__()
        self.encoder = Encoder(inp_vocab_size,inp_embedding_size,lstm_size)
        self.decoder = Decoder(tar_vocab_size,
                               tar_embedding_size,
                               lstm_size,
                               target_tokenizer
                               )
        self.target_tokenizer = target_tokenizer

    def call(self,input,training=False):
        
        if training:
            input_sequence = input[0]
            target_sequence = input[1]

            batch_size = input_sequence.shape[0]
            encoder_input = [input_sequence,encoder.initialize_states(batch_size=batch_size)]
            en_output, en_hidden_state, en_cell_state = self.encoder(encoder_input)
            
            decoder_input = [target_sequence,[en_hidden_state, en_cell_state]]
            decoder_op = self.decoder(decoder_input,training=training)

            return decoder_op
        
        else:
            batch_size = input.shape[0]
            encoder_input = [input,encoder.initialize_states(batch_size=batch_size)]
            en_output, en_hidden_state, en_cell_state = self.encoder(encoder_input)

            decoder_op = self.decoder([en_hidden_state, en_cell_state])
            return decoder_op

    def single_predict(self,input):
        batch_size = input.shape[0] 
        encoder_input = [input,encoder.initialize_states(batch_size=batch_size)]
        en_output, en_hidden_state, en_cell_state = self.encoder(encoder_input)

        decoder_op = self.decoder.single_predict([en_hidden_state, en_cell_state])

        return decoder_op
       

In [None]:
tar_vocab_size = MAX_VOCAB_SIZE
nmt = NMT(lstm_size,
          inp_vocab_size,
          inp_embedding_size,
          tar_vocab_size,
          tar_embedding_size,
          target_tokenizer
          )

In [None]:
batch_size = 2
# op = nmt([en_train_padded_sequences[:batch_size,:],hi_train_padded_sequences[:batch_size,:]],training=True)
op = nmt(en_train_padded_sequences[:batch_size,:],training=False)

In [None]:
op.shape

(2, 31)

## Training

In [None]:
train_tf_dataset = tf.data.Dataset.from_tensor_slices((en_train_padded_sequences, hi_train_padded_sequences))
test_tf_dataset = tf.data.Dataset.from_tensor_slices((en_test_padded_sequences, hi_test_padded_sequences))
val_tf_dataset = tf.data.Dataset.from_tensor_slices((en_val_padded_sequences, hi_val_padded_sequences))

In [None]:
en_train_padded_sequences.shape, en_test_padded_sequences.shape, en_val_padded_sequences.shape

((383985, 30), (127995, 30), (127995, 30))

In [None]:
train_tf_dataset = train_tf_dataset.batch(128)
test_tf_dataset = test_tf_dataset.batch(128)



In [None]:
# loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
# optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)

In [None]:
# epochs = 2
# train_loss_at_each_epoch = []
# test_loss_at_each_epoch = []

# for epoch in range(epochs):
#     train_tf_dataset = train_tf_dataset.shuffle(buffer_size = en_train_padded_sequences.shape[0])
#     train_tf_dataset_ = train_tf_dataset.as_numpy_iterator()

#     print("\nStart of epoch %d" % (epoch,))

#     # Iterate over the batches of the dataset.
#     for step, (x_batch_train, y_batch_train) in enumerate(train_tf_dataset_):

#         # Open a GradientTape to record the operations run
#         # during the forward pass, which enables auto-differentiation.
#         with tf.GradientTape() as tape:

#             # Run the forward pass of the layer.
#             # The operations that the layer applies
#             # to its inputs are going to be recorded
#             # on the GradientTape.
#             logits = nmt([x_batch_train,y_batch_train], training=True)  # Logits for this minibatch

#             # Compute the loss value for this minibatch.
#             loss_value = loss_fn(y_batch_train, logits)

#         # Use the gradient tape to automatically retrieve
#         # the gradients of the trainable variables with respect to the loss.
#         grads = tape.gradient(loss_value, nmt.trainable_weights)

#         # Run one step of gradient descent by updating
#         # the value of the variables to minimize the loss.
#         optimizer.apply_gradients(zip(grads, nmt.trainable_weights))
#         break

#     # Log loss at each epoch
#     train_tf_dataset_ = train_tf_dataset.as_numpy_iterator()
#     test_tf_dataset_ = test_tf_dataset.as_numpy_iterator()

#     train_loss = []
#     test_loss = []
#     for (x_batch_train, y_batch_train) in train_tf_dataset_:
#         logits = nmt([x_batch_train,y_batch_train], training=True)  # Logits for this minibatch
#         # Compute the loss value for this minibatch.
#         loss_value = loss_fn(y_batch_train, logits)
#         train_loss.append(loss_value)
    
#     for (x_batch_test, y_batch_test) in test_tf_dataset_:
#         logits = nmt([x_batch_test,y_batch_test], training=True)  # Logits for this minibatch
#         # Compute the loss value for this minibatch.
#         loss_value = loss_fn(y_batch_test, logits)
#         test_loss.append(loss_value)
    
#     mean_train_loss = np.mean(train_loss)
#     mean_test_loss = np.mean(test_loss)
#     train_loss_at_each_epoch.append(mean_train_loss)
#     test_loss_at_each_epoch.append(mean_test_loss)

#     print("Train loss at epoch {} : {}".format(epoch+1,mean_train_loss))
#     print("Test loss at epoch {} : {}".format(epoch+1,mean_test_loss))


In [None]:
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=False)

# Prepare the metrics.
train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()

In [None]:
@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss_value = loss_fn(y, logits)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value

In [None]:
@tf.function
def test_step(x, y):
    val_logits = model(x, training=False)
    val_acc_metric.update_state(y, val_logits)

In [None]:
import time
train_acc_history = []
val_acc_history = []

epochs = 20
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        loss_value = train_step(x_batch_train, y_batch_train)

        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))

    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch {} : {}" % (epoch,float(train_acc),))

    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc over epoch {} : {}" % (epoch,float(val_acc),))

    train_acc_history.append(train_acc)
    val_acc_history.append(val_acc)

    if len(val_acc_history) > 3 and  val_acc > val_acc_history[-2] and val_acc > val_acc_history[-3]:
        print("Early stopping training...")
        nmt.save_weights('/content/NMT_weights/nmt_weights_{}'.format(epoch), save_format='tf')
        with open("/content/NMT_weights/epoch_{}.txt".format(epoch),'w'):
            f.write("train_accuracy : {} \t val_accuracy {}".format(str(float(train_acc)),str(float(val_acc))))

        break

    print("Time taken: %.2fs" % (time.time() - start_time))

In [None]:
nmt.save_weights('/content/NMT_weights/nmt_weights', save_format='tf')

In [None]:
nmt.load_weights('/content/model_weights/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f2188d9ed90>

In [None]:
op = nmt.single_predict(np.expand_dims(en_train_padded_sequences[0,:],axis=0))

In [None]:
print(en_tokenizer.sequences_to_texts(np.expand_dims(en_train_padded_sequences[0,:],axis=0)))

['[start] when he ran away to the laden ship [end] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]']


In [None]:
print(op)

 एकरसता विह्वल नियामकीय परछाई सचमुच सचमुच खिलाफ़ उतनी मांद उत्सवों अनैच्छिक एजोला मैप अतुल्यकालित अहम् अहम् पास्ट किरणों रखी चौंका अलिंद छोडने गवर्नर अविवाहित शिकारी फीरोजशाह रक्तस्रावण पातिव्रत घड़नेवाले रक्तस्रावण फीडबैक
