<a href="https://colab.research.google.com/github/enthusiastic2003/SummerTimeFun/blob/main/venus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import unicodedata

import re


# Convert the unicode sequence to ascii
def unicode_to_ascii(s):

  # Normalize the unicode string and remove the non-spacking mark
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

# Preprocess the sequence
def preprocess_sentence(w):

  # Clean the sequence
  w = unicode_to_ascii(w.lower().strip())

  # Create a space between a word and the punctuation following it also place a space between the punctuation and the following word. Note that punctuation also includes |

  w = re.sub(r"([?.!।])", r" \1 ", w)

  # Add a start and stop token to detect the start and end of the sequence
  w = '<start> ' + w + ' <end>'
  return w

In [2]:
import io

# Create the Dataset
def create_dataset(path):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  # Loop through lines (sequences) and extract the English and French sequences. Store them as a word-pair
  word_pairs = [[preprocess_sentence(w) for w in l.split('\t', 2)[:-1]]  for l in lines]
  return zip(*word_pairs)

In [3]:
path_to_file='fra_new.txt'

In [4]:
lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\n')
print(lines[0])
print(preprocess_sentence(lines[0].split('\t', 2)[0]))
print(preprocess_sentence(lines[0].split('\t', 2)[1]))

Go.	Va !	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)
<start> go .  <end>
<start> va  !  <end>


In [5]:
en, fra = create_dataset(path_to_file)
print(en[0])
print(fra[0])

<start> go .  <end>
<start> va  !  <end>


In [6]:
import tensorflow as tf

# Convert sequences to tokenizers
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')

  # Convert sequences into internal vocab
  lang_tokenizer.fit_on_texts(lang)

  # Convert internal vocab to numbers
  tensor = lang_tokenizer.texts_to_sequences(lang)

  # Pad the tensors to assign equal length to all the sequences
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [7]:
# Load the dataset
def load_dataset(path, num_examples=None):

  # Create dataset (targ_lan = English, inp_lang = French)
  inp_lang,targ_lang = create_dataset(path)

  # Tokenize the sequences
  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [8]:
# Consider 50k examples
num_examples = 50000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [9]:
print(max_length_targ, max_length_inp)

17 11


In [10]:
print(input_tensor[0], target_tensor.shape)

[ 1 31  3  2  0  0  0  0  0  0  0] (58184, 17)


In [11]:
from sklearn.model_selection import train_test_split

# Create training and validation sets using an 80/20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

46547 46547 11637 11637


In [12]:
print(input_tensor_train)
print(inp_lang)

[[  1  53  32 ...   0   0   0]
 [  1   7  20 ...   0   0   0]
 [  1  49  64 ...   0   0   0]
 ...
 [  1  69  15 ...   0   0   0]
 [  1  21  52 ...   0   0   0]
 [  1 265   9 ...   0   0   0]]
<keras.src.preprocessing.text.Tokenizer object at 0x7f4d90108c40>


In [13]:
# Show the mapping b/w word index and language tokenizer
def convert(lang, tensor):
  for t in tensor:
    if t != 0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
53 ----> i'll
32 ----> be
60 ----> out
48 ----> of
614 ----> town
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
4 ----> je
10 ----> ne
361 ----> serai
7 ----> pas
24 ----> en
442 ----> ville
3 ----> .
2 ----> <end>


In [14]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

In [34]:
!mkdir ./utils

In [1]:


import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
from utils.dataset import NMTDataset
from tensorflow.python.ops import math_ops



ModuleNotFoundError: No module named 'utils.dataset'

In [15]:
BUFFER_SIZE = 32000
BATCH_SIZE = 64
num_examples = 30000

dataset_creator = NMTDataset('en-spa')
train_dataset, val_dataset, inp_lang, targ_lang = dataset_creator.call(num_examples, BUFFER_SIZE, BATCH_SIZE)

In [16]:
# Size of input and target batches
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 11]), TensorShape([64, 17]))

In [17]:
import tensorflow as tf

class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units

        # Embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        # GRU-bidirectional Layer
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')


    # Encoder network comprises an Embedding layer followed by an LSTM layer
    def call(self, x, hidden):
        x = self.embedding(x)
        output_all_cells, state = self.gru(x, initial_state=hidden)
        state = tf.expand_dims(state, 1)
        return output_all_cells, state

    # To initialize the hidden state
    def initialize_hidden_state(self):
      return tf.zeros((self.batch_sz, self.enc_units))



In [18]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output, enc_states = encoder(example_input_batch, sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print('Encoder state shape: {}'.format(enc_states.shape))



Encoder output shape: (batch size, sequence length, units) (64, 11, 1024)
Encoder state shape: (64, 1, 1024)


In [19]:
import tensorflow as tf

class PayAttention(tf.keras.layers.Layer):
    def __init__(self, units_enc, units_dec,enc_length):
        super(PayAttention, self).__init__() #Call initializer of the superclass
        self.units_enc=units_enc
        self.units_dec=units_dec
        self.enc_length=enc_length


    def call(self, enc_output, dec_states):


        # enc_output= (64, 23, 1024)
        # x=(1, units_dec)
        # x=(1,1)
        x=tf.matmul(dec_states,enc_output, transpose_b=True)
        #weights=(64,23,scalr)
        weights=x

        # softmaxed_weights = (64,1,23)
        softmaxed_weights = tf.nn.softmax(weights, axis=2)
        softmaxed_weights = tf.transpose(softmaxed_weights, perm=[0,2,1])
        context_vector = tf.reduce_sum(softmaxed_weights*enc_output, axis=1)
        return context_vector, softmaxed_weights






In [20]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, enc_units,inp_length):
        super().__init__()
        self.batch_sz=batch_sz
        self.dec_units=dec_units
        self.enc_units=enc_units
        self.embedding_dim=embedding_dim
        self.vocab_size=vocab_size
        self.inp_length=inp_length

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        self.attention = PayAttention(self.enc_units , self.dec_units, self.inp_length)

        self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, enc_output, dec_input, dec_internal_state=None):
        # hidden is the hidden states of all the units in the encoder
        # context_vector is the context vector from the attention layer
        # dec_input is the input to the decoder

        # Now, first embed the decoder input
        if dec_internal_state==None:
          dec_internal_state = self.gru.get_initial_state(inputs=dec_input)
          dec_internal_state = tf.cast(dec_internal_state, tf.float32)

        # Get Context for each target token generation
        context_vector, softmaxed_weights = self.attention(enc_output, dec_internal_state)

        # Embed the decoder input
        embed_dec_input=self.embedding(dec_input)

        # Context + input = input to the GRU
        x = tf.concat([tf.expand_dims(context_vector, 1), embed_dec_input], axis=-1)

        # Pass through a GRU layer
        output, state = self.gru(x)

        # Pass through a dense layer to get the probabilities distribution over the target vocabulary
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        state = tf.expand_dims(state, 1)
        return x, state , softmaxed_weights

In [21]:
decoder = DecoderLayer(vocab_tar_size ,embedding_dim, units, BATCH_SIZE, units, max_length_inp)

In [22]:
dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
prob_dist, states_decode, weights = decoder(sample_output, dec_input, enc_states )
#decoder(sample_output, dec_input, None )

In [23]:
print(states_decode.shape)

(64, 1, 1024)


In [24]:
print(weights[0])

tf.Tensor(
[[0.08968794]
 [0.08991764]
 [0.08947254]
 [0.08964249]
 [0.08912943]
 [0.08935861]
 [0.08961659]
 [0.09079557]
 [0.09077422]
 [0.09474929]
 [0.0968556 ]], shape=(11, 1), dtype=float32)


In [25]:
# Now we will define the Model using the layers we have already defined.
#class BenLish(tf.keras.Model):
#  def __init__(self, vocab_size_inp, vocab_size_tar, embedding_dim, dec_units, batch_sz, enc_units,inp_length):
#    super().__init__()
#    self.encoder = Encoder(vocab_size_inp, embedding_dim, enc_units, batch_sz)
#    self.decoder = DecoderLayer(vocab_size_tar ,embedding_dim, dec_units, batch_sz, dec_units, inp_length)

#  def call(self, sentence):


In [26]:
#Now we define the loss function. The loss function is the cross entropy loss function. The cross entropy loss function is defined as follows:
#-sum(y_true * log(y_pred), axis=-1)
import numpy as np

with tf.device('/gpu:0'):
  optimizer = tf.optimizers.Adam()

  def loss_function(real, pred):
      mask = 1 - np.equal(real, 0)
      loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
      return tf.reduce_mean(loss_)

In [27]:

#Now we run the training loop
with tf.device('/gpu:0'):
  optimizer = tf.keras.optimizers.Adam()
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

  decoder = DecoderLayer(vocab_tar_size ,embedding_dim, units, BATCH_SIZE, units, max_length_inp)
  encoder=Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

  import os
  checkpoint_dir = './training_checkpoints'
  checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
  checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                  encoder=encoder,
                                  decoder=decoder)

  def train_step(inp, targ):
      loss=0

      with tf.GradientTape() as tape:
          print("INPUT: ", inp.shape)
          # Initialize the hidden state of the encoder and pass the input to the encoder
          hidden_initialize = encoder.initialize_hidden_state()
          enc_output, enc_hidden = encoder(inp, hidden_initialize)

          dec_hidd=enc_hidden

          # Initialize the hidden state of the decoder
          dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

          # Teacher forcing - feeding the target as the next input
          for t in range(1, targ.shape[1]):
              # passing enc_output to the decoder
              prob_dist, dec_hidd, weights = decoder(sample_output, dec_input, dec_hidd )
              loss += loss_function(targ[:, t], prob_dist)
              # using teacher forcing
              dec_input = tf.expand_dims(targ[:, t], 1)



      batch_loss = (loss / int(targ.shape[1]))
      variables = encoder.trainable_variables + decoder.trainable_variables
      gradients = tape.gradient(loss, variables)
      optimizer.apply_gradients(zip(gradients, variables))
      return batch_loss




In [28]:
print(steps_per_epoch)

727


In [29]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.InitializationOnlyStatus at 0x7f4d153d5270>

In [30]:
with tf.device('/gpu:0'):
  EPOCHS=30
  import time

  for epoch in range(EPOCHS):
      start = time.time()
      total_loss = 0

      for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
          batch_loss = train_step(inp, targ)
          total_loss += batch_loss
          print(targ.shape)
          print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                          batch,
                                                          batch_loss.numpy()))
      # saving (checkpoint) the model every 2 epochs
      if (epoch + 1) % 2 == 0:
          checkpoint.save(file_prefix = checkpoint_prefix+str(epoch))

      print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
      print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))



INPUT:  (64, 11)




(64, 17)
Epoch 1 Batch 0 Loss 3.2724
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 1 Loss 3.1274
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 2 Loss 3.2640
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 3 Loss 3.3906
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 4 Loss 3.4265
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 5 Loss 3.3311
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 6 Loss 3.3186
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 7 Loss 3.2849
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 8 Loss 3.3095
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 9 Loss 3.2574
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 10 Loss 3.3554
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 11 Loss 3.3125
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 12 Loss 3.3160
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 13 Loss 3.2659
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 14 Loss 2.9791
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 15 Loss 2.9851
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 16 Loss 2.9893
INPUT:  (64, 11)
(64, 17)
Epoch 1 Batch 17 Loss 2.8766
INPUT:  (64, 11)
(64, 17)
Epoch 1 Bat

KeyboardInterrupt: 

In [None]:
checkpoint.save(file_prefix = checkpoint_prefix)

In [None]:
max_seq_length = 20
#Now we will generate the translation of the input sentence

def evaluate(sentence):

    sentence = preprocess_sentence(sentence)
    print(sentence)
    sentence = inp_lang.texts_to_sequences([sentence])
    print(sentence)
    len_sent=len(sentence[0])
    # Pad the sentence to the max_length_inp

    sentence = tf.keras.preprocessing.sequence.pad_sequences(sentence,
                                                            maxlen=max_length_inp,
                                                            padding='post')

    sentence = tf.convert_to_tensor(sentence)

    hidden_initialize = encoder.initialize_hidden_state()

    sentence = tf.convert_to_tensor([sentence]*BATCH_SIZE)
    sentence = tf.squeeze(sentence)
    print(sentence.shape)

    enc_output, enc_hidden = encoder(sentence, hidden_initialize)
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']]* BATCH_SIZE, 1)
    dec_hidd = enc_hidden
    result = []
    #Disable teacher forcing
    attention_graph=[]
    for i in range(max_seq_length):
        prob_dist, dec_hidd, weights = decoder(sample_output, dec_input, dec_hidd )
        predicted_id = tf.argmax(prob_dist[0]).numpy()
        attention_graph.append(weights)
        result.append(targ_lang.index_word[predicted_id])

        if targ_lang.index_word[predicted_id] == '<end>':
            returnd=(attention_graph,result,len_sent)
            return returnd

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id] * BATCH_SIZE, 0)
        dec_input = tf.transpose(dec_input)
        print(dec_input.shape)

    returnd=(attention_graph,result,len_sent)
    return returnd


resultas = evaluate("Good.")

attend, result, len_inp = resultas[0], resultas[1], resultas[2]


In [None]:
print(len(result))

In [None]:
print(attend[0].shape)

In [None]:
new_attend=[]
for i in range(len(attend)):
    new_attend.append(attend[i][0][:len_inp])

In [None]:
print(len_inp)

In [None]:
print(len(new_attend))

In [None]:
print(new_attend[0].shape)

In [None]:
new_attend=np.array(new_attend)

In [None]:
print(new_attend)

In [None]:
print(result)