## Data Preprocessing 

First let start pre processing the data sets. We will first remove all the punctutations and unnecessary characters from the data. After than we will convert each word into their corresponding index.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import unicodedata
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy
import time


In [None]:

class Lang:
    def __init__(self,name):
      #Here we are maintaining three dictionaries, one to convert word into index,another index into word and count.
      #Also we are maintaining global count of distinct words.
      self.name=name
      self.word2index={"<start>":0,"<end>":1}
      self.word2count={"<start>":0,"<end>":0}
      self.index2word={0:"<start>",1:"<end>"}
      self.n_count=2
    def addsentence(self,sent):
      s=sent.split(" ")
      for i in s:
        self.addword(i)

    #preprocessing the data. Seperating the words into dictionary
    def addword(self,word):
      if word not in self.word2index:
        self.word2index[word]=self.n_count
        self.word2count[word]=1
        self.index2word[self.n_count]=word
        self.n_count+=1
      else:
        self.word2count[word]+=1
    

In [None]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

def clean(s):
    s=s.lower().strip()
    s=unicode_to_ascii(s)
    s=re.sub(r"[.!?]+",r" ",s)
    s="<start> "+s+" <end>"
    return s
def read(source,target,links,reverse=False):
  MAX_len=50
  with open(links[target]) as f1,open(links[source]) as f2:
    pairs=[]
    for x,y in zip(f1,f2):
      x,y=x.strip(),y.strip()
#Because all the other tensors will be padded according to max word length we have to keep an upper cap on max word to limit.
      if len(x)<MAX_len and len(y)<MAX_len:
        pairs.append([clean(x),clean(y)] )
    input_lang=Lang(source)
    output_lang=Lang(target)
    if reverse:
      pairs=[list(reversed(l)) for l in pairs]
      input_lang,output_lang=output_lang,input_lang
    for pair in pairs:
      input_lang.addsentence(pair[0])
      output_lang.addsentence(pair[1])
    print(input_lang.n_count)
    return pairs,input_lang,output_lang

def tokenize(pairs,input_lang,output_lang):
  input_tensor=list([])
  output_tensor=list([])
  input_tensor=[[input_lang.word2index[word] for word in i[0].split(" ")] for i in pairs]
  output_tensor=[[output_lang.word2index[word] for word in i[1].split(" ")] for i in pairs]
  # input_tensor=tf.keras.preprocessing.sequence.pad_sequences(input_tensor,padding='post')
  # output_tensor=tf.keras.preprocessing.sequence.pad_sequences(output_tensor,padding='post')
  
  return input_tensor,output_tensor
def pad(tensor):
  tensor=tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
  return tensor
  

In [None]:
links={}
links["hindi"]="/content/drive/My Drive/IITB_hindi.txt"
links["english"]="/content/drive/My Drive/IITB_english.txt.en"
pairs,input_lang,output_lang=read("hindi","english",links,reverse=False)
print(len(pairs))

125614
784916


In [None]:
input_tensor,output_tensor=tokenize(pairs,input_lang,output_lang)

In [None]:
input_tensor=pad(input_tensor)
output_tensor=pad(output_tensor)

In [None]:
print(input_tensor[:6])
def convert(lang, tensor):
  for t in tensor:
    print ("%d ----> %s" % (t, lang.index2word[t]))
convert(output_lang,output_tensor[1])
  

[[ 0  2  3  4  5  6  7  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  8  6  9  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 10 11 12 13 14 10 15 16  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 10 11 12 13 14 10 17 16  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 18 19  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 18 20 21  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
0 ----> <start>
10 ----> एकसरसाइसर
5 ----> पहचनीयता
11 ----> अनवषक
1 ----> <end>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>
0 ----> <start>


In [None]:
#Training test split
input_tensor_train, input_tensor_val, output_tensor_train, output_tensor_val = train_test_split(input_tensor, output_tensor, test_size=0.05)
print(len(input_tensor_train), len(output_tensor_train), len(input_tensor_val), len(output_tensor_val))

745670 745670 39246 39246


All right so far we are done with the pre processing part. Now we will move on to the creating a model.

## The Attention model

In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 128
iterations=int(BUFFER_SIZE/BATCH_SIZE)
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1000
vocab_inp_size = len(input_lang.word2index)+1
vocab_tar_size = len(output_lang.word2index)+1

In [None]:
class Encoder(tf.keras.Model):
  
  def __init__(self,vocab_l,embeding_dim,enc_units,batch_size):
    super(Encoder, self).__init__()
    self.batch_size=batch_size
    self.vocab_l=vocab_l
    self.embeding_dim=embeding_dim
    self.enc_units=enc_units
    self.embedding=tf.keras.layers.Embedding(vocab_l,embedding_dim)
    self.gru=tf.keras.layers.GRU(self.enc_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')

  def call(self,x,hidden):
    x=self.embedding(x)
    output,state=self.gru(x,initial_state=hidden)
    return output,state
  
  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size,self.enc_units))

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train,output_tensor_train))
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
print(dataset)
example_input_batch,example_target_batch=next(iter(dataset))
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))



<BatchDataset shapes: ((64, 22), (64, 22)), types: (tf.int32, tf.int32)>
Encoder output shape: (batch size, sequence length, units) (64, 22, 64)
Encoder Hidden state shape: (batch size, units) (64, 64)


In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))


Encoder output shape: (batch size, sequence length, units) (64, 22, 64)
Encoder Hidden state shape: (batch size, units) (64, 64)


In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))
    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 64)
Attention weights shape: (batch_size, sequence_length, 1) (64, 22, 1)


In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    x = self.embedding(x)

    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 172334)


In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([output_lang.word2index['<start>']] * BATCH_SIZE, 1)
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
EPOCHS = 1

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, output)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp,output, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs


  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.7916
Epoch 1 Batch 20 Loss 2.4520
Epoch 1 Batch 40 Loss 1.8729
Epoch 1 Batch 60 Loss 1.8540
Epoch 1 Batch 80 Loss 1.5140
Epoch 1 Batch 100 Loss 1.3896
Epoch 1 Batch 120 Loss 1.3938
Epoch 1 Batch 140 Loss 1.5101
Epoch 1 Batch 160 Loss 1.5177
Epoch 1 Batch 180 Loss 1.5094
Epoch 1 Batch 200 Loss 1.4319
Epoch 1 Batch 220 Loss 1.5421
Epoch 1 Batch 240 Loss 1.3560
Epoch 1 Batch 260 Loss 1.5613
Epoch 1 Batch 280 Loss 1.5387
Epoch 1 Batch 300 Loss 1.3505
Epoch 1 Batch 320 Loss 1.3932
Epoch 1 Batch 340 Loss 1.4334
Epoch 1 Batch 360 Loss 1.4007
Epoch 1 Batch 380 Loss 1.5733
Epoch 1 Batch 400 Loss 1.4590
Epoch 1 Batch 420 Loss 1.5328
Epoch 1 Batch 440 Loss 1.4894
Epoch 1 Batch 460 Loss 1.4411
Epoch 1 Batch 480 Loss 1.3755
Epoch 1 Batch 500 Loss 1.3306
Epoch 1 Batch 520 Loss 1.3038
Epoch 1 Batch 540 Loss 1.3314
Epoch 1 Batch 560 Loss 1.3730
Epoch 1 Batch 580 Loss 1.5570
Epoch 1 Batch 600 Loss 1.5206
Epoch 1 Batch 620 Loss 1.5274
Epoch 1 Batch 640 Loss 1.5910
Epoch 1 Batch 66