<a href="https://colab.research.google.com/github/girish445ai/Recurrent_Neural_networks/blob/main/Transliteration_with_attention_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing Libraries 

In [None]:
import io
import numpy as np
import tensorflow 
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, GRU, Dropout, SimpleRNN
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Nadam
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from math import log
from numpy import array
from numpy import argmax
from tensorflow import keras
import os
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

import keras
from keras.models import load_model
from math import log1p 

In [None]:
%pip install wandb -q
import wandb
from wandb.keras import WandbCallback

[K     |████████████████████████████████| 1.8 MB 5.3 MB/s 
[K     |████████████████████████████████| 181 kB 39.5 MB/s 
[K     |████████████████████████████████| 144 kB 36.1 MB/s 
[K     |████████████████████████████████| 63 kB 1.5 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


### Unzipping the dataset

Lexicons for Latin-Telugu are taken from Google's Dakshina dataset.

In [None]:
# Downloading dakshina dataset
!yes | wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"

--2022-05-07 19:24:56--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.6.128, 142.250.152.128, 142.251.120.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.6.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2022-05-07 19:25:11 (127 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [None]:
# Unzipping dataset
!yes | tar xopf dakshina_dataset_v1.0.tar

In [None]:
# The folder containing the datasets to be used in this program
!ls dakshina_dataset_v1.0/te/lexicons

te.translit.sampled.dev.tsv   te.translit.sampled.train.tsv
te.translit.sampled.test.tsv


In [None]:
print_data = True

## Reading the dataset


In [None]:
train_path = "./dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.train.tsv"
dev_path = "./dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.dev.tsv"
test_path = "./dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.dev.tsv"

def reading_data(corpus_file):
  # function reads the raw text of words and returns native versions of words
  telugu_words = []
  latin_words = []
  with io.open(corpus_file, encoding ='utf-8') as f:
    for line in f:
      if '\t' not in line:
        continue
      tokens = line.rstrip().split("\t")
      latin_words.append(tokens[1])
      telugu_words.append(tokens[0])
  return latin_words, telugu_words

train_source, train_target = reading_data(train_path)
val_source, val_target = reading_data(dev_path)
test_source, test_target = reading_data(test_path)

print("Number of training samples: ", len(train_source))
print("Number of validation samples: ", len(val_source))
print("Number of testing samples: ", len(test_source))

Number of training samples:  58550
Number of validation samples:  5683
Number of testing samples:  5683


In [None]:
arr = np.arange(len(train_source))
np.random.shuffle(arr)
arr1 = np.arange(len(val_source))
np.random.shuffle(arr1)

input_chars = set()
target_chars = set()
input_texts_ns = []
target_texts_ns = []
val_input_texts_ns = []
val_target_texts_ns = []

for (input_text, target_text) in zip(train_source, train_target):
    # "tab" is the "start sequence" characte ,"\n" is "end sequence" character.
    target_text = "B" + target_text + "E"
    input_texts_ns.append(input_text)
    target_texts_ns.append(target_text)
    for char in input_text:
        if char not in input_chars:
            input_chars.add(char)
    for char in target_text:
        if char not in target_chars:
            target_chars.add(char)

for (input_text, target_text) in zip(val_source, val_target):
    # "tab" is the "start sequence" characte ,"\n" is "end sequence" character.
    target_text = "B" + target_text + "E"
    val_input_texts_ns.append(input_text)
    val_target_texts_ns.append(target_text)
    for char in input_text:
        if char not in input_chars:
            input_chars.add(char)
    for char in target_text:
        if char not in target_chars:
            target_chars.add(char)

input_texts = []
target_texts = []

for i in range(len(train_source)):
    input_texts.append(input_texts_ns[arr[i]])
    target_texts.append(target_texts_ns[arr[i]])

val_input_texts = []
val_target_texts = []

for i in range(len(val_source)):
    val_input_texts.append(val_input_texts_ns[arr1[i]])
    val_target_texts.append(val_target_texts_ns[arr1[i]])

input_chars.add(" ")
target_chars.add(" ")

input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))


no_enc_tokens = len(input_chars)
no_dec_tokens = len(target_chars)
enc_seq_length = max([len(txt) for txt in input_texts])
dec_seq_length = max([len(txt) for txt in target_texts])
val_max_encoder_seq_length = max([len(txt) for txt in val_input_texts])
val_max_decoder_seq_length = max([len(txt) for txt in val_target_texts])



print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", no_enc_tokens)
print("Number of unique output tokens:", no_dec_tokens)
print("Max sequence length for inputs:", enc_seq_length)
print("Max sequence length for outputs:", dec_seq_length)
print("Max sequence length for val inputs:", val_max_encoder_seq_length)
print("Max sequence length for val outputs:", val_max_decoder_seq_length)

print(input_chars)
print(target_chars)

Number of samples: 58550
Number of unique input tokens: 27
Number of unique output tokens: 66
Max sequence length for inputs: 25
Max sequence length for outputs: 22
Max sequence length for val inputs: 21
Max sequence length for val outputs: 21
[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[' ', 'B', 'E', 'ం', 'ః', 'అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ఱ', 'ల', 'ళ', 'వ', 'శ', 'ష', 'స', 'హ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్', '\u200c']


In [None]:
print(input_texts[123:130])
print(target_texts[123:130])

['soura', 'pramaadakaramiena', 'cheyistu', 'samvatsaaraalalo', 'naitikataku', 'vaarasulanu', 'droham']
['BసౌరE', 'Bప్రమాదకరమైనE', 'Bచేయిస్తూE', 'Bసంవత్సరాలలోE', 'BనైతికతకుE', 'BవారసులనుE', 'Bద్రోహంE']


**Training** :

In [None]:
input_token_index = dict([(char, i) for i, char in enumerate(input_chars)])
target_token_index = dict([(char, i) for i, char in enumerate(target_chars)])
print(input_token_index)
print(target_token_index)

{' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{' ': 0, 'B': 1, 'E': 2, 'ం': 3, 'ః': 4, 'అ': 5, 'ఆ': 6, 'ఇ': 7, 'ఈ': 8, 'ఉ': 9, 'ఊ': 10, 'ఋ': 11, 'ఎ': 12, 'ఏ': 13, 'ఐ': 14, 'ఒ': 15, 'ఓ': 16, 'ఔ': 17, 'క': 18, 'ఖ': 19, 'గ': 20, 'ఘ': 21, 'చ': 22, 'ఛ': 23, 'జ': 24, 'ఝ': 25, 'ఞ': 26, 'ట': 27, 'ఠ': 28, 'డ': 29, 'ఢ': 30, 'ణ': 31, 'త': 32, 'థ': 33, 'ద': 34, 'ధ': 35, 'న': 36, 'ప': 37, 'ఫ': 38, 'బ': 39, 'భ': 40, 'మ': 41, 'య': 42, 'ర': 43, 'ఱ': 44, 'ల': 45, 'ళ': 46, 'వ': 47, 'శ': 48, 'ష': 49, 'స': 50, 'హ': 51, 'ా': 52, 'ి': 53, 'ీ': 54, 'ు': 55, 'ూ': 56, 'ృ': 57, 'ె': 58, 'ే': 59, 'ై': 60, 'ొ': 61, 'ో': 62, 'ౌ': 63, '్': 64, '\u200c': 65}


In [None]:
# Encoder Input Sequences are padded to a maximum length of MAX encoder SeqLen characters. 
enc_input_data = np.zeros(
    (len(input_texts), enc_seq_length), dtype="float32"
)
dec_input_data = np.zeros(
    (len(input_texts), dec_seq_length), dtype="float32"
)
dec_target_data = np.zeros(
    (len(input_texts), dec_seq_length, no_dec_tokens), dtype="float32"
)
#Decoder Target Sequences are Padded to a maximum length of max_decoder SeqLen characters with a vocabulary of sizeofTeluguVocab different characters. 
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        enc_input_data[i, t] = input_token_index[char]
    enc_input_data[i, t + 1 :] = input_token_index[" "]

    for t, char in enumerate(target_text):
        # dec_target_data is ahead of dec_input_data by one timestep
        dec_input_data[i, t] = target_token_index[char]
        if t > 0:
            # dec_target_data will not include the start character.
            dec_target_data[i, t - 1, target_token_index[char]] = 1.0
    dec_input_data[i, t + 1: ] = target_token_index[" "]
    dec_target_data[i, t:, target_token_index[" "]] = 1.0


val_enc_input_data = np.zeros(
    (len(input_texts), val_max_encoder_seq_length), dtype="float32"
)
val_dec_input_data = np.zeros(
    (len(input_texts), val_max_decoder_seq_length), dtype="float32"
)
val_dec_target_data = np.zeros(
    (len(input_texts), val_max_decoder_seq_length, no_dec_tokens), dtype="float32"
)

for i, (input_text, target_text) in enumerate(zip(val_input_texts, val_target_texts)):
    for t, char in enumerate(input_text):
        val_enc_input_data[i, t] = input_token_index[char]
    val_enc_input_data[i, t + 1 :] = input_token_index[" "]

    for t, char in enumerate(target_text):
        # dec_target_data is ahead of decoder_input_data by one timestep
        val_dec_input_data[i, t] = target_token_index[char]
        if t > 0:
            # dec_target_data will be ahead by one timestep
            # and will not include the start character.
            val_dec_target_data[i, t - 1, target_token_index[char]] = 1.0
    val_dec_input_data[i, t + 1: ] = target_token_index[" "]
    val_dec_target_data[i, t:, target_token_index[" "]] = 1.0




In [None]:
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())
print(reverse_target_char_index)

{0: ' ', 1: 'B', 2: 'E', 3: 'ం', 4: 'ః', 5: 'అ', 6: 'ఆ', 7: 'ఇ', 8: 'ఈ', 9: 'ఉ', 10: 'ఊ', 11: 'ఋ', 12: 'ఎ', 13: 'ఏ', 14: 'ఐ', 15: 'ఒ', 16: 'ఓ', 17: 'ఔ', 18: 'క', 19: 'ఖ', 20: 'గ', 21: 'ఘ', 22: 'చ', 23: 'ఛ', 24: 'జ', 25: 'ఝ', 26: 'ఞ', 27: 'ట', 28: 'ఠ', 29: 'డ', 30: 'ఢ', 31: 'ణ', 32: 'త', 33: 'థ', 34: 'ద', 35: 'ధ', 36: 'న', 37: 'ప', 38: 'ఫ', 39: 'బ', 40: 'భ', 41: 'మ', 42: 'య', 43: 'ర', 44: 'ఱ', 45: 'ల', 46: 'ళ', 47: 'వ', 48: 'శ', 49: 'ష', 50: 'స', 51: 'హ', 52: 'ా', 53: 'ి', 54: 'ీ', 55: 'ు', 56: 'ూ', 57: 'ృ', 58: 'ె', 59: 'ే', 60: 'ై', 61: 'ొ', 62: 'ో', 63: 'ౌ', 64: '్', 65: '\u200c'}


In [None]:
print(enc_input_data[1])
print(dec_input_data[1])
print(dec_target_data[1])

[16.  1.  5. 12. 21.  4. 21. 11. 21.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]
[ 1. 37. 59. 45. 55. 29. 55. 18. 55.  2.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


For Validation and testing:

In [None]:
print(val_dec_input_data[26])
print(val_dec_target_data[26])

[ 1. 37. 58. 43. 53. 20. 53. 37. 62. 32. 55. 36. 64. 36.  2.  0.  0.  0.
  0.  0.  0.]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [None]:
x_test = val_enc_input_data
y_test = val_target_texts

In [None]:
class AttentionLayer(tf.keras.layers.Layer):
  def __init__(self, units):
    super(AttentionLayer, self).__init__(units)
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)
    
  def call(self, query, values):
    
    query_with_time_axis = tf.expand_dims(query, 1)
    
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))
    
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
class AttentionLayer(Layer):

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):

        self.W_a = self.add_weight(name='W_a',
                                   shape = tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer = 'uniform',
                                   trainable = True)

        self.U_a = self.add_weight(name = 'U_a',
                                   shape = tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer = 'uniform',
                                   trainable = True)

        self.V_a = self.add_weight(name = 'V_a',
                                   shape = tf.TensorShape((input_shape[0][2], 1)),
                                   initializer = 'uniform',
                                   trainable = True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs):
       
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        encoder_out_seq, decoder_out_seq = inputs

        def energy_step(inputs, states):
           
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)
            
            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)

            return c_i, [c_i]

        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs



## MODEL TRAINING 

In [None]:
import tensorflow as tf

class MyRNN(object):
  def __init__(self,cell_type = 'RNN',in_emb = 32, hidden_size=32, learning_rate= 1e-3, 
               dropout=0.4,epochs = 10, batch_size = 32,
               num_enc = 1,num_dec = 1):
    
    self.cell_type = cell_type
    self.in_emb = in_emb
    self.hidden_size = hidden_size
    self.learning_rate = learning_rate
    self.dropout = dropout
    self.epochs = epochs
    self.batch_size = batch_size
    self.num_enc = num_enc
    self.num_dec = num_dec

  def build_fit(self,enc_input_data,dec_input_data,dec_target_data,x_test, y_test):
    enc_inputs = Input(shape=(None, ),name = 'Enc_inputs')

    # Add an Embedding layer expecting input vocab of size no_enc_tokens, and
    # output embedding dimension of size in_enc.
    enc_emb =  Embedding(no_enc_tokens, self.in_emb , mask_zero = True,name = 'Enc_emb')(enc_inputs)

    enc_outputs = enc_emb
    if self.cell_type == 'LSTM':
      encoder_lstm = LSTM(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")
      enc_outputs, state_h, state_c = encoder_lstm(enc_outputs)
      encoder_states = [state_h, state_c]
      encoder_first_outputs = enc_outputs

      # Add a LSTM layer with hidden_size internal units.
      for i in range( 2, self.num_enc +1):
        layer_name = ('Enc_hidden_%d') %i
        encoder_lstm = LSTM(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)
        enc_outputs, state_h, state_c = encoder_lstm(enc_outputs,initial_state = encoder_states)
        encoder_states = [state_h, state_c]

    elif self.cell_type == 'GRU':
      encoder_gru = GRU(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")
      enc_outputs, state_h = encoder_gru(enc_outputs)
      encoder_states = [state_h]
      encoder_first_outputs = enc_outputs

      for i in range(2, self.num_enc +1):
        layer_name = ('Enc_hidden_%d') %i
        encoder_gru = GRU(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)
        enc_outputs, state_h = encoder_gru(enc_outputs, initial_state = encoder_states)
        encoder_states = [state_h]  

    elif self.cell_type == 'RNN':
      encoder_rnn = SimpleRNN(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")
      enc_outputs, state_h = encoder_rnn(enc_outputs)
      encoder_states = [state_h]
      encoder_first_outputs = enc_outputs

      for i in range(2, self.num_enc +1):
        layer_name = ('Enc_hidden_%d') %i
        encoder_rnn = SimpleRNN(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)
        enc_outputs, state_h = encoder_rnn(enc_outputs, initial_state = encoder_states)
        encoder_states = [state_h]  

    # Set up the decoder, using `encoder_states` as initial state.
    dec_inputs = Input(shape=(None,), name = 'Dec_inputs')
    dec_emb_layer = Embedding(no_dec_tokens, self.hidden_size, mask_zero = True, name = 'Dec_emb')
    dec_emb = dec_emb_layer(dec_inputs)
    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    dec_outputs = dec_emb
    decoder_first_outputs = dec_outputs
    if self.cell_type == 'LSTM':
      decoder_lstm = LSTM(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")
      dec_outputs, _, _ = decoder_lstm(dec_outputs, initial_state = encoder_states)
      
      for i in range(2, self.num_dec +1):
        layer_name = ('Dec_hidden_%d') %i
        decoder_lstm = LSTM(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)
        dec_outputs, _, _ = decoder_lstm(dec_outputs, initial_state = encoder_states)
        if i == self.num_dec:
          decoder_first_outputs = dec_outputs

    elif self.cell_type == 'GRU':
      decoder_gru = GRU(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")
      dec_outputs, _ = decoder_gru(dec_outputs, initial_state = encoder_states)

      for i in range(2, self.num_dec+1):
        layer_name = ('Dec_hidden_%d') %i
        decoder_gru = GRU(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)
        dec_outputs, _ = decoder_gru(dec_outputs, initial_state = encoder_states)
        if i == self.num_dec:
          decoder_first_outputs = dec_outputs

    elif self.cell_type == 'RNN':
      decoder_rnn = SimpleRNN(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")
      dec_outputs, _ = decoder_rnn(dec_outputs, initial_state = encoder_states)

      for i in range(2, self.num_dec+1):
        layer_name = ('Dec_hidden_%d') %i
        decoder_rnn = SimpleRNN(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)
        dec_outputs, _ = decoder_rnn(dec_outputs, initial_state = encoder_states)
        if i == self.num_dec:
          decoder_first_outputs = dec_outputs

    attention_layer = AttentionLayer(name='attention_layer')
    attention_out, attention_states = attention_layer([enc_outputs, dec_outputs])

    decoder_concat_input = keras.layers.Concatenate(axis=-1, name='concat_layer')([dec_outputs, attention_out])

    decoder_dense = Dense(no_dec_tokens, activation='softmax', name = 'dense')
    dec_outputs = decoder_dense(dec_outputs)

    # Define the model that takes encoder and decoder input 
    # to output dec_outputs
    model = Model([enc_inputs, dec_inputs], dec_outputs)

    model.summary()

    # Define the optimizer
    optimizer = Adam(lr=self.learning_rate, beta_1=0.9, beta_2=0.999)
    model.compile(loss = "categorical_crossentropy", optimizer = optimizer, metrics=['accuracy'])
  
    model.fit(
        [enc_input_data, dec_input_data],
        dec_target_data,
        batch_size=self.batch_size,
        epochs=self.epochs,
        callbacks = [WandbCallback()]
        )
    
    pred=model.predict(val_enc_input_data[:5500], batch_size=self.batch_size)

    global_count = 0
    count = 0
    global_total = 0
    global_correct = 0
    val_total = 5500
    for index in range(0, val_total):
      one_hot_vector = pred[index]
      one_hot_vector1 = val_dec_target_data[index]
      index2 = tf.argmax(one_hot_vector, axis=1)
      index1 = tf.argmax(one_hot_vector1, axis=1)
      #a = (index2-index1).numpy()
      if (index2.numpy() == index1.numpy()).all():
        global_correct = global_correct + 1
        
      global_total = global_total + 1
      accuracy_epoch = global_correct/global_total
      if global_total % 50 == 0:
        wandb.log({'epoch_accuracy' : accuracy_epoch})
      #print("Accuracy: %s" % (accuracy_epoch))
    
    val_accuracy = global_correct/global_total
    #print(val_accuracy)

    wandb.log({'val_accuracy' : val_accuracy})

    if self.cell_type == 'LSTM':
      return encoder_lstm,attention_layer,decoder_lstm,decoder_dense
    if self.cell_type == 'GRU':
      return encoder_gru,attention_layer,decoder_gru,decoder_dense
    if self.cell_type == 'RNN':
      return encoder_rnn,attention_layer,decoder_rnn,decoder_dense  

  def evaluate(self,seq_in):
    attention_plot = np.zeros((max_decoder_seq_length, max_encoder_seq_length))
    #sequence = [7, 9, 8, 5]
    sequence = seq_in
    #sequence = one_hot_encode(seq_in,num_encoder_tokens)
    encoder_inputs=array(sequence).reshape(1,max_encoder_seq_length,num_encoder_tokens)
    
    encoder_inputs = tf.convert_to_tensor(encoder_inputs,dtype=tf.float32)
    
    if self.cell_type == 'LSTM':
      #encoder_lstm = LSTM(self.hidden_size,return_sequences=True, return_state=True, dropout = self.dropout, name='encoder_lstm')
      encoder_outputs, encoder_state_h, encoder_state_c = encoder(encoder_inputs)
      encoder_states = [encoder_state_h, encoder_state_c]
    elif self.cell_type == 'GRU':
      #encoder_gru = GRU(self.hidden_size,return_sequences=True, return_state=True, dropout = self.dropout, name='encoder_gru')
      encoder_outputs, encoder_state_h = encoder(encoder_inputs)
      encoder_states = [encoder_state_h]
    elif self.cell_type == 'RNN':
      #encoder_rnn = SimpleRNN(self.hidden_size,return_sequences=True, return_state=True, dropout = self.dropout, name='encoder_rnn')
      encoder_outputs, encoder_state_h = encoder(encoder_inputs)
      encoder_states = [encoder_state_h]

    all_outputs = []

    decoder_input_data = np.zeros((1, 1, num_decoder_tokens))
    decoder_input_data[:, 0, 0] = 1 

    inputs = decoder_input_data
    decoder_outputs = encoder_state_h
    states = encoder_states

    weigh_atten =[]
    for t in range(max_decoder_seq_length):

      # pay attention
      context_vector, attention_weights=attention(decoder_outputs, encoder_outputs)

      # storing the attention weights to plot later on
      attention_weights = tf.reshape(attention_weights, (-1, ))
      weigh_atten.append(attention_weights)
      
      attention_plot[t] = attention_weights.numpy()
      
      decoder_outputs=tf.expand_dims(decoder_outputs, 1)

      context_vector = tf.expand_dims(context_vector, 1)
      inputs = tf.concat([context_vector, inputs], axis=-1)

      if self.cell_type == 'LSTM':
        decoder_outputs, state_h, state_c = decoder(inputs, initial_state=states)
      if self.cell_type == 'GRU':
        decoder_outputs, state_h = decoder(inputs, initial_state=states)
      if self.cell_type == 'RNN':
        decoder_outputs, state_h = decoder(inputs, initial_state=states)
            
      outputs = decoder_dense(decoder_outputs)
      # Store the current prediction (we will concatenate all predictions later)
      outputs = tf.expand_dims(outputs, 1)
      all_outputs.append(outputs)
      inputs = outputs
      if self.cell_type == 'LSTM':
        states = [state_h, state_c]
      if self.cell_type == 'GRU' or self.cell_type == 'RNN':
        states = [state_h]
    
    decoder_outputs = Lambda(lambda x: K.concatenate(x, axis=1))(all_outputs)
    seq_outs = decoder_outputs[0]
    seq_out = tf.argmax(seq_outs, axis=1)
    seq_out = seq_out.numpy()
    seq_in = tf.argmax(seq_in, axis = 1)
    seq_in = seq_in.numpy()
    list(filter(lambda num: num != 0, seq_in))
    list(filter(lambda num: num != 0, seq_out))
    
    return seq_in, seq_out, attention_plot, weigh_atten

  def plot_attention(self,attention, sequence, predicted_sequence, idx,fig):
    
    ax = fig.add_subplot(4, 3, idx)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 16}
    seq = ''
    for i in range(len(sequence)):
      seq = seq + reverse_source_char_index[sequence[i]]
    
    pred = ''
    for i in range(len(predicted_sequence)):
      pred = pred + reverse_target_char_index[predicted_sequence[i]]

    #ax.rcParams["font.family"] = "Vijaya"
    ax.set_xticklabels(seq, fontdict=fontdict)
    ax.set_yticklabels(pred, fontdict=fontdict, fontproperties = tamil_font)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    
  def translate(self,seq_in, idx,fig):
    seq_in, seq_out, attention_plot, weigh_atten = self.evaluate(seq_in)

    a = [0]
    for i in range(len(seq_in)):
      if seq_in[i] != 0:
        a.append(seq_in[i])

    b = []
    for i in range(len(seq_out)):
      if seq_out[i] != 0:
        b.append(seq_out[i])
  
    b = b[:len(b)-1]
    #print(a)
    #print(b)
    
    attention_plot = attention_plot[:len(b), :len(a)]
    self.plot_attention(attention_plot, a, b, idx,fig)  

    return weigh_atten

  def attention_plot(self,val_input):
    w_a = []
    fig = plt.figure(figsize=(16,18))
    for i in range(1,13,1): 
      seq_in = val_input[i*9]
      weigh_atten = self.translate(seq_in,i,fig)  
      w_a.append(weigh_atten)
    plt.show()
    return w_a

## Sweep

In [None]:
sweep_config = {
    'method': 'bayes', 
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {

        'dropout': {
            'values': [0.0, 0.1, 0.2]
        },
        'learning_rate': {
            'values': [1e-3, 1e-4]
        },
        'batch_size': {
            'values': [64, 128]
        },
        'in_emb': {
            'values': [32, 64, 128]
        },
        'num_enc': {
            'values': [1, 2, 3]
        },
        'num_dec': {
            'values': [1, 2, 3]
        },
        'hidden_size':{
            'values': [32, 64, 128]
        },
        'cell_type': {
            'values': ['GRU']
        }
    }
}

In [None]:
def train_sweep():
  config_defaults = {
        'dropout': 0.4,
        'learning_rate': 1e-3,
        'batch_size': 32,
        'epochs' : 10,
        'in_emb': 32,
        'num_enc': 2,
        'num_dec': 2,
        'hidden_size': 32,
        'cell_type': 'RNN'
        }

  # Initialize a new wandb run
  wandb.init(config = config_defaults)
  
  # Config is a variable that holds and saves hyperparameters and inputs
  config = wandb.config

  wandb.run.name = str(config.cell_type)+ '_' + '_bs_'+str(config.batch_size) + '_hs_'+str(config.hidden_size)
  
  model_rnn = MyRNN(cell_type = config.cell_type, in_emb = config.in_emb, hidden_size=config.hidden_size,
                learning_rate= config.learning_rate, dropout=config.dropout,epochs = config.epochs,
                batch_size = config.batch_size, num_enc = config.num_enc,num_dec = config.num_dec)
  
  model_rnn.build_fit(enc_input_data,dec_input_data,dec_target_data,x_test, y_test)

In [None]:
sweep_id = wandb.sweep(sweep_config, entity="jyothiraditya", project="assignment3")
wandb.agent(sweep_id, lambda : train_sweep())

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: kuatwm7t
Sweep URL: https://wandb.ai/jyothiraditya/assignment3/sweeps/kuatwm7t


[34m[1mwandb[0m: Agent Starting Run: s4myjtcz with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	in_emb: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec: 2
[34m[1mwandb[0m: 	num_enc: 2
[34m[1mwandb[0m: Currently logged in as: [33mjyothiraditya[0m. Use [1m`wandb login --relogin`[0m to force relogin


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Enc_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 Enc_emb (Embedding)            (None, None, 128)    3456        ['Enc_inputs[0][0]']             
                                                                                                  
 Dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 Enc_hidden_1 (LSTM)            [(None, None, 64),   49408       ['Enc_emb[0][0]']                
                                 (None, 64),                                                  

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▄▆▆▇▇████
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▅▃▂▂▂▁▁▁▁

0,1
accuracy,0.88906
epoch,9.0
loss,0.16955


Run s4myjtcz errored: ValueError('in user code:\n\n    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1801, in predict_function  *\n        return step_function(self, iterator)\n    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1790, in step_function  **\n        outputs = model.distribute_strategy.run(run_step, args=(data,))\n    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1783, in run_step  **\n        outputs = model.predict_step(data)\n    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1751, in predict_step\n        return self(x, training=False)\n    File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler\n        raise e.with_traceback(filtered_tb) from None\n    File "/usr/local/lib/python3.7/dist-packages/keras/engine/input_spec.py", line 200, in assert_input_compatibility\n        raise ValueError(f\'Layer "

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Enc_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 Enc_emb (Embedding)            (None, None, 64)     1728        ['Enc_inputs[0][0]']             
                                                                                                  
 Enc_hidden_1 (SimpleRNN)       [(None, None, 128),  24704       ['Enc_emb[0][0]']                
                                 (None, 128)]                                                     
                                                                                                  
 Dec_inputs (InputLayer)        [(None, None)]       0           []                           