In [113]:
from random import randint
from numpy import array
from numpy import argmax
import keras.backend as K
from tensorflow.keras import models
from numpy import array_equal
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Bidirectional, SimpleRNN, GRU
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras import Input
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.layers import Lambda
from tensorflow.keras import backend as K
import tensorflow as tf
import io

tf.keras.backend.set_floatx('float64')

In [114]:
!pip install wandb



In [115]:
import wandb 
from wandb.keras import WandbCallback

In [156]:
# Downloading dakshina dataset
!yes | wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"

--2022-05-07 11:27:37--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.157.128, 142.251.8.128, 74.125.23.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.157.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar.2’


2022-05-07 11:27:48 (182 MB/s) - ‘dakshina_dataset_v1.0.tar.2’ saved [2008340480/2008340480]



In [157]:
# Unzipping dataset
!yes | tar xopf dakshina_dataset_v1.0.tar

In [159]:
# The folder containing the datasets to be used in this program
!ls dakshina_dataset_v1.0/te/lexicons

te.translit.sampled.dev.tsv   te.translit.sampled.train.tsv
te.translit.sampled.test.tsv


In [171]:
!git clone https://github.com/girish445ai/lexicons.git 

Cloning into 'lexicons'...
fatal: could not read Username for 'https://github.com': No such device or address


In [160]:
train_dir = "lexicon-dataset/te.translit.sampled.train.tsv"
dev_dir = "lexicon-dataset/te.translit.sampled.dev.tsv"
test_dir = "lexicon-dataset/te.translit.sampled.test.tsv"


def read_corpus(corpus_file):
  tamil_words = []
  latin_words = []
  with io.open(corpus_file, encoding ='utf-8') as f:
    for line in f:
      if '\t' not in line:
        continue
      tokens = line.rstrip().split("\t")
      latin_words.append(tokens[1])
      tamil_words.append(tokens[0])
  return latin_words, tamil_words

train_source, train_target = read_corpus(train_dir)
valid_source, valid_target = read_corpus(dev_dir)
test_source, test_target = read_corpus(test_dir)

print("Number of training samples: ", len(train_source))
print("Number of validation samples: ", len(valid_source))
print("Number of testing samples: ", len(test_source))

Number of training samples:  58550
Number of validation samples:  5683
Number of testing samples:  5747


In [161]:
arr = np.arange(len(train_source))
np.random.shuffle(arr)
arr1 = np.arange(len(valid_source))
np.random.shuffle(arr1)

input_characters = set()
target_characters = set()
input_texts_ns = []
target_texts_ns = []
val_input_texts_ns = []
val_target_texts_ns = []

for (input_text, target_text) in zip(train_source, train_target):
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "B" + target_text + "E"
    input_texts_ns.append(input_text)
    target_texts_ns.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

for (input_text, target_text) in zip(valid_source, valid_target):
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "B" + target_text + "E"
    val_input_texts_ns.append(input_text)
    val_target_texts_ns.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
      
        if char not in target_characters:
            target_characters.add(char)

input_texts = []
target_texts = []

for i in range(len(train_source)):
    input_texts.append(input_texts_ns[arr[i]])
    target_texts.append(target_texts_ns[arr[i]])

val_input_texts = []
val_target_texts = []

for i in range(len(valid_source)):
    val_input_texts.append(val_input_texts_ns[arr1[i]])
    val_target_texts.append(val_target_texts_ns[arr1[i]])

input_characters.add(" ")
target_characters.add(" ")

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))



# Adding the padding character
#input_characters.append("P")
#target_characters.append("P")

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
val_max_encoder_seq_length = max([len(txt) for txt in val_input_texts])
val_max_decoder_seq_length = max([len(txt) for txt in val_target_texts])



print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)
print("Max sequence length for val inputs:", val_max_encoder_seq_length)
print("Max sequence length for val outputs:", val_max_decoder_seq_length)

print(input_characters)
print(target_characters)

Number of samples: 58550
Number of unique input tokens: 27
Number of unique output tokens: 66
Max sequence length for inputs: 25
Max sequence length for outputs: 22
Max sequence length for val inputs: 21
Max sequence length for val outputs: 21
[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[' ', 'B', 'E', 'ం', 'ః', 'అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ఱ', 'ల', 'ళ', 'వ', 'శ', 'ష', 'స', 'హ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్', '\u200c']


In [162]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
reverse_source_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())
print(input_token_index)
print(target_token_index)

{' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{' ': 0, 'B': 1, 'E': 2, 'ం': 3, 'ః': 4, 'అ': 5, 'ఆ': 6, 'ఇ': 7, 'ఈ': 8, 'ఉ': 9, 'ఊ': 10, 'ఋ': 11, 'ఎ': 12, 'ఏ': 13, 'ఐ': 14, 'ఒ': 15, 'ఓ': 16, 'ఔ': 17, 'క': 18, 'ఖ': 19, 'గ': 20, 'ఘ': 21, 'చ': 22, 'ఛ': 23, 'జ': 24, 'ఝ': 25, 'ఞ': 26, 'ట': 27, 'ఠ': 28, 'డ': 29, 'ఢ': 30, 'ణ': 31, 'త': 32, 'థ': 33, 'ద': 34, 'ధ': 35, 'న': 36, 'ప': 37, 'ఫ': 38, 'బ': 39, 'భ': 40, 'మ': 41, 'య': 42, 'ర': 43, 'ఱ': 44, 'ల': 45, 'ళ': 46, 'వ': 47, 'శ': 48, 'ష': 49, 'స': 50, 'హ': 51, 'ా': 52, 'ి': 53, 'ీ': 54, 'ు': 55, 'ూ': 56, 'ృ': 57, 'ె': 58, 'ే': 59, 'ై': 60, 'ొ': 61, 'ో': 62, 'ౌ': 63, '్': 64, '\u200c': 65}


In [163]:
trunc_input_texts = input_texts[:68096]
trunc_target_texts = target_texts[:68096]

encoder_input_data = np.zeros(
    (len(trunc_input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float64"
)
decoder_target_data = np.zeros(
    (len(trunc_input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float64"
)

for i, (input_text, target_text) in enumerate(zip(trunc_input_texts, trunc_target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_target_data[i, t, target_token_index[char]] = 1.0
    decoder_target_data[i, t + 1 :, target_token_index[" "]] = 1.0
    
val_encoder_input_data = np.zeros(
    (len(val_input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float64"
)
val_decoder_target_data = np.zeros(
    (len(val_target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float64"
)

for i, (input_text, target_text) in enumerate(zip(val_input_texts, val_target_texts)):
    for t, char in enumerate(input_text):
        val_encoder_input_data[i, t, input_token_index[char]] = 1.0
    #encoder_input_data[i, t + 1 :] = input_token_index["P"]
    val_encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0

    for t, char in enumerate(target_text):
      # decoder_target_data is ahead of decoder_input_data by one timestep
        val_decoder_target_data[i, t, target_token_index[char]] = 1.0
    val_decoder_target_data[i, t + 1: ,target_token_index[" "]] = 1.0

In [164]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)
    
  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

In [165]:
class LuongAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(LuongAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
   
    query_with_time_axis = tf.expand_dims(query, 1)
    
    values_transposed = tf.transpose(values, perm=[0, 2, 1])
    
    score = tf.transpose(tf.matmul(query_with_time_axis, values_transposed) , perm=[0, 2, 1])

    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [166]:
class MyRNN_atten(object):
  def __init__(self,cell_type = 'RNN', hidden_size=32, 
               learning_rate= 1e-3,dropout=0.3,epochs = 10, batch_size = 32,
               attention = 'bahdanau'):
    
    self.cell_type = cell_type
    self.hidden_size = hidden_size
    self.learning_rate = learning_rate
    self.dropout = dropout
    self.epochs = epochs
    self.batch_size = batch_size
    self.attention = attention

  def build_fit(self,encoder_input_data,decoder_target_data):

    encoder_inputs = Input(shape=(max_encoder_seq_length, num_encoder_tokens), name='encoder_inputs')
    if self.cell_type == 'LSTM':
      encoder_lstm = LSTM(self.hidden_size,return_sequences=True, return_state=True, dropout = self.dropout, name='encoder_lstm')
      encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
      encoder_states = [encoder_state_h, encoder_state_c]
    elif self.cell_type == 'GRU':
      encoder_gru = GRU(self.hidden_size,return_sequences=True, return_state=True, dropout = self.dropout, name='encoder_gru')
      encoder_outputs, encoder_state_h = encoder_gru(encoder_inputs)
      encoder_states = [encoder_state_h]
    elif self.cell_type == 'RNN':
      encoder_rnn = SimpleRNN(self.hidden_size,return_sequences=True, return_state=True, dropout = self.dropout, name='encoder_rnn')
      encoder_outputs, encoder_state_h = encoder_rnn(encoder_inputs)
      encoder_states = [encoder_state_h]

    # Set up the attention layer
    if self.attention == 'bahdanau':
      attention= BahdanauAttention(self.hidden_size)
    elif self.attention == 'luong':
      attention= LuongAttention(self.hidden_size)

    # Set up the decoder layers
    decoder_inputs = Input(shape=(1, (num_decoder_tokens+self.hidden_size)),name='decoder_inputs')
    if self.cell_type == 'LSTM':
      decoder_lstm = LSTM(self.hidden_size, dropout = self.dropout, return_state=True, name='decoder_lstm')
    elif self.cell_type == 'GRU':
      decoder_gru = GRU(self.hidden_size, dropout = self.dropout, return_state=True, name='decoder_gru')
    elif self.cell_type == 'RNN':
      decoder_rnn = SimpleRNN(self.hidden_size, dropout = self.dropout, return_state=True, name='decoder_rnn')  
    
    decoder_dense = Dense(num_decoder_tokens, activation='softmax',  name='decoder_dense')

    all_outputs = []

    inputs = np.zeros((self.batch_size, 1, num_decoder_tokens))
    inputs[:, 0, 0] = 1 

    decoder_outputs = encoder_state_h
    states = encoder_states

    for _ in range(max_decoder_seq_length):

      context_vector, attention_weights=attention(decoder_outputs, encoder_outputs)
      
      context_vector = tf.expand_dims(context_vector, 1)
      
      inputs = tf.concat([context_vector, inputs], axis=-1)
      if self.cell_type == 'LSTM':
        decoder_outputs, state_h, state_c = decoder_lstm(inputs, initial_state=states)
      if self.cell_type == 'GRU':
        decoder_outputs, state_h = decoder_gru(inputs, initial_state=states)
      if self.cell_type == 'RNN':
        decoder_outputs, state_h = decoder_rnn(inputs, initial_state=states)
      
      outputs = decoder_dense(decoder_outputs)
      outputs = tf.expand_dims(outputs, 1)
      all_outputs.append(outputs)
      inputs = outputs
      if self.cell_type == 'LSTM':
        states = [state_h, state_c]
      if self.cell_type == 'GRU' or self.cell_type == 'RNN':
        states = [state_h]


    decoder_outputs = Lambda(lambda x: K.concatenate(x, axis=1))(all_outputs)
    #getindicelayer = Lambda(lambda x: x[:, -1, :]) 
    #decoder_outputs = getindicelayer(all_outputs)

    model = Model(encoder_inputs, decoder_outputs, name='model_encoder_decoder')
    
    optimizer = Adam(learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    
    #model.summary()

    model.fit(encoder_input_data, decoder_target_data,
              batch_size=self.batch_size, 
              epochs=self.epochs,
              #callbacks = [WandbCallback()]
              )

    pred=model.predict(val_encoder_input_data[:6784], batch_size=self.batch_size)

    global_count = 0
    count = 0
    global_total = 0
    global_correct = 0
    val_total = 6784
    for index in range(0, val_total):
      one_hot_vector = pred[index]
      one_hot_vector1 = val_decoder_target_data[index]
      index2 = tf.argmax(one_hot_vector, axis=1)
      index1 = tf.argmax(one_hot_vector1, axis=1)
      #a = (index2-index1).numpy()
      if (index2.numpy() == index1.numpy()).all():
        global_correct = global_correct + 1
        
      global_total = global_total + 1
      accuracy_epoch = global_correct/global_total
      if global_total % 50 == 0:
        wandb.log({'epoch_accuracy' : accuracy_epoch})
      #print("Accuracy: %s" % (accuracy_epoch))
    
    val_accuracy = global_correct/global_total
    #print(val_accuracy)

    wandb.log({'val_accuracy' : val_accuracy})

In [167]:
sweep_config = {
    'method': 'bayes', 
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {

        'dropout': {
            'values': [0.0, 0.1, 0.2]
        },
        'learning_rate': {
            'values': [1e-3, 1e-4]
        },
        'batch_size': {
            'values': [64, 128]
        },
        'hidden_size':{
            'values': [32, 64, 128]
        },
        'cell_type': {
            'values': ['RNN', 'GRU', 'LSTM']
        },
        'attention': {
            'values': ['bahdanau', 'luong']
        }
    }
}

In [168]:
def train_sweep():
  config_defaults = {
        'dropout': 0.3,
        'learning_rate': 1e-3,
        'batch_size': 128,
        'epochs' : 10,
        'hidden_size': 128,
        'cell_type': 'LSTM',
        'attention': 'bahdanau'
        }

  # Initialize a new wandb run
  wandb.init(config = config_defaults)
  
  # Config is a variable that holds and saves hyperparameters and inputs
  config = wandb.config

  wandb.run.name = str(config.cell_type)+ '_' + config.attention +'_bs_'+str(config.batch_size)+'_bs_'+str(config.hidden_size)
  
  model_rnn = MyRNN_atten(cell_type = config.cell_type, hidden_size=config.hidden_size,
                learning_rate= config.learning_rate, dropout=config.dropout,epochs = config.epochs,
                batch_size = config.batch_size, attention = config.attention)
  
  model_rnn.build_fit(encoder_input_data,decoder_target_data)

In [169]:
sweep_id = wandb.sweep(sweep_config, entity="jyothiraditya", project="assignment3_attn")
wandb.agent(sweep_id, train_sweep, count=100)

Create sweep with ID: s3rw6dy2
Sweep URL: https://wandb.ai/jyothiraditya/assignment3_attn/sweeps/s3rw6dy2


[34m[1mwandb[0m: Agent Starting Run: ih27eeac with config:
[34m[1mwandb[0m: 	attention: luong
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/10

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[34m[1mwandb[0m: [32m[41mERROR[0m Run ih27eeac errored: InvalidArgumentError()
[34m[1mwandb[0m: Agent Starting Run: wh18zl2m with config:
[34m[1mwandb[0m: 	attention: luong
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
 64/458 [===>..........................] - ETA: 3:49 - loss: 2.0566 - accuracy: 0.5766

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
