In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from keras.layers import Input,Dense,LSTM,GRU,RNN,SimpleRNN,Softmax,Dropout,Concatenate
from keras.layers import TimeDistributed
import matplotlib.pyplot as plt
from attention import AttentionLayer
import pandas as pd

In [None]:
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xvf '/content/dakshina_dataset_v1.0.tar'

--2022-05-15 12:06:07--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.99.128, 173.194.202.128, 74.125.199.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.99.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2022-05-15 12:06:15 (250 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]

dakshina_dataset_v1.0/bn/
dakshina_dataset_v1.0/bn/lexicons/
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv
dakshina_dataset_v1.0/bn/native_script_wikipedia/
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.valid.text.shuf.txt.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.info.sorted.tsv.

In [None]:
def preprocess_data(file_name):
    
    input_texts = []
    target_texts = []
    inputdata=[]
    with open(file_name, "r", encoding="utf-8") as f:
        lines = f.read().split("\n")
    for line in lines[: len(lines) - 1]:
        inputdata.append(line)
   
    for line in inputdata:
        target_text,input_text, attestation = line.split("\t")
        
        target_text = "\t" + target_text + "\n"
        input_texts.append(input_text)
        target_texts.append(target_text)
        
    return(input_texts,target_texts)





In [None]:
input_words, target_words = preprocess_data("dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")


max_encoder_seq_length = max([len(txt) for txt in input_words])
max_decoder_seq_length = max([len(txt) for txt in target_words])

input_characters = set()
target_characters = set()

for input_word in input_words:  
  for char in input_word:
        if char not in input_characters:
            input_characters.add(char)
for target_word in target_words:
    for char in target_word:
        if char not in target_characters:
            target_characters.add(char)
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

num_input_tokens = len(input_characters)
num_target_tokens = len(target_characters)



print(len(input_characters), len(target_characters))
input_char_map = dict([(char, i+1) for i, char in enumerate(input_characters)])
target_char_map = dict([(char, i+1) for i, char in enumerate(target_characters)])
print(len(input_words), len(target_words))
print(input_char_map)


val_input_words, val_target_words = preprocess_data("dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")

test_input_words, test_target_words = preprocess_data("dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")



26 65
44204 44204
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [None]:


def one_hot_encoding(input_words, target_words):

    length = len(input_words)
    encoder_input_array = np.zeros(
        (length, max_encoder_seq_length, num_input_tokens+1), dtype="float32"
    )
    decoder_input_array = np.zeros(
        (length, max_decoder_seq_length, num_target_tokens+1), dtype="float32"
    )
    decoder_output_array = np.zeros(
        (length, max_decoder_seq_length, num_target_tokens+1), dtype="float32"
    )


    for i, (input_text, target_text) in enumerate(zip(input_words, target_words)):
        for t, char in enumerate(input_text):
            encoder_input_array[i, t, input_char_map[char]] = 1.0
        
        for t, char in enumerate(target_text):
            
            decoder_input_array[i, t, target_char_map[char]] = 1.0
            if t >=1 :
                
                decoder_output_array[i, t - 1, target_char_map[char]] = 1.0
        
    return(encoder_input_array,decoder_input_array,decoder_output_array)

encoder_input_array, decoder_input_array, decoder_output_array = one_hot_encoding(input_words,target_words)
val_encoder_input_array, val_decoder_input_array, val_decoder_output_array = one_hot_encoding(val_input_words,val_target_words)
test_encoder_input_array, test_decoder_input_array, test_decoder_output_array = one_hot_encoding(test_input_words,test_target_words)

print(decoder_input_array.shape)
encoder_input_array = np.argmax(encoder_input_array, axis=2)
decoder_input_array = np.argmax(decoder_input_array, axis=2)

val_encoder_input_array = np.argmax(val_encoder_input_array, axis=2)
test_encoder_input_array = np.argmax(test_encoder_input_array, axis=2)

val_decoder_input_array = np.argmax(val_decoder_input_array, axis=2)
test_decoder_input_array = np.argmax(test_decoder_input_array, axis=2)

reverse_input_char_map = dict((i, char) for char, i in input_char_map.items())
print(reverse_input_char_map)
reverse_target_char_map = dict((i, char) for char, i in target_char_map.items())
print(reverse_target_char_map)
reverse_target_char_map[0] = "\n"

(44204, 21, 66)
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
{1: '\t', 2: '\n', 3: 'ँ', 4: 'ं', 5: 'ः', 6: 'अ', 7: 'आ', 8: 'इ', 9: 'ई', 10: 'उ', 11: 'ऊ', 12: 'ऋ', 13: 'ए', 14: 'ऐ', 15: 'ऑ', 16: 'ओ', 17: 'औ', 18: 'क', 19: 'ख', 20: 'ग', 21: 'घ', 22: 'ङ', 23: 'च', 24: 'छ', 25: 'ज', 26: 'झ', 27: 'ञ', 28: 'ट', 29: 'ठ', 30: 'ड', 31: 'ढ', 32: 'ण', 33: 'त', 34: 'थ', 35: 'द', 36: 'ध', 37: 'न', 38: 'प', 39: 'फ', 40: 'ब', 41: 'भ', 42: 'म', 43: 'य', 44: 'र', 45: 'ल', 46: 'व', 47: 'श', 48: 'ष', 49: 'स', 50: 'ह', 51: '़', 52: 'ा', 53: 'ि', 54: 'ी', 55: 'ु', 56: 'ू', 57: 'ृ', 58: 'ॅ', 59: 'े', 60: 'ै', 61: 'ॉ', 62: 'ो', 63: 'ौ', 64: '्', 65: 'ॐ'}


In [None]:
def define_model(num_cells, cell_type, num_encoder_layers, num_decoder_layers, input_embedding_size, dropout_fraction, beam_size):
    
    encoder_input = keras.Input(shape=(None, ), name="enc_input")
    encoder_embedding = keras.layers.Embedding(num_input_tokens + 1, input_embedding_size, name="enc_embedding", mask_zero=True)(encoder_input)

    
    states = {}
    for i in range(0, num_encoder_layers):
        if cell_type=="LSTM":

            encoder = keras.layers.LSTM(num_cells, return_state=True, return_sequences=True, name="enc_"+str(i+1), dropout=dropout_fraction, recurrent_dropout=dropout_fraction)

            if i==0:
                encoder_outputs, encoder_state_h, encoder_state_c = encoder(encoder_embedding)
            else:
                encoder_outputs, encoder_state_h, encoder_state_c = encoder(encoder_outputs)

            states['encoder_state_h_'+str(i+1)] =  encoder_state_h
            states['encoder_state_c_'+str(i+1)] =  encoder_state_c
              

        if cell_type=="RNN":
  
            encoder = keras.layers.SimpleRNN(num_cells, return_state=True, return_sequences=True, name="enc_"+str(i+1), dropout=dropout_fraction, recurrent_dropout=dropout_fraction)
            
            if i==0:
                whole_sequence_output, rnn_final_state = encoder(encoder_embedding)
            else:
                whole_sequence_output, rnn_final_state = encoder(whole_sequence_output)

            states['rnn_final_state_'+str(i+1)] =  rnn_final_state
            

        if cell_type=="GRU":
            
            encoder = keras.layers.GRU(num_cells, return_state=True, return_sequences=True, name="enc_"+str(i+1), dropout=dropout_fraction, recurrent_dropout=dropout_fraction)
            
            if i==0:
                whole_sequence_output, gru_final_state = encoder(encoder_embedding)
            else:
                whole_sequence_output, gru_final_state = encoder(whole_sequence_output)

            states['gru_final_state_'+str(i+1)] =  gru_final_state
            

   
    decoder_input = keras.Input(shape=(None, ), name="dec_input")
    decoder_embedding = keras.layers.Embedding(num_target_tokens + 1, 64, name="dec_embedding", mask_zero=True)(decoder_input)


    for i in range(0, num_decoder_layers):
        if cell_type=="LSTM":
            decoder_lstm = keras.layers.LSTM(num_cells, return_sequences=True, return_state=True, name="dec_"+str(i+1), dropout=dropout_fraction, recurrent_dropout=dropout_fraction)
            
            if i==0:
                decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_embedding, initial_state = [states['encoder_state_h_'+str(i+1)], states['encoder_state_c_'+str(i+1)]])
            else:
                decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_outputs, initial_state = [states['encoder_state_h_'+str(i+1)],states['encoder_state_c_'+str(i+1)]])
            

        if cell_type=="RNN":
            decoder_rnn = keras.layers.SimpleRNN(num_cells, return_sequences=True, return_state=True, name="dec_"+str(i+1), dropout=dropout_fraction, recurrent_dropout=dropout_fraction)
            if i==0:
                decoder_outputs, rnn_decoder_final_state = decoder_rnn(decoder_embedding, initial_state = states['rnn_final_state_'+str(i+1)])
            else:
                decoder_outputs, rnn_decoder_final_state = decoder_rnn(decoder_outputs, initial_state = states['rnn_final_state_'+str(i+1)])
            
        if cell_type=="GRU":
            decoder_gru = keras.layers.GRU(num_cells, return_sequences=True, return_state=True, name="dec_"+str(i+1), dropout=dropout_fraction, recurrent_dropout=dropout_fraction)
            if i==0:
                decoder_outputs, gru_decoder_final_state = decoder_gru(decoder_embedding, initial_state = states['gru_final_state_'+str(i+1)])
            else:
                decoder_outputs, gru_decoder_final_state = decoder_gru(decoder_outputs, initial_state = states['gru_final_state_'+str(i+1)])
            


    decoder_dense = keras.layers.Dense(num_target_tokens + 1, activation="softmax", name="dec_dense") # Softmax picks one character
    decoder_outputs = decoder_dense(decoder_outputs)


    model = keras.Model([encoder_input, decoder_input], decoder_outputs)

    return model

In [None]:
def inferenceLSTM(model, num_cells):
   
    
    states={}
    enc_states=[]
    enc_inputs = model.input[0]
    dec_inputs = model.input[1]

    
    for layer in model.layers:
        string = layer.name
        i= string[-1]
        if(i.isnumeric() and string[0]=='e'):
          _, enc_h_state, enc_c_state= layer.output
          states['enc_h_state_'+i]=enc_h_state
          states['enc_c_state_'+i]=enc_c_state
          enc_states.append(states['enc_h_state_'+ i])
          enc_states.append(states['enc_c_state_'+ i])
  

    enc_model = keras.Model(enc_inputs, enc_states)

   
    decoders={}
    count=0
    for layer in model.layers:
        if layer.name=="dec_dense":
            dec_dense = layer
        if layer.name == "dec_embedding":
            dec_embedding = layer
        string = layer.name
        i= string[-1]
        if(i.isnumeric() and string[0]=='d'):
          count+=1
          decoders['decoder_'+i]=layer
     

    for i in range(1,count+1):
      input_dec_h_state = keras.Input(shape=(num_cells,))
      input_dec_c_state = keras.Input(shape=(num_cells,))
      states['input_dec_h_state_'+str(i)]=input_dec_h_state
      states['input_dec_c_state_'+str(i)]=input_dec_c_state



    dec_states_inputs=[]
    for i in range(1,count+1):
      states['input_dec_states_'+str(i)]=[]
      states['input_dec_states_'+str(i)].append(states['input_dec_h_state_'+str(i)])
      states['input_dec_states_'+str(i)].append(states['input_dec_c_state_'+str(i)])
      dec_states_inputs= dec_states_inputs+states['input_dec_states_'+str(i)]



    dec_states=[]
    for i in range(1,count+1):
      if(i==1):
        dec_outputs, dec_h_state, dec_c_state = decoders['decoder_'+str(i)](dec_embedding(dec_inputs), states['input_dec_states_'+str(i)])
      else:
        dec_outputs, dec_h_state, dec_c_state = decoders['decoder_'+str(i)](dec_outputs, states['input_dec_states_'+str(i)])
      
      states['dec_h_state_'+str(i)]= dec_h_state
      states['dec_c_state_'+str(i)]= dec_c_state

      dec_states.append(states['dec_h_state_'+str(i)])
      dec_states.append(states['dec_c_state_'+str(i)])


   

   
    dec_outputs = dec_dense(dec_outputs)
   
    dec_model = keras.Model([dec_inputs] + dec_states_inputs, [dec_outputs] + dec_states)

    return enc_model, dec_model


In [None]:
def inferenceOther(model, num_cells):
    
    
    states={}
    enc_states=[]
    enc_inputs = model.input[0]
    dec_inputs = model.input[1]

    
    for layer in model.layers:
        string = layer.name
        i= string[-1]
        if(i.isnumeric() and string[0]=='e'):
          _, enc_state= layer.output
          states['enc_state_'+i]= enc_state
          enc_states.append(states['enc_state_'+ i])
          
  

    
    enc_model = keras.Model(enc_inputs, enc_states)

   

   
    decoders={}
    count=0
    for layer in model.layers:
        if layer.name=="dec_dense":
            dec_dense = layer
        if layer.name == "dec_embedding":
            dec_embedding = layer
        string = layer.name
        i= string[-1]
        if(i.isnumeric() and string[0]=='d'):
          count+=1
          decoders['decoder_'+i]=layer
        
    

    for i in range(1,count+1):
      input_dec_state = keras.Input(shape=(num_cells,))
      states['input_dec_state_'+str(i)]=input_dec_state
      

    

    dec_states_inputs=[]
    for i in range(1,count+1):
      states['input_dec_states_'+str(i)]=[]
      states['input_dec_states_'+str(i)].append(states['input_dec_state_'+str(i)])
      dec_states_inputs= dec_states_inputs+states['input_dec_states_'+str(i)]


    

    dec_states=[]
    for i in range(1,count+1):
      if(i==1):
        dec_outputs, dec_state = decoders['decoder_'+str(i)](dec_embedding(dec_inputs), states['input_dec_states_'+str(i)])
      else:
        dec_outputs, dec_state = decoders['decoder_'+str(i)](dec_outputs, states['input_dec_states_'+str(i)])
      
      states['dec_state_'+str(i)]= dec_state
      

      dec_states.append(states['dec_state_'+str(i)])
 

  
    dec_outputs = dec_dense(dec_outputs)
   
   
    dec_model = keras.Model([dec_inputs] + dec_states_inputs, [dec_outputs] + dec_states)

    
    return enc_model, dec_model


In [None]:
def decode_words(input_words, enc_model, dec_model):
    
    batch_size = input_words.shape[0]
    
    enc_hidden_states = enc_model.predict(input_words)

    target_sequence = np.zeros((batch_size, 1, num_target_tokens+1))
    
    target_sequence[:, 0, target_char_map["\t"]] = 1.0
    target_sequence = np.argmax(target_sequence, axis=2)

    dec_words=[]
    for i in range(batch_size):
      dec_words.append("")

   

    for i in range(max_decoder_seq_length):

        outputs = dec_model.predict([target_sequence] + enc_hidden_states)

        outputs = list(outputs)

        output_tokens = outputs[0]


        sampled_char_indices = np.argmax(output_tokens[:, -1, :], axis=1)

        enc_hidden_states=[]
        
        target_sequence = np.zeros((batch_size, 1, num_target_tokens+1))

        for j, ch_index in enumerate(sampled_char_indices):
            dec_words[j] += reverse_target_char_map[ch_index]
            target_sequence[j, 0, ch_index] = 1.0

        target_sequence = np.argmax(target_sequence, axis=2)

        
        
        for i in range(1,len(outputs)):
          enc_hidden_states.append(outputs[i]) 

    i=0
    for word in dec_words:
      dec_words[i] = word[:word.find("\n")]
      i=i+1
    
    
    
    return dec_words

In [None]:
sweep_config = {
  'name': 'Attention',
  'method': 'bayes',
  'metric': {
      'name': 'accuracy',
      'goal': 'maximize'   
    },
  'parameters': {
      
        'input_embedding':{
            'values' : [32, 64, 128]
        },
        'enc_layers':{
            'values':[1,2,3]
        },
        'dec_layers':{
            'values':[1,2,3]
        },
        'hidden':{
            'values':[64,128,256]
        },
        'cell_type':{
            'values':['GRU', 'LSTM','RNN']
        },
        'dropout':{
            'values':[0.0,0.3]
        },
        'epochs':{
            'values':[5,10,15,20]
        },
        'rec_dropout':{
            'values':[0.0,0.3]
        },
        'beam_size':{
            'values':[1,3]
        }

    }
}

sweep_id = wandb.sweep(sweep_config, project='CS6910 Assignment 3', entity='go4rav')

In [None]:
def train(num_cells, cell_type, num_layers, input_embedding_size, dropout_fraction, beam_size, recurrent_dropout, epochs):
   

    model = define_model(num_cells, cell_type, num_layers, num_layers, input_embedding_size, dropout_fraction, beam_size)
    print(model.summary())

   
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    
    history = model.fit(
            [encoder_input_array, decoder_input_array],
            decoder_output_array,
            batch_size = 64,
            epochs = epochs,
            verbose = 1,
            validation_data = ([val_encoder_input_array, val_decoder_input_array], val_decoder_output_array),
            callbacks=[WandbCallback()]
            )
    
    
    model.save("best_model_without_attention.h5")

    
    

    if cell_type == "LSTM":
        encoder_model, decoder_model = InferenceLSTM(model, num_cells)
    else:
        encoder_model, decoder_model = InferenceOther(model, num_cells)

    

  


    outputs = []
    n = encoder_input_array.shape[0]
    batch_size = 1000
    for i in range(0, n, batch_size):
        
        query = encoder_input_array[i:i+batch_size]
        
        decoded_words = decode_words(query, encoder_model, decoder_model)
        outputs = outputs + decoded_words

   
    ground_truths = [word[1:-1] for word in target_words]
    
    training_inference_accuracy = np.mean(np.array(outputs) == np.array(ground_truths))
    

    outputs = []
    n = val_encoder_input_array.shape[0]
    batch_size = 1000
    for i in range(0, n, batch_size):
       
        query = val_encoder_input_array[i:i+batch_size]
       
        decoded_words = decode_words(query, encoder_model, decoder_model)
        outputs = outputs + decoded_words

  
    ground_truths = [word[1:-1] for word in val_target_words]
    
    validation_inference_accuracy = np.mean(np.array(outputs) == np.array(ground_truths))
   
    

    return model, history

In [None]:
def model_train():

  run = wandb.init()
  configuration=run.config

  wandb.run.name = ('model='
        +str(configuration.cell_type)
        +'_embed_size='
        +str(configuration.input_embedding)
        + '_num_enc='
        + str(configuration.enc_layers)
        + '_num_dec='
        + str(configuration.dec_layers)
        + "_rec_drp_out="
        + str(configuration.rec_dropout) 
        + "_drp_out="
        + str(configuration.dropout) 
        + "_bm_size="
        + str(configuration.beam_size)
        + "_hid_size="
        + str(configuration.hidden)
        + "_epchs="
        + str(configuration.epochs)
    )
  configuration.dec_layers = configuration.enc_layers

  model, history = train(configuration.hidden, configuration.cell_type, configuration.dec_layers,configuration.input_embedding ,configuration.dropout, configuration.beam_size,configuration.rec_dropout,configuration.epochs)
  
