<a href="https://colab.research.google.com/github/dipnarayan501/Assignment3/blob/main/attention_wandb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Seq2Seq Attention with attention

In [1]:
#import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import keras
import tensorflow as tf
from keras.layers import SimpleRNN,LSTM,GRU,Embedding,Dense,Dropout,Input,Concatenate
from tensorflow.keras.optimizers import Adam,Nadam
from keras import Model

In [2]:
#Mount to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#configuring wandb
%pip install wandb -q
import wandb
wandb.login()

[K     |████████████████████████████████| 1.8 MB 7.9 MB/s 
[K     |████████████████████████████████| 144 kB 52.7 MB/s 
[K     |████████████████████████████████| 181 kB 55.5 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
zip_path = "drive/MyDrive/hi.zip"
!cp "{zip_path}" .
!unzip -q hi.zip

In [7]:
#Reading dataset as dataframe and returning it
def load_data(path):
    with open(path) as file:
        data = pd.read_csv(file,sep='\t',header=None,names=["hi","en",""],skip_blank_lines=True,index_col=None)
    data = data[data['hi'].notna()]
    data = data[data['en'].notna()]
    data = data[['hi','en']]
    return data

In [8]:
#Getting dataset for train test validation
train = load_data("hi/lexicons/hi.translit.sampled.train.tsv")
dev = load_data("hi/lexicons/hi.translit.sampled.dev.tsv")
test = load_data("hi/lexicons/hi.translit.sampled.test.tsv")

In [9]:
x = train['en'].values
y = train['hi'].values
y = '\t'+y+'\n'

Getting Unique tokens hindi and english language

In [10]:
def unique_tokenize(data):
    english = train['en'].values
    hindi = train['hi'].values
    hindi = '\t'+hindi+'\n'
    english_tokens = set()
    hindi_tokens = set()
    
    for x,y in zip(english,hindi):
        for ch in x:
            english_tokens.add(ch)
        for ch in y:
            hindi_tokens.add(ch)
    english_tokens = sorted(list(english_tokens))
    hindi_tokens = sorted(list(hindi_tokens))
    return hindi_tokens , english_tokens
hindi_tokens , english_tokens = unique_tokenize(train)

Mapping the tokens for hindi and engliish

In [11]:
def tokenize_map(hindi_tokens , english_tokens):
    eng_token_map = dict([(ch,i+1) for i,ch in enumerate(english_tokens)])
    hin_token_map = dict([(ch,i+1) for i,ch in enumerate(hindi_tokens)])
    hin_token_map[" "] = 0
    eng_token_map[" "] = 0
    return hin_token_map, eng_token_map

hin_token_map, eng_token_map = tokenize_map(hindi_tokens , english_tokens)

In [12]:
#Getting max length
max_eng_len = max([len(i) for i in x])
max_hin_len = max([len(i) for i in y])

Preprocessing the datset

In [13]:
def process(data):
    x,y = data['en'].values, data['hi'].values
    y = "\t" + y + "\n"
    
    a = np.zeros((len(x),max_eng_len),dtype="float32")
    b = np.zeros((len(y),max_hin_len),dtype="float32")
    c = np.zeros((len(y),max_hin_len,len(hindi_tokens)+1),dtype="int")
    
    
    for i,(xx,yy) in enumerate(zip(x,y)):
        for j,ch in enumerate(xx):
            a[i,j] = eng_token_map[ch]

        a[i,j+1:] = eng_token_map[" "]
        for j,ch in enumerate(yy):
            b[i,j] = hin_token_map[ch]

            if j>0:
                c[i,j-1,hin_token_map[ch]] = 1

        b[i,j+1:] = hin_token_map[" "]
        c[i,j:,hin_token_map[" "]] = 1
        
    return a,b,c

In [14]:
#Getting preprocess train test and validation data
trainx, trainxx, trainy = process(train)
valx, valxx, valy = process(dev)
testx,testxx,testy = process(test)

In [15]:
np.random.seed(42)
reverse_eng_map = dict([(i,char) for char,i in eng_token_map.items()])
reverse_hin_map = dict([(i,char) for char,i in hin_token_map.items()])

Attention code

In [16]:
#returns context vector and attention weights
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.units = units
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'units': self.units,
  
        })
        return config
    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # values shape == (batch_size, max_len, hidden size)

        query_with_time_axis = tf.expand_dims(query, 1)
        
        # score 
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights 
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector 
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        

        return context_vector, attention_weights

In [17]:
#Generating encoder decoder models
def build_model(cell = "LSTM",units = 256, enc_layers = 1, dec_layers = 1,embedding_dim = 32,dense_size=32,dropout=None):
    keras.backend.clear_session()
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_dim=len(english_tokens)+1,output_dim = embedding_dim,mask_zero=True)
    encoder_context = encoder_embedding(encoder_inputs)
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(input_dim = len(hindi_tokens)+1,output_dim = embedding_dim,mask_zero=True)
    decoder_context = decoder_embedding(decoder_inputs)
    attention = Attention(units)
    tot_out = []
    concat1 = Concatenate(axis=-1)
    concat2 = Concatenate(axis=1)    
    if cell == "LSTM":
        encoder_prev = [LSTM(units,return_sequences=True) for i in range(enc_layers-1)]
        encoder_fin = LSTM(units,return_sequences=True,return_state=True)
        temp = encoder_context
        for lay in encoder_prev:
            temp = lay(temp)
            if dropout is not None:
                temp = Dropout(dropout)(temp)
            
        enc_out = encoder_fin(temp)
        dec_states = enc_out[1:]
        
        decoder = [LSTM(units,return_sequences=True,return_state=True) for i in range(dec_layers)]
        temp_states = [dec_states]*dec_layers
        
        for i in range(max_hin_len):
            context,att_wts = attention(temp_states[0][0],enc_out[0])
            temp = concat1([tf.expand_dims(context, 1), decoder_context[:,i:i+1,:]])

            for i in range(dec_layers):
                temp,sh,sc = decoder[i](temp,initial_state=temp_states[i])
                temp_states[i] = [sh,sc]
            tot_out.append(temp)
            
        outt = concat2(tot_out)
       
    elif cell == "GRU":
        encoder_prev = [GRU(units,return_sequences=True) for i in range(enc_layers-1)]
        encoder_fin = GRU(units,return_sequences=True,return_state=True)
        temp = encoder_context
        for lay in encoder_prev:
            temp = lay(temp)
            if dropout is not None:
                temp = Dropout(dropout)(temp)
            
        enc_out = encoder_fin(temp)
        dec_states = enc_out[1:]
        
        decoder = [GRU(units,return_sequences=True,return_state=True) for i in range(dec_layers)]
        temp_states = []
        for _ in range(dec_layers):
            temp_states += dec_states
        
        for i in range(max_hin_len):
            context,att_wts = attention(temp_states[0],enc_out[0])
            temp = concat1([tf.expand_dims(context, 1), decoder_context[:,i:i+1,:]])

            for i in range(dec_layers):
                temp,st = decoder[i](temp,initial_state=temp_states[i])
                temp_states[i] = st
            tot_out.append(temp)
            
        outt = concat2(tot_out)
            
        
    dense_lay1 = Dense(dense_size,activation='relu')
    pre_out = dense_lay1(outt)
    dense_lay2 = Dense(len(hindi_tokens)+1,activation = 'softmax')
    final_output = dense_lay2(pre_out)
    
    train = Model([encoder_inputs,decoder_inputs],final_output)
    
    encoder_model = Model(encoder_inputs,enc_out)
    
    if cell == "LSTM":
        state_inputs = []
        state_outputs = []
        
        encout_input = Input(shape=(None,units))
        
        temp = decoder_context
                                                                  
        for i in range(dec_layers):
            decoder_input_h = Input(shape=(units,))
            decoder_input_c = Input(shape=(units,))
            
            if i==0:
                context,att_wts_out = attention(decoder_input_h,encout_input)
                temp = concat1([tf.expand_dims(context, 1), temp])
                
            temp,sh,sc = decoder[i](temp,initial_state = [decoder_input_h,decoder_input_c])
            state_inputs += [decoder_input_h,decoder_input_c]
            state_outputs += [sh,sc]
            
        decoder_input_pass = [decoder_inputs,encout_input] + state_inputs
        
    elif cell == "GRU":
        state_inputs = []
        state_outputs = []
        
        encout_input = Input(shape=(None,units))
        
        temp = decoder_context
                                                                  
        for i in range(dec_layers):
            state_input = Input(shape=(units,))
            
            if i==0:
                context,att_wts_out = attention(state_input,encout_input)
                temp = concat1([tf.expand_dims(context, 1), temp])
                
            temp,s = decoder[i](temp,initial_state = state_input)
            state_inputs.append(state_input)
            state_outputs.append(s)
            
        decoder_input_pass = [decoder_inputs,encout_input] + state_inputs

    pre_out = dense_lay1(temp)
    final_output = dense_lay2(pre_out)
    
    decoder_model = Model(decoder_input_pass, [final_output,att_wts_out]+state_outputs)
    
    return train,encoder_model,decoder_model

In [18]:
#configuring wandb
import wandb
from wandb.keras import WandbCallback
%pip install wandb -q
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdipnarayan501[0m ([33mfdl-moni_dip[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [25]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
from wandb.keras import WandbCallback
units = 256
def train():
    # Default values for hyper-parameters we're going to sweep over

    config_defaults = {
        'learning_rate': 1e-2,
        'dense_size': 128,
        'cell': 'LSTM',
        'units' :256,
        'embedding_dim': 64,
        'enc_layers': 1,
        'dec_layers': 1,
        'dropout': 0.,
        'batch_size': 64
    }

    # Initialize a new wandb run
    wandb.init(config=config_defaults)
    
    config = wandb.config

    global units

    enc_layers=config.enc_layers
    dec_layers=config.dec_layers
    embedding_dim = config.embedding_dim

    dense_size=config.dense_size
    cell = config.cell
    dropout = config.dropout
    learning_rate = config.learning_rate
    batch_size = config.batch_size
                                

    # Displaying hyperparameters
    run_name = "{}_enc_lay_{}_dec_lay_{}_embd_{}_dp_{}_lr_{}_ds_{}_bs_{}".format(cell, enc_layers, dec_layers,embedding_dim, dropout, learning_rate, dense_size,batch_size)
    print(run_name)
 

    
    # Config is a variable that holds and saves hyperparameters and inputs
    #Build model
    train,enc,dec = build_model(
                                dense_size=dense_size,
                                enc_layers=enc_layers,
                                dec_layers=dec_layers,
                                cell = cell,
                                dropout = dropout,
                                embedding_dim = embedding_dim)
     
     #Compliling model                           
    train.compile(optimizer = Adam(learning_rate= learning_rate),loss='categorical_crossentropy',metrics=['accuracy'])
    checkpoint = ModelCheckpoint('bestmodel.h5', monitor='val_accuracy', mode='max', verbose=0, save_best_only=True)
    #fitting model model
    train.fit([trainx,trainxx],trainy,
             batch_size=batch_size,
             validation_data = ([valx,valxx],valy),
             epochs=5,  #Change epoches here
             callbacks = [WandbCallback(), checkpoint])



    wandb.run.name = run_name
    wandb.run.save()
    return train

In [28]:
sweep_config = {
    'method': 'random', #grid, random , bayes
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'learning_rate': {
            'values': [0.01, 0.001]
        },
        'dense_size': {
            'values': [64,128,512]
        },
        'dropout': {
            'values': [0.0,0.2,0.4]
        },

          'batch_size': {
            'values': [64,128,256]
        },
        'cell': {
            'values': ["LSTM","GRU"]
        },
        'embedding_size': {
            'values': [64,128,256]
        },
        'enc_layers': {
            'values': [1,2,3]
        },
                'units': {
            'values': [256]
        },
        'dec_layers': {
            'values': [1,2,3]
        },
        
    }
}

#sweep_id = wandb.sweep(sweep_config, entity="fdl-moni_dip", project="seq2seq_attention")



In [29]:
import wandb
wandb.login()

True

In [None]:
#id = 'q28nbq5b'
wandb.agent(sweep_id, train,entity="fdl-moni_dip", project="seq2seq_attention", count=5)