<a href="https://colab.research.google.com/github/rabimist/Deep-Learning-for-Natural-Language-Processing/blob/main/Question_Generation_using_Seq2Seq_model_with_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Name:** Deen Mohammad Abdullah

Question Generation using Seq2Seq model with Attention.

**Dataset:** qa_Health_and_Personal_Care ([Amazon](https://jmcauley.ucsd.edu/data/amazon/qa/))

**Deep Learning for Natural Language Processing**


**The reason behind the higher loss value:**

Here, I have calculated the loss by considering same positions of prediction and target.

Example:  loss = criteria (prediction[i], target[i])

In [None]:
############################################ Required Packages #####################################################################
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import re
import random
import gzip
from tqdm.notebook import tqdm
######################################################################################################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

############################# Text Processing ##################################
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 20
MIN_LENGTH = 5

# This class helps us to process the source and target documents
class Text:
   def __init__(self):
       self.word2index = {}
       self.word2count = {}
       self.index2word = {0: "SOS", 1: "EOS"}
       self.n_words = 2  # Count SOS and EOS

   def addSentence(self, sentence):
       for word in sentence.split(' '):
           self.addWord(word)

   def addWord(self, word):
       if word not in self.word2index:
           self.word2index[word] = self.n_words
           self.word2count[word] = 1
           self.index2word[self.n_words] = word
           self.n_words += 1
       else:
           self.word2count[word] += 1
#-----------------------------------------------------------------------
def normalize_sentence(df, index):
   sentence = df[index].str.lower() 
   sentence = sentence.str.replace('[^A-Za-z\s]+', '')
   sentence = sentence.str.normalize('NFD')
   sentence = sentence.str.encode('ascii', errors='ignore').str.decode('utf-8')
   return sentence
#------------------------------------------------------------------------
def read_sentence(df, src, tgt):
   print ('Processing Text . . . ')
   sentence1 = normalize_sentence(df, src)
   sentence2 = normalize_sentence(df, tgt)
   return sentence1, sentence2
#------------------------------------------------------------------------
def parse(path): 
  g = gzip.open(path, 'rb') 
  
  for l in g: 
    yield eval(l) 
#------------------------------------------------------------------------
def getDF(path): 
  i = 0 
  df = {} 
  for d in parse(path): 
    df[i] = d 
    i += 1 
  
  return pd.DataFrame.from_dict(df, orient='index') 
#------------------------------------------------------------------------  
def process_data(src,tgt):
  df = getDF('/content/drive/MyDrive/qa_Health_and_Personal_Care.json.gz')
  df1 = getDF('/content/drive/MyDrive/qa_Home_and_Kitchen.json.gz')
  print('Before Processing Data, the number of records: %d' % (len(df)+len(df1)))
  print ('File reading complete')
  sentence1, sentence2 = read_sentence(df, src, tgt)
  sentence11, sentence22 = read_sentence(df1, src, tgt)
  
  source = Text()
  target = Text()
  pairs = []
  count = 0

  for i in range(len(df)):
    if len(sentence1[i].split(' ')) < MAX_LENGTH and len(sentence2[i].split(' ')) < MAX_LENGTH and len(sentence1[i].split(' ')) > MIN_LENGTH:
      full = [sentence1[i], sentence2[i]]
      source.addSentence(sentence1[i])
      target.addSentence(sentence2[i])
      pairs.append(full)
      count = count + 1

  for i in range(len(df1)):
    if len(sentence11[i].split(' ')) < MAX_LENGTH and len(sentence22[i].split(' ')) < MAX_LENGTH and len(sentence11[i].split(' ')) > MIN_LENGTH:
      full = [sentence11[i], sentence22[i]]
      source.addSentence(sentence11[i])
      target.addSentence(sentence22[i])
      pairs.append(full)
      count = count + 1
  
  print ('Processing Complete')
  print ('After Processing Data, the number of records: %d' % count)
  
  return source, target, pairs
#------------------------------------------------------------------------
def indexesFromSentence(lang, sentence):
   return [lang.word2index[word] for word in sentence.split(' ')]
#------------------------------------------------------------------------
def tensorFromSentence(lang, sentence):
   indexes = indexesFromSentence(lang, sentence)
   indexes.append(EOS_token)
   return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
#------------------------------------------------------------------------
def tensorsFromPair(input_text, output_text, pair):
   input_tensor = tensorFromSentence(input_text, pair[0])
   target_tensor = tensorFromSentence(output_text, pair[1])
   return (input_tensor, target_tensor)
################################################################################


############################### Our Model ######################################
class Encoder(nn.Module):
   def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
       super(Encoder, self).__init__()
      
       self.hidden_size = hidden_size
       self.num_layers = num_layers

       self.dropout = nn.Dropout(p)
       self.embedding = nn.Embedding(input_size, embedding_size)

       self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional = True, dropout=p)
       
       # For dimension adjustment of bidirectional LSTM
       self.fc_hidden = nn.Linear(hidden_size*2, hidden_size)
       self.fc_cell = nn.Linear(hidden_size*2, hidden_size)
              
   def forward(self, x):
      
       #embedded = self.embedding(src).view(1,1,-1)
       embedding = self.dropout(self.embedding(x))

       encoder_states, (hidden, cell) = self.rnn(embedding)

       hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
       cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

       #outputs, hidden = self.gru(embedded)
       return encoder_states, hidden, cell

class Decoder(nn.Module):
   def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
       super(Decoder, self).__init__()

       self.hidden_size = hidden_size       
       self.num_layers = num_layers
       self.output_size = output_size

       self.dropout = nn.Dropout(p)

       self.embedding = nn.Embedding(input_size, embedding_size)
       self.rnn = nn.LSTM(hidden_size*2 + embedding_size, hidden_size, num_layers, dropout=p)

       self.energy = nn.Linear(hidden_size*3, 1)
       self.softmax = nn.Softmax(dim=0)
       self.relu = nn.ReLU()
       self.fc = nn.Linear(hidden_size, output_size)
       
   def forward(self, x, encoder_states, hidden, cell):

# reshape the x to (1, batch_size) as we are predicting one word at a time
       x = x.unsqueeze(0)
       embedding = self.dropout(self.embedding(x))

       sequence_length = encoder_states.shape[0]
       h_reshaped = hidden.repeat(sequence_length, 1, 1)
       energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim = 2)))
       attention = self.softmax(energy)                  # [sequence_len, batch_size, 1]
       attention = attention.permute(1, 2, 0)            # [batch_size, 1, sequence_len]
       encoder_states = encoder_states.permute(1, 0, 2)  # [batch_size, sequence_len, hidden_size*2]

       #matrix multiplication to calculate the context vector
       context_vector = torch.bmm(attention, encoder_states).permute(1, 0, 2)  # [batch_size, 1, hidden_size*2] to [1, batch_size, hidden_size*2]

       # for each time-step during predicting each word, we are concatenating this context vector to maintain the attention
       # instead of embedding, this rnn_input will be our input
       rnn_input = torch.cat((context_vector, embedding), dim=2)

       outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))       
       predictions = self.fc(outputs)
       predictions = predictions.squeeze(0)  # [1, batch_size, len_of_vocab] to [batch_size, len_of_vocab]
      
       return predictions, hidden, cell

class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder):
       super().__init__()      
#initialize the encoder and decoder
       self.encoder = encoder
       self.decoder = decoder

   def forward(self, source, target, teacher_forcing_ratio=0.5):

       input_length = source.size(0) #get the input length (number of words in sentence)
       batch_size = source.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_size
      
#initialize a variable to hold the predicted outputs
       outputs = torch.zeros(target_length, batch_size, vocab_size).to(device)

       encoder_states, hidden, cell = self.encoder(source)

       x = target[0]

       for t in range(1, target_length):   
           output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

           outputs[t] = output

           best_guess = output.argmax(1)

           x = target[t] if random.random() < teacher_forcing_ratio else best_guess

       return outputs

#####################################################################################

def evaluate(model, input_text, output_text, sentences, criterion, max_length=MAX_LENGTH):
  input_tensor = tensorFromSentence(input_text, sentences[0])
  target_tensor = tensorFromSentence(output_text, sentences[1])
  output = model(input_tensor, target_tensor)
  num_iter = output.size(0)
  loss = 0
  decoded_words = []

  for ot in range(num_iter):
    loss += criterion(output[ot], target_tensor[ot])
    topv, topi = output[ot].topk(1)
    if topi[0].item() == EOS_token:
      decoded_words.append('<EOS>')
      break
    else:
      decoded_words.append(output_text.index2word[topi[0].item()])
  return float(loss/num_iter), decoded_words

def evaluateRandomly(model, source, target, pairs, criterion, n=10):
  eval_loss = 0.0
  for i in range(n):
    pair = random.choice(pairs)
    loss, decoded_words = evaluate(model, source, target, pair, criterion)
    output_sentence = ' '.join(decoded_words)
    eval_loss += loss
    print('=====')
    print('answer: {}'.format(pair[0]))
    print('target question: {}'.format(pair[1]))
    print('predicted question: {}'.format(output_sentence))
    print('=====')
  return float(eval_loss/n)

def clacModel(model, input_tensor, target_tensor, model_optimizer, criterion):
   model_optimizer.zero_grad()

   input_length = input_tensor.size(0)
   loss = 0
   epoch_loss = 0
   
   output = model(input_tensor, target_tensor)

   num_iter = output.size(0)
   
   for ot in range(num_iter):
       loss += criterion(output[ot], target_tensor[ot])

   loss.backward()
   model_optimizer.step()
   epoch_loss = loss.item() / num_iter

   return epoch_loss

def trainModel(model, source, target, pairs, num_iteration):
   model.train()

   #optimizer = optim.SGD(model.parameters(), lr=3e-4)
   optimizer = optim.Adam(model.parameters(), lr=3e-4)
   criterion = nn.CrossEntropyLoss()
   #criterion = nn.NLLLoss()
   total_loss_iterations = 0

   training_pairs = [tensorsFromPair(source, target, random.choice(pairs))
                     for i in range(num_iteration)]
  
   check_point = 1
   for iter in tqdm(range(1, num_iteration+1)):
       training_pair = training_pairs[iter - 1]
       input_tensor = training_pair[0]
       target_tensor = training_pair[1]

       loss = clacModel(model, input_tensor, target_tensor, optimizer, criterion)

       total_loss_iterations += loss
       
       if iter % 50000 == 0:
           avarage_loss= total_loss_iterations / 50000
           total_loss_iterations = 0
           print('------------------------')
           print ('Check Point %d:' % check_point)
           check_point = check_point + 1
           print('Training Loss: %.4f' % (avarage_loss))
          
   return model

src_file = 'answer'
tgt_file = 'question'

source, target, pairs = process_data(src_file, tgt_file)


randomize = random.choice(pairs)

input_size = source.n_words
output_size = target.n_words

embed_size = 100
hidden_size = 256
num_layers = 1
num_iteration = 1000000
dropout = 0.0

encoder = Encoder(input_size, embed_size, hidden_size, num_layers, dropout)
decoder = Decoder(output_size, embed_size, hidden_size, output_size, num_layers, dropout)

model = Seq2Seq(encoder, decoder).to(device)
print ('Model is initialized')

print ('Training starts . . .')
model = trainModel(model, source, target, pairs, num_iteration)


print ('-----------Evaluation-------------------')
model.eval()
criterion = nn.CrossEntropyLoss()
eval_loss = evaluateRandomly(model, source, target, pairs, criterion)
print('Validation Loss: %.4f' % (eval_loss))
print('------------------------')


Before Processing Data, the number of records: 264935
File reading complete
Processing Text . . . 
Processing Text . . . 
Processing Complete
After Processing Data, the number of records: 72463
Model is initialized
Training starts . . .


HBox(children=(FloatProgress(value=0.0, max=1000000.0), HTML(value='')))

------------------------
Check Point 1:
Training Loss: 5.6293
------------------------
Check Point 2:
Training Loss: 5.2806
------------------------
Check Point 3:
Training Loss: 5.1545
------------------------
Check Point 4:
Training Loss: 5.0307
------------------------
Check Point 5:
Training Loss: 4.9080
------------------------
Check Point 6:
Training Loss: 4.8033
------------------------
Check Point 7:
Training Loss: 4.7423
------------------------
Check Point 8:
Training Loss: 4.6575
------------------------
Check Point 9:
Training Loss: 4.6020
------------------------
Check Point 10:
Training Loss: 4.5363
------------------------
Check Point 11:
Training Loss: 4.4807
------------------------
Check Point 12:
Training Loss: 4.4434
------------------------
Check Point 13:
Training Loss: 4.4046
------------------------
Check Point 14:
Training Loss: 4.3774
------------------------
Check Point 15:
Training Loss: 4.3293
------------------------
Check Point 16:
Training Loss: 4.2953
-