In [None]:
'''
mount google drive
'''
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
'''
install required libraries
'''
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 6.2MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 34.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |█████

In [None]:
'''
import required packages
'''
import unicodedata
import re
import os
import random
import string
import itertools
import pickle
import glob

from queue import PriorityQueue
import operator

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import transformers

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
'''
configuration for deterministic results with multiple run
'''
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) 

np.random.seed(seed)  
random.seed(seed) 

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
'''
pandas configuration for showing complete content of record
'''
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', None)

In [None]:
'''
load news data and show some sample 
'''
data_path='/content/gdrive/My Drive/Capstone_Project/Data/News_Data/news_article_with_sim_score.df'
article_df=pd.read_pickle(data_path)
no_of_headlines=[len(similar_headlines) for similar_headlines in article_df['similar_headline'].tolist()]
print('max no of similar headlines: ',max(no_of_headlines))
print('min no of similar headlines: ',min(no_of_headlines))
print('article_df shape:',article_df.shape)
article_df.sample(2)

max no of similar headlines:  13
min no of similar headlines:  0
article_df shape: (3000, 9)


Unnamed: 0,article_url,headline,content,author,published_date,read_more_source,similar_headline,similar_headline_url,similarity_scores
1801,https://inshorts.com/en/news/foot-of-missing-businesswoman-who-stole-₹74cr-from-clients-found-on-aus-beach-1614333830092,Foot of missing businesswoman who stole ₹74cr from clients found on Aus beach,"Australian police has said that campers have found the decomposed foot of missing businesswoman Melissa Caddick on a beach. Caddick, who allegedly stole A$13 million (over ₹74 crore) from her clients, disappeared on November 12 last year after federal police raided her home in Sydney. ""She may have taken her own life,"" police added.",,2021-02-26T10:03:50.000Z,Daily Mail,"[Melissa Caddick: Missing fraud suspect's foot found on Australian beach, Melissa Caddick: remains of missing businesswoman found months after disappearance, Melissa Caddick dead, police confirm, after campers find her foot on NSW South Coast, Remains of missing businesswoman and 'conwoman' Melissa Caddick have been found, NSW Health In Australia Orders Radiology Solution From Sectra For Enterprise Access To Images, Sexy Croc &dash Entry #1372 &dash Data Clustering Contest]","[https://www.bbc.com/news/world-australia-56205519, https://www.theguardian.com/australia-news/2021/feb/26/melissa-caddick-missing-financial-adviser-found-dead-months-after-disappearance, https://www.abc.net.au/news/2021-02-26/melissa-caddick-found-dead/13195242, https://www.dailymail.co.uk/news/article-9301259/Remains-missing-businesswoman-conwoman-Melissa-Caddick-found.html, https://www.medicalbuyer.co.in/nsw-health-in-australia-orders-radiology-solution-from-sectra-for-enterprise-access-to-images/, https://entry1372-dcround2.usercontent.dev/20200529/categories/en/economy.html]","[0.58, 0.51, 0.34, 0.49, 0.19, 0.16]"
1190,https://inshorts.com/en/news/trumps-gab-account-compromised-as-hackers-target-platform-1614608033197,Trump's Gab account compromised as hackers target platform,"Former US President Donald Trump's Gab account was compromised along with the social network Gab CEO Andrew Torba's account. Torba revealed that the platform is being attacked by hackers who had earlier targeted law enforcement officers. According to Wired, around 70 gigabytes of Gab data representing over 40 million posts has been stolen and includes passwords, group passwords and messages.",,2021-03-01T14:13:53.000Z,Business Insider India,"[Far-right social media Gab hacked, Trump's account targeted, Gab confirms it was hacked, Trump and Gab CEO accounts compromised during large-scale hack of alternative social media platform, Gab Hack Reveals Passwords And Private Messages, Hacktivists Attack Controversial Christian Conservative Social Media Site Gab, Leak 70 Gigabytes of Hacked Data Including Private Messages and Passwords, Gab: hack gives unprecedented look into platform used by far right, Gab Founder Andrew Torba Says Platform Was Hacked By Far-Left Activists : US : Christianity Daily, US Right-Wing Platform Gab Acknowledges it Was Hacked, Passwords, Private Posts Exposed in Hack of Gab Social Network]","[https://www.jpost.com/international/far-right-social-media-gab-hacked-trumps-account-targeted-660790, https://www.securitymagazine.com/articles/94733-gab-confirms-it-was-hacked, https://www.coloradopolitics.com/news/trump-and-gab-ceo-accounts-compromised-during-large-scale-hack-of-alternative-social-media-platform/article_379f06da-eb18-5226-b920-0833a591345f.html, https://www.forbes.com/sites/emmawoollacott/2021/03/02/gab-hack-reveals-passwords-and-private-posts/, https://www.cpomagazine.com/cyber-security/hacktivists-attack-controversial-christian-conservative-social-media-site-gab-leak-70-gigabytes-of-hacked-data-including-private-messages-and-passwords/, https://www.theguardian.com/world/2021/mar/11/gab-hack-neo-nazis-qanon-conspiracy-theories, http://www.christianitydaily.com/articles/11022/20210303/gab-founder-andrew-torba-says-platform-was-hacked-by-far-left-activists.htm, https://www.securityweek.com/us-right-wing-platform-gab-acknowledges-it-was-hacked, https://threatpost.com/hacktivists-gab-posts-passwords/164360/]","[0.76, 0.66, 0.86, 0.54, 0.47, 0.59, 0.43, 0.44, 0.54]"


In [None]:
contents=[]
target_headlines_1=[]
target_headlines_2=[]
target_headlines_3=[]
similarity_scores_threshold=0.50
for index, row in article_df.iterrows():
  similarity_scores=row['similarity_scores']
  #print(similarity_scores)
  #print(similar_headlines)
  sorted_index=list(np.argsort(similarity_scores)) # in ascending order
  sorted_index.reverse() # in descending order
  if len(sorted_index)>=2:
    second_highest_sim_score=similarity_scores[sorted_index[1]]
    #print(second_highest_sim_score)
    if (second_highest_sim_score >= similarity_scores_threshold):
      target_headlines_1.append(row['headline'])
      similar_headlines=row['similar_headline']
      target_headlines_2.append(similar_headlines[sorted_index[0]]) # first best similar
      target_headlines_3.append(similar_headlines[sorted_index[1]]) # second best similar
      contents.append(row['content'])

In [None]:
#sample record
print('news-summary: ',contents[0])
print('headlines1: ',target_headlines_1[0])
print('headlines2: ',target_headlines_2[0])
print('headlines3: ',target_headlines_3[0])


news-summary:  Taking to Instagram on Saturday, Arjun Kapoor posted a picture of himself with Janhvi Kapoor to wish the actress on her 24th birthday. In the picture, Arjun can be seen walking ahead while holding his sister's hand. "Happy birthday Janhvi...I can't promise much except like this picture you shall always have my support & hand wherever you go," Arjun wrote.
headlines1:  You shall always have my support: Arjun Kapoor on Janhvi's b'day
headlines2:  'You shall always have my support': Arjun Kapoor pens heart-warming birthday note for Janhvi
headlines3:  "You will always have my support," Arjun Kapoor writes a heartfelt birthday note for Janhvi Kapoor.


In [50]:
'''
decide threshold for min and max no-of-word-token in headline 
'''
headlines = [headline for headline in target_headlines_1] + [headline for headline in target_headlines_2] + [headline for headline in target_headlines_3]
headline_len=[len(headline.split(' '))for headline in headlines]
print('5th percentile length: ',np.quantile(headline_len, 0.05))
print('25th percentile length: ',np.quantile(headline_len, 0.25))
print('50th percentile length: ',np.quantile(headline_len, 0.50))
print('75th percentile length: ',np.quantile(headline_len, 0.75))
print('95th percentile length: ',np.quantile(headline_len, 0.95))
print('99th percentile length: ',np.quantile(headline_len, 0.99))

5th percentile length:  8.0
25th percentile length:  10.0
50th percentile length:  12.0
75th percentile length:  14.0
95th percentile length:  18.0
99th percentile length:  22.0


In [None]:
'''
creating summary-headline pair and then randomly shuffle them
'''
summary_headline_pairs=list(zip(contents,target_headlines_1, target_headlines_2, target_headlines_3))
random.shuffle(summary_headline_pairs)
len(summary_headline_pairs)

2610

In [None]:
'''
get train and test dataset
'''
train_summary_headline_pairs=summary_headline_pairs[0:2000]
#train_summary_headline_pairs=summary_headline_pairs[0:100]#just for faster testing if code flow is working fine
test_summary_headline_pairs=summary_headline_pairs[2000:]
no_of_training_records=len(train_summary_headline_pairs)

In [None]:
'''
load Bert-Model and Tokeninzer using predefined weights
distilbert-base-uncased' model is uncased: it does not make a difference between english and English. 
'''
model_class, tokenizer_class, pretrained_weights = (transformers.BertModel, transformers.BertTokenizer, "bert-base-uncased")# 'distilbert-base-uncased'

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
'''
initialize BOS and EOS token
'''
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
print('SOS token id: ',tokenizer.bos_token_id)
print('EOS token id: ',tokenizer.eos_token_id)

SOS token id:  101
EOS token id:  102


In [69]:
'''
tokenize news summary and headline
'''
max_encoder_len=125
max_decoder_len=40 
tokenized_summaries = [tokenizer(summary, padding="max_length", truncation=True, max_length=max_encoder_len) for summary in contents]
tokenized_headlines_1 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in target_headlines_1]
tokenized_headlines_2 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in target_headlines_2]
tokenized_headlines_3 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in target_headlines_3]

In [None]:
summary_lengths = [len(tokenized_summary.input_ids) for tokenized_summary in tokenized_summaries]
headline_lengths_1 = [len(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_1]
headline_lengths_2 = [len(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_2]
headline_lengths_3 = [len(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_3]

In [None]:
print('summary_lengths_max: ',max(summary_lengths))
print('headline1_lengths_max: ',max(headline_lengths_1))
print('headline2_lengths_max: ',max(headline_lengths_2))
print('headline3_lengths_max: ',max(headline_lengths_3))

summary_lengths_max:  125
headline1_lengths_max:  32
headline2_lengths_max:  173
headline3_lengths_max:  51


In [None]:
'''
instance of class Batch_Data represent input to the encoder decoder model for a batch
'''
class Batch_Data:
  def __init__(self, batch_ip_vector, batch_ip_length, batch_op_vector_1, batch_op_vector_2, batch_op_vector_3, batch_op_token_idxs_1, batch_op_token_idxs_2, 
               batch_op_token_idxs_3, batch_mask_1, batch_mask_2, batch_mask_3):
    self.batch_ip_vector=batch_ip_vector
    self.batch_ip_length=batch_ip_length
    self.batch_op_vector_1=batch_op_vector_1
    self.batch_op_vector_2=batch_op_vector_2
    self.batch_op_vector_3=batch_op_vector_3
    self.batch_op_token_idxs_1=batch_op_token_idxs_1
    self.batch_op_token_idxs_2=batch_op_token_idxs_2
    self.batch_op_token_idxs_3=batch_op_token_idxs_3
    self.batch_mask_1=batch_mask_1
    self.batch_mask_2=batch_mask_2
    self.batch_mask_3=batch_mask_3

In [67]:
#tokenized_summaries[0].input_ids
a=np.array([list([1,2]) for i in range(2)])
a.shape
tokenized_summaries_1=tokenized_summaries[2:4]
b=np.array([list(tokenized_summary.input_ids) for tokenized_summary in tokenized_summaries_1])
b.shape

(2, 40)

In [76]:
'''
run this cell only once.
compute BERT based representation of news summary and headline and store it on drive.
this will help in faster training, as we don't have to get bert based vector representation of headline and summary during every training iteration. 
'''
batch_start=0
batch_end=0
end = 32 #len(tokenized_summaries)#32
batch_size=16
summary_vector=None

inputs_input_ids = np.array([list(tokenized_summary.input_ids) for tokenized_summary in tokenized_summaries])
inputs_attention_mask = np.array([np.array(tokenized_summary.attention_mask) for tokenized_summary in tokenized_summaries])

outputs1_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_1])
outputs1_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_headlines_1])

outputs2_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_2])
outputs2_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_headlines_2])

outputs3_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_3])
outputs3_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_headlines_3])


while batch_end<end:
  batch_end=batch_start+batch_size
  if batch_end<end:
    pass #do nothing
  else:
    batch_end=end
  print('batch_start: ',batch_start,' batch_end: ',batch_end)
  summary_batch=inputs_input_ids[batch_start:batch_end]
  summary_length_batch=[np.count_nonzero(summary==0) for summary in summary_batch]
  attention_mask_summary = inputs_attention_mask[batch_start:batch_end]

  headline1_batch=outputs1_input_ids[batch_start:batch_end]
  attention_mask_headline1 = outputs1_attention_mask[batch_start:batch_end]

  headline2_batch=outputs2_input_ids[batch_start:batch_end]
  attention_mask_headline2 = outputs2_attention_mask[batch_start:batch_end]

  headline3_batch=outputs3_input_ids[batch_start:batch_end]
  attention_mask_headline3 = outputs3_attention_mask[batch_start:batch_end]

  summary_batch_t = torch.tensor(summary_batch) 
  attention_mask_summary_t = torch.tensor(attention_mask_summary)

  headline1_batch_t = torch.tensor(headline1_batch) 
  attention_mask_headline1_t = torch.BoolTensor(attention_mask_headline1)
  
  headline2_batch_t = torch.tensor(headline2_batch) 
  attention_mask_headline2_t = torch.BoolTensor(attention_mask_headline2)

  headline3_batch_t = torch.tensor(headline3_batch) 
  attention_mask_headline3_t = torch.BoolTensor(attention_mask_headline3)
   
  with torch.no_grad():
    last_hidden_states = model(summary_batch_t, attention_mask=attention_mask_summary_t)
  summary_batch_vector=last_hidden_states[0]
  with torch.no_grad():
    last_hidden_states = model(headline1_batch_t, attention_mask=attention_mask_headline1_t)
  headline1_batch_vector=last_hidden_states[0]
  with torch.no_grad():
    last_hidden_states = model(headline2_batch_t, attention_mask=attention_mask_headline2_t)
  headline2_batch_vector=last_hidden_states[0]
  with torch.no_grad():
    last_hidden_states = model(headline3_batch_t, attention_mask=attention_mask_headline3_t)
  headline3_batch_vector=last_hidden_states[0]

  batch_data=Batch_Data(summary_batch_vector,summary_length_batch,headline1_batch_vector,headline2_batch_vector,headline3_batch_vector,
                        headline1_batch_t,headline2_batch_t,headline3_batch_t,attention_mask_headline1_t,attention_mask_headline2_t,attention_mask_headline3_t)
  batch_file_path='/content/gdrive/My Drive/Capstone_Project/Data/Bert_vectors/one_to_many_setup/batch_'+str(batch_start)+'_'+str(batch_end)+'.pickle'
  with open(batch_file_path, 'wb') as file_handle:
    pickle.dump(batch_data, file_handle, protocol=pickle.HIGHEST_PROTOCOL)
  batch_start=batch_end

batch_start:  0  batch_end:  16
batch_start:  16  batch_end:  32


In [None]:
'''
function for providing emebded representation of BOS token at first timestep of decoding for complete batch
'''
def get_initial_decoder_ip(batch_size):
  sos_token_tensor=torch.tensor([[tokenizer.bos_token_id]])
  with torch.no_grad():
    last_hidden_states = model(sos_token_tensor)
  SOS_token_bert_vector=last_hidden_states[0]
  SOS_token_bert_vector=torch.squeeze(SOS_token_bert_vector, 0)
  decoder_input = torch.tensor([SOS_token_bert_vector.numpy() for _ in range(batch_size)])
  print('decoder_input shape: ',decoder_input.shape)
  decoder_input=decoder_input.permute(1,0,2)
  print('decoder_input shape: ',decoder_input.shape)
  return decoder_input

In [None]:
'''
just for testing
'''
get_initial_decoder_ip(32)

decoder_input shape:  torch.Size([32, 1, 768])
decoder_input shape:  torch.Size([1, 32, 768])


tensor([[[-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         ...,
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568]]])

In [None]:
'''
GRU based encoder class without any embedding layer (as input will precomputed bert vector representation of news data)
'''
class Encoder(nn.Module):
  def __init__(self, embbed_dim, hidden_dim, num_layers):
       super(Encoder, self).__init__()
       #set the encoder input dimesion , embbed dimesion, hidden dimesion, and number of layers 
       self.hidden_dim = hidden_dim
       self.num_layers = num_layers
       self.embbed_dim=embbed_dim
       #intialize the GRU to take the input dimetion of embbed, and output dimention of hidden and
       #set the number of gru layers
       self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)

  def forward(self, input_seq, input_lengths, hidden=None):
    print('inside encoder forward function || input_seq shape: ',input_seq.shape )
    print('inside encoder forward function || input_lengths shape: ',input_lengths.shape )
    if(hidden!=None):
      torch.set_printoptions(threshold=10000)
      print('inside encoder forward function || hidden shape: ',hidden )
    # Pack padded batch of sequences for RNN module
    
    # Forward pass through GRU
    outputs, hidden = self.gru(input_seq, hidden)
    '''
    packed = nn.utils.rnn.pack_padded_sequence(input_seq, input_lengths)
    outputs, hidden = self.gru(packed, hidden)##TODO 
    outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
    '''
    # Unpack padding
    
    #print('inside encoder forward function || outputs shape: ',outputs.shape )
    #print('inside encoder forward function || hidden shape: ',hidden )
    #print('inside encoder forward function || outputs[:, : ,self.hidden_dim:] shape: ',outputs[:, : ,self.hidden_dim:].shape )
    # Sum bidirectional GRU outputs
    #outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
    # Return output and final hidden state
    print('inside encoder forward function || outputs shape: ',outputs.shape )
    print('inside encoder forward function || hidden shape: ',hidden.shape )
    return outputs, hidden   

In [None]:
'''
Luong attention layer
'''
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [None]:
'''
GRU with Luong Attn based Decoder class 
'''
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embbed_dim, hidden_size, output_size, num_layers=1 ):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.embbed_dim=embbed_dim
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        # Define layers
        self.gru = nn.GRU(self.embbed_dim, self.hidden_size, self.num_layers)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        #print("decoder forward 1 embedded shape: ",embedded.shape)
        #embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(input_step, last_hidden)
        #print("decoder forward 2 rnn_output shape: ",rnn_output.shape)
        #print("decoder forward 2 hidden shape: ",hidden.shape)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        #print("decoder forward 3 attn_weights shape: ",attn_weights.shape)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        #print("decoder forward 4 context shape: ",context.shape)
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        #print("decoder forward 5 rnn_output.squeeze shape: ",rnn_output.shape)
        #print("decoder forward 5 context.squeeze shape: ",context.shape)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        #print("decoder forward 6 concat_input shape: ",concat_input.shape)
        #print("decoder forward 6 concat_output shape: ",concat_output.shape)        
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        #print("decoder forward 7 output shape: ",output.shape)
        output = F.softmax(output, dim=1)
        #print("decoder forward 8 output shape: ",output.shape)
        # {Return word2 output} {and output} {output} and final hidden state
        return output, hidden

In [None]:
'''
loss function that calculates the average negative log likelihood of the elements that correspond to a 1 in the mask tensor
'''
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [None]:
'''
function for performing a single training iteration
'''
def train(input_variable, lengths, target_variable, target_op_token_idxs, mask, max_target_len, encoder, decoder,
          encoder_optimizer, decoder_optimizer, batch_size, clip,decoder_ip_initial, teacher_forcing_ratio=1):
    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    print('inside train function decoder_ip_initial shape: ',decoder_ip_initial.shape)
    decoder_input = decoder_ip_initial#sos token for all the training sample in a given batch
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.num_layers] #this important and handle scenario where no of layer for GRU varries in encoder decoder
    #TODO check why 'encoder_hidden[:decoder.n_layers]'? not 'encoder_hidden[:encoder.n_layers]'
    #print('inside function train decoder_hidden: ',decoder_hidden.shape)
    #print('inside function train encoder_hidden[:encoder.num_layers]: ',encoder_hidden[:encoder.num_layers].shape)
    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False #TODO need to work on this
    #print('before the decoder forward pass')
    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        #iterate through timesteps for decoder 
        print('teacher forcing will be used for this batch')
        for timestep in range(1,max_target_len):
            #print('before decoder forward pass: ')
            #print('decoder_input shape: ',decoder_input.shape)
            #print('decoder_hidden shape: ',decoder_hidden.shape)
            #print('encoder_outputs shape: ',encoder_outputs.shape)
            print('timestep: ',timestep,' inside train function1: ',decoder_input.shape)
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)#(decoder_input, decoder_hidden, encoder_outputs)
            # Teacher forcing: next input is current target
            print('timestep: ',timestep,' inside train function2: ',decoder_input.shape)
            decoder_input = torch.unsqueeze(target_variable[timestep],0)#[1,64] use next timestamp token from target seq  as ip to decoder at nexe time step
            #print('target_variable[timestep] shape: ',target_variable[timestep].shape)#[64]
            #print('target_variable[timestep].view(1, -1) shape: ',target_variable[timestep].view(1, -1).shape)#[1,64]
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_op_token_idxs[timestep], mask[timestep])
            print('timestep : ',timestep,' mask_loss: ',mask_loss,' loss: ',loss)
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
            non_padding_token_count=mask[timestep+1].sum()#non_padding_token_count for next timestep
            #print('non_padding_token_count: ',non_padding_token_count)
            if(non_padding_token_count==0):#all tokens are padding token for next timestep for all records in batches 
              break
    else:
        print('teacher forcing won\'t be used for this batch')
        for timestep in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [None]:
'''
function for performing a single training iteration
'''
def train(input_variable, lengths, target_variable1, target_variable2, target_variable3, target_op_token_idxs1, target_op_token_idxs2, target_op_token_idxs3, mask1, mask2,mask3, max_target_len, encoder, decoder1, decoder2, decoder3,
          encoder_optimizer, decoder_optimizer, batch_size, clip,decoder_ip_initial, teacher_forcing_ratio=1):
    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable1 = target_variable1.to(device)
    target_variable2 = target_variable2.to(device)
    target_variable3 = target_variable3.to(device)
    mask1 = mask1.to(device)
    mask2 = mask2.to(device)
    mask3 = mask3.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    print('inside train function decoder_ip_initial shape: ',decoder_ip_initial.shape)
    decoder_input = decoder_ip_initial#sos token for all the training sample in a given batch
    decoder_input1 = decoder_input2 = decoder_input3 = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder1_hidden = decoder2_hidden = decoder3_hidden = encoder_hidden[:decoder.num_layers] #this important and handle scenario where no of layer for GRU varries in encoder decoder
    #TODO check why 'encoder_hidden[:decoder.n_layers]'? not 'encoder_hidden[:encoder.n_layers]'
    #print('inside function train decoder_hidden: ',decoder_hidden.shape)
    #print('inside function train encoder_hidden[:encoder.num_layers]: ',encoder_hidden[:encoder.num_layers].shape)
    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False #TODO need to work on this
    #print('before the decoder forward pass')
    # Forward batch of sequences through decoder one time step at a time
    # TODO we can get this value from actual op seq 
    max_target_len=10
    if use_teacher_forcing:
        #iterate through timesteps for decoder 
        print('teacher forcing will be used for this batch')
        for timestep in range(1,max_target_len):
            #print('before decoder forward pass: ')
            #print('decoder_input shape: ',decoder_input.shape)
            #print('decoder_hidden shape: ',decoder_hidden.shape)
            #print('encoder_outputs shape: ',encoder_outputs.shape)
            print('timestep: ',timestep,' inside train function1: ',decoder_input.shape)
            decoder1_output, decoder1_hidden = decoder(decoder1_input, decoder1_hidden, encoder_outputs) #(decoder_input, decoder_hidden, encoder_outputs)
            decoder2_output, decoder2_hidden = decoder(decoder2_input, decoder2_hidden, encoder_outputs)
            decoder3_output, decoder3_hidden = decoder(decoder3_input, decoder3_hidden, encoder_outputs)
            # Teacher forcing: next input is current target
            print('timestep: ',timestep,' inside train function2: ',decoder_input.shape)
            decoder1_input = torch.unsqueeze(target_variable1[timestep],0)#[1,64] use next timestamp token from target seq  as ip to decoder at nexe time step
            decoder2_input = torch.unsqueeze(target_variable2[timestep],0)
            decoder3_input = torch.unsqueeze(target_variable3[timestep],0)
            #print('target_variable[timestep] shape: ',target_variable[timestep].shape)#[64]
            #print('target_variable[timestep].view(1, -1) shape: ',target_variable[timestep].view(1, -1).shape)#[1,64]
            # Calculate and accumulate loss
            mask_loss1, nTotal = maskNLLLoss(decoder1_output, target_op_token_idxs1[timestep], mask1[timestep])
            mask_loss2, nTotal = maskNLLLoss(decoder2_output, target_op_token_idxs2[timestep], mask2[timestep])
            mask_loss3, nTotal = maskNLLLoss(decoder3_output, target_op_token_idxs3[timestep], mask3[timestep])

            print('timestep : ',timestep,' mask_loss: ',mask_loss,' loss: ',loss)
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
            non_padding_token_count=mask[timestep+1].sum()#non_padding_token_count for next timestep
            #print('non_padding_token_count: ',non_padding_token_count)
            if(non_padding_token_count==0):#all tokens are padding token for next timestep for all records in batches 
              break
    else:
        print('teacher forcing won\'t be used for this batch')
        for timestep in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals