In [1]:
'''
mount google drive
'''
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
'''
install required libraries
'''
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |▏                               | 10kB 16.3MB/s eta 0:00:01[K     |▎                               | 20kB 20.1MB/s eta 0:00:01[K     |▍                               | 30kB 23.4MB/s eta 0:00:01[K     |▌                               | 40kB 25.8MB/s eta 0:00:01[K     |▋                               | 51kB 26.6MB/s eta 0:00:01[K     |▉                               | 61kB 27.7MB/s eta 0:00:01[K     |█                               | 71kB 28.5MB/s eta 0:00:01[K     |█                               | 81kB 28.6MB/s eta 0:00:01[K     |█▏                              | 92kB 26.6MB/s eta 0:00:01[K     |█▎                              | 102kB 27.5MB/s eta 0:00:01[K     |█▍                              | 112kB 27.5MB/s eta 0:00:01[K     |█▋                              | 

In [3]:
'''
import required packages
'''
import unicodedata
import re
import os
import random
import string
import itertools
import pickle
import glob

from queue import PriorityQueue
import operator

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import transformers

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
'''
configuration for deterministic results with multiple run
'''
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) 

np.random.seed(seed)  
random.seed(seed) 

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

os.environ['PYTHONHASHSEED'] = str(seed)

In [5]:
'''
pandas configuration for showing complete content of record
'''
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', None)

In [6]:
'''
load news data and show some sample 
'''
data_path='/content/gdrive/My Drive/Capstone_Project/Data/News_Data/news_article_with_sim_score.df'
article_df=pd.read_pickle(data_path)
no_of_headlines=[len(similar_headlines) for similar_headlines in article_df['similar_headline'].tolist()]
print('max no of similar headlines: ',max(no_of_headlines))
print('min no of similar headlines: ',min(no_of_headlines))
print('article_df shape:',article_df.shape)
article_df.sample(2)

max no of similar headlines:  13
min no of similar headlines:  0
article_df shape: (3000, 9)


Unnamed: 0,article_url,headline,content,author,published_date,read_more_source,similar_headline,similar_headline_url,similarity_scores
1801,https://inshorts.com/en/news/foot-of-missing-businesswoman-who-stole-₹74cr-from-clients-found-on-aus-beach-1614333830092,Foot of missing businesswoman who stole ₹74cr from clients found on Aus beach,"Australian police has said that campers have found the decomposed foot of missing businesswoman Melissa Caddick on a beach. Caddick, who allegedly stole A$13 million (over ₹74 crore) from her clients, disappeared on November 12 last year after federal police raided her home in Sydney. ""She may have taken her own life,"" police added.",,2021-02-26T10:03:50.000Z,Daily Mail,"[Melissa Caddick: Missing fraud suspect's foot found on Australian beach, Melissa Caddick: remains of missing businesswoman found months after disappearance, Melissa Caddick dead, police confirm, after campers find her foot on NSW South Coast, Remains of missing businesswoman and 'conwoman' Melissa Caddick have been found, NSW Health In Australia Orders Radiology Solution From Sectra For Enterprise Access To Images, Sexy Croc &dash Entry #1372 &dash Data Clustering Contest]","[https://www.bbc.com/news/world-australia-56205519, https://www.theguardian.com/australia-news/2021/feb/26/melissa-caddick-missing-financial-adviser-found-dead-months-after-disappearance, https://www.abc.net.au/news/2021-02-26/melissa-caddick-found-dead/13195242, https://www.dailymail.co.uk/news/article-9301259/Remains-missing-businesswoman-conwoman-Melissa-Caddick-found.html, https://www.medicalbuyer.co.in/nsw-health-in-australia-orders-radiology-solution-from-sectra-for-enterprise-access-to-images/, https://entry1372-dcround2.usercontent.dev/20200529/categories/en/economy.html]","[0.58, 0.51, 0.34, 0.49, 0.19, 0.16]"
1190,https://inshorts.com/en/news/trumps-gab-account-compromised-as-hackers-target-platform-1614608033197,Trump's Gab account compromised as hackers target platform,"Former US President Donald Trump's Gab account was compromised along with the social network Gab CEO Andrew Torba's account. Torba revealed that the platform is being attacked by hackers who had earlier targeted law enforcement officers. According to Wired, around 70 gigabytes of Gab data representing over 40 million posts has been stolen and includes passwords, group passwords and messages.",,2021-03-01T14:13:53.000Z,Business Insider India,"[Far-right social media Gab hacked, Trump's account targeted, Gab confirms it was hacked, Trump and Gab CEO accounts compromised during large-scale hack of alternative social media platform, Gab Hack Reveals Passwords And Private Messages, Hacktivists Attack Controversial Christian Conservative Social Media Site Gab, Leak 70 Gigabytes of Hacked Data Including Private Messages and Passwords, Gab: hack gives unprecedented look into platform used by far right, Gab Founder Andrew Torba Says Platform Was Hacked By Far-Left Activists : US : Christianity Daily, US Right-Wing Platform Gab Acknowledges it Was Hacked, Passwords, Private Posts Exposed in Hack of Gab Social Network]","[https://www.jpost.com/international/far-right-social-media-gab-hacked-trumps-account-targeted-660790, https://www.securitymagazine.com/articles/94733-gab-confirms-it-was-hacked, https://www.coloradopolitics.com/news/trump-and-gab-ceo-accounts-compromised-during-large-scale-hack-of-alternative-social-media-platform/article_379f06da-eb18-5226-b920-0833a591345f.html, https://www.forbes.com/sites/emmawoollacott/2021/03/02/gab-hack-reveals-passwords-and-private-posts/, https://www.cpomagazine.com/cyber-security/hacktivists-attack-controversial-christian-conservative-social-media-site-gab-leak-70-gigabytes-of-hacked-data-including-private-messages-and-passwords/, https://www.theguardian.com/world/2021/mar/11/gab-hack-neo-nazis-qanon-conspiracy-theories, http://www.christianitydaily.com/articles/11022/20210303/gab-founder-andrew-torba-says-platform-was-hacked-by-far-left-activists.htm, https://www.securityweek.com/us-right-wing-platform-gab-acknowledges-it-was-hacked, https://threatpost.com/hacktivists-gab-posts-passwords/164360/]","[0.76, 0.66, 0.86, 0.54, 0.47, 0.59, 0.43, 0.44, 0.54]"


In [7]:
contents=[]
target_headlines_1=[]
target_headlines_2=[]
target_headlines_3=[]
similarity_scores_threshold=0.50
for index, row in article_df.iterrows():
  similarity_scores=row['similarity_scores']
  #print(similarity_scores)
  #print(similar_headlines)
  sorted_index=list(np.argsort(similarity_scores)) # in ascending order
  sorted_index.reverse() # in descending order
  if len(sorted_index)>=2:
    second_highest_sim_score=similarity_scores[sorted_index[1]]
    #print(second_highest_sim_score)
    if (second_highest_sim_score >= similarity_scores_threshold):
      target_headlines_1.append(row['headline'])
      similar_headlines=row['similar_headline']
      target_headlines_2.append(similar_headlines[sorted_index[0]]) # first best similar
      target_headlines_3.append(similar_headlines[sorted_index[1]]) # second best similar
      contents.append(row['content'])

In [8]:
#sample record
print('news-summary: ',contents[0])
print('headlines1: ',target_headlines_1[0])
print('headlines2: ',target_headlines_2[0])
print('headlines3: ',target_headlines_3[0])


news-summary:  Taking to Instagram on Saturday, Arjun Kapoor posted a picture of himself with Janhvi Kapoor to wish the actress on her 24th birthday. In the picture, Arjun can be seen walking ahead while holding his sister's hand. "Happy birthday Janhvi...I can't promise much except like this picture you shall always have my support & hand wherever you go," Arjun wrote.
headlines1:  You shall always have my support: Arjun Kapoor on Janhvi's b'day
headlines2:  'You shall always have my support': Arjun Kapoor pens heart-warming birthday note for Janhvi
headlines3:  "You will always have my support," Arjun Kapoor writes a heartfelt birthday note for Janhvi Kapoor.


In [9]:
'''
decide threshold for min and max no-of-word-token in headline 
'''
headlines = [headline for headline in target_headlines_1] + [headline for headline in target_headlines_2] + [headline for headline in target_headlines_3]
headline_len=[len(headline.split(' '))for headline in headlines]
print('5th percentile length: ',np.quantile(headline_len, 0.05))
print('25th percentile length: ',np.quantile(headline_len, 0.25))
print('50th percentile length: ',np.quantile(headline_len, 0.50))
print('75th percentile length: ',np.quantile(headline_len, 0.75))
print('95th percentile length: ',np.quantile(headline_len, 0.95))
print('99th percentile length: ',np.quantile(headline_len, 0.99))

5th percentile length:  8.0
25th percentile length:  10.0
50th percentile length:  12.0
75th percentile length:  14.0
95th percentile length:  18.0
99th percentile length:  22.0


In [10]:
'''
creating summary-headline pair and then randomly shuffle them
'''
summary_headline_pairs=list(zip(contents,target_headlines_1, target_headlines_2, target_headlines_3))
random.shuffle(summary_headline_pairs)
len(summary_headline_pairs)

2610

In [11]:
'''
get train and test dataset
'''
train_summary_headline_pairs=summary_headline_pairs[0:2000]
#train_summary_headline_pairs=summary_headline_pairs[0:100]#just for faster testing if code flow is working fine
test_summary_headline_pairs=summary_headline_pairs[2000:]
no_of_training_records=len(train_summary_headline_pairs)

In [107]:
'''
load Bert-Model and Tokeninzer using predefined weights
distilbert-base-uncased' model is uncased: it does not make a difference between english and English. 
'''
model_class, tokenizer_class, pretrained_weights = (transformers.BertModel, transformers.BertTokenizer, "bert-base-uncased")# 'distilbert-base-uncased'

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [141]:
'''
initialize BOS and EOS token
'''
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
print('SOS token id: ',tokenizer.bos_token_id)
print('EOS token id: ',tokenizer.eos_token_id)

SOS token id:  101
EOS token id:  102


In [14]:
'''
tokenize news summary and headline
'''
max_encoder_len=125
max_decoder_len=40 
tokenized_summaries = [tokenizer(summary, padding="max_length", truncation=True, max_length=max_encoder_len) for summary in contents]
tokenized_headlines_1 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in target_headlines_1]
tokenized_headlines_2 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in target_headlines_2]
tokenized_headlines_3 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in target_headlines_3]

In [15]:
summary_lengths = [len(tokenized_summary.input_ids) for tokenized_summary in tokenized_summaries]
headline_lengths_1 = [len(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_1]
headline_lengths_2 = [len(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_2]
headline_lengths_3 = [len(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_3]

In [16]:
print('summary_lengths_max: ',max(summary_lengths))
print('headline1_lengths_max: ',max(headline_lengths_1))
print('headline2_lengths_max: ',max(headline_lengths_2))
print('headline3_lengths_max: ',max(headline_lengths_3))

summary_lengths_max:  125
headline1_lengths_max:  40
headline2_lengths_max:  40
headline3_lengths_max:  40


In [17]:
'''
instance of class Batch_Data represent input to the encoder decoder model for a batch
'''
class Batch_Data:
  def __init__(self, batch_ip_vector, batch_ip_length, batch_op_vector_1, batch_op_vector_2, batch_op_vector_3, batch_op_token_idxs_1, batch_op_token_idxs_2, 
               batch_op_token_idxs_3, batch_mask_1, batch_mask_2, batch_mask_3):
    self.batch_ip_vector=batch_ip_vector
    self.batch_ip_length=batch_ip_length
    self.batch_op_vector_1=batch_op_vector_1
    self.batch_op_vector_2=batch_op_vector_2
    self.batch_op_vector_3=batch_op_vector_3
    self.batch_op_token_idxs_1=batch_op_token_idxs_1
    self.batch_op_token_idxs_2=batch_op_token_idxs_2
    self.batch_op_token_idxs_3=batch_op_token_idxs_3
    self.batch_mask_1=batch_mask_1
    self.batch_mask_2=batch_mask_2
    self.batch_mask_3=batch_mask_3

In [18]:
#tokenized_summaries[0].input_ids
a=np.array([list([1,2]) for i in range(2)])
a.shape
tokenized_summaries_1=tokenized_summaries[2:4]
b=np.array([list(tokenized_summary.input_ids) for tokenized_summary in tokenized_summaries_1])
b.shape

(2, 125)

In [19]:
'''
rearrange numpy array based on input row index position
'''
def rearrange_numpy_array(np_array,pos_row_idxs):
  return np_array[pos_row_idxs, :]

In [20]:
np_array=np.array([[1,2,3],[5,6,7],[9,10,11]])
pos_row_idxs=[0,2,1]
rearrange_numpy_array(np_array,pos_row_idxs)

array([[ 1,  2,  3],
       [ 9, 10, 11],
       [ 5,  6,  7]])

In [21]:
ip_array=[42, 32, 41, 49, 46, 52, 19, 31, 44, 43, 38, 40, 53, 40, 48, 50]
sorted_idxs=np.argsort(ip_array)
print(type(sorted_idxs))
print(type(list(sorted_idxs)))
print('sorted_idxs: ',sorted_idxs)
sorted_array=[]
for indx in sorted_idxs:
  sorted_array.append(ip_array[indx])
print('sorted_array: ',sorted_array)
#sorted summary_length_batch index:  [ 6  7  1 10 11 13  2  0  9  8  4 14  3 15  5 12]

<class 'numpy.ndarray'>
<class 'list'>
sorted_idxs:  [ 6  7  1 10 11 13  2  0  9  8  4 14  3 15  5 12]
sorted_array:  [19, 31, 32, 38, 40, 40, 41, 42, 43, 44, 46, 48, 49, 50, 52, 53]


In [78]:
'''
run this cell only once.
compute BERT based representation of news summary and headline and store it on drive.
this will help in faster training, as we don't have to get bert based vector representation of headline and summary during every training iteration. 
'''
batch_start=0
batch_end=0
end = 32 #len(tokenized_summaries)#32
batch_size=16
summary_vector=None

inputs_input_ids = np.array([list(tokenized_summary.input_ids) for tokenized_summary in tokenized_summaries])
inputs_attention_mask = np.array([np.array(tokenized_summary.attention_mask) for tokenized_summary in tokenized_summaries])

outputs1_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_1])
outputs1_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_headlines_1])

outputs2_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_2])
outputs2_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_headlines_2])

outputs3_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_headlines_3])
outputs3_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_headlines_3])


while batch_end<end:
  batch_end=batch_start+batch_size
  if batch_end<end:
    pass #do nothing
  else:
    batch_end=end
  print('batch_start: ',batch_start,' batch_end: ',batch_end)
  summary_batch=inputs_input_ids[batch_start:batch_end]
  summary_length_batch=[np.count_nonzero(summary) for summary in summary_batch]
  #print('summary_length_batch: ',summary_length_batch)
  attention_mask_summary = inputs_attention_mask[batch_start:batch_end]

  sorted_idx_pos=list(np.argsort(summary_length_batch)) #index position of input seq length based ascending order
  sorted_idx_pos.reverse()

  summary_length_batch.sort(reverse=True)
  summary_batch=rearrange_numpy_array(summary_batch,sorted_idx_pos)
  attention_mask_summary=rearrange_numpy_array(attention_mask_summary,sorted_idx_pos)

  #print('**************summary_batch shape: ',summary_batch.shape)
  #print('**************summary_length_batch : ',summary_length_batch)
  #print('sorted summary_length_batch index: ',np.argsort(summary_length_batch))
  #break

  headline1_batch=outputs1_input_ids[batch_start:batch_end]
  headline1_batch=rearrange_numpy_array(headline1_batch,sorted_idx_pos)
  attention_mask_headline1 = outputs1_attention_mask[batch_start:batch_end]
  attention_mask_headline1=rearrange_numpy_array(attention_mask_headline1,sorted_idx_pos)

  headline2_batch=outputs2_input_ids[batch_start:batch_end]
  headline2_batch=rearrange_numpy_array(headline2_batch,sorted_idx_pos)
  attention_mask_headline2 = outputs2_attention_mask[batch_start:batch_end]
  attention_mask_headline2=rearrange_numpy_array(attention_mask_headline2,sorted_idx_pos)

  headline3_batch=outputs3_input_ids[batch_start:batch_end]
  headline3_batch=rearrange_numpy_array(headline3_batch,sorted_idx_pos)
  attention_mask_headline3 = outputs3_attention_mask[batch_start:batch_end]
  attention_mask_headline3=rearrange_numpy_array(attention_mask_headline3,sorted_idx_pos)

  summary_batch_t = torch.tensor(summary_batch) 
  attention_mask_summary_t = torch.tensor(attention_mask_summary)

  headline1_batch_t = torch.tensor(headline1_batch) 
  attention_mask_headline1_t = torch.BoolTensor(attention_mask_headline1)
  
  headline2_batch_t = torch.tensor(headline2_batch) 
  attention_mask_headline2_t = torch.BoolTensor(attention_mask_headline2)

  headline3_batch_t = torch.tensor(headline3_batch) 
  attention_mask_headline3_t = torch.BoolTensor(attention_mask_headline3)
   
  with torch.no_grad():
    last_hidden_states = model(summary_batch_t, attention_mask=attention_mask_summary_t)
  summary_batch_vector=last_hidden_states[0]
  with torch.no_grad():
    last_hidden_states = model(headline1_batch_t, attention_mask=attention_mask_headline1_t)
  headline1_batch_vector=last_hidden_states[0]
  with torch.no_grad():
    last_hidden_states = model(headline2_batch_t, attention_mask=attention_mask_headline2_t)
  headline2_batch_vector=last_hidden_states[0]
  with torch.no_grad():
    last_hidden_states = model(headline3_batch_t, attention_mask=attention_mask_headline3_t)
  headline3_batch_vector=last_hidden_states[0]

  batch_data=Batch_Data(summary_batch_vector,summary_length_batch,headline1_batch_vector,headline2_batch_vector,headline3_batch_vector,
                        headline1_batch_t,headline2_batch_t,headline3_batch_t,attention_mask_headline1_t,attention_mask_headline2_t,attention_mask_headline3_t)
  batch_file_path='/content/gdrive/My Drive/Capstone_Project/Data/Bert_vectors/one_to_many_setup/batch_'+str(batch_start)+'_'+str(batch_end)+'.pickle'
  with open(batch_file_path, 'wb') as file_handle:
    pickle.dump(batch_data, file_handle, protocol=pickle.HIGHEST_PROTOCOL)
  batch_start=batch_end

batch_start:  0  batch_end:  16
summary_length_batch:  [83, 93, 84, 76, 79, 73, 106, 94, 81, 82, 87, 85, 72, 85, 77, 75]
batch_start:  16  batch_end:  32
summary_length_batch:  [92, 88, 88, 103, 102, 102, 94, 85, 87, 66, 83, 73, 94, 99, 80, 70]


In [142]:
'''
function for providing emebded representation of BOS token at first timestep of decoding for complete batch
'''
def get_initial_decoder_ip(batch_size):
  sos_token_tensor=torch.tensor([[tokenizer.bos_token_id]])
  with torch.no_grad():
    last_hidden_states = model(sos_token_tensor)
  SOS_token_bert_vector=last_hidden_states[0]
  SOS_token_bert_vector=torch.squeeze(SOS_token_bert_vector, 0)
  decoder_input = torch.tensor([SOS_token_bert_vector.numpy() for _ in range(batch_size)])
  print('decoder_input shape: ',decoder_input.shape)
  decoder_input=decoder_input.permute(1,0,2)
  print('decoder_input shape: ',decoder_input.shape)
  return decoder_input

In [143]:
'''
just for testing
'''
get_initial_decoder_ip(32)

decoder_input shape:  torch.Size([32, 1, 768])
decoder_input shape:  torch.Size([1, 32, 768])


tensor([[[-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         ...,
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568],
         [-0.7868,  0.3158,  0.1873,  ...,  0.1196,  0.4806,  0.2568]]])

In [90]:
'''
utility function for creating batch from stored BERT based news headline and summary vector 
'''
def prepare_batches():
  training_batch_location='/content/gdrive/My Drive/Capstone_Project/Data/Bert_vectors/one_to_many_setup/*'
  batch_files=glob.glob(training_batch_location)
  random.shuffle(batch_files)
  #print(batch_files)
  return batch_files 

def get_data_for_current_iteration(iteration_index,batch_files,batch_size=16):
  no_of_files_per_batch=int(batch_size/16) #every batch file on disk has 16 training records and batch size shalll be multiple of 16
  start_idx=iteration_index*no_of_files_per_batch
  end_idx=start_idx+no_of_files_per_batch
  files_for_this_iter=batch_files[start_idx:end_idx]#end_idx is exclusive and start_idx is inclusive
  print('files_for_this_iter: ',files_for_this_iter)
  batch_data=None
  for file in files_for_this_iter:
    #print('for loop')
    with open(file, 'rb') as file_handle:
      batch_data_loaded=pickle.load(file_handle)  
      #print('*************1')
    if (batch_data==None):
      #print('*************2')
      batch_data=batch_data_loaded
      #print('*************3')
    else:
      #print('*************4')
      
      batch_ip_vector=torch.vstack((batch_data.batch_ip_vector,batch_data_loaded.batch_ip_vector))
      batch_ip_length=batch_data.batch_ip_length + batch_data_loaded.batch_ip_length
      #
      batch_op_vector_1=torch.vstack((batch_data.batch_op_vector_1,batch_data_loaded.batch_op_vector_1))
      batch_op_token_idxs_1=torch.vstack((batch_data.batch_op_token_idxs_1,batch_data_loaded.batch_op_token_idxs_1))
      batch_mask_1=torch.vstack((batch_data.batch_mask_1,batch_data_loaded.batch_mask_1))
      #
      batch_op_vector_2=torch.vstack((batch_data.batch_op_vector_2,batch_data_loaded.batch_op_vector_2))
      batch_op_token_idxs_2=torch.vstack((batch_data.batch_op_token_idxs_2,batch_data_loaded.batch_op_token_idxs_2))
      batch_mask_2=torch.vstack((batch_data.batch_mask_2,batch_data_loaded.batch_mask_2))
      #
      batch_op_vector_3=torch.vstack((batch_data.batch_op_vector_3,batch_data_loaded.batch_op_vector_3))
      batch_op_token_idxs_3=torch.vstack((batch_data.batch_op_token_idxs_3,batch_data_loaded.batch_op_token_idxs_3))
      batch_mask_3=torch.vstack((batch_data.batch_mask_3,batch_data_loaded.batch_mask_3))
      #
      batch_data=Batch_Data(batch_ip_vector,batch_ip_length, batch_op_vector_1, batch_op_vector_2, batch_op_vector_3,batch_op_token_idxs_1, batch_op_token_idxs_2, 
               batch_op_token_idxs_3, batch_mask_1, batch_mask_2, batch_mask_3)
  #print('*************5')
  return batch_data

In [91]:
'''
test function for ip data generation for a given training iteration
'''
def trainIters_test():
  num_sample=32 #TODO need to be initialized prperly
  batch_size=16 #TODO need to be initialized prperly
  n_epoch=2 #TODO need to be initialized prperly
  max_target_len=44 #TODO need to be initialized prperly
  num_iteration=int(num_sample/batch_size)
  for epoch in range(n_epoch):
    print('epoch is in progress: ',epoch+1)  
    batch_files=prepare_batches()
    training_batches =[] 
    for iteration_index in range(num_iteration):
      print('*********************iteration index:',str(iteration_index),'*********************')
      # Run a training iteration with batch
      # Extract fields from batch
      batch_data=get_data_for_current_iteration(iteration_index,batch_files,batch_size)
      input_variable,lengths=batch_data.batch_ip_vector,batch_data.batch_ip_length
      target_variable_1,target_variable_2,target_variable_3=batch_data.batch_op_vector_1,batch_data.batch_op_vector_2,batch_data.batch_op_vector_3
      batch_op_token_idxs_1,batch_op_token_idxs_2,batch_op_token_idxs_3=batch_data.batch_op_token_idxs_1,batch_data.batch_op_token_idxs_2,batch_data.batch_op_token_idxs_3
      mask_1,mask_2,mask_3=batch_data.batch_mask_1,batch_data.batch_mask_2,batch_data.batch_mask_3

      input_variable=input_variable.permute(1,0,2)
      lengths = torch.tensor(lengths)
      print('input_variable: ',input_variable.shape, "type: ",type(input_variable))
      print('batch_ip_length: ',lengths.shape, "type: ",type(lengths))

      print('target_variable1: ',target_variable_1.shape, "type: ",type(target_variable_1))
      print('target_variable2: ',target_variable_2.shape, "type: ",type(target_variable_2))
      print('target_variable3: ',target_variable_3.shape, "type: ",type(target_variable_3))

      print('mask1: ',mask_1.shape, "type: ",type(mask_1))
      print('mask2: ',mask_2.shape, "type: ",type(mask_2))
      print('mask3: ',mask_3.shape, "type: ",type(mask_3))
      
      

In [92]:
'''
test ip data generation for a given training iteration
'''
trainIters_test()

epoch is in progress:  1
*********************iteration index: 0 *********************
files_for_this_iter:  ['/content/gdrive/My Drive/Capstone_Project/Data/Bert_vectors/one_to_many_setup/batch_0_16.pickle']
input_variable:  torch.Size([125, 16, 768]) type:  <class 'torch.Tensor'>
batch_ip_length:  torch.Size([16]) type:  <class 'torch.Tensor'>
target_variable1:  torch.Size([16, 40, 768]) type:  <class 'torch.Tensor'>
target_variable2:  torch.Size([16, 40, 768]) type:  <class 'torch.Tensor'>
target_variable3:  torch.Size([16, 40, 768]) type:  <class 'torch.Tensor'>
mask1:  torch.Size([16, 40]) type:  <class 'torch.Tensor'>
mask2:  torch.Size([16, 40]) type:  <class 'torch.Tensor'>
mask3:  torch.Size([16, 40]) type:  <class 'torch.Tensor'>
*********************iteration index: 1 *********************
files_for_this_iter:  ['/content/gdrive/My Drive/Capstone_Project/Data/Bert_vectors/one_to_many_setup/batch_16_32.pickle']
input_variable:  torch.Size([125, 16, 768]) type:  <class 'torch.

In [93]:
'''
GRU based encoder class without any embedding layer (as input will precomputed bert vector representation of news data)
'''
class Encoder(nn.Module):
  def __init__(self, embbed_dim, hidden_dim, num_layers):
       super(Encoder, self).__init__()
       #set the encoder input dimesion , embbed dimesion, hidden dimesion, and number of layers 
       self.hidden_dim = hidden_dim
       self.num_layers = num_layers
       self.embbed_dim=embbed_dim
       #intialize the GRU to take the input dimetion of embbed, and output dimention of hidden and
       #set the number of gru layers
       self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)

  def forward(self, input_seq, input_lengths, hidden=None):
    print('inside encoder forward function || input_seq shape: ',input_seq.shape )
    print('inside encoder forward function || input_lengths shape: ',input_lengths.shape )
    if(hidden!=None):
      torch.set_printoptions(threshold=10000)
      print('inside encoder forward function || hidden shape: ',hidden )
    # Pack padded batch of sequences for RNN module
    
    # Forward pass through GRU
    outputs, hidden = self.gru(input_seq, hidden)
    '''
    packed = nn.utils.rnn.pack_padded_sequence(input_seq, input_lengths)
    outputs, hidden = self.gru(packed, hidden)##TODO 
    outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
    '''
    # Unpack padding
    
    #print('inside encoder forward function || outputs shape: ',outputs.shape )
    #print('inside encoder forward function || hidden shape: ',hidden )
    #print('inside encoder forward function || outputs[:, : ,self.hidden_dim:] shape: ',outputs[:, : ,self.hidden_dim:].shape )
    # Sum bidirectional GRU outputs
    #outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
    # Return output and final hidden state
    print('inside encoder forward function || outputs shape: ',outputs.shape )
    print('inside encoder forward function || hidden shape: ',hidden.shape )
    return outputs, hidden   

In [94]:
'''
Luong attention layer
'''
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [95]:
'''
GRU with Luong Attn based Decoder class 
'''
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embbed_dim, hidden_size, output_size, num_layers=1 ):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.embbed_dim=embbed_dim
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        # Define layers
        self.gru = nn.GRU(self.embbed_dim, self.hidden_size, self.num_layers)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        #print("decoder forward 1 embedded shape: ",embedded.shape)
        #embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(input_step, last_hidden)
        #print("decoder forward 2 rnn_output shape: ",rnn_output.shape)
        #print("decoder forward 2 hidden shape: ",hidden.shape)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        #print("decoder forward 3 attn_weights shape: ",attn_weights.shape)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        #print("decoder forward 4 context shape: ",context.shape)
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        #print("decoder forward 5 rnn_output.squeeze shape: ",rnn_output.shape)
        #print("decoder forward 5 context.squeeze shape: ",context.shape)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        #print("decoder forward 6 concat_input shape: ",concat_input.shape)
        #print("decoder forward 6 concat_output shape: ",concat_output.shape)        
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        #print("decoder forward 7 output shape: ",output.shape)
        output = F.softmax(output, dim=1)
        #print("decoder forward 8 output shape: ",output.shape)
        # {Return word2 output} {and output} {output} and final hidden state
        return output, hidden

In [96]:
'''
loss function that calculates the average negative log likelihood of the elements that correspond to a 1 in the mask tensor
'''
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [97]:
'''
function for performing a single training iteration
'''
def train(input_variable, lengths, target_variable1, target_variable2, target_variable3, target_op_token_idxs1, target_op_token_idxs2, target_op_token_idxs3, mask1, mask2,mask3, 
          max_target_len, encoder, decoder1, decoder2, decoder3,
          encoder_optimizer, decoder_optimizer_1, decoder_optimizer_2, decoder_optimizer_3, batch_size, clip, decoder_ip_initial, teacher_forcing_ratio=1):
    # Zero gradients             
    encoder_optimizer.zero_grad()
    decoder_optimizer_1.zero_grad()
    decoder_optimizer_2.zero_grad()
    decoder_optimizer_3.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable1 = target_variable1.to(device)
    target_variable2 = target_variable2.to(device)
    target_variable3 = target_variable3.to(device)
    mask1 = mask1.to(device)
    mask2 = mask2.to(device)
    mask3 = mask3.to(device)

    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    print('inside train function decoder_ip_initial shape: ',decoder_ip_initial.shape)
    decoder_input = decoder_ip_initial#sos token for all the training sample in a given batch
    decoder1_input = decoder2_input = decoder3_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder1_hidden = decoder2_hidden = decoder3_hidden = encoder_hidden[:decoder1.num_layers] #this important and handle scenario where no of layer for GRU varries in encoder decoder
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False #TODO need to work on this
    
    max_target_len=10 # TODO we can get this value from actual op seq 
    
   
    if use_teacher_forcing:
        #iterate through timesteps for decoder 
        print('teacher forcing will be used for this batch')
        for timestep in range(1,max_target_len):            
            #print('before decoder forward pass: ')
            #print('decoder_input shape: ',decoder_input.shape)
            #print('decoder_hidden shape: ',decoder_hidden.shape)
            #print('encoder_outputs shape: ',encoder_outputs.shape)
            print('timestep: ',timestep,' inside train function1: ',decoder_input.shape)
            decoder1_output, decoder1_hidden = decoder1(decoder1_input, decoder1_hidden, encoder_outputs) #(decoder_input, decoder_hidden, encoder_outputs)
            decoder2_output, decoder2_hidden = decoder2(decoder2_input, decoder2_hidden, encoder_outputs)
            decoder3_output, decoder3_hidden = decoder3(decoder3_input, decoder3_hidden, encoder_outputs)

            # Teacher forcing: next input is current target
            # print('timestep: ',timestep,' inside train function2: ',decoder_input.shape)
            decoder1_input = torch.unsqueeze(target_variable1[timestep],0)#[1,64] use next timestamp token from target seq  as ip to decoder at nexe time step
            decoder2_input = torch.unsqueeze(target_variable2[timestep],0)
            decoder3_input = torch.unsqueeze(target_variable3[timestep],0)

            #print('target_variable[timestep] shape: ',target_variable[timestep].shape)#[64]
            #print('target_variable[timestep].view(1, -1) shape: ',target_variable[timestep].view(1, -1).shape)#[1,64]
            # Calculate and accumulate loss
            mask_loss1, nTotal1 = maskNLLLoss(decoder1_output, target_op_token_idxs1[timestep], mask1[timestep])
            mask_loss2, nTotal2 = maskNLLLoss(decoder2_output, target_op_token_idxs2[timestep], mask2[timestep])
            mask_loss3, nTotal3 = maskNLLLoss(decoder3_output, target_op_token_idxs3[timestep], mask3[timestep])

            mask_loss=mask_loss1 + mask_loss2 + mask_loss3
            nTotal=nTotal1+nTotal2+nTotal3
            loss += mask_loss
            #print('mask_loss1: ',mask_loss1)
            #print('mask_loss2: ',mask_loss2)
            #print('mask_loss3: ',mask_loss3)
            #print('type of mask_loss1: ',type(mask_loss1))
            #print('timestep : ',timestep,' mask_loss: ',mask_loss,' loss: ',loss)
            
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
            non_padding_token_count=(mask1[timestep+1].sum()+mask1[timestep+1].sum()+mask1[timestep+1].sum())#non_padding_token_count for next timestep
            #print('non_padding_token_count: ',non_padding_token_count)
            if(non_padding_token_count==0):#all tokens are padding token for next timestep for all records in batches 
              break
    else:
        print('teacher forcing won\'t be used for this batch')
        for timestep in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    print('before calling backward pass')
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder1.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder2.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder3.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer_1.step()
    decoder_optimizer_2.step()
    decoder_optimizer_3.step()

    return sum(print_losses) / n_totals

In [98]:
'''
function responsible for running n epoch of training given the passed models, optimizers, data etc.
'''
def trainIters(n_epoch, batch_size, num_sample, max_target_len, encoder, decoder1, decoder2, decoder3,
          encoder_optimizer, decoder_optimizer_1, decoder_optimizer_2, decoder_optimizer_3,  decoder_ip_initial, print_every, clip):
  num_iteration=int(num_sample/batch_size)    
  num_iteration=2 #TODO need to commented just for testing if complete training code is working fine
  for epoch in range(n_epoch):
    print('epoch is in progress: ',epoch+1)  
    batch_files=prepare_batches()
    print_loss=0
    for iteration_index in range(num_iteration):
      print('*********************iteration index:',str(iteration_index),'*********************')
      # Run a training iteration with batch
      # Extract fields from batch
      batch_data=get_data_for_current_iteration(iteration_index,batch_files,batch_size)
      print('1****************')
      input_variable,lengths=batch_data.batch_ip_vector,batch_data.batch_ip_length
      target_variable_1,target_variable_2,target_variable_3=batch_data.batch_op_vector_1,batch_data.batch_op_vector_2,batch_data.batch_op_vector_3
      target_op_token_idxs_1,target_op_token_idxs_2,target_op_token_idxs_3=batch_data.batch_op_token_idxs_1,batch_data.batch_op_token_idxs_2,batch_data.batch_op_token_idxs_3
      mask_1,mask_2,mask_3=batch_data.batch_mask_1,batch_data.batch_mask_2,batch_data.batch_mask_3

      input_variable=input_variable.permute(1,0,2)
      lengths = torch.tensor(lengths)
      print('input_variable shape: ',input_variable.shape)
      print('lengths: ',lengths)

      target_variable_1=target_variable_1.permute(1,0,2)
      target_variable_2=target_variable_2.permute(1,0,2)
      target_variable_3=target_variable_3.permute(1,0,2)
      print('target_variable shape: ',(target_variable_1.shape, target_variable_2.shape, target_variable_3.shape))


      target_op_token_idxs_1=target_op_token_idxs_1.permute(1,0)
      target_op_token_idxs_2=target_op_token_idxs_2.permute(1,0)
      target_op_token_idxs_3=target_op_token_idxs_3.permute(1,0)
      print('target_op_token_idxs shape: ',(target_op_token_idxs_1.shape, target_op_token_idxs_2.shape, target_op_token_idxs_3.shape))

      mask_1=mask_1.permute(1,0)
      mask_2=mask_2.permute(1,0)
      mask_3=mask_3.permute(1,0)
      print('mask_1 shape: ',(mask_1.shape, mask_2.shape, mask_3.shape))

      #print('input_variable: ',input_variable.shape, "type: ",type(input_variable))
      #print('target_variable: ',target_variable.shape, "type: ",type(target_variable))
      #print('target_op_token_idxs: ',target_op_token_idxs.shape, "type: ",type(target_op_token_idxs))
      #torch.set_printoptions(profile="full")
      #print('target_op_token_idxs: ',target_op_token_idxs)
      #torch.set_printoptions(profile="default") # reset#
      #break
      #print('mask: ',mask.shape, "type: ",type(mask))
      #print('batch_ip_length: ',lengths.shape, "type: ",type(lengths))
      
      print('inside trainIters before calling train')
      loss = train(input_variable, lengths, target_variable_1, target_variable_2, target_variable_3, target_op_token_idxs_1, target_op_token_idxs_2, target_op_token_idxs_3, mask_1, mask_2, mask_3, 
          max_target_len, encoder, decoder1, decoder2, decoder3,
          encoder_optimizer, decoder_optimizer_1, decoder_optimizer_2, decoder_optimizer_3, batch_size, clip, decoder_ip_initial, teacher_forcing_ratio=1)

      print('inside trainIters after calling train')
      print_loss += loss
      print_every=1 #print after every 10 iteration
      # Print training progress
      if iteration_index % print_every == 0: 
        print_loss_avg = print_loss / print_every
        print('Epoch: ',epoch+1,' Iteration: ',iteration_index,' avg_loss: ',print_loss_avg)
        print_loss = 0

In [99]:
'''
Configure training and optimization parameter
'''
encoder_n_layers=1 
decoder_n_layers=1

clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_epoch = 1 #10
print_every = 10
save_every = 500

##
num_sample = 32 #padded_summaries.shape
max_target_len = 40 #headline_max_len


##
vocab_len=30522 #
output_size=30521 # BERT vocab_size  30522
embed_size = 768
hidden_size = 256
batch_size = 16
#num_iteration = 100000


# Initialize encoder & decoder models
encoder = Encoder(embed_size, hidden_size, encoder_n_layers)
attn_model = 'dot'
decoder_1 = LuongAttnDecoderRNN(attn_model, embed_size, hidden_size, output_size, decoder_n_layers)
decoder_2 = LuongAttnDecoderRNN(attn_model, embed_size, hidden_size, output_size, decoder_n_layers)
decoder_3 = LuongAttnDecoderRNN(attn_model, embed_size, hidden_size, output_size, decoder_n_layers)

# Ensure dropout layers are in train mode
encoder.train()
decoder_1.train()
decoder_2.train()
decoder_3.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer_1 = optim.Adam(decoder_1.parameters(), lr=learning_rate * decoder_learning_ratio)
decoder_optimizer_2 = optim.Adam(decoder_2.parameters(), lr=learning_rate * decoder_learning_ratio)
decoder_optimizer_3 = optim.Adam(decoder_3.parameters(), lr=learning_rate * decoder_learning_ratio)

decoder_ip_initial=get_initial_decoder_ip(batch_size)

Building optimizers ...
decoder_input shape:  torch.Size([16, 1, 768])
decoder_input shape:  torch.Size([1, 16, 768])


In [100]:
'''
train the model for specified no of epoch
'''
trainIters(n_epoch, batch_size, num_sample, max_target_len, encoder, decoder_1, decoder_2, 
           decoder_3, encoder_optimizer, decoder_optimizer_1, decoder_optimizer_2, decoder_optimizer_3,  decoder_ip_initial, print_every, clip)

epoch is in progress:  1
*********************iteration index: 0 *********************
files_for_this_iter:  ['/content/gdrive/My Drive/Capstone_Project/Data/Bert_vectors/one_to_many_setup/batch_16_32.pickle']
1****************
input_variable shape:  torch.Size([125, 16, 768])
lengths:  tensor([103, 102, 102,  99,  94,  94,  92,  88,  88,  87,  85,  83,  80,  73,
         70,  66])
target_variable shape:  (torch.Size([40, 16, 768]), torch.Size([40, 16, 768]), torch.Size([40, 16, 768]))
target_op_token_idxs shape:  (torch.Size([40, 16]), torch.Size([40, 16]), torch.Size([40, 16]))
mask_1 shape:  (torch.Size([40, 16]), torch.Size([40, 16]), torch.Size([40, 16]))
inside trainIters before calling train
inside encoder forward function || input_seq shape:  torch.Size([125, 16, 768])
inside encoder forward function || input_lengths shape:  torch.Size([16])
inside encoder forward function || outputs shape:  torch.Size([125, 16, 256])
inside encoder forward function || hidden shape:  torch.Size

In [162]:
'''
greedy search decoder
'''
def greedy_decode(decoder1,decoder2, decoder3, decoder_hidden, encoder_outputs, max_seq_len):
        '''
        :param target_tensor: target indexes tensor of shape [B, T] where B is the batch size and T is the maximum length of the output sentence
        :param decoder_hidden: input tensor of shape [1, B, H] for start of the decoding
        :param encoder_outputs: if you are using attention mechanism you can pass encoder outputs, [T, B, H] where T is the maximum length of input sentence
        :return: decoded_batch
        '''
        batch_size = encoder_outputs.shape[1]

        ##decoded_batch = torch.zeros((batch_size, max_seq_len)).int()
        decoded_batch_1 = torch.zeros((batch_size, max_seq_len)).int()
        decoded_batch_2 = torch.zeros((batch_size, max_seq_len)).int()
        decoded_batch_3 = torch.zeros((batch_size, max_seq_len)).int()
        #print((decoded_batch.dtype))
        # decoder_input = torch.LongTensor([[EN.vocab.stoi['<sos>']] for _ in range(batch_size)]).cuda()
        #decoder_input = Variable(trg.data[0, :])  # sos
        decoder_input = get_initial_decoder_ip(batch_size)
        decoder1_input = decoder2_input =  decoder3_input = decoder_input
        decoder1_hidden = decoder2_hidden = decoder3_hidden = decoder_hidden
        print('1. inside greedy_decode decoder_input shape : ',decoder1_input.shape)
        for timestep in range(max_seq_len):
            ##decoder_output, decoder_hidden= decoder1(decoder_input, decoder_hidden, encoder_outputs)
            decoder1_output, decoder1_hidden= decoder2(decoder1_input, decoder1_hidden, encoder_outputs)
            decoder2_output, decoder2_hidden= decoder3(decoder2_input, decoder2_hidden, encoder_outputs)
            decoder3_output, decoder3_hidden= decoder3(decoder3_input, decoder3_hidden, encoder_outputs)
            print('2. inside greedy_decode decoder_output shape : ',decoder1_output.shape)# [1, 30521]
            print('3. inside greedy_decode decoder_hidden shape : ',decoder1_hidden.shape)
            ##topv, topi = decoder_output.data.topk(1)  
            d1_topv, d1_topi = decoder1_output.data.topk(1)  
            d2_topv, d2_topi = decoder2_output.data.topk(1)  
            d3_topv, d3_topi = decoder3_output.data.topk(1)  
            print('topv, topi: ',(d1_topv, d1_topi))
            ##topi = topi.view(-1)
            d1_topi =d1_topi.view(-1)
            d2_topi = d2_topi.view(-1)
            d3_topi = d3_topi.view(-1)
            ##decoded_batch[:, timestep] = topi   
            decoded_batch_1[:, timestep] = d1_topi   
            decoded_batch_2[:, timestep] = d2_topi   
            decoded_batch_3[:, timestep] = d3_topi   
            ##attention_mask_decoded_batch = torch.where(decoded_batch != 0, 1, 0)        
            d1_attention_mask_decoded_batch = torch.where(decoded_batch_1 != 0, 1, 0)   
            d2_attention_mask_decoded_batch = torch.where(decoded_batch_2 != 0, 1, 0)   
            d3_attention_mask_decoded_batch = torch.where(decoded_batch_3 != 0, 1, 0)   

            ##decoder_input = topi.detach().view(-1)
            decoder1_input = d1_topi.detach().view(-1)
            decoder2_input = d2_topi.detach().view(-1)
            decoder3_input = d3_topi.detach().view(-1)
            print('4 inside greedy_decode decoder_input 2 : ',decoder1_input)
            ##decoder_input = torch.unsqueeze(decoder_input, 0)
            decoder1_input = torch.unsqueeze(decoder1_input, 0)
            decoder2_input = torch.unsqueeze(decoder2_input, 0)
            decoder3_input = torch.unsqueeze(decoder3_input, 0)
            print('5 inside greedy_decode decoder_input 3 : ',decoder1_input)
            with torch.no_grad():
              ##last_hidden_states = model(decoded_batch, attention_mask=attention_mask_decoded_batch)
              d1_last_hidden_states = model(decoded_batch_1, attention_mask=d1_attention_mask_decoded_batch)
              d2_last_hidden_states = model(decoded_batch_2, attention_mask=d2_attention_mask_decoded_batch)
              d3_last_hidden_states = model(decoded_batch_3, attention_mask=d3_attention_mask_decoded_batch)
            ##decoded_batch_vector=last_hidden_states[0]#(batch_size*max_op_len*768)
            d1_decoded_batch_vector=d1_last_hidden_states[0]#(batch_size*max_op_len*768)
            d2_decoded_batch_vector=d2_last_hidden_states[0]#(batch_size*max_op_len*768)
            d3_decoded_batch_vector=d3_last_hidden_states[0]#(batch_size*max_op_len*768)

            ##decoder_input=torch.unsqueeze(decoded_batch_vector.permute(1,0,2)[timestep],0)
            decoder1_input=torch.unsqueeze(d1_decoded_batch_vector.permute(1,0,2)[timestep],0)
            decoder2_input=torch.unsqueeze(d2_decoded_batch_vector.permute(1,0,2)[timestep],0)
            decoder3_input=torch.unsqueeze(d3_decoded_batch_vector.permute(1,0,2)[timestep],0)
            print('6 inside greedy_decode decoder_input 5 : ',decoder1_input.shape)
        return (decoded_batch_1,decoded_batch_2,decoded_batch_3)

In [167]:
'''
function for generating op sequence for a given input seq
'''
MAX_LENGTH=10
def evaluate(summary_batch_vector, summary_length_batch, encoder, decoder1, decoder2, decoder3, max_length=MAX_LENGTH):
  input_variable=summary_batch_vector.permute(1,0,2)
  lengths = torch.tensor(summary_length_batch)
  encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
  encoder_hidden = encoder_hidden[:decoder1.num_layers] 
  print('*************encoder forward pass complted***************')
  decoded_batch_1,decoded_batch_2,decoded_batch_3=greedy_decode(decoder1, decoder2, decoder3, encoder_hidden, encoder_outputs, max_length)

  d1_decoded_words = [tokenizer.decode(token_idx) for token_idx in decoded_batch_1[0]]
  d2_decoded_words = [tokenizer.decode(token_idx) for token_idx in decoded_batch_2[0]]
  d3_decoded_words = [tokenizer.decode(token_idx) for token_idx in decoded_batch_3[0]]
  print('d1_decoded_words: ',d1_decoded_words)
  print('d2_decoded_words: ',d2_decoded_words)
  print('d3_decoded_words: ',d3_decoded_words)
  return (d1_decoded_words,d2_decoded_words,d3_decoded_words)

In [168]:
'''
generate sequence for a given input news summary using 'greedy-search' 
'''
news_summary = contents[0]
ref_headline1 = target_headlines_1[0]
ref_headline2 = target_headlines_2[0]
ref_headline3 = target_headlines_3[0]

print("article content: ")
print(news_summary)
print("reference headline: ",)
print(ref_headline1)
print(ref_headline2)
print(ref_headline3)

test_contents=contents[0:5]
test_target_headlines_1=target_headlines_1[0:5]
test_target_headlines_2=target_headlines_2[0:5]
test_target_headlines_3=target_headlines_3[0:5]

tokenized_test_summaries = [tokenizer(summary, padding="max_length", truncation=True, max_length=max_encoder_len) for summary in test_contents]


summary_batch = np.array([list(tokenized_test_summary.input_ids) for tokenized_test_summary in tokenized_test_summaries])
summary_attention_mask = np.array([np.array(tokenized_test_summary.attention_mask) for tokenized_test_summary in tokenized_test_summaries])

summary_length_batch=[np.count_nonzero(summary) for summary in summary_batch]

#sort news summary based their no of tokens
sorted_idx_pos=list(np.argsort(summary_length_batch)) #index position of input seq length based ascending order
sorted_idx_pos.reverse() #to get descending order

summary_length_batch.sort(reverse=True)
summary_batch = rearrange_numpy_array(summary_batch,sorted_idx_pos)
summary_attention_mask = rearrange_numpy_array(summary_attention_mask,sorted_idx_pos)

'''
tokenized_test_headlines_1 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in test_target_headlines_1]
tokenized_test_headlines_2 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in test_target_headlines_2]
tokenized_test_headlines_3 = [tokenizer(target_headlines, padding="max_length", truncation=True, max_length=max_decoder_len) for target_headlines in test_target_headlines_3]

test_outputs1_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_test_headlines_1])
test_outputs1_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_test_headlines_1])
test_outputs1_input_ids=rearrange_numpy_array(test_outputs1_input_ids,sorted_idx_pos)
test_outputs1_attention_mask=rearrange_numpy_array(test_outputs1_attention_mask,sorted_idx_pos)

test_outputs2_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_test_headlines_2])
test_outputs2_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_test_headlines_2])
test_outputs2_input_ids=rearrange_numpy_array(test_outputs2_input_ids,sorted_idx_pos)
test_outputs2_attention_mask=rearrange_numpy_array(test_outputs2_attention_mask,sorted_idx_pos)

test_outputs3_input_ids = np.array([np.array(tokenized_headline.input_ids) for tokenized_headline in tokenized_test_headlines_3])
test_outputs3_attention_mask = np.array([np.array(tokenized_headline.attention_mask) for tokenized_headline in tokenized_test_headlines_3])
test_outputs3_input_ids=rearrange_numpy_array(test_outputs3_input_ids,sorted_idx_pos)
test_outputs3_attention_mask=rearrange_numpy_array(test_outputs3_attention_mask,sorted_idx_pos)
'''
###########

summary_batch_t = torch.tensor(summary_batch) 
summary_attention_mask_t = torch.tensor(summary_attention_mask)

with torch.no_grad():
  last_hidden_states = model(summary_batch_t, attention_mask=summary_attention_mask_t)
summary_batch_vector=last_hidden_states[0]

print('summary_batch_vector shape: ',summary_batch_vector.shape)
print('lengths: ',summary_length_batch)

'''
summary_batch=summary_batch.permute(1,0,2)
summary_length_batch = torch.tensor(summary_length_batch)
target_variable_1=target_variable_1.permute(1,0,2)
target_variable_2=target_variable_2.permute(1,0,2)
target_variable_3=target_variable_3.permute(1,0,2)
print('target_variable shape: ',(target_variable_1.shape, target_variable_2.shape, target_variable_3.shape))
'''
MAX_LENGTH=10 # max target length of generated target seq
evaluate(summary_batch_vector, summary_length_batch, encoder, decoder_1, decoder_2, decoder_3,MAX_LENGTH)
#



article content: 
Taking to Instagram on Saturday, Arjun Kapoor posted a picture of himself with Janhvi Kapoor to wish the actress on her 24th birthday. In the picture, Arjun can be seen walking ahead while holding his sister's hand. "Happy birthday Janhvi...I can't promise much except like this picture you shall always have my support & hand wherever you go," Arjun wrote.
reference headline: 
You shall always have my support: Arjun Kapoor on Janhvi's b'day
'You shall always have my support': Arjun Kapoor pens heart-warming birthday note for Janhvi
"You will always have my support," Arjun Kapoor writes a heartfelt birthday note for Janhvi Kapoor.
summary_batch_vector shape:  torch.Size([5, 125, 768])
lengths:  [93, 84, 83, 79, 76]
inside encoder forward function || input_seq shape:  torch.Size([125, 5, 768])
inside encoder forward function || input_lengths shape:  torch.Size([5])
inside encoder forward function || outputs shape:  torch.Size([125, 5, 256])
inside encoder forward functio

(['c h i l l y',
  'c o m m i t s',
  'm a n n e r s',
  'c o m m i t s',
  'c o m m i t s',
  'c o m m i t s',
  'c o m m i t s',
  'c o m m i t s',
  'p e r s o n s',
  'p e r s o n s'],
 ['# # a t i o n s',
  'd o m',
  'd o m',
  '☆',
  'd o m',
  'p d f',
  'p d f',
  '# # s a u',
  'i n t e l l e c t',
  'i n t e l l e c t'],
 ['# # a t i o n s',
  'd o m',
  'd o m',
  '☆',
  'd o m',
  'p d f',
  'p d f',
  '# # s a u',
  'i n t e l l e c t',
  'i n t e l l e c t'])