In [0]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as f
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [0]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

In [3]:
device

device(type='cuda')

In [7]:
from zipfile import ZipFile

file = "data.zip"

with ZipFile(file,'r') as zip:
    zip.extractall()
    print('done')

done


In [0]:
#line file  
lines_filepath = os.path.join("cornell movie-dialogs corpus","movie_lines.txt")
#conversation file
con_filepath = os.path.join("cornell movie-dialogs corpus","movie_conversations.txt")

In [9]:
#visualize some lines
#as its already 'utf-8' coded so this codec can't decode byte 0xad thats why insted read i.e 'r' use 'rb'
with open(lines_filepath,'rb') as file:
    lines =file.readlines()
for line in lines[:8]:
    print(line.strip())

b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go."
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie."
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No'


In [0]:
#split each line of the file into a dictionary of feilds (lineId,characterId,movieId,character,text)

line_fields = ["lineId","characterId","movieId","character","text"]
lines ={}
#encode in ASCII
with open(lines_filepath,'r',encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(' +++$+++ ')
        #extract fields
        lineObj = {}
        for i, field in enumerate(line_fields):
            #lineObj is dictionary which contaion all lines_fields as key and values
            lineObj[field] = values[i]
        #key for lines dictionary is lineIdand value is lineObj
        lines[lineObj["lineId"]] = lineObj

In [11]:
#1st element of lines dictionary
l = lines
list(l.items())[0]

('L1045',
 {'character': 'BIANCA',
  'characterId': 'u0',
  'lineId': 'L1045',
  'movieId': 'm0',
  'text': 'They do not!\n'})

In [12]:
lines['L194']

{'character': 'BIANCA',
 'characterId': 'u0',
 'lineId': 'L194',
 'movieId': 'm0',
 'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'}

In [13]:
#visualize some conversation file
with open(con_filepath,'r') as file:
    convs =file.readlines()
for line in convs[:8]:
    print(line.strip())

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']


In [0]:
#split each onversation of the file into a dictionary of feilds ("character1Id","character2Id","movieId","utteranceIds")

conv_fields = ["character1Id","character2Id","movieId","utteranceIds"]
conversations =[]
with open(con_filepath,'r',encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(' +++$+++ ')
        convObj = {}
        for i, field in enumerate(conv_fields):
            convObj[field] = values[i]
        lineIds = eval(convObj["utteranceIds"])
        convObj["lines"] = []
        for lineId in lineIds:
            convObj["lines"].append(lines[lineId])
        conversations.append(convObj)


In [15]:
conversations[0]

{'character1Id': 'u0',
 'character2Id': 'u2',
 'lines': [{'character': 'BIANCA',
   'characterId': 'u0',
   'lineId': 'L194',
   'movieId': 'm0',
   'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'character': 'CAMERON',
   'characterId': 'u2',
   'lineId': 'L195',
   'movieId': 'm0',
   'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'character': 'BIANCA',
   'characterId': 'u0',
   'lineId': 'L196',
   'movieId': 'm0',
   'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'character': 'CAMERON',
   'characterId': 'u2',
   'lineId': 'L197',
   'movieId': 'm0',
   'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}],
 'movieId': 'm0',
 'utteranceIds': "['L194', 'L195', 'L196', 'L197']\n"}

In [0]:
#making question and amswer pair of each conversation  
qa_pairs =[]
#iterate through the lines inside conversations list
for conversation in conversations:
  for i in range(len(conversation['lines']) - 1):
    inputLine = conversation['lines'][i]['text'].strip()
    targetLine = conversation['lines'][i+1]['text'].strip()
    #if either inputLine or targetLine is absent than ignore
    if inputLine and targetLine:
      qa_pairs.append([inputLine,targetLine])

In [17]:
print(qa_pairs[0])
# total number of conversations
print('length of coversations = {}'.format(len(qa_pairs)))

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."]
length of coversations = 221282


In [18]:
#creae new file contain each line with questions and ansers which is seprated by tab

datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
delimiter = '\t' #tab
#unescape or decode the delimiter
delimiter = str(codecs.decode(delimiter,'unicode_escape'))

#now write into newly created file
print('start writing into the file...')

#open newly created file and start writing each line with questions and answers which are seprated by tab
with open (datafile, 'w', encoding='utf-8') as outputfile:
  writer  = csv.writer(outputfile, delimiter=delimiter)
  for pair in qa_pairs:
    writer.writerow(pair)
print('\ndone writing')


start writing into the file...

done writing


In [19]:
#visualize some lines inside 'formatted_movie_lines.txt' file

datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")

with open(datafile,'rb') as file:
  lines = file.readlines()
#1st 8 lines each line contain question and answer which are seprated by '\t'
# and each question answer pair seprated by '\r\n'
for line in lines[:8]:
  print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\n"
b'Why?\tUnsolved myster

# processing the words

eah line contain                                     

*pad token : if any sentence is shorter than pad it with zero to make all sentence same in length                                    
*start of sentence token(SOS token) : indicate the start of sentence                  
*End of sentence token(EOS token) : indicate the end of sentence 



NOTE :
addword() method is called 2 times                              
>>> *1st time : for counting the number of occurence in sentence using addSentence() method                                                           
>>> *2nd time : for deciding which words are importent and should be keept in dictionary and which are not importent should be removed from dictionary (depending upon number of occurence of that word if less than min_count which is nothing but thershokd than remove else keep).      this call in trim() method

In [0]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class vocabulary:
  def __init__(self,name):
    self.name = name
    self.word2index = {} #contain word as key and index as value e.g {'hi':0,'he':1,'maybe':2}
    self.word2count = {} #contain word as key and freq i.e number of occurence as value
    #index2word is opposite of word2index which contain index as key and word as value
    self.index2word = {PAD_token : 'PAD', 'SOS_token':'SOS', 'EOS_token':'EOS'}
    #count number of words in conversation (it will start with 3 because we already have 'PAD' = 0,'SOS' =1 & 'EOS'=2)
    self.num_words = 3

  def addSentence(self, sentence):
    """
    sentence.split() convert sentence into list of words
    e.g, inut :'hi i am ankit' ; output:['hi','i','am','ankit']
    """
    for word in sentence.split():
      self.addword(word)
  
  def addword(self,word):
    #check if word is present in the word2index dictionary or not
    if word not in self.word2index:
      self.word2index[word] = self.num_words #new word start with value as 3beacuse already PAD,SOS,EOS is present
      self.word2count[word] = 1 #start counting word occurence
      self.index2word[self.num_words] = word #start key with 3 and value as new word
      self.num_words += 1 #increse the num_words for new word in sentence
    else:
      self.word2count[word] += 1 #count the number of occurence
      
  def trim(self,min_count):
    keep_words = []
    for k,v in self.word2count.items():
      if v >= min_count:
        keep_words.append(k)
    print('keep words {} / {} = {:.4f}'.format(len(keep_words),len(self.word2index),len(keep_words)/len(self.word2index)))
    #reinitialize dictionaries for storing only required words and remove wich are lesser than thershold
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token : 'PAD', 'SOS_token':'SOS', 'EOS_token':'EOS'}
    self.num_words = 3

    for word in keep_words:
      self.addword(word)

#Data preprocessing

In [0]:
#turn a unicode string into plain ASCII 
def unicodeToAscii(string):
  """
  'NFD': normal form decompose , 'Mn' : non marking
  in unicodedata.normalize() method we pass two arguments one is NFD and string 
  if it is 'Mn' than ignore, this method output as tuple and at the end we combine all element in tupe by join methode
  """
  return ''.join(c for c in unicodedata.normalize('NFD',string) if unicodedata.category(c) != 'Mn')

In [22]:
#example of above method (it convert complex text into plain text by removing all special characctor)
unicodeToAscii('Jaimerais,bière,Où,es-tu ....')

'Jaimerais,biere,Ou,es-tu ....'

Now let's conert all characters in string into lowercase, multiple whites paces into single white space,...etc 
and also remove non-letter characters(i.e either number or sppecial characters)


In [0]:
def normalization(string):
  #convert string into lower case charactors and remove left and right white spaces
  s = unicodeToAscii(string.lower().strip())
  #replace '.' with ' .', '!' with ' !', '?' with ' ?' (i.e white space + chracter)
  # r is use to not consider ' \1' as character(r is to escape backspace)
  # \1 meanse 1st bracketed group
  s = re.sub(r"([.!?])",r" \1",s)
  # remove any character which is not sequesnce of lower letters or either any of . ! ? this 3 special charecters.
  # + means one or more (e.g input : aaa12a ; output : aaa a)
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  # if mutiple white spaces than replce by single space (i.e input :'    '; output:' ')
  s = re.sub(r"\s+",r" ",s).strip()
  return s

In [24]:
#test above function 
normalization("aa123aBc!s's    dd?")

'aa abc !s s dd ?'

#processing the text inside "formatted_mvie_lines.txt" file

In [25]:
datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
#read file and split each line (i.e \n)
print("reading file......")
lines = open(datafile, encoding = 'utf-8').read().strip().split('\n')
# split each line into pairs and normalize them( after spliting each line by '\t' we get one list for each line which contain 2 elements question and answer)
pairs = [[normalization(s) for s in pair.split('\t')] for pair in lines]
print('done reading!')

voc = vocabulary('cornell movie-dialogs corpus')

reading file......
done reading!


In [26]:
#above function results
print("1. single line of conversation in file is as below : ")
print(lines[0])  
print("\n 2. after splitinig by '\t' :")
print(lines[0].split('\t'))
print("\n 3. after normalization of both the elements in list using we created 'normalization' method:")
print(pairs[0])

1. single line of conversation in file is as below : 
Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.	Well, I thought we'd start with pronunciation, if that's okay with you.

 2. after splitinig by '	' :
['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."]

 3. after normalization of both the elements in list using we created 'normalization' method:
['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .', 'well i thought we d start with pronunciation if that s okay with you .']


#filtering the text

In [0]:
#if number of words in either question or answer is gretter than length of 9 than remove
max_len =10

def filterPair(p):
  #pair[0] is 1st element and pair[1] is 2nd element is each list inside pairs list(pairs is list contain multile lists)
  return [pair for pair in pairs if len(pair[0].split()) < max_len and len(pair[1].split()) < max_len]

In [28]:
print('before filteration of pairs , there are {} pairs/conversations \n'.format(len(pairs)))
pairs = filterPair(pairs)
print('after filteration of pairs , there are {} pairs/conversations'.format(len(pairs)))

before filteration of pairs , there are 221282 pairs/conversations 

after filteration of pairs , there are 64271 pairs/conversations


# Let's remove rarely use words

In [29]:
  #loop throw the filtered pairs and pass through the vocabulary class which we created 
  voc = vocabulary('cornell movie-dialogs corpus')
  for pair in pairs:
    voc.addSentence(pair[0]) #question
    voc.addSentence(pair[1]) #reply
  print("number of words : ", voc.num_words)

  #visualize some of the pairs

  for pair in pairs[:10]:
    print(pair)

number of words :  18007
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [30]:
"""
loop through pairs and take only those pair which contain numbers which occures more than thershold 
if any of the word occurence is less than thershold than whole question and reply pair will remove
"""

min_count = 3 #threshold
#voc : object of vocabulary class; pairs : output from above cell; min_count : threshold

def trimRareWords(voc, pairs, min_count):
  voc.trim(min_count) # trim is a method from vocabulary class gives word2index, word2count and index2words dictionaries
  keep_pairs = []
  for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input = True  # default true if not present in word2index dictionary than set to false
    keep_output = True  # default true if not present in word2index dictionary than set to false
    #check if any of the word in pair[0] and pair[1] (i.e question and reply) is not present in word2index dictionary than remove whole pair of conversation 
    #check for pair[0] (question)
    for word in input_sentence.split(' '):
      if word not in voc.word2index: 
        keep_input = False
        break
    
    #check for pair[1] (reply)
    for word in output_sentence.split(' '):
      if word not in voc.word2index: 
        keep_output = False
        break

    #only keep if keep_input & keep_output both are true 
    if keep_input and keep_output:
      keep_pairs.append(pair)
  print(" trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs),  len(keep_pairs)/ len(pairs)))
  return keep_pairs

pairs = trimRareWords(voc, pairs, min_count)

keep words 7822 / 18004 = 0.4345
 trimmed from 64271 pairs to 53125, 0.8266 of total


# preapering data for model

In [0]:
"""
get index of each word in sentence with end of sentence 
e.g index of 'hi' = 0, 'hello'=1, EOS(end f sentence) =2
input :'hi hello'; output:[0,1,2]
"""

def indexesFromSentence(voc,sentence):
  #word2index is a dictionary which contai word as key and index as value
  return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [32]:
print('sentence : ',pairs[1][0])
print('index of each word in sentence :', indexesFromSentence(voc,pairs[1][0]))
print('number of words in sentence {} + 1(EOS_token) = {}'.format(len(pairs[1][0].split()), len(pairs[1][0].split())+1) )

sentence :  you have my word . as a gentleman
index of each word in sentence : [7, 8, 9, 10, 4, 11, 12, 13, 2]
number of words in sentence 8 + 1(EOS_token) = 9


In [33]:
#lets try with some pairs 
questions = []
replys = []
for pair in pairs[:10]:
  questions.append(pair[0])
  replys.append(pair[1])
#let only print questions
print(questions)
print(len(questions))
indexes = [indexesFromSentence(voc, sentence) for sentence in questions]
print(indexes)
print('each list contain 2 at end which shows end of sentence')

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']
10
[[3, 4, 2], [7, 8, 9, 10, 4, 11, 12, 13, 2], [16, 4, 2], [8, 31, 22, 6, 2], [33, 34, 4, 4, 4, 2], [35, 36, 37, 38, 7, 39, 40, 41, 4, 2], [42, 2], [47, 7, 48, 40, 45, 49, 6, 2], [50, 51, 52, 6, 2], [58, 2]]
each list contain 2 at end which shows end of sentence


In [34]:
"""
 *there is difference between 'zip' & 'zip_longest'
 *'zip_longest' is a method of itertools class
 *we prefere 'zip_longest' insted of 'zip' because for 'zip' need to have all list/tuples same number of elements 
  else it willremove extra elements but in 'zip_longest' it will give 'None' value 
"""
a = [1,2,3,4]
b = [4,5,6]
print(list(itertools.zip_longest(a,b)))
 
# if we want '0' insted of 'None'
print(list(itertools.zip_longest(a,b,fillvalue=0))) 
#we can give any value as fillvalue
print(list(itertools.zip_longest(a,b,fillvalue='d'))) 


[(1, 4), (2, 5), (3, 6), (4, None)]
[(1, 4), (2, 5), (3, 6), (4, 0)]
[(1, 4), (2, 5), (3, 6), (4, 'd')]


In [35]:
#let's start with zeropadding i.e give 0 value if value is not present
"""
as we know max length of any nested list will be 10
we declare during 'filtering the text' in filterPair() method
so if length of any nested list is less than 10 than it will pad with 0 to make length eqqal to 10
"""
nested_list = [[3, 4, 2], 
              [7, 8, 9, 10, 4, 11, 12, 13, 2], 
              [16, 4, 2], 
              [8, 31, 22, 6, 2], 
              [33, 34, 4, 4, 4, 2], 
              [35, 36, 37, 38, 7, 39, 40, 41, 4, 2], 
              [42, 2], [47, 7, 48, 40, 45, 49, 6, 2], 
              [50, 51, 52, 6, 2],
              [58, 2]]

# we know max length is 10 but still let check for it
leng = [len(ind) for ind in nested_list]
print("length of each list inside 'nested_list' : {}\n ".format(leng))
print('maximum length is', max(leng))

length of each list inside 'nested_list' : [3, 9, 3, 5, 6, 10, 2, 8, 5, 2]
 
maximum length is 10


In [0]:
def zerPadding(l,fillvalue = 0):
  return list(itertools.zip_longest(*l, fillvalue= fillvalue))

In [37]:
#let test zeroPadding() function on 'nested_list'
test_result = zerPadding(nested_list)
print(test_result)

[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58), (4, 8, 4, 31, 34, 36, 2, 7, 51, 2), (2, 9, 2, 22, 4, 37, 0, 48, 52, 0), (0, 10, 0, 6, 4, 38, 0, 40, 6, 0), (0, 4, 0, 2, 4, 7, 0, 45, 2, 0), (0, 11, 0, 0, 2, 39, 0, 49, 0, 0), (0, 12, 0, 0, 0, 40, 0, 6, 0, 0), (0, 13, 0, 0, 0, 41, 0, 2, 0, 0), (0, 2, 0, 0, 0, 4, 0, 0, 0, 0), (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]


test_result =  

[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),                                              
(4, 8, 4, 31, 34, 36, 2, 7, 51, 2),                             
(2, 9, 2, 22, 4, 37, 0, 48, 52, 0),                                
(0, 10, 0, 6, 4, 38, 0, 40, 6, 0),                                  
(0, 4, 0, 2, 4, 7, 0, 45, 2, 0),                                    
(0, 11, 0, 0, 2, 39, 0, 49, 0, 0),                                     
(0, 12, 0, 0, 0, 40, 0, 6, 0, 0),                                 
(0, 13, 0, 0, 0, 41, 0, 2, 0, 0),                                          
(0, 2, 0, 0, 0, 4, 0, 0, 0, 0),                                         
(0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]                                                     
  
***
[1]. where 1st tuple contain 1st elements of each list inside 'nested_list', 2nd tuple contain 2nd elements of each list inside 'nested_list', 3rd tuple contain 3rd elements of each list inside 'nested_list', and so on .... 
***
[2].  and if element not prsesnt in list it will put 0 in tuple.
***
[3]. now max length is number of rows as we interchange rows and columns.
***
[4]. each element in list is nothing but token/index number of each word
***


#Prepare data for our Model(input data)  :

#BInary Matrix Formation

les't create binary matrix 

In [0]:
def binaryMatrix(l, value = 0):
  inp_matrix = []
  for i,lis in enumerate(l):
    inp_matrix.append([])
    for elem in lis:
      if elem == 0:
        inp_matrix[i].append(0)
      else:
        inp_matrix[i].append(1)
  return inp_matrix

In [39]:
#test above function with 'test_result' matrix
result = binaryMatrix(test_result)
result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

test_result =  

[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),                                              
(4, 8, 4, 31, 34, 36, 2, 7, 51, 2),                             
(2, 9, 2, 22, 4, 37, 0, 48, 52, 0),                                
(0, 10, 0, 6, 4, 38, 0, 40, 6, 0),                                  
(0, 4, 0, 2, 4, 7, 0, 45, 2, 0),                                    
(0, 11, 0, 0, 2, 39, 0, 49, 0, 0),                                     
(0, 12, 0, 0, 0, 40, 0, 6, 0, 0),                                 
(0, 13, 0, 0, 0, 41, 0, 2, 0, 0),                                          
(0, 2, 0, 0, 0, 4, 0, 0, 0, 0),                                         
(0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]     

******
it shows that each tuple converted to list & if element is 'non zero' than put '1' and if element is 'zero' than put '0'
***

resullt =                                                                     
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],                                                      
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],                                                         
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],                                                            
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],                                      
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],                                                           
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],                                                        
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],                                                         
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],                                                            
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],                                                         
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]                                                                                                                                           

# Input data for model

In [0]:
"""
1.this function takes the list of all 'questions' in the conversation as input and returs padded input sequesce 
than we convert that padded input senquence into 'torch.tensor' format for model and also take the length of each
batch inside tensor
"""
#l = list of questions ; voc = vocabulary class obet

def inputVar(l,voc):
  #indexesFromSentence() this fuction gives us the list of indexes for each sentence/question
  indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
  #take the length of each list of indexes & conver list into torch.tensor()
  lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
  # padding operation using zeroPadding() function
  padList = zerPadding(indexes_batch)
  #convert padList into torch.tensor() with each element is of long type
  padVar = torch.LongTensor(padList)
  #return padded torch tensor & length of each list inside tensor
  return padVar, lengths

In [0]:
"""
this function contain some operation similar as above function & some extra operations

for this fuction we take list of all 'reply' as input, convert each word of reply/sentence into list of indexes(i.e each index number for each words),
take the maximum length among all the list of indexes, do padding operation 

now use padding operation as input and perform 2 operations,
1. output of padding operation -> perform binary operation to get 1s and 0s -> convert into torch byte tensor
2. output of padding operation -> convert each integer into long type

3 OUTPUT of function:
maximum length, tensor with all long type, byte tensors
"""
#l = list of questions ; voc = vocabulary class obet
def outputVar(l, voc):
  #indexesFromSentence() this fuction gives us the list of indexes for each sentence/question
  indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
  #take the maximum length among the list of length
  max_length = max([len(indexes) for indexes in indexes_batch])
  # padding operation using zeroPadding() function
  padList = zerPadding(indexes_batch)
  #convert padList into list of 1s and 0s
  mask = binaryMatrix(padList)
  #convert mask list into byte tensor
  mask = torch.ByteTensor(mask)
  #convert each elemnt of padList into Long type
  padVar = torch.LongTensor(padList)
  #3 output of function is 1.tensor with all long type, 2.byte tensors, 3.maximum length
  return padVar, mask, max_length


In [0]:
#voc = object of vocabulary class; pair_batch = it is nested list and each lit contain 2 elements question,answer  
def batch2TrainData(voc, pair_batch):
  #each list inside pair_batch 1st-elment is question & 2nd is answer,now sort the pair_batch list by length of question
  pair_batch.sort(key = lambda x: len(x[0].split(' ')), reverse = True)
  input_batch,output_batch = [],[]
  for pair in pair_batch:
    input_batch.append(pair[0])
    output_batch.append(pair[1])
  #use inputVar() & outputVar() function's which are created above
  inp, lengths = inputVar(input_batch, voc)
  output, mask, max_length = outputVar(output_batch, voc)
  return inp, lengths, output, mask, max_length
  

In [43]:
"""
pairs is a nested list, in which each list contain 2 elements 1st is question and 2nd is answer
"""
#take 1st 5 list inside pairs-list
lis = []
for i in range(5):
  lis.append(pairs[i])

#let test batch2TrainData() function 
inp, lengths, output, mask, max_length = batch2TrainData(voc, lis)
print('input variables :')
print(inp)
print('\n list of lengths of each sentence : ', lengths)
print('\n target variables :')
print(output)
print('\n byte tensor :')
print(mask)
print('\n maximum length among output tensors :', max_length)

input variables :
tensor([[ 7, 33,  8,  3, 16],
        [ 8, 34, 31,  4,  4],
        [ 9,  4, 22,  2,  2],
        [10,  4,  6,  0,  0],
        [ 4,  4,  2,  0,  0],
        [11,  2,  0,  0,  0],
        [12,  0,  0,  0,  0],
        [13,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0]])

 list of lengths of each sentence :  tensor([9, 6, 5, 3, 3])

 target variables :
tensor([[ 7, 35, 32,  5, 17],
        [14, 36,  2,  6, 18],
        [15, 37,  0,  2, 19],
        [ 4, 38,  0,  0, 20],
        [ 2,  7,  0,  0, 21],
        [ 0, 39,  0,  0, 22],
        [ 0, 40,  0,  0, 23],
        [ 0, 41,  0,  0,  6],
        [ 0,  4,  0,  0,  2],
        [ 0,  2,  0,  0,  0]])

 byte tensor :
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 0, 1, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 0, 0]], dtype=torch.uint8)

 maximum length among output tensors : 1

# Start with Model

we are using RNN model (GRU)

![alt text](https://miro.medium.com/max/3032/1*yBXV9o5q7L_CvY7quJt3WQ.png
)

#SEquence to Sequence GRU Model

It contain 'Encoder' & 'Decoder'

![alt text](https://cntk.ai/jup/s2s.png)

We are using "Bidirectional-GRU", means there is 2 independent RNN
1. fed the input sequence in normal sequential order
2. fed the input squence in reverse order

# define Encoder Class

In [0]:
#inherit from 'nn' class  'Module' method from pytorch package 
class EncoderRNN(nn.Module):
  #create constructor
  """
  hidden_size = number of nuerons in hidden lay,
  embedding = searching/learning for words that have same meaning as given input word, 
  n_layers = number of hidden layers,
   dropout = % of drop neurons from hidden layer
  """
  def __init__(self, hidden_size, embedding, n_layers =1, dropout =0):
    #inherit from super class
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = embedding
    self.n_layers = n_layers
    #set 'input_size' & 'hidden_size' parameters equal to 'hiden_size'
    #we are using embedding (look in the pytorch documentaion for more details)
    #initialize GRU: nn.GRU(input_size,hidden_size,n_layers, dropout, bidirectional)
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers ==1 else dropout), bidirectional = True)

  #forward propagation
  """
  input_seq = bath of input sentence, each of shape=(max_length, batch_size),
  input_lengths = list of each sentence length,
  hidden = hidden state between each channel, shape =(n_layers * num_directions, batch_size, hidden_size)  
  """
  def forward(self, input_seq, input_lengths, hidden = None):
    embedded = self.embedding(input_seq)
    #we can either use pack padded or normal (in this RNN we are using pack padded inputs)
    packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
    #forward pass through GRU
    output, hidden = self.gru(packed,hidden)
    #unpack padding
    outputs,_ = torch.nn.utils.rnn.pad_packed_sequence(output)
    #take the sum of both GRU (Bidirectional GRU)
    outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
    # it will return output and final hidden state (more details on documentation of pytorch)
    return outputs, hidden

    """
    outputs: the output feature from the last layer of the GRU, for each timestep(sum of bidirectional outputs)
    outputs shape = (max_length, batch_size, hidden_size)
    
    hidden: hidden state from the last timestep only, of shape=(n_layers * num_directions, batch_size, hidden_size)
    """



# Define the Decoder with Attention mechanism

In [0]:
#Attention Class

#inherti from torch.nn.Module
class Attn(torch.nn.Module):
  def __init__(self, method, hidden_size):
    super(Attn, self).__init__()
    self.method = method
    self.hidden_size = hidden_size

  #elemnet wise multiplication of encoder_output & current GRU decoder state
  def dot_product(self, hidden, encode_output): 
    return torch.sum(hidden * encoder_output, dim = 2)

  
  """
  encoder_output: of shape=(max_length, batch_size, hidden_size)
  hidden: of shape=(1, batch_size, hidden_size) ; 1 becuse each GRU have only one element
  
  after taking dot product of hidden & encoder_output we will get:
  (max_length, batch_size, hidden_size) * (1, batch_size, hidden_size) = (max_length, batch_size, hidden_size)
    
  dim=2 in dot_product() function means addition of last 2 dimension (i.e  batch_size + hidden_size)
  which is of size batch_size 
  so the output of dot_product() function is (max_length, batch_size)
  """
  def forward(self, hidden, encoder_outputs):
    attn_engergies = self.dot_product(hidden, encoder_output) #(max_length, batch_size)
    #transpose max_length & batch_size dimensions
    attn_engergies = attn_engergies.t() #(batch_size, max_length)
    #now perform softmax normalization(with added dimension)
    return f.softmax(max_length, dim =1).unsqueeze(1) #(batch_size, 1, max_length)

    #f is a function which is imported 

In [0]:
#decoder class

#inherit from nn.Module
class LuongAttnDecoderRNN(nn.Module):
  #create constructor
  """
  attn_model = model that we are providing in 'Attension Mechanism' 
  embedding = searching/learning for words that have same meaning as given input word, 
  hidden_size = number of nuerons in hidden lay,
  output_size = output matrix size, of shape = (max_length, batch_size, hidden_size),
  n_layers = number of hidden layers,
   dropout = % of drop neurons from hidden layer
  """
  def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers =1, dropout =0.1):
    super(LuongAttnDecoderRNN, self).__init__()
    self.attn_model = attn_model
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout

    #define layers
    self.embedding = embedding
    self.embendding_dropout = nn.Dropout()  #also we are using dropout for embendding
    #bidirection is false
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers ==1 else dropout))
    #after concatinating attension model output & GRU output we get size of (bach_size, hidden_size *2)
    #for perfoming 'tanh operation we converting this into (hidden_size * 2, hidden)
    self.concat = nn.Linear(hidden_size * 2, hidden)
    #befor putting into softmax we agin convert it into (hidden_size, output_size)
    self.out = nn.Linear(hidden_size, output_size)

    self.attm = Attn(attn_model, hidden_size)

  #forward from one GRU to another at 'Decoder'
  """
  input_step = one time step of input sequence for predincting one word(1st GRE), of shape=(1, batch_size)
  last_hidden = hidden state from last GRU, of shape = (n_layers * num_directions, batch_size, hidden_size)
  encoder_output = encoder model's output, of shape = (ax_length, batch_size, hidden_size)
  """
  def forward(self, input_step, last_hidden, encoder_output):
    # 1.embedding current input words (batch)
    embedded = self.embendding(input_step)
    # 2.perform dropout on embedded output
    embedded = self.embendding_dropout(embedded)
    # 3.forward pass through unidirectional GRU
    rnn_output, hidden = self.gru(embedded, last_hidden)
    # 4.calculating the attention weights from the current GRU mmodel output
    attn_weights = self.attn(rnn_output, encoder_output)
    # 5. context vector/weighted sum = attention weights * encoder outputs
    #bmm = batch multiplication mechanisum
    #(batch_size,1,max_length) bmm (batch_size, max_length, hidden) = (batch_size, 1 , hidden)
    #matrix multiplication, (1, max_length) matrix multiplication (max_length, hidden) = (1,hidden)
    #so the context shape= (batch_size, 1, hidden)
    context = attn_weights.bmm(encoder_output.transpose(0,1)) 
    # 6. we have GRU output 'rnn_output' shape=(1, batch_size, hidden_size) & context shape= (batch_size, 1, hidden)
    #    remove 1 by using squeeze
    rnn_output = rnn_output.squeez(0) # 0 is position in shape
    context = context.squeez(1) # 1 is position in shape
    # 7. concate GRU output shape = (batch_size, hidden_size) & context vector = (batch_size, hidden_size)
    # 1 is dimention number i.e column
    #concatination result shape = (batch_size, hidden_size * 2)
    concat_input = torch.cat((rnn_output,context), 1) 
    # 8. pass concat_input through 'Linear layer' to for shape = (batch_size, hidden_size)
    #now this newly created shape is passes through 'tanh' activation function
    #(batch_size, hidden_size) -> tanh -> (batch_size, voc_size)
    concat_output = torch.tanh(self.concat(concat_input)) #self.concat represent Linear layer as we declear in constructor
    # 9. pass concat_output through 'Linear layer' for output prediction
    output = self.out(concat_output) #self.out represent 2nd Linear layer as we declear in constructor
    # 10. pass this outputs through softmax to get ranging between 0-1 for each output
    output = f.softmax(output, dim =1) #dim =1 (1 column)
    #this function returns output & final hidden state which passes for next GRU as input for predicting next word
    return output, hidden
    #output shape = (batch_size, voc_size)
    #hidden = (n_layers * n_directions, batch_size, hidden_size)

# Loss Function
*                                                        we are using 'Teacher Forcing' for o.5 time and remaning 0.5 'No Teacher Forcing' at Training
* 'No Teacher Forcing' at Testing                     

In [0]:
"""
*decoder_output is getting from LuongAttnDecoderRNN lass
*mask and target we are getting from batch2TrainData() function which we created

mask is a matrix of 1s & 0s, we ignore 0s and calculate 'loss' only for 1s
"""
# maskKNLLLoss = mask Negartive Log LikeLihood Loss
def maskKNLLLoss(decoder_out, target, mask):
  nTotal = mask.sum() #this will give us total number of ones/non-zero elements in matrix
  target = target.view(-1,1) #any number of rows that python decide and 1 column
  #decoder shape = (batch_size, voc_size); target = (batch_size,1)
  gathered_tensor = torch.gather(decoder_out,1,target)
  #calculate the loss
  crossEntrophy = -torch.log(gathered_tensor) #NLLLoss( Negartive Log LikeLihood Loss)
  #only select/consider non-zero elements
  loss = crossEntrophy.masked_select(mask)
  #calculate the mean of los
  loss = loss.mean()
  loss = loss.to(device) #convert to currently use device (CPU/GPU)
  #return total number of non-zero elements(nTotal), loss
  return loss, nTotal.item()




# Start with Iteration 