# Sequence to Sequence model with attention 
# to design a fun chatbot on the Cornell movie conversations Dataset

In [1]:
#!pip install tensorflow-gpu==2.0.0-beta1
import tensorflow as tf
import re
import numpy as np
import pandas as pd
import time
import copy
from sklearn.model_selection import train_test_split

from distutils.version import LooseVersion
import warnings

# Check TensorFlow Version
print('TensorFlow Version: {}'.format(tf.__version__))

W0726 15:38:31.839067 19516 __init__.py:308] Limited tf.compat.v2.summary API due to missing TensorBoard installation.
W0726 15:38:32.023203 19516 __init__.py:335] Limited tf.summary API due to missing TensorBoard installation.


TensorFlow Version: 2.0.0-beta1


# Data Pre-processing steps:

Open and Load data

Extract the movie conversations as questions and answers

Clean the questions and answers for easier network training

Analyze sentences length

Analyze word frequency

Create dictionaries for integer and words mapping

Preprocessing targets by adding GO and EOS token

Convert the text to integers and replace words not present in vocabualarly with unknown

Sort the sentences to make training easier


In [2]:
#Open and load data

#line ids represent the specific character statement in the movie

liners = open("movie_lines.txt", encoding = "utf-8", mode = "r", errors = "ignore").read().split('\n')

#conversations represent sequence of interactions with line ids in movie

converse = open("movie_conversations.txt", encoding = "utf-8", mode = "r", errors = "ignore").read().split('\n')

In [3]:
## The sentences' ids, which will be processed to become our input and target data.
converse[:10]

#onvers = convers[]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']"]

In [4]:
# The chatbot train statements

liners[:10]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?']

In [10]:
#Creating a dictionary to have line ids as keys, movie text as values

id_dict = {}


for line in liners:
    
    _line = line.split("+++$+++")
    
    if len(_line) == 5:
        
        id_dict[_line[0].replace(" ", "")] = _line[4]
           

In [11]:
#display extracted dictionary

id_dict

{'L1045': ' They do not!',
 'L1044': ' They do to!',
 'L985': ' I hope so.',
 'L984': ' She okay?',
 'L925': " Let's go.",
 'L924': ' Wow',
 'L872': " Okay -- you're gonna need to learn how to lie.",
 'L871': ' No',
 'L870': ' I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869': ' Like my fear of wearing pastels?',
 'L868': ' The "real you".',
 'L867': ' What good stuff?',
 'L866': " I figured you'd get to the good stuff eventually.",
 'L865': ' Thank God!  If I had to hear one more story about your coiffure...',
 'L864': " Me.  This endless ...blonde babble. I'm like, boring myself.",
 'L863': ' What crap?',
 'L862': ' do you listen to this crap?',
 'L861': ' No...',
 'L860': ' Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."',
 'L699': ' You always been this selfish?',
 'L698': ' But',
 'L697': " Then that's all you had to say.",
 'L696': ' Well, no...',
 'L695': " You never wanted t

In [12]:
# Extracting conversation ids

convs = []

for c in converse[:-1]:
    
    extract= re.search("\[.+", c).group()[1:-1].replace("'",'').replace(" ", "").split(',')
    
    convs.append(extract)
    

In [13]:
convs

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366'],
 ['L367', 'L368'],
 ['L401', 'L402', 'L403'],
 ['L404', 'L405', 'L406', 'L407'],
 ['L575', 'L576'],
 ['L577', 'L578'],
 ['L662', 'L663'],
 ['L693', 'L694', 'L695'],
 ['L696', 'L697', 'L698', 'L699'],
 ['L860', 'L861'],
 ['L862', 'L863', 'L864', 'L865'],
 ['L866', 'L867', 'L868', 'L869'],
 ['L870', 'L871', 'L872'],
 ['L924', 'L925'],
 ['L984', 'L985'],
 ['L1044', 'L1045'],
 ['L49', 'L50', 'L51'],
 ['L571', 'L572', 'L573'],
 ['L579', 'L580'],
 ['L595', 'L596', 'L597'],
 ['L598', 'L599', 'L600'],
 ['L659', 'L660'],
 ['L952', 'L953'],
 ['L394', 'L395'],
 ['L396', 'L397'],
 ['L589', 'L590', 'L591'],
 ['L592', 'L593'],
 ['L756', 'L757', 'L758'],
 ['L759', 'L760'],
 ['L164', 'L165'],
 ['L319', 'L320'],
 ['L441', 'L442', 'L443', 'L444', 'L445']

In [14]:
#sorting training sets into questions as inputs, answers as targets
questions = []

answers = []

for conv in convs:
    
    for c in range(len(conv) -1):
        
        questions.append(id_dict[conv[c]])
        
        answers.append(id_dict[conv[c+1]])
        
        

In [15]:
# Check if we have loaded the data correctly
limit = 0
for i in range(limit, limit+100):
    print(questions[i].lstrip())
    print(answers[i].lstrip())
    print()

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.

Well, I thought we'd start with pronunciation, if that's okay with you.
Not the hacking and gagging and spitting part.  Please.

Not the hacking and gagging and spitting part.  Please.
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

You're asking me out.  That's so cute. What's your name again?
Forget it.

No, no, it's my fault -- we didn't have a proper introduction ---
Cameron.

Cameron.
The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.

The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.
Seems like she could get a date easy enough...

Why?
Unsolved mystery.  She used to be really popular when she started h

In [16]:
# Compare lengths of questions and answers
print(len(questions))
print(len(answers))

221616
221616


In [17]:
#Clean unreasonable wordings

def cleaned_text(text):
    
    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [18]:
#clean the data

clean_questions = []

clean_answers = []

for clean_q in questions:
    
    clean_questions.append(cleaned_text(clean_q.lstrip()))
    
for clean_a in answers:
    
     clean_answers.append(cleaned_text(clean_a.lstrip()))

In [19]:
# Check cleaned dataset
limit = 0
for i in range(limit, limit+100):
    print(clean_questions[i])
    print(clean_answers[i])
    print()

can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
well i thought we would start with pronunciation if that is okay with you

well i thought we would start with pronunciation if that is okay with you
not the hacking and gagging and spitting part  please

not the hacking and gagging and spitting part  please
okay then how about we try out some french cuisine  saturday  night

you are asking me out  that is so cute that is your name again
forget it

no no it is my fault  we did not have a proper introduction 
cameron

cameron
the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does

the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does
seems like she could get a date easy enough

why
unsolved mystery  she used to be really popular when she started high school then it was just lik

In [21]:
#analyzing sentence lengths for questions and answers

lengths = []

for c in clean_questions:

    lengths.append(len(c.split()))
    
for a in clean_answers:

    lengths.append(len(a.split()))
    



lengths_size = pd.DataFrame(lengths, columns = ['counts'])


In [23]:
#statistics of length sentences

lengths_size.describe()

Unnamed: 0,counts
count,443232.0
mean,10.872094
std,12.215895
min,0.0
25%,4.0
50%,7.0
75%,14.0
max,555.0


In [24]:
#examining the quartiles 

print(np.percentile(lengths, 20))
print(np.percentile(lengths, 50))
print(np.percentile(lengths, 85))
print(np.percentile(lengths, 90))
print(np.percentile(lengths, 95))


3.0
7.0
19.0
24.0
32.0


In [25]:
#selecting sentences with length greater than 2 and less than 32

stan_questions_temp = []

stan_answers_temp = []

max_length = 32

min_length = 3

i = 0

for question in clean_questions:
    
    if len(question.split()) >= min_length and len(question.split()) <= max_length:
        
        
        stan_questions_temp.append(question)
        
        stan_answers_temp.append(clean_answers[i])
        
    i += 1
        

        
        
stan_questions = []

stan_answers = []
        
        
i = 0
for answer in stan_answers_temp:
    
    if len(answer.split()) >= min_length and len(answer.split()) <= max_length:
        
        
        stan_answers.append(answer)
        
        stan_questions.append(stan_questions_temp[i])
        
        
    i += 1
        
        


In [26]:
# Compare the number of lines we will use with the total number of lines.
print("# of questions:", len(stan_questions))
print("# of answers:", len(stan_answers))
print("% of data used: {}%".format(round(len(stan_questions)/len(clean_questions), 5)*100))

# of questions: 144071
# of answers: 144071
% of data used: 65.009%


In [27]:
#frequency distribution of number of occurence of words

import nltk
#nltk.download('punkt')
from nltk.probability import FreqDist


fdist = FreqDist()
for sentence in stan_questions:
     for word in nltk.tokenize.word_tokenize(sentence):
        fdist[word] += 1
        
for sentence in stan_answers:
     for word in nltk.tokenize.word_tokenize(sentence):
        fdist[word] += 1

In [28]:
fdist

FreqDist({'you': 139680, 'i': 131790, 'the': 82403, 'not': 73258, 'is': 72316, 'to': 70025, 'a': 61257, 'it': 60928, 'that': 47418, 'do': 46775, ...})

In [29]:
# Remove rare words from the vocabulary.
# We will aim to replace fewer than 5% of words with <UNK>
# You will see this ratio soon.
threshold = 5

count = 0
for k,v in fdist.items():
    
    if v >= threshold:
        
        count += 1

In [30]:
#comparison with original daatset

print("vocab size for reduced dataset is {}".format(count))

print("original vocab size for dataset is {}".format(len(fdist)))

print("fraction of vocab size for selected dataset is {:%}".format(count/len(fdist)))

vocab size for reduced dataset is 15954
original vocab size for dataset is 50011
fraction of vocab size for selected dataset is 31.900982%


In [31]:
# questions_vocab_to_int = copy.copy(codes)

# answers_vocab_to_int = copy.copy(codes)

# word_num = len(codes)

# for word, count in fdist.items():
#     if count >= threshold:
#         questions_vocab_to_int[word] = word_num
#         word_num += 1
        

# word_num = len(codes)
# for word, count in fdist.items():
#     if count >= threshold:
#         answers_vocab_to_int[word] = word_num
#         word_num += 1

#imported created vocabulary of questions and answers integer mapping to words
import pickle
    
pickle_in = open("questions_int_to_vocab.pkl","rb")
questions_int_to_vocab = pickle.load(pickle_in)


pickle_in = open("answers_int_to_vocab.pkl","rb")
answers_int_to_vocab = pickle.load(pickle_in)


len(questions_int_to_vocab)

15967

In [32]:
questions_int_to_vocab

{0: '<PAD>',
 1: '<GO>',
 2: '<EOS>',
 3: '<UNK>',
 4: 'can',
 5: 'we',
 6: 'make',
 7: 'this',
 8: 'quick',
 9: 'and',
 10: 'andrew',
 11: 'barrett',
 12: 'are',
 13: 'having',
 14: 'an',
 15: 'incredibly',
 16: 'public',
 17: 'break',
 18: 'up',
 19: 'on',
 20: 'the',
 21: 'again',
 22: 'well',
 23: 'i',
 24: 'thought',
 25: 'would',
 26: 'start',
 27: 'with',
 28: 'if',
 29: 'that',
 30: 'is',
 31: 'okay',
 32: 'you',
 33: 'not',
 34: 'hacking',
 35: 'gagging',
 36: 'spitting',
 37: 'part',
 38: 'please',
 39: 'thing',
 40: 'cameron',
 41: 'am',
 42: 'at',
 43: 'mercy',
 44: 'of',
 45: 'a',
 46: 'particularly',
 47: 'hideous',
 48: 'breed',
 49: 'loser',
 50: 'my',
 51: 'sister',
 52: 'date',
 53: 'until',
 54: 'she',
 55: 'does',
 56: 'unsolved',
 57: 'mystery',
 58: 'used',
 59: 'to',
 60: 'be',
 61: 'really',
 62: 'popular',
 63: 'when',
 64: 'started',
 65: 'high',
 66: 'school',
 67: 'then',
 68: 'it',
 69: 'was',
 70: 'just',
 71: 'like',
 72: 'got',
 73: 'sick',
 74: 'or',
 7

In [34]:
#mapping of word and ids vocabularly for questions and answers

questions_vocab_to_int = {key :  value for value, key in questions_int_to_vocab.items()}

answers_vocab_to_int = {key :  value for value, key in answers_int_to_vocab.items()}

In [35]:
# Check the length of the dictionaries for consistency
print(len(questions_vocab_to_int))
print(len(questions_int_to_vocab))
print(len(answers_vocab_to_int))
print(len(answers_int_to_vocab))

15967
15967
15967
15967


In [36]:
# Add the end of sentence token to the end of every answer.
for i in range(len(stan_answers)):
    
    stan_answers[i] = '<GO> ' + stan_answers[i] + ' <EOS>' 
    

In [37]:
stan_answers[6]

'<GO> right  see  you are ready for the quiz <EOS>'

In [38]:
#encoding our sentences and replacing words not present in vocabularly with UNK
# Convert the text to integers. 
# Replace any words that are not in the respective vocabulary with <UNK> 

questions_int = []
for question in stan_questions:
    ints = []
    for word in question.split():
        if word not in questions_vocab_to_int:
            ints.append(questions_vocab_to_int['<UNK>'])
        else:
            ints.append(questions_vocab_to_int[word])
    questions_int.append(ints)
    
    
answers_int = []
for answer in stan_answers:
    ints = []
    for word in answer.split():
        if word not in answers_vocab_to_int:
            ints.append(answers_vocab_to_int['<UNK>'])
        else:
            ints.append(answers_vocab_to_int[word])
    answers_int.append(ints)

In [39]:
# Check the lengths
print(len(questions_int))
print(len(answers_int))

144071
144071


In [40]:
#To speed up training, remove sentences with unk token

questions_int_check = []

answers_int_check = []

for i in range(len(questions_int)):
  
  if (3 not in questions_int[i]) and (3 not in answers_int[i]):
    
    questions_int_check.append(questions_int[i])
    
    answers_int_check.append(answers_int[i])

In [41]:
# Check the lengths
print(len(questions_int_check))
print(len(answers_int_check))

answers_int_check[1]

84819
84819


[1, 29, 30, 45, 3807, 2]

In [42]:
# # Calculate what percentage of all words have been replaced with <UNK>
# word_count = 0
# unk_count = 0

# for question in questions_int:
#     for word in question:
#         if word == questions_vocab_to_int["<UNK>"]:
#             unk_count += 1
#         word_count += 1
    
# for answer in answers_int:
#     for word in answer:
#         if word == answers_vocab_to_int["<UNK>"]:
#             unk_count += 1
#         word_count += 1
    
# unk_ratio = round(unk_count/word_count,4)*100
    
# print("Total number of words:", word_count)
# print("Number of times <UNK> is used:", unk_count)
# print("Percent of words that are <UNK>: {}%".format(round(unk_ratio,3)))

In [0]:
# # Sort questions and answers by the length of questions.
# # This will reduce the amount of padding during training
# # Which should speed up training and help to reduce the loss

# sort_questions = []

# sort_answers = []

# for length in range(0, max_length + 1):
    
#     for i, _ in enumerate(answers_int_check):
        
        
#         if len(answers_int[i]) == length:
            
#             sort_questions.append(questions_int[i])
            
#             sort_answers.append(answers_int[i])
            
# print(len(sort_questions))
# print(len(sort_answers))
# print()
# for i in range(5):
#     print(sort_answers[i])
#     print(sort_questions[i])
#     print()

In [43]:
#implement keras padding on answers
answers_int_check = tf.keras.preprocessing.sequence.pad_sequences(answers_int_check, padding='post')

In [44]:
#implement keras padding on questions
questions_int_check = tf.keras.preprocessing.sequence.pad_sequences(questions_int_check, padding='post')

# Model Building Steps

Creating Train and Validation sets

Build the Encoder Network

Build the Attention layer

Build the Decoder

Define Optimizer and Loss Functions

Build training step of encoder-decoder

In [45]:
# Creating training and validation sets using an 80-20 split

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(questions_int_check, answers_int_check, test_size=128, random_state = 42)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

84691 84691 128 128


In [46]:
#Creating tf.data dataset

BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 128
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
enc_units = 256
dec_units = 512
vocab_inp_size = len(questions_int_to_vocab)
vocab_tar_size = len(answers_int_to_vocab)
learning_rate = 0.001

#shuffling dataset 
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)

#extracting training data for batch size
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)


dataset_val = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val)).shuffle(len(input_tensor_val))


#extracting training data for batch size
dataset_val = dataset_val.batch(len(input_tensor_val), drop_remainder = False)




In [47]:
#Verifying generator object yielding dataset in batches
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

#val_inputs, val_targets = next(iter(dataset_val))

#val_inputs.shape

(TensorShape([128, 32]), TensorShape([128, 34]))

In [48]:
#Defining the Bi-directional Encoder network
#return sequences return hidden state output for each time step
#return state returns cell and hiddem state ouputs of last timestep

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        
        self.bigru =  tf.keras.layers.Bidirectional(self.gru)

    def call(self, x, states):
      
        x = self.embedding(x)
        
        concat_output, final_state_f, final_state_b  = self.bigru(x, initial_state = states)
        
        return concat_output, tf.keras.layers.concatenate([final_state_f, final_state_b], axis = -1)

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))
    
    
    

In [49]:
#verifying encoder component shapes

encoder = Encoder(vocab_inp_size, embedding_dim, enc_units, BATCH_SIZE)

# sample input
sample_hidden_f, sample_hidden_b = encoder.initialize_hidden_state()
sample_output, final_state = encoder(example_input_batch, [sample_hidden_f, sample_hidden_b])
                                        
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden_f.shape))
print ('Encoder Hidden bi-directional state shape: (batch size, units) {}'.format(sample_hidden_f.shape))

Encoder output shape: (batch size, sequence length, units) (128, 32, 512)
Encoder Hidden state shape: (batch size, units) (128, 256)
Encoder Hidden bi-directional state shape: (batch size, units) (128, 256)


In [50]:
#implement Bahdanau Attention to obtain context vector

class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        
        #define weight matrix for Bahdanau Attention
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        #insert a dimension of 1 at position 1
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        #apply the weight matrix on encoder inputs values and decoder hidden outputs
        score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [51]:
#verifying attention components shape

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(final_state, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (128, 512)
Attention weights shape: (batch_size, sequence_length, 1) (128, 32, 1)


In [52]:
#Build the decoder LSTM model with attention step.

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)
        
    

    def call(self, x, lstm_out, enc_output):
        
        
        #enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(lstm_out, enc_output)
        
        
        #x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        
        #embedding output` = The input to the decoder X is passed through an embedding layer.
        #merged vector = concat(embedding output, context vector)`
        #This merged vector is then given to the RNN

        #The shapes of all the vectors at each step have been specified in the comments in the code:
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        ## Write the encoder and decoder model

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the lstm
        output, final_state = self.gru(x)


        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        y = self.fc(output)

        return y, final_state, attention_weights

In [53]:
#verifying decoder output shape

decoder = Decoder(vocab_tar_size, embedding_dim, dec_units, BATCH_SIZE)


sample_decoder_output, _, _ = decoder(tf.random.uniform((128, 1)), final_state,  sample_output)
                                                       

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (128, 15967)


In [54]:
#defining optimizer and loss function

optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate, clipnorm = 1.0)

#providing labels as integers, use sparse
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

#mask to weight the padded outputs as 0 and important ones as 1

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [56]:
#checkpoint for saving model
# import os
# os.mkdir('checkpoints')
# checkpoint_dir = 'checkpoints/training_checkpoints'
# checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [0]:
#@tf.function
def train_step(inp, targ, enc_states):
    loss = 0
    
    #accurate_total = []

    with tf.GradientTape() as tape:
        
        #Running network computations in gradienttape to keep track of gradients
        
        enc_hidden = [sample_hidden_f, sample_hidden_b]
    
        enc_output, enc_final_state = encoder(inp, enc_hidden)
        
        dec_hidden = enc_final_state

        dec_input = tf.expand_dims([answers_vocab_to_int['<GO>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden,  _ = decoder(dec_input, dec_hidden, enc_output)
        
            # The loss is now accumulated through the whole batch
            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)
                   

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [55]:
EPOCHS = 30

for epoch in range(EPOCHS):
  
    old_loss = []
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        
        batch_loss  = train_step(inp, targ, enc_hidden)
        
        
        total_loss += batch_loss
        
   
        #val_loss = validation_step(input_tensor_val, target_tensor_val)
         
     
      
      
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {: .4f}'.format(epoch + 1, batch, batch_loss.numpy()))
                                                     
                                                     
                                                                       
                                                                     
        # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 30 == 0:
      checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    
    
    
    
    
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss  1.7251
Epoch 1 Batch 100 Loss  1.7443
Epoch 1 Batch 200 Loss  1.4323
Epoch 1 Batch 300 Loss  1.6412
Epoch 1 Batch 400 Loss  1.4018
Epoch 1 Batch 500 Loss  1.6132
Epoch 1 Batch 600 Loss  1.4766
Epoch 1 Loss 1.5625
Time taken for 1 epoch 469.1668734550476 sec

Epoch 2 Batch 0 Loss  1.4128
Epoch 2 Batch 100 Loss  1.4645
Epoch 2 Batch 200 Loss  1.2584
Epoch 2 Batch 300 Loss  1.5100
Epoch 2 Batch 400 Loss  1.3032
Epoch 2 Batch 500 Loss  1.5217
Epoch 2 Batch 600 Loss  1.4163
Epoch 2 Loss 1.4143
Time taken for 1 epoch 468.8044230937958 sec

Epoch 3 Batch 0 Loss  1.3512
Epoch 3 Batch 100 Loss  1.4144
Epoch 3 Batch 200 Loss  1.2128
Epoch 3 Batch 300 Loss  1.4568
Epoch 3 Batch 400 Loss  1.2655
Epoch 3 Batch 500 Loss  1.4822
Epoch 3 Batch 600 Loss  1.3788
Epoch 3 Loss 1.3674
Time taken for 1 epoch 467.37792587280273 sec

Epoch 4 Batch 0 Loss  1.3198
Epoch 4 Batch 100 Loss  1.3876
Epoch 4 Batch 200 Loss  1.1834
Epoch 4 Batch 300 Loss  1.4244
Epoch 4 Batch 400 Loss  1.2346
Epo

In [77]:
def evaluate(sentence):
    attention_plot = np.zeros((34, 32))

    #sentence = preprocess_sentence(sentence)

    inputs = [questions_vocab_to_int[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen= 32,
                                                           padding='post')
        
        
   # inputs = sentence
    
    inputs = tf.convert_to_tensor(inputs)
    
    hidden = [tf.zeros((1, enc_units)), tf.zeros((1, enc_units))]
    
    enc_output, enc_final_hidden = encoder(inputs, hidden)

    dec_hidden = enc_final_hidden
    
    dec_input = tf.expand_dims([answers_vocab_to_int['<GO>']], 0)
    
    result = ' '

    for t in range(32):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_output)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        
        if answers_int_to_vocab[predicted_id] == '<EOS>':
            return result, sentence, attention_plot
    
          
        result += answers_int_to_vocab[predicted_id] + ' '

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot


In [74]:
# function for plotting the attention weights
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [75]:
def response(sentence):
  
    result, sentence, attention_plot = evaluate(sentence)

    print('Input sentence: %s' % (sentence))
    print('Chatbot response: {}'.format(result))
    #print(target_tensor_val[0])

#     attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
#     plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [61]:
# restoring the latest checkpoint of trained chatbot
checkpoint.restore('checkpoints/ckpt-1')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x19dc041e630>

In [97]:
#seeing some fun responses from chatbot

print('sentence 1:')

response('i am very upset')

print('\n')


print('sentence 2:')

response('what is your worst fear')


print('\n')


print('sentence 3:')

response('where are you going')


print('\n')


print('sentence 4:')

response('what is your favourite song')


print('\n')


print('sentence 5:')

response('did you like the cinema')

print('\n')


print('sentence 6:')

response('do you like meeting new people')





sentence 1:
Input sentence: i am very upset
Chatbot response:  i am sorry i am sorry i am sorry i am sorry i am sorry i am sorry i am sorry i am sorry i am sorry i am sorry i am 


sentence 2:
Input sentence: what is your worst fear
Chatbot response:  i am not going to be the same 


sentence 3:
Input sentence: where are you going
Chatbot response:  i am going to get a good idea of the locals 


sentence 4:
Input sentence: what is your favourite song
Chatbot response:  one of the morning 


sentence 5:
Input sentence: did you like the cinema
Chatbot response:  no i did not 


sentence 6:
Input sentence: do you like meeting new people
Chatbot response:  i am a hustler 
