# Deep Learning for NLP - Creating a chatbot

In [1]:
!git clone https://github.com/deepanrajm/deep_learning.git

Cloning into 'deep_learning'...
remote: Enumerating objects: 2564, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 2564 (delta 29), reused 42 (delta 8), pack-reused 2488[K
Receiving objects: 100% (2564/2564), 292.61 MiB | 16.93 MiB/s, done.
Resolving deltas: 100% (68/68), done.
Updating files: 100% (2420/2420), done.


In [2]:
#Library Imports
import pickle
import numpy as np

In [3]:
#retrieve training data
with open('deep_learning/Attention_model/train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [4]:
#retrieve test data
with open('deep_learning/Attention_model/test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [5]:
#Number of training instances
len(train_data)

10000

In [6]:
#Number of test instances
len(test_data)

1000

In [7]:
#Example of one of the instances
train_data[10]

(['Sandra',
  'went',
  'back',
  'to',
  'the',
  'hallway',
  '.',
  'Sandra',
  'moved',
  'to',
  'the',
  'office',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'office', '?'],
 'yes')

In [8]:
' '.join(train_data[10][0])

'Sandra went back to the hallway . Sandra moved to the office .'

In [9]:
' '.join(train_data[10][1])

'Is Sandra in the office ?'

In [10]:
train_data[10][2]

'yes'

In [11]:
#First we will build a set of all the words in the dataset:
vocab = set()
for story, question, answer in train_data:
    vocab = vocab.union(set(story)) #Set returns unique words in the sentence
                                    #Union returns the unique common elements from a two sets
    vocab = vocab.union(set(question))

In [12]:
vocab.add('no')
vocab.add('yes')

In [13]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [14]:
#Calculate len and add 1 for Keras placeholder - Placeholders are used to feed in the data to the network. 
#They need a data type, and have optional shape arguements.
#They will be empty at first, and then the data will get fed into the placeholder
vocab_len = len(vocab) + 1

In [15]:
vocab_len

38

In [16]:
#Now we are going to calculate the longest story and the longest question
#We need this for the Keras pad sequences. 
#Keras training layers expect all of the input to have the same length, so 
#we need to pad 
all_data = test_data + train_data

In [17]:
all_story_lens = [len(data[0]) for data in all_data]

In [18]:
max_story_len = (max(all_story_lens))

In [19]:
max_question_len = max([len(data[1]) for data in all_data])

## Vectorizing the data

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [21]:
#Create an instance of the tokenizer object:
tokenizer = Tokenizer(filters = [])
tokenizer.fit_on_texts(vocab)

In [22]:
#Dictionary that maps every word in our vocab to an index
# It has been automatically lowercased
#This tokenizer can give different indexes for different words depending on when we run it
tokenizer.word_index

{'football': 1,
 'dropped': 2,
 'moved': 3,
 'went': 4,
 'discarded': 5,
 'daniel': 6,
 'bathroom': 7,
 'back': 8,
 'down': 9,
 'sandra': 10,
 'yes': 11,
 'left': 12,
 'mary': 13,
 'in': 14,
 'kitchen': 15,
 'up': 16,
 'took': 17,
 'put': 18,
 'hallway': 19,
 'milk': 20,
 'the': 21,
 'is': 22,
 'garden': 23,
 '?': 24,
 'there': 25,
 'to': 26,
 '.': 27,
 'picked': 28,
 'bedroom': 29,
 'office': 30,
 'no': 31,
 'journeyed': 32,
 'travelled': 33,
 'apple': 34,
 'john': 35,
 'grabbed': 36,
 'got': 37}

In [23]:
#Tokenize the stories, questions and answers:
train_story_text = []
train_question_text = []
train_answers = []

In [24]:
#Separating each of the elements
for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question) 
    train_answers.append(answer)
    

In [25]:
#Coverting the text into the indexes 
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [26]:
#Create a function for vectorizing the stories, questions and answers:
def vectorize_stories(data,word_index = tokenizer.word_index, max_story_len = max_story_len, max_question_len = max_question_len):
    #vectorized stories:
    X = []
    #vectorized questions:
    Xq = []
    #vectorized answers:
    Y = []
    
    for story, question, answer in data:
        #Getting indexes for each word in the story
        x = [word_index[word.lower()] for word in story]
        #Getting indexes for each word in the story
        xq = [word_index[word.lower()] for word in question]
        #For the answers
        y = np.zeros(len(word_index) + 1) #Index 0 Reserved when padding the sequences
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    #Now we have to pad these sequences:
    return(pad_sequences(X,maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))
        

In [27]:
inputs_train, questions_train, answers_train = vectorize_stories(train_data)

In [28]:
inputs_test, questions_test, answers_test = vectorize_stories(test_data)

In [29]:
inputs_train[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0, 13,  3, 26, 21,  7, 27, 10, 32, 26,
       21, 29, 27], dtype=int32)

In [30]:
train_story_text[0]

['Mary',
 'moved',
 'to',
 'the',
 'bathroom',
 '.',
 'Sandra',
 'journeyed',
 'to',
 'the',
 'bedroom',
 '.']

In [31]:
train_story_seq[0]

[13, 3, 26, 21, 7, 27, 10, 32, 26, 21, 29, 27]

## Building the Network

In [33]:
#Imports
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [34]:
# We need to create the placeholders 
#The Input function is used to create a keras tensor
#PLACEHOLDER shape = (max_story_len,batch_size)
#These are our placeholder for the inputs, ready to recieve batches of the stories and the questions
input_sequence = Input((max_story_len,)) #As we dont know batch size yet
question = Input((max_question_len,))

In [35]:
#Create input encoder M:
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_len,output_dim = 64)) #From paper
input_encoder_m.add(Dropout(0.3))

#Outputs: (Samples, story_maxlen,embedding_dim) -- Gives a list of the lenght of the samples where each item has the
#lenght of the max story lenght and every word is embedded in the embbeding dimension

In [36]:
#Create input encoder C:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_len,output_dim = max_question_len)) #From paper
input_encoder_c.add(Dropout(0.3))

#Outputs: (samples, story_maxlen, max_question_len)

In [37]:
#Create question encoder:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_len,output_dim = 64,input_length=max_question_len)) #From paper
question_encoder.add(Dropout(0.3))

#Outputs: (samples, question_maxlen, embedding_dim)

In [38]:
#Now lets encode the sequences, passing the placeholders into our encoders:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [39]:
#Use dot product to compute similarity between input encoded m and question 
#Like in the paper:
match = dot([input_encoded_m,question_encoded], axes = (2,2))
match = Activation('softmax')(match)

In [40]:
#For the response we want to add this match with the ouput of input_encoded_c
response = add([match,input_encoded_c])
response = Permute((2,1))(response) #Permute Layer: permutes dimensions of input

In [41]:
#Once we have the response we can concatenate it with the question encoded:
answer = concatenate([response, question_encoded])

In [42]:
# Reduce the answer tensor with a RNN (LSTM)
answer = LSTM(32)(answer)

In [43]:
#Regularization with dropout:
answer = Dropout(0.5)(answer)
#Output layer:
answer = Dense(vocab_len)(answer) #Output shape: (Samples, Vocab_size) #Yes or no and all 0s

In [44]:
#Now we need to output a probability distribution for the vocab, using softmax:
answer = Activation('softmax')(answer)

In [45]:
#Now we build the final model:
model = Model([input_sequence,question], answer)

In [46]:
model.compile(optimizer='rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])
#Categorical instead of binary cross entropy as because of the way we are training
#we could actually see any of the words from the vocab as output
#however, we should only see yes or no

## Training and testing the model

In [47]:
history = model.fit([inputs_train,questions_train],answers_train, batch_size = 32, epochs = 10, validation_data = ([inputs_test,questions_test],answers_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:
filename = 'Z_chatbot_100_epochs.h5'
model.save(filename)

In [49]:
#To load a model that we have already trained and saved:
model.load_weights('Z_chatbot_100_epochs.h5')

In [50]:
#Lets check out the predictions on the test set:
#These are just probabilities for every single word on the vocab
pred_results = model.predict(([inputs_test,questions_test]))



In [51]:
#First test data point
test_data[0]

(['Mary',
  'got',
  'the',
  'milk',
  'there',
  '.',
  'John',
  'moved',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'John', 'in', 'the', 'kitchen', '?'],
 'no')

In [52]:
#These are the probabilities for the vocab words using the 1st sentence
pred_results[0]

array([2.2259530e-07, 1.8268750e-07, 2.1535004e-07, 1.9866428e-07,
       2.0431054e-07, 1.8660683e-07, 2.4095345e-07, 1.9344766e-07,
       1.7379344e-07, 1.7595177e-07, 2.3182170e-07, 4.6050417e-01,
       1.7740552e-07, 2.2139280e-07, 1.8970714e-07, 1.8913413e-07,
       1.9646330e-07, 2.3427356e-07, 1.7612267e-07, 2.0829370e-07,
       2.0681588e-07, 2.0721865e-07, 1.9322863e-07, 2.1326579e-07,
       1.9052156e-07, 2.0180472e-07, 2.0090334e-07, 1.9065679e-07,
       2.0848091e-07, 1.9787581e-07, 1.7418370e-07, 5.3948873e-01,
       2.1478600e-07, 2.1963641e-07, 1.7755548e-07, 1.7804926e-07,
       1.8592578e-07, 1.9482029e-07], dtype=float32)

In [53]:
val_max = np.argmax(pred_results[0])

In [54]:
for key,val in tokenizer.word_index.items():
    if val == val_max:
        k = key
print(k)

no


In [55]:
#See probability:
pred_results[0][val_max]

0.53948873

In [56]:
#Now, we can make our own questions using the vocabulary we have
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [57]:
my_story = 'Sandra picked up the milk . Mary travelled left . '

In [58]:
my_story.split()

['Sandra',
 'picked',
 'up',
 'the',
 'milk',
 '.',
 'Mary',
 'travelled',
 'left',
 '.']

In [59]:
my_question = 'Sandra got the milk ?'

In [60]:
my_question.split()

['Sandra', 'got', 'the', 'milk', '?']

In [61]:
#Put the data in the same format as before
my_data = [(my_story.split(), my_question.split(),'yes')]

In [62]:
#Vectorize this data
my_story, my_ques, my_ans = vectorize_stories(my_data)

In [63]:
#Make the prediction
pred_results = model.predict(([my_story,my_ques]))



In [64]:
val_max = np.argmax(pred_results[0])

In [None]:
#Correct prediction!
for key,val in tokenizer.word_index.items():
    if val == val_max:
        k = key
print(k)

In [None]:
#Confidence
pred_results[0][val_max]