#### Imports

In [2]:
import sys
import os
import pandas as pd
import numpy as np
import re
import nltk

from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Bidirectional
from keras.models import Model, load_model

INPUT_LENGTH = 20
OUTPUT_LENGTH = 20


Using TensorFlow backend.


### Reading data

In [5]:
lines = open('cornell-moviedialog-corpus/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('cornell-moviedialog-corpus/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

---
Sample of the data

---

In [12]:
lines[1:10]

['L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?']

### Extract the conversations for POC 1 from the dataset

In [13]:
# Create a dictionary to map each line's id with its text
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        if _line[2] == 'm159' :
            id2line[_line[0]] = _line[4]

In [17]:
len(id2line)

472

In [50]:
# Create a list of all of the conversations' lines' ids.
convs = []
for line in conv_lines[:-1]:
    _line = line.split(' +++$+++ ')
    if len(_line) >= 4:
        if _line[2] == 'm159':
            _line = _line[-1][1:-1].replace("'","").replace(" ","")
            convs.append(_line.split(','))

In [51]:
len(convs)

158

### Sample of conversation

In [52]:
for k in convs[100]:
    print (k, id2line[k])

L441948 Raise the sails.
L441949 The wind is quarter from astern ... by the time we're underway, we'll never catch them.
L441950 We need only to come about, to put them in range of the long nines.


---
Sort the sentences into questions (inputs) and answers (targets)

---

In [53]:
questions = []
answers = []
for conv in convs:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])
        
# Compare lengths of questions and answers
print(len(questions))
print(len(answers))

314
314


---
### Handling Dataset size issue

Since this dataset is very small in size, we append the dialogs from the 4 other POC movies to this.
The scripts for these movies are publicly available from multiple sources. We scrape the movie scripts into text files and the extract question answer pairs from them.

---

In [74]:
ls -al POC*

-rw-r--r--@ 1 camelliadebnath  staff   76547 May 22 12:28 POC1.txt
-rw-r--r--@ 1 camelliadebnath  staff   97258 May 21 16:49 POC2.txt
-rw-r--r--@ 1 camelliadebnath  staff   79823 May 22 12:42 POC3.txt
-rw-r--r--@ 1 camelliadebnath  staff  111840 May 22 12:46 POC4.txt
-rw-r--r--@ 1 camelliadebnath  staff  120175 May 22 12:46 POC5.txt


In [134]:
def processScript(scriptFile, movie)->pd.DataFrame:
    lines = scriptFile.readlines()
    dialogs = [re.sub(r'[\(\[].*?[\)\]]', '', x) for x in lines]
    dialog_df = []
    scene = 0
    for l in dialogs:
        row = l.split(':')
        if len(row)>= 2:
            if row[0].startswith('SCENE') or row[0].startswith('Scene'):
                m = re.search(r'\d+$', row[0])
                scene = int(m.group())
            else:
                dialog_df.append({'Movie':movie, 'Scene':scene, 'Character':row[0],'Dialog':row[1].rstrip()})
    dialog_df = pd.DataFrame(dialog_df)
    return dialog_df

In [135]:
f2 = open("POC2.txt", "r")
poc2 = processScript(f2, 2)

f3 = open("POC3.txt", "r")
poc3 = processScript(f3, 3)

f4 = open("POC4.txt", "r")
poc4 = processScript(f4, 4)

f5 = open("POC5.txt", "r")
poc5 = processScript(f5, 5)

print(poc2.shape)
poc = poc2.append(poc3).append(poc4).append(poc5)

print(poc.shape)

(826, 4)
(2761, 4)


In [142]:
poc.iloc[0]

Movie                                     2
Scene                                     1
Character                         Elizabeth
Dialog         Will. Why is this happening?
Name: 0, dtype: object

In [143]:
for i in range(poc.shape[0]-1):
    questions.append(poc.iloc[i].Dialog)
    answers.append(poc.iloc[i+1].Dialog)
        
# Compare lengths of questions and answers
print(len(questions))
print(len(answers))

3074
3074


---
Text Cleaning

---

In [144]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", text)
#     text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = " ".join(text.split())
    return text

In [145]:
# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))

In [146]:
# Find the length of sentences (not using nltk due to processing speed)
lengths = []
# lengths.append([len(nltk.word_tokenize(sent)) for sent in clean_questions]) #nltk approach
for question in clean_questions:
    lengths.append(len(question.split()))
for answer in clean_answers:
    lengths.append(len(answer.split()))
# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])
print(np.percentile(lengths, 80))
print(np.percentile(lengths, 85))
print(np.percentile(lengths, 90))
print(np.percentile(lengths, 95))

15.0
18.0
22.0
30.0


In [148]:
# Remove questions and answers that are shorter than 1 word and longer than 20 words.
min_line_length = 1
max_line_length = 20

# Filter out the questions that are too short/long
short_questions_temp = []
short_answers_temp = []

for i, question in enumerate(clean_questions):
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])

# Filter out the answers that are too short/long
short_questions = []
short_answers = []

for i, answer in enumerate(short_answers_temp):
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
        
print(len(short_questions))
print(len(short_answers))

2408
2408


### Sample question-answer pairs

In [152]:
r = np.random.randint(1,len(short_questions))

for i in range(r, r+3):
    print(short_questions[i])
    print(short_answers[i])
    print()

anyone?
if we had something sharp, i could pick that lock.

if we had something sharp, i could pick that lock.
you are not going to find anything that sharp here.

you are not going to find anything that sharp here.
keep still, you bilge rat. captain!



### Train-Test Validation

In [153]:
#choosing number of samples
num_samples = 2200  # Number of samples to train on.
short_questions = short_questions[:num_samples]
short_answers = short_answers[:num_samples]
#tokenizing the qns and answers
short_questions_tok = [nltk.word_tokenize(sent) for sent in short_questions]
short_answers_tok = [nltk.word_tokenize(sent) for sent in short_answers]

In [154]:
#train-validation split
data_size = len(short_questions_tok)

# We will use the first 0-80th %-tile (80%) of data for the training
training_input  = short_questions_tok[:round(data_size*(80/100))]
training_input  = [tr_input[::-1] for tr_input in training_input] #reverseing input seq for better performance
training_output = short_answers_tok[:round(data_size*(80/100))]

# We will use the remaining for validation
validation_input = short_questions_tok[round(data_size*(80/100)):]
validation_input  = [val_input[::-1] for val_input in validation_input] #reverseing input seq for better performance
validation_output = short_answers_tok[round(data_size*(80/100)):]

print('training size', len(training_input))
print('validation size', len(validation_input))

training size 1760
validation size 440


### Word en/decoding dictionaries

In [155]:
# Create a dictionary for the frequency of the vocabulary
# Create 
vocab = {}
for question in short_questions_tok:
    for word in question:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

for answer in short_answers_tok:
    for word in answer:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1 

In [157]:
# Remove rare words from the vocabulary.
# We will aim to replace fewer than 5% of words with <UNK>
# You will see this ratio soon.
threshold = 5
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1
        
        
print("Size of total vocab:", len(vocab))
print("Size of vocab we will use:", count)        

Size of total vocab: 2682
Size of vocab we will use: 796


In [158]:
#we will create dictionaries to provide a unique integer for each word.
WORD_CODE_START = 1
WORD_CODE_PADDING = 0


word_num  = 2 #number 1 is left for WORD_CODE_START for model decoder later
encoding = {}
decoding = {1: 'START'}
for word, count in vocab.items():
    if count >= threshold: #get vocabularies that appear above threshold count
        encoding[word] = word_num 
        decoding[word_num ] = word
        word_num += 1

print("No. of vocab used:", word_num)


No. of vocab used: 798


In [159]:
#include unknown token for words not in dictionary
decoding[len(encoding)+2] = '<UNK>'
encoding['<UNK>'] = len(encoding)+2

In [160]:
dict_size = word_num+1
dict_size

799

### Vectorization

In [161]:
def transform(encoding, data, vector_size=20):
    """
    :param encoding: encoding dict built by build_word_encoding()
    :param data: list of strings
    :param vector_size: size of each encoded vector
    """
    transformed_data = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            try:
                transformed_data[i][j] = encoding[data[i][j]]
            except:
                transformed_data[i][j] = encoding['<UNK>']
    return transformed_data

In [162]:
#encoding training set
encoded_training_input = transform(
    encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = transform(
    encoding, training_output, vector_size=OUTPUT_LENGTH)

print('encoded_training_input', encoded_training_input.shape)
print('encoded_training_output', encoded_training_output.shape)


encoded_training_input (1760, 20)
encoded_training_output (1760, 20)


In [163]:
#encoding validation set
encoded_validation_input = transform(
    encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = transform(
    encoding, validation_output, vector_size=OUTPUT_LENGTH)

print('encoded_validation_input', encoded_validation_input.shape)
print('encoded_validation_output', encoded_validation_output.shape)

encoded_validation_input (440, 20)
encoded_validation_output (440, 20)


# Model Building

### Sequence-to-Sequence in Keras

In [164]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [165]:
INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))

In [166]:
from keras.layers import SimpleRNN

encoder = Embedding(dict_size, 128, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
encoder = LSTM(512, return_sequences=True, unroll=True)(encoder)
encoder_last = encoder[:,-1,:]

print('encoder', encoder)
print('encoder_last', encoder_last)

decoder = Embedding(dict_size, 128, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)
decoder = LSTM(512, return_sequences=True, unroll=True)(decoder, initial_state=[encoder_last, encoder_last])

print('decoder', decoder)

# For the plain Sequence-to-Sequence, we produced the output from directly from decoder
# output = TimeDistributed(Dense(output_dict_size, activation="softmax"))(decoder)

encoder Tensor("lstm_1/transpose_2:0", shape=(None, 20, 512), dtype=float32)
encoder_last Tensor("strided_slice:0", shape=(None, 512), dtype=float32)
decoder Tensor("lstm_2/transpose_2:0", shape=(None, 20, 512), dtype=float32)


## Attention Mechanism

In [167]:
from keras.layers import Activation, dot, concatenate

# Equation (7) with 'dot' score from Section 3.1 in the paper.
# Note that we reuse Softmax-activation layer instead of writing tensor calculation
attention = dot([decoder, encoder], axes=[2, 2])
attention = Activation('softmax', name='attention')(attention)
print('attention', attention)

context = dot([attention, encoder], axes=[2,1])
print('context', context)

decoder_combined_context = concatenate([context, decoder])
print('decoder_combined_context', decoder_combined_context)

# Has another weight + tanh layer as described in equation (5) of the paper
output = TimeDistributed(Dense(512, activation="tanh"))(decoder_combined_context)
output = TimeDistributed(Dense(dict_size, activation="softmax"))(output)
print('output', output)

attention Tensor("attention/truediv:0", shape=(None, 20, 20), dtype=float32)
context Tensor("dot_2/MatMul:0", shape=(None, 20, 512), dtype=float32)
decoder_combined_context Tensor("concatenate_1/concat:0", shape=(None, 20, 1024), dtype=float32)
output Tensor("time_distributed_2/Reshape_1:0", shape=(None, 20, 799), dtype=float32)


In [168]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[output])
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 20, 128)      102272      input_2[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 128)      102272      input_1[0][0]                    
____________________________________________________________________________________________

In [169]:
training_encoder_input = encoded_training_input
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = WORD_CODE_START
training_decoder_output = np.eye(dict_size)[encoded_training_output.astype('int')]

validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = WORD_CODE_START
validation_decoder_output = np.eye(dict_size)[encoded_validation_output.astype('int')]

In [170]:
model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
          validation_data=([validation_encoder_input, validation_decoder_input], [validation_decoder_output]),
          #validation_split=0.05,
          batch_size=64, epochs=100)

model.save('model_attention.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1760 samples, validate on 440 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


  '. They will not be included '


### Model Testing

In [171]:
def prediction(raw_input):
    clean_input = clean_text(raw_input)
    input_tok = [nltk.word_tokenize(clean_input)]
    input_tok = [input_tok[0][::-1]]  #reverseing input seq
    encoder_input = transform(encoding, input_tok, 20)
    decoder_input = np.zeros(shape=(len(encoder_input), OUTPUT_LENGTH))
    decoder_input[:,0] = WORD_CODE_START
    for i in range(1, OUTPUT_LENGTH):
        output = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
    return output

def decode(decoding, vector):
    """
    :param decoding: decoding dict built by word encoding
    :param vector: an encoded vector
    """
    text = ''
    for i in vector:
        if i == 0:
            break
        text += ' '
        text += decoding[i]
    return text

In [199]:
texts = ["tell me the weather", "I need help", "can you help me?", "where am I?",
        "who are you?", "where is the ship?", "who are you?", "who is a good boy?",
        "Am I a good boy?"]

for text in texts:
    output = prediction(text)
    print ('Q:', text)
    print ('A:', decode(decoding, output[0]))
    print()

Q: tell me the weather
A:  no .

Q: I need help
A:  you <UNK> me , <UNK> me , hurry !

Q: can you help me?
A:  no ... you are be my father .

Q: where am I?
A:  you are beautiful .

Q: who are you?
A:  no .

Q: where is the ship?
A:  and there were a very <UNK> . without his face familiar to <UNK> .

Q: who are you?
A:  no .

Q: who is a good boy?
A:  no .

Q: Am I a good boy?
A:  ah , that is a <UNK> and <UNK> <UNK> ... in <UNK> .

