In [2]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences as Pad
import re
import numpy as np
import json

In [1]:
# load the data in 'movie_lines.txt' and the data in 'movie_conversations.txt'
# we will use the lineID sets in movie_convs to reconstruct the real conversations in movie_lines

movie_lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
movie_convs = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

print(movie_lines[:3])
print(movie_convs[:3])

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!', 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!', 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.']
["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']"]


In [None]:
# build the lineID-sentence pairs, each lineID map a particular sentence
id2sentence = {}
for line in movie_lines:
    line = line.split(' +++$+++ ') # split movie_lines with '+++$+++' so that we can get a list only contains informations
    if len(line) == 5:
        id2sentence[line[0]] = line[4] # the 1st element in list is lineID, the last element is the sentence
        
print(id2sentence['L1045'])
        

In [None]:
# build a list containing all lineID sets
conv_ID = []
for lineIDs in movie_convs:
    lineIDs= lineIDs.split(' +++$+++ ')[-1][1:-1].replace(' ','').replace("'","") # remove unnecessary symbols
    conv_ID.append(lineIDs.split(','))
print(conv_ID[:3])

In [None]:
# classify questions and answers

questions = []
answers = []

for lineIDs in conv_ID:
    for i in range(len(lineIDs) - 1):
        questions.append(id2sentence[lineIDs[i]])
        answers.append(id2sentence[lineIDs[i + 1]])

print(questions[0])
print(answers[0])

In [None]:
def clean_text(text):

    # remove unnecessary characters in sentences
    
    text = text.lower().strip()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r'[" "]+', " ", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [None]:
clean_questions = []
for question in questions: 
    clean_questions.append(clean_text(question))
    
clean_answers = []
for answer in answers:
    answer = 'bos ' + clean_text(answer) +' eos' # add bos (beginning of sentence) and eos (end of sentence) to answers
    clean_answers.append(answer)

print(clean_questions[0])
print(clean_answers[0])

In [None]:
print('number of questions: ' + str(len(clean_questions)))
print('number of answers: ' + str(len(clean_answers)))

In [None]:
# to make the model easier to experiment, we remove some queries and replies that are too long or too short

minlen = 2
maxlen = 20
filtered_questions_temp = []
filtered_answers_temp = []

# filter the Q&A pairs which question are too long or too short
for i,question in enumerate(clean_questions):
    if len(question.split()) <= maxlen and len(question.split()) >= minlen:
        filtered_questions_temp.append(question)
        filtered_answers_temp.append(clean_answers[i])
        
filtered_questions = []
filtered_answers = []

# from 'filtered_questions_temp' and 'filtered_answers_temp' filter the pairs that answers are too long or too short
for i, answer in enumerate(filtered_answers_temp):
    if len(answer.split()) <= maxlen and len(answer.split()) >= minlen:
        filtered_answers.append(answer)
        filtered_questions.append(filtered_questions_temp[i])
        
print(len(filtered_answers))
print(len(filtered_questions))

In [None]:
print(filtered_answers[0])
print(filtered_questions[0])

In [None]:
# build the input index sequence & vocabulary; build the output index sequence & vocabulary

# we build 2 different vocabulary because it is convinient, some other code only use one
# this method is also used in machine translation model because we need 2 different vocabulary in different language

# define the tokenizer
vocabsize = 2500

# vocabulary size is 8000, and we use UNK to replace those words are not frequently used
question_tokenizer = Tokenizer(num_words = vocabsize+1, oov_token = 'unk')
answer_tokenizer = Tokenizer(num_words = vocabsize+1,oov_token = 'unk')

# tokenize the questions and answers
question_tokenizer.fit_on_texts(filtered_questions)
answer_tokenizer.fit_on_texts(filtered_answers)

# build the input sequence and output sequence
q_sequences = question_tokenizer.texts_to_sequences(filtered_questions)
a_sequences = answer_tokenizer.texts_to_sequences(filtered_answers)

# pad sequences in same length so that we can train them in model
q_pad = Pad(q_sequences, padding = 'post')
a_pad = Pad(a_sequences, padding = 'post')

In [None]:
# save the preprocessed data
q_token_json = question_tokenizer.to_json()
a_token_json = answer_tokenizer.to_json()

In [None]:
with open('preprocessed_data/questions.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(q_token_json, ensure_ascii=False))
    f.close()

with open('preprocessed_data/answers.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(a_token_json, ensure_ascii=False))
    f.close()

np.savez('preprocessed_data/data.npz', q_pad, a_pad)