In [1]:
import numpy as np
import tensorflow as tf
import re
import time
import os
from collections import defaultdict
from joblib import dump

#### Setting up working directory information

In [2]:
working_directory = os.getcwd()
data_directory = os.path.join(working_directory, 'data')
models_directory = os.path.join(working_directory, 'models')

#### Load the data

In [3]:
lines = open(os.path.join(data_directory, 'movie_lines.txt'), encoding='utf-8', errors='ignore').read().split('\n')
conversations = open(os.path.join(data_directory, 'movie_conversations.txt'), encoding='utf-8', errors='ignore').read().split('\n')
word2vec_file = os.path.join(data_directory, "glove.6B.100d.txt")

In [4]:
data_separator = " +++$+++ "

In [5]:
# visualize top five lines
lines[0:5]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go."]

In [6]:
# Map ids to the line said by character
id_to_line = {}
for line in lines:
    _line = line.split(data_separator)
    if len(_line) == 5:
        id_to_line[_line[0]] = _line[-1].strip()

In [7]:
# Visualize top 5 conversations
conversations[0:5]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]

In [8]:
junk_characters = r"['\s\[\]]"
conversations_ids = [re.sub(junk_characters, "", conv.split(data_separator)[-1]).split(",") for  conv in conversations[:-1]]

In [9]:
conversations_ids[0]

['L194', 'L195', 'L196', 'L197']

#### Create Questions and Answers data
In conversations_ids, the list represents [Q, A, Q, A...] - Use this structure to create QnA data

In [10]:
questions = []
answers = []

for conv in conversations_ids:
    for i in range(len(conv)-1):
        questions.append(id_to_line[conv[i]])
        answers.append(id_to_line[conv[i+1]])

##### Clean the text

In [11]:
# common contractions
common_contractions = {
    r"i'm": "i am",
    r"he's": "he is",
    r"she's": "she is",
    r"that's": "that is",
    r"what's": "what is",
    r"where's": "where is",
    r"\'ll": " will",
    r"\'ve": " have",
    r"\'re": " are",
    r"\'d": " would",
    r"won't": "will not",
    r"can't": "can not",
    r"n't": " not",
    r"&": "and",
    r"it's": "it is",
    r"how's": "how is",
    r"[$()\"#/@;:<>{}+=-`|.?,\'*%_\[\]]|(-)+": ""
}

In [12]:
def clean_text(text):
    # lowercase
    text = str(text).lower()
    
    # replace common contractions
    for contraction, replacement in common_contractions.items():
        text = re.sub(contraction, replacement, text)
    return text

In [13]:
cleaned_questions = [clean_text(q) for q in questions]
cleaned_answers = [f"<SOS> {clean_text(a)} <EOS>" for a in answers]

In [14]:
cleaned_answers[0:5]

['<SOS> well i thought we would start with pronunciation if that is okay with you <EOS>',
 '<SOS> not the hacking and gagging and spitting part  please <EOS>',
 '<SOS> okay then how bout we try out some french cuisine  saturday  night <EOS>',
 '<SOS> forget it <EOS>',
 '<SOS> cameron <EOS>']

#### Other processing - Tokenization, padding, and converting to index

In [15]:
def get_max_len(data, cap=512):
    return min(max([len(t.split()) for t in data]), cap)

def create_tokenizer(data, vocab_size=5000):
    from keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(data)
    word_idx_map = tokenizer.word_index
    idx_word_map = {v:k for k,v in word_idx_map.items()}
    return tokenizer, word_idx_map, idx_word_map

def tokenize_sentences_and_pad(data, tokenizer, max_len):
    from keras.preprocessing.sequence import pad_sequences
    tokenized_sents = tokenizer.texts_to_sequences(data)
    return pad_sequences(tokenized_sents, padding='post', maxlen=max_len)


def w2v_create_embeddings_matrix(embeddings_file, word_index_mapping, emb_dim=100):
    import numpy as np
    vocab_size = len(word_index_mapping) + 1  # Adding 1, as 0 index is reserved for OOV/UNK token
    embedding_matrix = np.zeros((vocab_size, emb_dim))
    with open(embeddings_file, encoding="utf-8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index_mapping:
                idx = word_index_mapping[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:emb_dim]
    return embedding_matrix, vocab_size

In [16]:
MAX_LEN = get_max_len(cleaned_questions+cleaned_answers)
tokenizer, word_idx_map, idx_word_map = create_tokenizer(cleaned_questions+cleaned_answers, vocab_size=20000)
questions_tokenized = tokenize_sentences_and_pad(cleaned_questions, tokenizer, MAX_LEN)
answers_tokenized = tokenize_sentences_and_pad(cleaned_answers, tokenizer, MAX_LEN)

Using TensorFlow backend.


In [17]:
embeddings_matrix, _ = w2v_create_embeddings_matrix(word2vec_file, word_idx_map, emb_dim=100)

In [18]:
len(idx_word_map)

64841

In [19]:
print(questions_tokenized[0:5])

[[  33   20  101 ...    0    0    0]
 [  58    4  134 ...    0    0    0]
 [   8    5 8938 ...    0    0    0]
 [   3   15  557 ...    0    0    0]
 [  30   30   10 ...    0    0    0]]


In [20]:
print(answers_tokenized[0:5])

[[   1   58    4 ...    0    0    0]
 [   1    8    5 ...    0    0    0]
 [   1  105   86 ...    0    0    0]
 [   1  326   10 ...    0    0    0]
 [   1 5695    2 ...    0    0    0]]


In [21]:
print(MAX_LEN)

512


#### Saving outputs to file

In [22]:
dump(tokenizer, os.path.join(models_directory, "tokenizer.h5"))
dump(word_idx_map, os.path.join(models_directory, "word_idx_map.h5"))
dump(idx_word_map, os.path.join(models_directory, "idx_word_map.h5"))
dump(embeddings_matrix, os.path.join(models_directory, "embeddings_matrix.h5"))
dump(questions_tokenized, os.path.join(data_directory, "questions_tokenized.h5"))
dump(answers_tokenized, os.path.join(data_directory, "answers_tokenized.h5"))

['/Users/ss_0002/Documents/work/other-repos/chatbot/Seq_to_Seq/data/answers_tokenized.h5']