In [1]:
import os
import numpy as np
import nltk
from keras.models import Sequential
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, recurrent, Embedding
from keras.preprocessing.sequence import pad_sequences
import json

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def read_lines(file_path, separator, name_idx, content_idx, start_line = 0, limit = 10000):
    ln = 1
    line_added = 0
    prev_name = None
    sentences = []
    for line in open(file_path, 'r', encoding="utf-8"):

        if ln >= start_line + limit:
            if line_added % 2 == 0:
                break
            else:
                limit += 1
        elif ln < start_line:
            ln += 1
            continue
        ln += 1
        items = line.split(separator)
        name = items[name_idx].lower()
        content = items[content_idx].lower().replace('?', '').replace('!', '').replace( '.', '').replace( ',', '').replace( '-', '')
        words = content.split()
        if prev_name != name:
            sentences.append(words)
            line_added +=1
        else:
            #print(prev_name, "spoke again")
            sentences[-1].extend(words )
        prev_name = name
    #print(len(sentences))
    if len(sentences) % 2 != 0:
        sentences.pop();
    return sentences

In [3]:
#flatten_sentences = [word for sentence in sentences for word in sentence ]



In [4]:
def build_vocab_dict(sentences, padding_char, unknown_char):
    word_freq = nltk.FreqDist(np.hstack(sentences))
    word_freq.pop(padding_char, None)
    word_freq.pop(unknown_char, None)
    vocab = word_freq.most_common(VOCAB_SIZE - 2)

    vocab.insert(0, (padding_char,1))
    vocab.append( (unknown_char,1))
    
    vocab_dict = {pair[0]: id for id, pair in enumerate(vocab)}
    
    idx_dict = {idx:word for word, idx in vocab_dict.items()}
    print(idx_dict)
    return vocab_dict, idx_dict


#print(vocab_dict, idx_dict)

In [5]:
def save_vocab(vocab_dict):
    f = open("vocab_dict.json", 'w') 
    f.write(json.dumps(vocab_dict)) 
    f.close() 
def load_vocab():
    f = open("vocab_dict.json", 'r') 
    vocab_dict = json.loads(f.read() )
    f.close() 
    idx_dict = idx_dict = {idx:word for word, idx in vocab_dict.items()}
    return vocab_dict, idx_dict

In [6]:
def sentence_to_vec(sentences, vocab_dict, unknown_char, sentence_length):
    l = len(sentences)
    vec = []
    unk_idx = vocab_dict[unknown_char]

    for sen in sentences:
        vec.append( [vocab_dict[x] if x in vocab_dict else unk_idx for x in sen ][:20])

    padded = pad_sequences(vec, maxlen=sentence_length, dtype='int32')
    return padded

In [7]:
def to_one_hot(vec, sentence_length, vocab_length):
    print((len(vec), sentence_length, vocab_length))
    res = np.zeros((len(vec), sentence_length, vocab_length))
    for i, sen in enumerate(vec):
        for j, num in enumerate(sen):
            res[i, j, num] = 1
    return res

In [8]:
def create_model(x_vocab_len, x_max_len, y_vocab_len, y_max_len, hidden_size, num_layers):
    model = Sequential()

    # Creating encoder network
    model.add(Embedding(x_vocab_len, 1024, input_length=x_max_len, mask_zero=True))
    model.add(LSTM(hidden_size))
    model.add(RepeatVector(y_max_len))

    # Creating decoder network
    for _ in range(num_layers):
        model.add(LSTM(hidden_size, return_sequences=True))
    model.add(TimeDistributed(Dense(y_vocab_len)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
            optimizer='rmsprop',
            metrics=['accuracy'])
    return model


In [9]:
SENTENCE_LENGTH = 20
VOCAB_SIZE = 10000
"""
sentences = read_lines("movie_lines.txt", " +++$+++ ", 3, 4, 0, 1000)

vocab_dict, idx_dict = build_vocab_dict(sentences, ' ', 'UNK')
#save_vocab(vocab_dict)
vocab_dict, idx_dict = load_vocab()
#print(vocab_dict, idx_dict)

#sentences=["hi","i'm good","hello","hello","hey","hey there", 'shit','i know right']
#sentences= sentences + sentences + sentences + sentences
#sentences= sentences + sentences + sentences + sentences
#sentences= sentences + sentences + sentences + sentences


x_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 0]
y_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 1]

x_vec = sentence_to_vec(x_sentences, vocab_dict, 'UNK', SENTENCE_LENGTH)
y_vec = sentence_to_vec(y_sentences, vocab_dict, 'UNK', SENTENCE_LENGTH)
y_vec = to_one_hot(y_vec, SENTENCE_LENGTH, VOCAB_SIZE)

print(x_vec.shape, y_vec.shape)

model = create_model(VOCAB_SIZE, SENTENCE_LENGTH, VOCAB_SIZE, SENTENCE_LENGTH, 1024, 3)
#model.summary()
model.load_weights("new_chatbot_model.h5")
"""

'\nsentences = read_lines("movie_lines.txt", " +++$+++ ", 3, 4, 0, 1000)\n\nvocab_dict, idx_dict = build_vocab_dict(sentences, \' \', \'UNK\')\n#save_vocab(vocab_dict)\nvocab_dict, idx_dict = load_vocab()\n#print(vocab_dict, idx_dict)\n\n#sentences=["hi","i\'m good","hello","hello","hey","hey there", \'shit\',\'i know right\']\n#sentences= sentences + sentences + sentences + sentences\n#sentences= sentences + sentences + sentences + sentences\n#sentences= sentences + sentences + sentences + sentences\n\n\nx_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 0]\ny_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 1]\n\nx_vec = sentence_to_vec(x_sentences, vocab_dict, \'UNK\', SENTENCE_LENGTH)\ny_vec = sentence_to_vec(y_sentences, vocab_dict, \'UNK\', SENTENCE_LENGTH)\ny_vec = to_one_hot(y_vec, SENTENCE_LENGTH, VOCAB_SIZE)\n\nprint(x_vec.shape, y_vec.shape)\n\nmodel = create_model(VOCAB_SIZE, SENTENCE_LENGTH, VOCAB_SIZE, SENTENCE_LENGTH, 1024, 3)\n#mo

In [10]:
def load_data(start_line, limit, vocab_dict):
    sentences = read_lines("movie_lines.txt", " +++$+++ ", 3, 4, start_line, limit)

    x_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 0]
    y_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 1]

    x_vec = sentence_to_vec(x_sentences, vocab_dict, 'UNK', SENTENCE_LENGTH)
    y_vec = sentence_to_vec(y_sentences, vocab_dict, 'UNK', SENTENCE_LENGTH)
    y_vec = to_one_hot(y_vec, SENTENCE_LENGTH, VOCAB_SIZE)

    print(x_vec.shape, y_vec.shape)
    return x_vec, y_vec

In [11]:
model = create_model(VOCAB_SIZE, SENTENCE_LENGTH, VOCAB_SIZE, SENTENCE_LENGTH, 1024, 3)
#model.summary()
model.load_weights("new_chatbot_model.h5")

In [12]:
vocab_dict, idx_dict = load_vocab()

In [58]:
batch_size = 1000
start_point = 216000
batch = 216
round = 0
while True:
    print("Getting data:", start_point, batch_size)
    x_vec, y_vec = load_data(start_point, batch_size, vocab_dict)
    if len(x_vec) == 0:
        start_point = 0
        batch = 0
        x_vec, y_vec = load_data(start_point, batch_size, vocab_dict)
        round += 1
    model.fit(x_vec, y_vec, batch_size=200, nb_epoch=1000, verbose =3)
    model.save_weights("new_chatbot_model.h5")
    start_point += batch_size
    batch += 1
    print("--------------------------------------------------------------------------")
    print("Round:", round, "Batch:", batch)
    print("--------------------------------------------------------------------------")

Getting data: 216000 1000
(474, 20, 10000)


MemoryError: 

In [59]:
sen = "died"
sen = sen.lower().replace('?', '').replace('!', '').replace( '.', '')
vec = sentence_to_vec([sen], vocab_dict, 'UNK', SENTENCE_LENGTH)
#print(model.predict(vec).shape)
res = model.predict(vec)

vec_y = np.argmax(res, axis=2)
" ".join([idx_dict[x] for x in vec_y[0]])

"    part             don't           if a you him you"

In [35]:
sen = "what do you mean"
sen = sen.lower().replace('?', '').replace('!', '').replace( '.', '')
vec = sentence_to_vec([sen], vocab_dict, 'UNK', SENTENCE_LENGTH)
#print(model.predict(vec).shape)
res = model.predict(vec)

vec_y = np.argmax(res, axis=2)
" ".join([idx_dict[x] for x in vec_y[0] if idx_dict[x]!= 'UNK'])

"                  be     so just and up that if that's please"