In [217]:
import gensim
import os
import numpy as np
import nltk
from keras.models import Sequential
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, recurrent, Embedding
from keras.preprocessing.sequence import pad_sequences
import gensim
import json

In [87]:
SENTENCE_LENGTH = 20

In [88]:
def read_lines(file_path, separator, name_idx, content_idx, limit = 10000):
    ln = 1
    prev_name = None
    sentences = []
    all_words = set()
    for line in open(file_path, 'r', encoding="utf-8"):
        #print(ln)
        if ln >= limit:
            break
        ln += 1
        items = line.split(separator)
        name = items[name_idx].lower()
        content = items[content_idx].lower().replace('?', '').replace('!', '').replace( '.', '')
        words = content.split()
        all_words.update(words)
        if prev_name != name:
            sentences.append(words)
        else:
            #print(prev_name, "spoke again")
            sentences[-1].extend([" "] + words )
        prev_name = name
    return sentences, list(all_words)



sentences, all_words = read_lines("movie_lines.txt", " +++$+++ ", 3, 4, 1000)



In [89]:
def build_vocab_dict(all_words, padding_char, unknown_char):
    all_words.insert(0, padding_char)
    all_words.append(unknown_char)
    vocab_dict = {word: id for id, word in enumerate(all_words)}
    return vocab_dict

vocab_dict = build_vocab_dict(all_words, ' ', 'UNK')

In [119]:
idx_dict = {idx:word for word, idx in vocab_dict.items()}

In [219]:
f = open("vocab_dict.json", 'w') 
f.write(json.dumps(idx_dict)) 
f.close() 

In [90]:
x_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 0]
y_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 1]

In [102]:
def sentence_to_vec(sentences, vocab_dict, unknown_char, sentence_length):
    l = len(sentences)
    vec = []
    unk_idx = vocab_dict[unknown_char]

    for sen in sentences:
        vec.append( [vocab_dict[x] if x in vocab_dict else unk_idx for x in sen ][:20])

    padded = pad_sequences(vec, maxlen=sentence_length, dtype='int32')
    return padded

In [104]:
def to_one_hot(vec, sentence_length, vocab_length):
    res = np.zeros((len(vec), sentence_length, vocab_length))
    for i, sen in enumerate(vec):
        for j, num in enumerate(sen):
            res[i, j, num] = 1
    return res

x_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 0]
y_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 1]

x_vec = sentence_to_vec(x_sentences, vocab_dict, 'UNK', SENTENCE_LENGTH)
y_vec = sentence_to_vec(y_sentences, vocab_dict, 'UNK', SENTENCE_LENGTH)
y_vec = to_one_hot(y_vec, SENTENCE_LENGTH, len(vocab_dict))

In [95]:
def create_model(x_vocab_len, x_max_len, y_vocab_len, y_max_len, hidden_size, num_layers):
    model = Sequential()

    # Creating encoder network
    model.add(Embedding(x_vocab_len, 1024, input_length=x_max_len, mask_zero=True))
    model.add(LSTM(hidden_size))
    model.add(RepeatVector(y_max_len))

    # Creating decoder network
    for _ in range(num_layers):
        model.add(LSTM(hidden_size, return_sequences=True))
    model.add(TimeDistributed(Dense(y_vocab_len)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
            optimizer='rmsprop',
            metrics=['accuracy'])
    return model
model = create_model(len(vocab_dict), SENTENCE_LENGTH, len(vocab_dict), SENTENCE_LENGTH, 1024, 3)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 20, 1024)          2041856   
_________________________________________________________________
lstm_30 (LSTM)               (None, 1024)              8392704   
_________________________________________________________________
repeat_vector_8 (RepeatVecto (None, 20, 1024)          0         
_________________________________________________________________
lstm_31 (LSTM)               (None, 20, 1024)          8392704   
_________________________________________________________________
lstm_32 (LSTM)               (None, 20, 1024)          8392704   
_________________________________________________________________
lstm_33 (LSTM)               (None, 20, 1024)          8392704   
_________________________________________________________________
time_distributed_8 (TimeDist (None, 20, 1994)          2043850   
__________

In [206]:
model.fit(x_vec, y_vec, batch_size=100, nb_epoch=20)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x30415f98>

In [205]:
model.save_weights("chatbot_model.h5")

In [216]:
sen = "Let me see what I can do"
sen = sen.lower().replace('?', '').replace('!', '').replace( '.', '')
vec = sentence_to_vec([sen], vocab_dict, 'UNK', SENTENCE_LENGTH)
#print(model.predict(vec).shape)
res = model.predict(vec)

#print(list(res[19]))
#print(res.shape)
vec_y = np.argmax(res, axis=2)
" ".join([idx_dict[x] for x in vec_y[0]])

'                                  to to me'

'{"class_name": "Sequential", "config": [{"class_name": "Embedding", "config": {"name": "embedding_9", "trainable": true, "batch_input_shape": [null, 20], "dtype": "float32", "input_dim": 1994, "output_dim": 1024, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": true, "input_length": 20}}, {"class_name": "LSTM", "config": {"name": "lstm_30", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 1024, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {