In [1]:
import os
import numpy as np
import nltk
from keras.models import Sequential
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, recurrent, Embedding
from keras.preprocessing.sequence import pad_sequences
import json

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def read_lines(file_path, separator, name_idx, content_idx, start_line = 0, limit = 10000):
    ln = 1
    line_added = 0
    prev_name = None
    sentences = []
    for line in open(file_path, 'r', encoding="utf-8"):

        if ln >= start_line + limit:
            if line_added % 2 == 0:
                break
            else:
                limit += 1
        elif ln < start_line:
            ln += 1
            continue
        ln += 1
        items = line.split(separator)
        name = items[name_idx].lower()
        content = items[content_idx].lower().replace('?', '').replace('!', '').replace( '.', '').replace( ',', '').replace( '-', '')
        words = content.split()
        if prev_name != name:
            sentences.append(words)
            line_added +=1
        else:
            #print(prev_name, "spoke again")
            sentences[-1].extend(words )
        prev_name = name
    #print(len(sentences))
    if len(sentences) % 2 != 0:
        sentences.pop();
    return sentences

In [3]:
def exclude_long_sentences(source_path, output_path, word_limit = 20, line_separator = '\n', convo_separator = "\n\n\n\n"):
    wfile = open(output_path, "w")
    rfile = open(source_path, "r")
    content = rfile.read()
    convos = content.split(convo_separator)
    for convo in convos:
        sentences_to_write = []
        sentences = convo.split(line_separator)
        under_limit = True
        for sen in sentences:
            words = sen.split()
            if len(words) > word_limit:
                under_limit = False
                break
            sentences_to_write.append(sen)
        if under_limit:
            sl = len(sentences_to_write)
            for i in range(sl):
                wfile.write(sentences_to_write[i])
                if i == sl - 1:
                    wfile.write(convo_separator)
                else:
                    wfile.write(line_separator)
    wfile.close()
    rfile.close()
def read_sentences(file_path,line_separator = '\n', convo_separator = "\n\n\n\n"):
    sentences = []
    rfile = open(file_path, "r")
    content = rfile.read()
    convos = content.split(convo_separator)
    for convo in convos:
        sens = convo.split(line_separator)
        for sen in sens:
            words = sen.lower().replace('?', '').replace('!', '').replace( '.', '').replace( ',', '').replace( '-', '').replace(';', '').split()
            sentences.append(words)
    return sentences

In [4]:
#flatten_sentences = [word for sentence in sentences for word in sentence ]



In [5]:
def build_vocab_dict(sentences, padding_char, unknown_char):
    word_freq = nltk.FreqDist(np.hstack(sentences))
    word_freq.pop(padding_char, None)
    word_freq.pop(unknown_char, None)
    vocab = word_freq.most_common(VOCAB_SIZE - 2)

    vocab.insert(0, (padding_char,1))
    vocab.append( (unknown_char,1))
    
    vocab_dict = {pair[0]: id for id, pair in enumerate(vocab)}
    
    idx_dict = {idx:word for word, idx in vocab_dict.items()}
    print(idx_dict)
    return vocab_dict, idx_dict


#print(vocab_dict, idx_dict)

In [6]:
def save_vocab(vocab_dict):
    f = open("vocab_dict.json", 'w') 
    f.write(json.dumps(vocab_dict)) 
    f.close() 
def load_vocab():
    f = open("vocab_dict.json", 'r') 
    vocab_dict = json.loads(f.read() )
    f.close() 
    idx_dict = idx_dict = {idx:word for word, idx in vocab_dict.items()}
    return vocab_dict, idx_dict

In [7]:
def sentence_to_vec(sentences, vocab_dict, unknown_char, sentence_length):
    l = len(sentences)
    vec = []
    unk_idx = vocab_dict[unknown_char]

    for sen in sentences:
        vec.append( [vocab_dict[x] if x in vocab_dict else unk_idx for x in sen ][:20])

    padded = pad_sequences(vec, maxlen=sentence_length, dtype='int32')
    return padded

In [8]:
def to_one_hot(vec, sentence_length, vocab_length):
    print((len(vec), sentence_length, vocab_length))
    res = np.zeros((len(vec), sentence_length, vocab_length))
    for i, sen in enumerate(vec):
        for j, num in enumerate(sen):
            res[i, j, num] = 1
    return res

In [9]:
def vectorize_sentence(sentences, vocab_dict, vocab_size, sentence_size):

    x_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 0]
    y_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 1]
    
    if len(x_sentences) > len(y_sentences):
        x_sentences = x_sentences[:-1]
    elif len(y_sentences) > len(x_sentences):
        y_sentences = y_sentences[:-1]
    #print(x_sentences,x_sentences)
    x_vec = sentence_to_vec(x_sentences, vocab_dict, 'UNK', sentence_size)
    y_vec = sentence_to_vec(y_sentences, vocab_dict, 'UNK', sentence_size)

    y_vec = to_one_hot(y_vec, sentence_size, vocab_size)
    return x_vec, y_vec

In [10]:
def create_model(x_vocab_len, x_max_len, y_vocab_len, y_max_len, hidden_size, num_layers):
    model = Sequential()

    # Creating encoder network
    model.add(Embedding(x_vocab_len, 1024, input_length=x_max_len, mask_zero=True))
    model.add(LSTM(hidden_size))
    model.add(RepeatVector(y_max_len))

    # Creating decoder network
    for _ in range(num_layers):
        model.add(LSTM(hidden_size, return_sequences=True))
    model.add(TimeDistributed(Dense(y_vocab_len)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
            optimizer='rmsprop',
            metrics=['accuracy'])
    return model

In [11]:
SENTENCE_LENGTH = 20
VOCAB_SIZE = 10000
"""
sentences = read_lines("movie_lines.txt", " +++$+++ ", 3, 4, 0, 1000)

vocab_dict, idx_dict = build_vocab_dict(sentences, ' ', 'UNK')
#save_vocab(vocab_dict)
vocab_dict, idx_dict = load_vocab()
#print(vocab_dict, idx_dict)

#sentences=["hi","i'm good","hello","hello","hey","hey there", 'shit','i know right']
#sentences= sentences + sentences + sentences + sentences
#sentences= sentences + sentences + sentences + sentences
#sentences= sentences + sentences + sentences + sentences


x_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 0]
y_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 1]

x_vec = sentence_to_vec(x_sentences, vocab_dict, 'UNK', SENTENCE_LENGTH)
y_vec = sentence_to_vec(y_sentences, vocab_dict, 'UNK', SENTENCE_LENGTH)
y_vec = to_one_hot(y_vec, SENTENCE_LENGTH, VOCAB_SIZE)

print(x_vec.shape, y_vec.shape)

model = create_model(VOCAB_SIZE, SENTENCE_LENGTH, VOCAB_SIZE, SENTENCE_LENGTH, 1024, 3)
#model.summary()
model.load_weights("new_chatbot_model.h5")
"""

'\nsentences = read_lines("movie_lines.txt", " +++$+++ ", 3, 4, 0, 1000)\n\nvocab_dict, idx_dict = build_vocab_dict(sentences, \' \', \'UNK\')\n#save_vocab(vocab_dict)\nvocab_dict, idx_dict = load_vocab()\n#print(vocab_dict, idx_dict)\n\n#sentences=["hi","i\'m good","hello","hello","hey","hey there", \'shit\',\'i know right\']\n#sentences= sentences + sentences + sentences + sentences\n#sentences= sentences + sentences + sentences + sentences\n#sentences= sentences + sentences + sentences + sentences\n\n\nx_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 0]\ny_sentences = [sentences[i] for i in range(len(sentences)) if i % 2 == 1]\n\nx_vec = sentence_to_vec(x_sentences, vocab_dict, \'UNK\', SENTENCE_LENGTH)\ny_vec = sentence_to_vec(y_sentences, vocab_dict, \'UNK\', SENTENCE_LENGTH)\ny_vec = to_one_hot(y_vec, SENTENCE_LENGTH, VOCAB_SIZE)\n\nprint(x_vec.shape, y_vec.shape)\n\nmodel = create_model(VOCAB_SIZE, SENTENCE_LENGTH, VOCAB_SIZE, SENTENCE_LENGTH, 1024, 3)\n#mo

In [12]:
def load_cornell_movie_data(start_line, limit, vocab_dict):
    sentences = read_lines("movie_lines.txt", " +++$+++ ", 3, 4, start_line, limit)
    x_vec, y_vec = vectorize_sentence(sentences, vocab_dict, VOCAB_SIZE, SENTENCE_LENGTH)
    return x_vec, y_vec

def load_twitter_data(start_line, limit, vocab_dict):
    sentences = read_sentences("ShortenTwitterAsciiCorpus.txt")[:1000]
    x_vec, y_vec = vectorize_sentence(sentences, vocab_dict, VOCAB_SIZE, SENTENCE_LENGTH)
    return x_vec, y_vec

In [23]:
model = create_model(VOCAB_SIZE, SENTENCE_LENGTH, VOCAB_SIZE, SENTENCE_LENGTH, 1024, 3)
#model.summary()
model.load_weights("new_chatbot_model.h5")

In [16]:
vocab_dict, idx_dict = load_vocab()

In [24]:
batch_size = 1000
start_point = 0
batch = 0
round = 0
while True:
    print("Getting data:", start_point, batch_size)
    x_vec, y_vec = load_twitter_data(start_point, batch_size, vocab_dict)
    if len(x_vec) == 0:
        start_point = 0
        batch = 0
        x_vec, y_vec = load_twitter_data(start_point, batch_size, vocab_dict)
        round += 1
    model.fit(x_vec, y_vec, batch_size=200, nb_epoch=1 )
    model.save_weights(str(batch%2)+"new_chatbot_model.h5")
    start_point += batch_size
    batch += 1
    print("--------------------------------------------------------------------------")
    print("Round:", round, "Batch:", batch)
    print("--------------------------------------------------------------------------")

Getting data: 0 1000
(500, 20, 10000)




Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 1
--------------------------------------------------------------------------
Getting data: 1000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 2
--------------------------------------------------------------------------
Getting data: 2000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 3
--------------------------------------------------------------------------
Getting data: 3000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 4
--------------------------------------------------------------------------
Getting data: 4000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 5
---------------------

--------------------------------------------------------------------------
Round: 0 Batch: 28
--------------------------------------------------------------------------
Getting data: 28000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 29
--------------------------------------------------------------------------
Getting data: 29000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 30
--------------------------------------------------------------------------
Getting data: 30000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 31
--------------------------------------------------------------------------
Getting data: 31000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 32
----------------------

--------------------------------------------------------------------------
Round: 0 Batch: 55
--------------------------------------------------------------------------
Getting data: 55000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 56
--------------------------------------------------------------------------
Getting data: 56000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 57
--------------------------------------------------------------------------
Getting data: 57000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 58
--------------------------------------------------------------------------
Getting data: 58000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 59
----------------------

--------------------------------------------------------------------------
Round: 0 Batch: 82
--------------------------------------------------------------------------
Getting data: 82000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 83
--------------------------------------------------------------------------
Getting data: 83000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 84
--------------------------------------------------------------------------
Getting data: 84000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 85
--------------------------------------------------------------------------
Getting data: 85000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 86
----------------------

--------------------------------------------------------------------------
Round: 0 Batch: 109
--------------------------------------------------------------------------
Getting data: 109000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 110
--------------------------------------------------------------------------
Getting data: 110000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 111
--------------------------------------------------------------------------
Getting data: 111000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 112
--------------------------------------------------------------------------
Getting data: 112000 1000
(500, 20, 10000)
Epoch 1/1
--------------------------------------------------------------------------
Round: 0 Batch: 113
-------------

KeyboardInterrupt: 

In [29]:
sen = "pretty"
sen = sen.lower().replace('?', '').replace('!', '').replace( '.', '')
vec = sentence_to_vec([sen], vocab_dict, 'UNK', SENTENCE_LENGTH)
#print(model.predict(vec).shape)
res = model.predict(vec)

vec_y = np.argmax(res, axis=2)
" ".join([idx_dict[x] for x in vec_y[0]])

"                            you you   can't head UNK"

In [26]:
sen = "hi"
sen = sen.lower().replace('?', '').replace('!', '').replace( '.', '')
vec = sentence_to_vec([sen], vocab_dict, 'UNK', SENTENCE_LENGTH)
#print(model.predict(vec).shape)
res = model.predict(vec)

vec_y = np.argmax(res, axis=2)
" ".join([idx_dict[x] for x in vec_y[0] if idx_dict[x]!= 'UNK'])

"                            think i'm is is week week"