In [1]:

import helper

data_dir = './data/simpsons/moes_tavern_lines.txt'
text = helper.load_data(data_dir)
# Ignore notice, since we don't use it for analysing the data
text = text[81:]

In [2]:
view_sentence_range = (0, 10)


import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 11492
Number of scenes: 262
Average number of sentences in each scene: 15.248091603053435
Number of lines: 4257
Average number of words in each line: 11.50434578341555

The sentences 0 to 10:
Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.




In [1]:
import numpy as np
import problem_unittests as tests

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    
    text=set(text)
    vocab_to_int= {}
    int_to_vocab={}
    for i,word in enumerate(text):
        vocab_to_int[word]=i
        int_to_vocab[i]=word
    return (vocab_to_int,int_to_vocab)



In [2]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    token_dict={'.':'||period||',
                ',':'||comma||',
                '"':'||quotation_mark||',
                ';':'||semicolon||',
               '!':'||exclamation_mark||',
                '?':'||question_mark||',
                '(':'||left_parentheses||',
                ')':'||right_parentheses||',
               '--':'||dash||',
               '\n':'||return||'}
    return token_dict



In [5]:
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [6]:
import helper
import numpy as np
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

In [7]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.3'), 'Please use TensorFlow version 1.3 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.4.0


  


In [3]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    
    input_ = tf.placeholder(tf.int32, shape=[None, None], name='input')
    targets_ = tf.placeholder(tf.int32, shape=[None, None], name='targets')
    learning_rate_ = tf.placeholder(tf.float32, name='learning_rate')
    return (input_, targets_, learning_rate_)




In [4]:
def get_init_cell(batch_size, rnn_size):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    # TODO: Implement Function
    
    num_layers = 1
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    cell = tf.contrib.rnn.MultiRNNCell([lstm] * num_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    return (cell, initial_state)





In [6]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    #train_graph = tf.Graph()
    #with train_graph.as_default():
    print(vocab_size)
    print(embed_dim)
    print(input_data)
    embedding=tf.Variable(tf.random_uniform((vocab_size,embed_dim),-1,1))
    print(embedding.shape)
    embedd_data=tf.nn.embedding_lookup(embedding,input_data)
    return embedd_data




In [5]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """

    outputs,final_state=tf.nn.dynamic_rnn(cell,inputs,dtype=tf.float32)
    finalstate=tf.identity(final_state,name='final_state')
    
    return (outputs,finalstate)




In [7]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :param embed_dim: Number of embedding dimensions
    :return: Tuple (Logits, FinalState)
    """
    
    embed_data=get_embed(input_data,vocab_size,embed_dim)
    outputs,finalstate=build_rnn(cell,embed_data)
    logits=tf.layers.dense(outputs,vocab_size,activation=None,use_bias=True)
    
    return (logits,finalstate)




In [8]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    
    num_of_batches=len(int_text)//(batch_size*seq_length)
    int_text=np.array(int_text[:len(int_text)-(len(int_text)%(batch_size*seq_length))])
    in_text = int_text.reshape(-1, seq_length)
    # Roll (shift) and reshape to get target sequences (maybe not optimal)
    tar_text = np.roll(int_text, -1).reshape(-1, seq_length)
    output = np.zeros(shape=(num_of_batches, 2, batch_size, seq_length), dtype=np.int)
    # Prepare the output
    for idx in range(0, in_text.shape[0]):
        jj = idx % num_of_batches
        ii = idx // num_of_batches
        output[jj,0,ii,:] = in_text[idx,:]
        output[jj,1,ii,:] = tar_text[idx,:]
    
    return output




In [10]:
# Number of Epochs
num_epochs = 128
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 512
# Embedding Dimension Size
embed_dim = 256
# Sequence Length
seq_length = 16
# Learning Rate
learning_rate = 0.001
# Show stats for every n number of batches
show_every_n_batches = 128


save_dir = './save'

In [18]:

from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    print(type(cell))
    print(type(initial_state))
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)
    print(type('hi'))
    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

<class 'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell'>
<class 'tensorflow.python.framework.ops.Tensor'>
6779
256
Tensor("input:0", shape=(?, ?), dtype=int32)
(6779, 256)
<class 'str'>


In [19]:

batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/33   train_loss = 8.822
Epoch   3 Batch   29/33   train_loss = 5.260
Epoch   7 Batch   25/33   train_loss = 4.612
Epoch  11 Batch   21/33   train_loss = 4.269
Epoch  15 Batch   17/33   train_loss = 3.874
Epoch  19 Batch   13/33   train_loss = 3.592
Epoch  23 Batch    9/33   train_loss = 3.204
Epoch  27 Batch    5/33   train_loss = 2.950
Epoch  31 Batch    1/33   train_loss = 2.572
Epoch  34 Batch   30/33   train_loss = 2.499
Epoch  38 Batch   26/33   train_loss = 2.245
Epoch  42 Batch   22/33   train_loss = 2.032
Epoch  46 Batch   18/33   train_loss = 1.860
Epoch  50 Batch   14/33   train_loss = 1.648
Epoch  54 Batch   10/33   train_loss = 1.521
Epoch  58 Batch    6/33   train_loss = 1.381
Epoch  62 Batch    2/33   train_loss = 1.336
Epoch  65 Batch   31/33   train_loss = 1.160
Epoch  69 Batch   27/33   train_loss = 1.126
Epoch  73 Batch   23/33   train_loss = 0.950
Epoch  77 Batch   19/33   train_loss = 0.836
Epoch  81 Batch   15/33   train_loss = 0.768
Epoch  85 

In [20]:
helper.save_params((seq_length, save_dir))

In [21]:

import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

In [11]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    # TODO: Implement Function
    inputtensor=loaded_graph.get_tensor_by_name("input:0")
    initialstatetensor=loaded_graph.get_tensor_by_name("initial_state:0")
    finalstatetensor=loaded_graph.get_tensor_by_name("final_state:0")
    probstensor=loaded_graph.get_tensor_by_name("probs:0")
    return (inputtensor,initialstatetensor,finalstatetensor,probstensor)



In [12]:
def pick_word(probabilities, int_to_vocab):
    
    predict = np.random.choice(range(0,len(int_to_vocab)), size=1, p=probabilities)
    return int_to_vocab[predict[0]]




In [24]:
gen_length = 200
# homer_simpson, moe_szyslak, or Barney_Gumble
prime_word = 'moe_szyslak'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[0][dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)

INFO:tensorflow:Restoring parameters from ./save
moe_szyslak:(stunned) nigeria?
moe_szyslak:(sadly) i guess despite all our so-called!
moe_szyslak: uh, when you think where would come back to drive.
homer_simpson: get 'em now, throw!
moe_szyslak: want you cost ya.
homer_simpson:(gasps) she's nervous, but we've gotta tell ya, i'm miserable there. i'm just saying that i gave you to charm the next to meet him.
barney_gumble: yeah, it's really comin' down.
moe_szyslak:(clears throat) and now, ya see, that's just grand on, bad. you know how to feel like any of you. get your own dreams and they are ready to buy pants a couple.
moe_szyslak: lemme just get right to the fbi.(sighs)
moe_szyslak:(cutting him off) a glove slap. since you have one wish i love if they wanted an accident.
moe_szyslak: that's where i ever mean like the time we could do things as a couple.
carl_carlson: you know, guys, you've got the guts!
moe_szyslak:
