In [2]:
import numpy as np
import tensorflow as tf
import warnings
import helper
warnings.filterwarnings('ignore')

In [9]:
data_dir = 'data/simpsons/moes_tavern_lines.txt'
text = helper.load_data(data_dir)
print(text[:81])
text = text[81:] # Remving the first line

[YEAR DATE 1989] © Twentieth Century Fox Film Corporation. All rights reserved.




In [10]:
view_sentence_range = (0, 10)

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 11492
Number of scenes: 262
Average number of sentences in each scene: 15.248091603053435
Number of lines: 4257
Average number of words in each line: 11.50434578341555

The sentences 0 to 10:
Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.




**Implementing Preprocessing Functions** 

The first thing to do to any dataset is preprocessing. Implementing the following preprocessing functions below:
    - Lookup Table
    - Tokenize Punctuation

In [15]:
import problem_unittests as tests

def vocab_lookup_tables(text):
    """
    param: 
     - text: The text of tv scripts split into words
     - return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    vocab = set(text)
    vocab_to_int = {v: k for k, v in enumerate(vocab)}
    int_to_vocab = {v: k for k, v in vocab_to_int.items()}
    
    return (vocab_to_int, int_to_vocab)

tests.test_create_lookup_tables(create_lookup_tables)

Tests Passed


**Tokenize Punctuation**

Creating a dictionary for the following symbols where the symbol is the key and value is the token:

Period ( . )

Comma ( , )

Quotation Mark ( " )

Semicolon ( ; )

Exclamation mark ( ! )

Question mark ( ? )

Left Parentheses ( ( )

Right Parentheses ( ) )

Dash ( -- )

Return ( \n )

This dictionary will be used to token the symbols and add the delimiter (space) around it. This separates the symbols as it's own word, making it easier for the neural network to predict on the next word. 

In [17]:
def token_lookup():
    """
    return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    token_dict = {
        
        ".": "||Period||",
        ",": "||Comma||",
        "\"": "||Quotation||",
        ";": "||Semicolon||",
        "!": "||Exclamation_Mark||",
        "?": "||Question_Mark||",
        "(": "||Left_Paranthesis||",
        ")": "||Right_Paranthesis||",
        "--" : "||Dash||",
        "\n" : "||Return||"
    }
    return token_dict

tests.test_tokenize(token_lookup)

Tests Passed


**Preprocess all the data and save it**

Running the code cell below will preprocess all the data and save it to file.

In [18]:
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [19]:
# The checkpoint to resume the next time
import helper
import numpy as np
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

**Checking the TensorFlow Version**

In [20]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0


**Input**

*Implementing the get_inputs() function to create TF Placeholders for the Neural Network.*

Input text placeholder named "input" using the TF Placeholder name parameter

    :Targets placeholder
    :Learning Rate placeholder
    :Returning the placeholders in the following tuple (Input, Targets, LearningRate)

In [27]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    inputs = tf.placeholder(tf.int32, shape=(None, None), name="input")
    targets = tf.placeholder(tf.int32, shape=(None, None), name="Target")
    learning_rate = tf.placeholder(tf.float32, name="LearningRate")
    
    return (inputs, targets, learning_rate)

tests.test_get_inputs(get_inputs)

Tests Passed


**Building RNN Cell and Initialize**

- Stack one or more BasicLSTMCells in a MultiRNNCell.

- The Rnn size should be set using rnn_size

- Initalize Cell State using the *MultiRNNCell's* zero_state() function

- Apply the name "initial_state" to the initial state using tf.identity()

- Return the cell and initial state in the following tuple (Cell, InitialState)

In [51]:
def get_init_cell(batch_size, rnn_size):
    """
    :param: 
        - batch_size: Size of batches
        - param rnn_size: Size of RNNs
        - return: Tuple (cell, initialize state)
    """
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=0.5)
    
    cell = tf.contrib.rnn.MultiRNNCell([drop] * 2)
    
    initial_state = tf.identity(cell.zero_state(batch_size, tf.float32), name='initial_state')
    
    return (cell, initial_state)

tests.test_get_init_cell(get_init_cell)

Tests Passed


### Word Embedding
Creating embedding to `input_data` using TensorFlow.  Return the embedded sequence.

In [52]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    :param:
        - input_data: TF placeholder for text input.
        - vocab_size: Number of words in vocabulary.
        - embed_dim: Number of embedding dimensions
    
    """
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data)

    return embed

tests.test_get_embed(get_embed)

Tests Passed


### Building the Neural Network

In [53]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    output, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, name="final_state")
    
    return (output, final_state)

tests.test_build_rnn(build_rnn)

Tests Passed


In [49]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the neural network
    :param:
        - cell: RNN cell
        - rnn_size: Size of rnns
        - input_data: Input data
        - vocab_size: Vocabulary size
        - embed_dim: Number of embedding dimensions
    """
    embed_input = get_embed(input_data, vocab_size, embed_dim)
    
    cell_input, cell_initial_state = get_init_cell(input_data, rnn_size)
    
    output, final_tate = tf.nn.dynamic_rnn(cell=cell_input, inputs=embed_input, initial_state=cell_initial_state)
    
    return (output, final_state)

tests.test_build_nn(build_nn)

Embed:
 Tensor("embedding_lookup:0", shape=(128, 5, 300), dtype=float32)
Cell... <tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl.MultiRNNCell object at 0x11b5bbeb8> Initial State... Tensor("initial_state:0", shape=(128, 5), dtype=int32)


ValueError: Expected state to be a tuple of length 2, but received: Tensor("rnn/while/Identity_2:0", shape=(128, 5), dtype=int32)