# TV Script Generation
Generate [Simpsons](https://en.wikipedia.org/wiki/The_Simpsons) TV scripts using RNNs.  Dataset is part of the [Simpsons dataset](https://www.kaggle.com/wcukierski/the-simpsons-by-the-data) of scripts from 27 seasons. The subset consists of scenes from   Moe's Tavern

In [1]:

import helper

data_dir = './data/simpsons/moes_tavern_lines.txt'
text = helper.load_data(data_dir)
# Ignore notice, since we don't use it for analysing the data
text = text[81:]

## Explore the Data

In [2]:
view_sentence_range = (0, 10)

import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 11492
Number of scenes: 262
Average number of sentences in each scene: 15.248091603053435
Number of lines: 4257
Average number of words in each line: 11.50434578341555

The sentences 0 to 10:
Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.




## Preprocessing Functions


### Lookup Table
To create a word embedding, the following are required
- Dictionary to go from the words to an id: `vocab_to_int`
- Dictionary to go from the id to word: `int_to_vocab`


In [None]:
import numpy as np
import problem_unittests as tests
from collections import Counter
def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    #unique = set(text)
    #vocab_to_int = {word:i for i,word in enumerate(unique)}
    #int_to_vocab = {ind:word for word,ind in vocab_to_int.items()}
    words = set(text)
    #countarray = Counter(text)
    vocab_to_int = {c: i for i, c in enumerate(words)}
    int_to_vocab = dict(enumerate(words))
    return vocab_to_int, int_to_vocab



### Tokenize Punctuation

In [None]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    return {'.':'|Period|', ',':'|Comma|', '"':'|Quotation|', ';':'|Semicolon|', '!':'|Exclamation|',
           '?':'|Question|', '(':'|leftParanthesis|',')':'|rightParanthesis|','--':'|Dash|','\n':'|Return|'}



## Preprocess all the data and save it
Running the code cell below will preprocess all the data and save it to file.

In [5]:
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

# Check Point

In [6]:

import helper
import numpy as np
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

## Build the Neural Network


### Input

In [None]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    inputs = tf.placeholder(tf.int32,[None,None],name='input')
    targets = tf.placeholder(tf.int32,[None,None],name='targets')
    learning_rate = tf.placeholder(tf.float32)
    return (inputs, targets, learning_rate,)



### Build RNN Cell and Initialize

In [None]:
def get_init_cell(batch_size, rnn_size):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    cell = tf.contrib.rnn.MultiRNNCell([lstm]*2)
    initial_state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state,name="initial_state")
    return (cell, initial_state)




### Word Embedding

In [None]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    embedded_inp = tf.contrib.layers.embed_sequence(input_data, vocab_size=vocab_size, embed_dim=embed_dim) 
    return embedded_inp




### Build RNN 

In [None]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    outputs,final_state = tf.nn.dynamic_rnn(cell,inputs,dtype=tf.float32)
    final_state=tf.identity(final_state,name="final_state")
    return outputs,final_state



### Build the Neural Network

In [None]:
def build_nn(cell, rnn_size, input_data, vocab_size):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :return: Tuple (Logits, FinalState)
    """
    embedded = get_embed(input_data, vocab_size, rnn_size)
    outputs,final_state = build_rnn(cell, embedded)
    predictions = tf.contrib.layers.fully_connected(outputs, vocab_size,activation_fn=None)
    return predictions, final_state



In [None]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    n_batches = len(int_text)//(batch_size*seq_length)
    size = n_batches*batch_size*seq_length
    inputs = np.array(int_text[:n_batches*batch_size*seq_length])
    target = np.array(int_text[1:n_batches*batch_size*seq_length+1])
    inputs = np.array(np.split(inputs.reshape((-1,batch_size,seq_length)),n_batches,0))
    target = np.array(np.split(target.reshape((-1,batch_size,seq_length)),n_batches,0))
    batches = np.concatenate((inputs,target),axis=1)
    return batches



## Neural Network Training
### Hyperparameters

In [28]:

num_epochs = 300

batch_size = 256

rnn_size = 600

seq_length = 20

learning_rate = 0.0015

show_every_n_batches = 50
save_dir = './save'

### Build the Graph

In [29]:

from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients]
    train_op = optimizer.apply_gradients(capped_gradients)

## Train

In [30]:
batches = get_batches(int_text, batch_size, seq_length)
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                    input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/13   train_loss = 8.822
Epoch   3 Batch   11/13   train_loss = 6.121
Epoch   7 Batch    9/13   train_loss = 6.044
Epoch  11 Batch    7/13   train_loss = 5.722
Epoch  15 Batch    5/13   train_loss = 5.383
Epoch  19 Batch    3/13   train_loss = 5.330
Epoch  23 Batch    1/13   train_loss = 5.079
Epoch  26 Batch   12/13   train_loss = 5.026
Epoch  30 Batch   10/13   train_loss = 4.861
Epoch  34 Batch    8/13   train_loss = 4.631
Epoch  38 Batch    6/13   train_loss = 4.479
Epoch  42 Batch    4/13   train_loss = 4.271
Epoch  46 Batch    2/13   train_loss = 4.174
Epoch  50 Batch    0/13   train_loss = 4.083
Epoch  53 Batch   11/13   train_loss = 4.071
Epoch  57 Batch    9/13   train_loss = 3.969
Epoch  61 Batch    7/13   train_loss = 3.806
Epoch  65 Batch    5/13   train_loss = 3.719
Epoch  69 Batch    3/13   train_loss = 3.556
Epoch  73 Batch    1/13   train_loss = 3.428
Epoch  76 Batch   12/13   train_loss = 3.351
Epoch  80 Batch   10/13   train_loss = 3.403
Epoch  84 

## Save Parameters
Save `seq_length` and `save_dir` for generating a new TV script.

In [31]:

# Save parameters for checkpoint
helper.save_params((seq_length, save_dir))

# Checkpoint

In [32]:

import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

## Implement Generate Functions
### Get Tensors

In [None]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """

    InputTensor = loaded_graph.get_tensor_by_name('input:0')
    InitialStateTensor = loaded_graph.get_tensor_by_name('initial_state:0')
    FinalStateTensor = loaded_graph.get_tensor_by_name('final_state:0')
    ProbsTensor = loaded_graph.get_tensor_by_name('probs:0')
    return InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor



### Choose Word


In [None]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    index = np.random.choice(list(int_to_vocab),1,p=probabilities)[0]
    return int_to_vocab[index]


## Generate TV Script

In [35]:
gen_length = 200
# homer_simpson, moe_szyslak, or Barney_Gumble
prime_word = 'moe_szyslak'


loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)
    # Sentences generation setup
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])
        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)

moe_szyslak: wait, homer. what the hell was a thing that.
lenny_leonard: i like you all think i gave me out of a new life of my friends. yeah, make your astronaut like to.
homer_simpson: oh, you're upsetting me.
homer_simpson: i just can't talk to the dump?
moe_szyslak: i can't believe you're actually... ah, dad. uh, uh... how uh... in the tap of the finest night?
moe_szyslak: you know what i ever thought that would do all with this of you.
homer_simpson: not at your best tipsy) is, homer, the school's back. and my dad is a ride there.
moe_szyslak: oh what to be back.
moe_szyslak: say, how come you and you say, homer! do you!(suspicious) is that the japanese soundin' bike...
homer_simpson: wait a minute. now who will flaming what i should say so mr. x were here can make you get in charge!
homer_simpson:(whiny) it's
