### Resources:
- [TFLearn Examples](https://github.com/tflearn/tflearn/blob/master/examples/nlp/lstm_generator_shakespeare.py)
- [Victorian Humour Tumblr](http://victorianhumour.tumblr.com/)
- [Victorian Jokes](http://victorianhumour.com/jokedb/)
- [WildML](http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/)

# Goal
- To build and train a recurrent neural network with victorian style humor then have it generate new text from what it learned.

In [1]:
import os
import pickle
from six.moves import urllib
import tflearn
from tflearn.data_utils import *

In [2]:
# Hyperparameters
N_SEQUENCE = 25 # maximum length of sequence
path = "shakespeare_input.txt"


# Data

In [3]:
# downloads txt data if not already downloaded
if not os.path.isfile(path):
    urllib.request.urlretrieve("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/shakespeare_input.txt", path)

In [4]:
# Loads a character vectors of unique values
char_idx_file = 'char_idx.pickle'
if os.path.isfile(char_idx_file):
  print('Loading previous char_idx')
  char_idx = pickle.load(open(char_idx_file, 'rb'))
    
print(type(char_idx))
print(len(char_idx))
{k: char_idx[k] for k in sorted(char_idx.keys())[:10]}

Loading previous char_idx
<class 'dict'>
67


{'\n': 4,
 ' ': 60,
 '!': 20,
 '$': 36,
 '&': 21,
 "'": 11,
 ',': 8,
 '-': 0,
 '.': 58,
 '3': 61}

In [5]:
# REMINDER: path = "shakespeare_input.txt"
# REMINDER: maximum length of sequence = 25

# Vectorize a string and returns parsed sequences and targets, along with the associated dictionary.
# http://tflearn.org/data_utils/#string_to_semi_redundant_sequences
X, Y, char_idx = textfile_to_semi_redundant_sequences(path, seq_maxlen=N_SEQUENCE, redun_step=3)

Vectorizing text...
Text total length: 17755
Distinct chars: 58
Total sequences: 5910


In [6]:
# Parsed sequences
print(type(X))
print(X.shape)
X[:1]

<class 'numpy.ndarray'>
(5910, 25, 58)


array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False,  True],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]]], dtype=bool)

In [7]:
# Targets
print(type(Y))
print(Y.shape)
Y[:1]

<class 'numpy.ndarray'>
(5910, 58)


array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False,  True, False]], dtype=bool)

In [8]:
print(type(char_idx))
print(len(char_idx))
{k: char_idx[k] for k in sorted(char_idx.keys())[:10]}

<class 'dict'>
58


{'\n': 29,
 ' ': 53,
 '!': 9,
 "'": 25,
 ',': 12,
 '-': 10,
 '.': 18,
 ':': 52,
 ';': 20,
 '?': 46}

In [9]:
# Serialize the dict for later
pickle.dump(char_idx, open(char_idx_file,'wb'))

# Graph

In [10]:
# Layer 0 (Input)
g = tflearn.input_data([None, N_SEQUENCE, len(char_idx)]) 

# Layer 1 (Hidden) 
# http://tflearn.org/layers/recurrent/#lstm
# return_seq: bool. If True, returns the full sequence instead of last sequence output only.
g = tflearn.lstm(g, 512, return_seq=True)
# http://tflearn.org/layers/core/#dropout
g = tflearn.dropout(g, 0.5)

# Layer 2 (Hidden)
g = tflearn.lstm(g, 512, return_seq=True)
g = tflearn.dropout(g, 0.5)

# Layer 3 (Output)
g = tflearn.lstm(g, 512)
g = tflearn.dropout(g, 0.5)

# Layer 2 (Fully Connected)
g = tflearn.fully_connected(g, len(char_idx), activation='softmax')
# http://tflearn.org/layers/estimator/#regression
g = tflearn.regression(g, optimizer='adam', loss='categorical_crossentropy',
                       learning_rate=0.001)

In [11]:
# http://tflearn.org/models/generator/#sequence-generator-model
m = tflearn.SequenceGenerator(g, dictionary=char_idx,
                              seq_maxlen=N_SEQUENCE,
                              clip_gradients=5.0,
                              checkpoint_path='model_shakespeare')

In [None]:
m

<tflearn.models.generator.SequenceGenerator at 0x11f510470>

In [None]:
for i in range(1):
    
    # Random 25 character string ""Now I may,'\nShe that bein""
    seed = random_sequence_from_textfile(path, N_SEQUENCE)
    
    m.fit(X, Y, validation_set=0.1, batch_size=1, n_epoch=1, run_id='shakespeare')
#     print("-- TESTING...")
#     print("-- Test with temperature of 1.0 --")
#     print(m.generate(600, temperature=1.0, seq_seed=seed))
#     print("-- Test with temperature of 0.5 --")
#     print(m.generate(600, temperature=0.5, seq_seed=seed))
    
print("done")

Training Step: 63  | total loss: [1m[32m4.09650[0m[0m
[2K| Adam | epoch: 000 | loss: 4.09650 -- iter: 0063/5319
