In [2]:
import string
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from numpy import array
from random import randint
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
# load doc into memory
def load_doc( filename ):
    
    # open the file as read only
    file = open( filename, 'r' )
    # read all text
    text = file.read()
    # close the file
    file.close()
    
    return text

# load document
#in_filename = "../texts/alice-in-wonderland.txt"
in_filename = "../texts/dr-zeuss-compilation.txt"
doc = load_doc( in_filename )
print( doc[ :200 ] )

The Cat in the Hat

By Dr. Seuss

The sun did not shine.
It was too wet to play.
So we sat in the house
All that cold, cold, wet day.

I sat there with Sally.
We sat there, we two.
And I said, "How I 


In [4]:
# turn a doc into clean tokens
def clean_doc( doc, to_lower=True ):
    
    # replace '--' with a space ' '
    doc = doc.replace( '--', ' ' )
    
    # split into tokens by white space
    tokens = doc.split()
    
    # remove punctuation from each token
    table = str.maketrans( '', '', string.punctuation )
    tokens = [ w.translate( table ) for w in tokens ]
    
    # remove remaining tokens that are not alphabetic
    if to_lower:
        tokens = [ word for word in tokens if word.isalpha() ]
    
    # make lower case
    tokens = [ word.lower() for word in tokens ] 
    
    return tokens

In [5]:
# clean document
tokens = clean_doc( doc )
print( tokens[ :200 ] )
print( 'Total Tokens: %d' % len( tokens ) )
print( 'Unique Tokens: %d' % len( set( tokens ) ) )

['the', 'cat', 'in', 'the', 'hat', 'by', 'dr', 'seuss', 'the', 'sun', 'did', 'not', 'shine', 'it', 'was', 'too', 'wet', 'to', 'play', 'so', 'we', 'sat', 'in', 'the', 'house', 'all', 'that', 'cold', 'cold', 'wet', 'day', 'i', 'sat', 'there', 'with', 'sally', 'we', 'sat', 'there', 'we', 'two', 'and', 'i', 'said', 'how', 'i', 'wish', 'we', 'had', 'something', 'to', 'do', 'too', 'wet', 'to', 'go', 'out', 'and', 'too', 'cold', 'to', 'play', 'ball', 'so', 'we', 'sat', 'in', 'the', 'house', 'we', 'did', 'nothing', 'at', 'all', 'so', 'all', 'we', 'could', 'do', 'was', 'to', 'sit', 'sit', 'sit', 'sit', 'and', 'we', 'did', 'not', 'like', 'it', 'not', 'one', 'little', 'bit', 'bump', 'and', 'then', 'something', 'went', 'bump', 'how', 'that', 'bump', 'made', 'us', 'jump', 'we', 'looked', 'then', 'we', 'saw', 'him', 'step', 'in', 'on', 'the', 'mat', 'we', 'looked', 'and', 'we', 'saw', 'him', 'the', 'cat', 'in', 'the', 'hat', 'and', 'he', 'said', 'to', 'us', 'why', 'do', 'you', 'sit', 'there', 'like'

In [6]:
# organize into sequences of tokens
sequence_len = 50 + 1
sequences = list()

for i in range( sequence_len, len( tokens ) ):
    
    # select sequence of tokens
    seq = tokens[ i - sequence_len:i ]
    
    # convert into a line
    line = ' '.join( seq )
    
    # store
    sequences.append( line )
    
print( 'Total Sequences: %d' % len( sequences ) )


Total Sequences: 6239


In [7]:
# save tokens to file, one dialog per line
def save_doc( lines, filename ):
    
    data = '\n'.join( lines )
    file = open( filename, 'w' )
    file.write( data )
    file.close()

In [8]:
# save sequences to file
out_filename = "../texts/dr-zeuss-compilation-sequences.txt"
save_doc( sequences, out_filename )

In [9]:
in_filename = "../texts/dr-zeuss-compilation-sequences.txt"
doc = load_doc( in_filename )
lines = doc.split( '\n' )
lines[ 0:10 ]

['the cat in the hat by dr seuss the sun did not shine it was too wet to play so we sat in the house all that cold cold wet day i sat there with sally we sat there we two and i said how i wish we had something to',
 'cat in the hat by dr seuss the sun did not shine it was too wet to play so we sat in the house all that cold cold wet day i sat there with sally we sat there we two and i said how i wish we had something to do',
 'in the hat by dr seuss the sun did not shine it was too wet to play so we sat in the house all that cold cold wet day i sat there with sally we sat there we two and i said how i wish we had something to do too',
 'the hat by dr seuss the sun did not shine it was too wet to play so we sat in the house all that cold cold wet day i sat there with sally we sat there we two and i said how i wish we had something to do too wet',
 'hat by dr seuss the sun did not shine it was too wet to play so we sat in the house all that cold cold wet day i sat there with sally we sat

## Convert Words to Index Values

In [10]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts( lines )
sequences = tokenizer.texts_to_sequences( lines )

In [11]:
len( sequences[ 0 ] ) == sequence_len

True

In [12]:
print( len( tokenizer.word_index ) )
print( type( tokenizer.word_index ) )
print( tokenizer.word_index[ "cat" ] )

855
<class 'dict'>
37


In [13]:
# vocabulary size
vocab_size = len( tokenizer.word_index ) + 1
vocab_size

856

In [14]:
# separate into input and output: for now it's 50 words input and 1 word output
sequences = array( sequences )
X = sequences[ :,:-1 ] # all rows, from word 0 up to, but not including, the last word
y = sequences[ :,-1 ]  # all rows, last word only
y = to_categorical( y, num_classes=vocab_size )
seq_length = X.shape[ 1 ]

In [15]:
seq_length

50

## Define Model

In [16]:
import keras
print( keras.__version__ )

import tensorflow as tf
print( tf.__version__ )

2.1.3
1.4.1


In [17]:
# define model
model = Sequential()
# original embedding size was 50, common values are 50, 100, and 300: dimension used to represent each word.
# we could use pre-calc'd embeddings too: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
model.add( Embedding( vocab_size, 300, input_length=seq_length ) )
model.add( LSTM( seq_length * 2, return_sequences=True ) )
model.add( LSTM( seq_length * 2 ) )
model.add( Dense( seq_length * 2, activation='relu' ) )

# fixed TypeError below, downgraded keras from 2.1.5 to 2.1.3: https://github.com/keras-team/keras/issues/9621
# TypeError: softmax() got an unexpected keyword argument 'axis'
model.add( Dense( vocab_size, activation='softmax' ) )

print( model.summary() )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           256800    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           160400    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 856)               86456     
Total params: 594,156
Trainable params: 594,156
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

In [18]:
# calc batch size
print( len( sequences ) / 128 )
print( len( sequences ) / 1028 )
# Was:
# batch_size = 128
batch_size = 1028
# batch_size = 32


48.7421875
6.069066147859922


In [19]:
# compile model
model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )
# fit model
model.fit( X, y, batch_size=batch_size, epochs=300 )

Epoch 1/300


InternalError: CUB segmented reduce errorout of memory
	 [[Node: lstm_1/Sum = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lstm_1/zeros_like, lstm_2/Sum/reduction_indices)]]
	 [[Node: metrics/acc/Mean/_149 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_3424_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'lstm_1/Sum', defined at:
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-17-ca945c537133>", line 6, in <module>
    model.add( LSTM( seq_length * 2, return_sequences=True ) )
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/keras/models.py", line 492, in add
    output_tensor = layer(self.outputs[0])
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/keras/layers/recurrent.py", line 488, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/keras/engine/topology.py", line 617, in __call__
    output = self.call(inputs, **kwargs)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/keras/layers/recurrent.py", line 2032, in call
    initial_state=initial_state)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/keras/layers/recurrent.py", line 546, in call
    initial_state = self.get_initial_state(inputs)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/keras/layers/recurrent.py", line 475, in get_initial_state
    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 1262, in sum
    return tf.reduce_sum(x, axis, keepdims)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1307, in reduce_sum
    name=name)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 4682, in _sum
    keep_dims=keep_dims, name=name)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/rruiz/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InternalError (see above for traceback): CUB segmented reduce errorout of memory
	 [[Node: lstm_1/Sum = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lstm_1/zeros_like, lstm_2/Sum/reduction_indices)]]
	 [[Node: metrics/acc/Mean/_149 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_3424_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [23]:
# save the model to file
# 00 got ~30% :-(
# 01 got 81% accuracy!
model.save( "models/dr-zeuss-compilation-00.keras" )

# save the tokenizer
dump( tokenizer, open( "tokenizers/dr-zeuss-compilation-00.pkl", 'wb' ) )

## Use The Model to Generate Text

In [24]:
seq_length = len( lines[ 0 ].split() ) - 1
seq_length

50

In [44]:
# # BUG: see shapes below, plus, had to cast explicitly to np array, as opposed to list, 
# # which tokenizer.texts_to_sequences returns

# # get indices for the seed text
# encoded = array( tokenizer.texts_to_sequences( [ seed_text ] )[ 0 ] )[ :-1 ]
# print( type( encoded ) )
# print( encoded.shape )
# print( encoded[ :-1 ].shape )

In [43]:
# # predict probabilities for each word
# yhat = model.predict_classes( encoded, verbose=0 )
# yhat

In [25]:
def generate_seq( model, tokenizer, seq_length, seed_text, n_words ):
    
    result = list()
    in_text = seed_text
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break 
                
        # append to input
        in_text += ' ' + out_word
        
        result.append( out_word )
        
    return ' '.join( result )

In [27]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text + '...\n' )
#print( len( seed_text.split( " " ) ) )

# generate new text
generated = generate_seq( model, tokenizer, seq_length, seed_text, 50 )
print( "..." + generated )

beetle noodle bottle paddle battle and now wait a minute mr socks fox when a fox is in the bottle where the tweetle beetles battle with their paddles in a puddle on a noodleeating poodle this is what they call a tweetle beetle noodle poodle bottled paddled muddled duddled fuddled wuddled...

...fox in socks sir thats my tongue isnt my fuddled wuddled make of stop a tweetle beetle bottle thats much fox in this tweetle beetles battle in this tweetle beetle bottle beetle noodle words no say in this tweetle beetles battle in a brothers read a brothers read a tweetle


In [28]:
def generate_seq_word_by_word( model, tokenizer, seq_length, seed_text, n_words ):
    
    print( "...", end='' )
    #result = list()
    in_text = seed_text
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        
        # map predicted word index to word
        #out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                print( word, end=' ' )
                break 
                
        # append to input for next iteration
        in_text += ' ' + out_word

In [29]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text + '...\n' )

# generate new text
generate_seq_word_by_word( model, tokenizer, seq_length, seed_text, 50 )

be in this house make them go they should not be here when your mother is not put them out put them out said the fish in the pot have no fear little fish said the cat in the hat these things are good things and he gave them a pat...

...asked you fear what good fun said the cat in the cat in the box i do not sing they do not like our bike of things is fun at this is good what oh no fear i do not like to go ask no oh no fear i do 