In [304]:
from __future__ import print_function

import matplotlib.pyplot as plt
import numpy as np
import time

import theano
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM

'''
This script generates a sequence of numbers, then passes a portion of them 1 at a time to an LSTM 
which is then trained to guess the next number. The LSTM is then tested on its ability to guess the
remaining numbers. A stateful LSTM network is used, so only the most recent time step needs to be 
passed in order for the network to learn. 
'''


data = [.1 , .1 , .4 , .1 , .2 ]
data = data * 300
numOfPrevSteps = 1 # We will only pass in 1 timestep at a time. The network will guess the next step from the previous step and its internal state.
batchSize = 2 # We are only tracking a single set of features through time per epoch.
featurelen = 1 # Only a single feature is being trained on. If our data was guess a list of numbers instead of 1 number each time, this would be set equal to the length of that list.
testingSize = 100 # 100 data points will be used as a test set
totalTimeSteps = len(data) # Each element in the data represents one timestep of our single feature.



print('Formatting Data')
'''
The data must be converted into a list of matrices to be fed to our network.
In this case, one matrix must be generated for item in the batch. Our batchsize
is 1, so there will only be 1 matrix in this list. The matrix consists of a list
of features. Each row has 1 column per feature. There is 1 column in the matrix 
per timestep.

So the final form of the data will be a list containing a single matrix, which has 
1 row per timestep, and only 1 column because we only have 1 feature. 
'''
X = np.zeros([batchSize, totalTimeSteps , featurelen]) 
for r in range(totalTimeSteps):
    X[0][r] = data[r]
print('Formatted Data ',X)


print('Building model...')
'''
This problem is very simple, so only 2 layers with 10 nodes
each are used. For more complicated data, more numerous and 
larger layers will likely be required. This data is very simple and 
could probably be trained off of only 1 layer. Remember to set 
return_sequences=False for the last hidden layer.
'''
model = Sequential()
model.add(LSTM(10 ,return_sequences=True, batch_input_shape=(batchSize, numOfPrevSteps , featurelen), stateful=True))
model.add(Dropout(0.2))
model.add(LSTM(10 , return_sequences=False,stateful=True))
model.add(Dropout(0.2))
model.add(Dense( featurelen ))
model.add(Activation('linear'))
model.compile(loss='mean_squared_error', optimizer='rmsprop')
model.reset_states()

print('starting training')
num_epochs = 100
for e in range(num_epochs):
    print('epoch - ',e+1)
    for i in range(0,totalTimeSteps-testingSize):
        model.train_on_batch(X[:, numOfPrevSteps*i:(i+1)*numOfPrevSteps, :], np.reshape(X[:, (i+1)*numOfPrevSteps, :], (batchSize, featurelen)) ) # Train on guessing a single element based on the previous element
    model.reset_states()
print('training complete')


print('warming up on training data') # Predict on all training data in order to warm up for testing data
warmupPredictions = []
for i in range(0,totalTimeSteps-testingSize ):
    pred = model.predict(X[:, numOfPrevSteps*i:(i+1)*numOfPrevSteps, :] )
    warmupPredictions.append(pred)


print('testing network')   
predictions = []
testStart = totalTimeSteps-testingSize -1 #We subtract one because we want the last element of the training set to be first element of the testing set
for i in range(testStart,totalTimeSteps-1):
    pred = model.predict(X[:, numOfPrevSteps*i:(i+1)*numOfPrevSteps, :] )
    predictions.append(pred)
    
targets = []   
for o in range(len(predictions)):
    target = X[0][o+testStart+1]
    targets.append(target)
    guess = predictions[o]
    inputs = X[0][o + testStart ]
    print('prediction ',guess,'target ',target,'inputs ',inputs)
    
model.reset_states()

Formatting Data
Formatted Data  [[[ 0.1]
  [ 0.1]
  [ 0.4]
  ..., 
  [ 0.4]
  [ 0.1]
  [ 0.2]]

 [[ 0. ]
  [ 0. ]
  [ 0. ]
  ..., 
  [ 0. ]
  [ 0. ]
  [ 0. ]]]
Building model...
starting training
epoch -  1
epoch -  2
epoch -  3
epoch -  4
epoch -  5
epoch -  6
epoch -  7
epoch -  8
epoch -  9
epoch -  10
epoch -  11
epoch -  12
epoch -  13
epoch -  14
epoch -  15
epoch -  16
epoch -  17
epoch -  18
epoch -  19
epoch -  20
epoch -  21
epoch -  22
epoch -  23
epoch -  24
epoch -  25
epoch -  26
epoch -  27
epoch -  28
epoch -  29
epoch -  30
epoch -  31
epoch -  32
epoch -  33
epoch -  34
epoch -  35
epoch -  36
epoch -  37
epoch -  38
epoch -  39
epoch -  40
epoch -  41
epoch -  42
epoch -  43
epoch -  44
epoch -  45
epoch -  46
epoch -  47
epoch -  48
epoch -  49
epoch -  50
epoch -  51
epoch -  52
epoch -  53
epoch -  54
epoch -  55
epoch -  56
epoch -  57
epoch -  58
epoch -  59
epoch -  60
epoch -  61
epoch -  62
epoch -  63
epoch -  64
epoch -  65
epoch -  66
epoch -  67
epoch -  

#Problem 1
* does the stateful method work for decoding (i.e. does the model start where it left off if sequence is not fed in all at once)
* can we get varying length input, i.e. not just training batch size?

In [331]:
#Try decoding with just the one step as in training
model.reset_states()

In [333]:
model.predict([np.array([[[ 0.2]],[[ 0.1 ]]],[[[ 0.1]],[[ 0.4 ]]],dtype="float")] )

TypeError: Argument given by name ('dtype') and position (2)

In [None]:
model.predict([np.array([[[0.1]]],dtype="float")] )

In [None]:
model.predict([np.array([[[0.4]]],dtype="float")] )

In [None]:
model.predict([np.array([[[0.1]]],dtype="float")] )

In [None]:
#Now try with multple inputs and see if for input lengths >2 the results are the same
model.reset_states()

In [None]:
model.predict([np.array([[[0.2],[0.1]]],dtype="float")] )

In [None]:
model.reset_states()

In [None]:
model.predict([np.array([[[0.2],[0.1],[0.1]]],dtype="float")] )

In [None]:
model.reset_states()

In [None]:
model.predict([np.array([[[0.2],[0.1],[0.1],[0.4]]],dtype="float")] )

In [None]:
model.reset_states()

In [None]:
model.predict([np.array([[[0.2],[0.1],[0.1],[0.4],[0.1]]],dtype="float")] )

#Conclusion 1: 

* As result is the same if fed in one by one or as a sequence, we do get statefulness until the reset_states() is called.
* As a consequence we can use varying length input in prediction.

#Problem 2:
* Can we access the activations/state of the hidden layers output from the network at run time?

In [274]:
print(model.layers)
print(len(model.layers))

[<keras.layers.recurrent.LSTM object at 0x122697850>, <keras.layers.core.Dropout object at 0x11788da50>, <keras.layers.recurrent.LSTM object at 0x12329bb50>, <keras.layers.core.Dropout object at 0x114c469d0>, <keras.layers.core.Dense object at 0x1150867d0>, <keras.layers.core.Activation object at 0x11520d890>]
6


In [283]:
layer_index = 3 #output from the 4th layer Dropout
get_activations = theano.function([model.layers[0].input], model.layers[layer_index].get_output(), allow_input_downcast=True)

In [284]:
model.reset_states()
get_activations([np.array([[0.2],[0.1],[0.1]],dtype="float")] )

array([[ 0.01731116, -0.03213039,  0.02213202, -0.01911084, -0.05958439,
         0.00447506, -0.09207099, -0.02546852,  0.03363676,  0.00022227]], dtype=float32)

In [292]:
layer_index = 5 #equiv to prediction?
get_activations = theano.function([model.layers[0].input], model.layers[layer_index].get_output(), allow_input_downcast=True)

In [293]:
model.reset_states()
get_activations([np.array([[0.2],[0.1],[0.1]],dtype="float")] )

array([[ 0.47800738]], dtype=float32)

#Conclusion 2:
* Yes we can get the internal activations of any layer during run time- though is it as fast as prediction?

#Problem 3:
* speed comparisons, (how much) does stateful slow things down, and is getting the hidden layer activations slower than prediction?

In [300]:
model.reset_states()
tic =  time.clock()
print('warming up on training data') # Predict on all training data in order to warm up for testing data
warmupPredictions = []
warm_up_training = []
for i in range(0,totalTimeSteps-testingSize):
    warm_up_training.append(X[:, numOfPrevSteps*i:(i+1)*numOfPrevSteps, :])
    pred = model.predict(X[:, numOfPrevSteps*i:(i+1)*numOfPrevSteps, :] )
    warmupPredictions.append(pred)
print(len(warmupPredictions))
print(time.clock() - tic)

warming up on training data
1400
0.655632


In [301]:
model.reset_states()
tic =  time.clock()
layer_index = 5 #equiv to prediction?
get_activations = theano.function([model.layers[0].input], model.layers[layer_index].get_output(), allow_input_downcast=True)
print('warming up on training data') # Predict on all training data in order to warm up for testing data
warmupPredictions2 = []
warm_up_training2 = []
for i in range(0,totalTimeSteps-testingSize):
    warm_up_training2.append(X[:, numOfPrevSteps*i:(i+1)*numOfPrevSteps, :])
    pred = get_activations(X[:, numOfPrevSteps*i:(i+1)*numOfPrevSteps, :] )
    warmupPredictions2.append(pred)
print(len(warmupPredictions2))
print(time.clock() - tic)

warming up on training data
1400
2.150498


In [303]:
for a,b,c,d in zip(warm_up_training,warm_up_training2,warmupPredictions,warmupPredictions2):
    print(a,b,a==b,c,d)

[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.64957732]] [[ 0.64957732]]
[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.7496295]] [[ 0.64957732]]
[[[ 0.4]]] [[[ 0.4]]] [[[ True]]] [[ 0.11653391]] [[ 0.09033854]]
[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.23024239]] [[ 0.64957732]]
[[[ 0.2]]] [[[ 0.2]]] [[[ True]]] [[ 0.12047109]] [[ 0.31567895]]
[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.04760765]] [[ 0.64957732]]
[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.12547265]] [[ 0.64957732]]
[[[ 0.4]]] [[[ 0.4]]] [[[ True]]] [[ 0.10521378]] [[ 0.09033854]]
[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.31149739]] [[ 0.64957732]]
[[[ 0.2]]] [[[ 0.2]]] [[[ True]]] [[ 0.16933043]] [[ 0.31567895]]
[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.28938058]] [[ 0.64957732]]
[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.38015276]] [[ 0.64957732]]
[[[ 0.4]]] [[[ 0.4]]] [[[ True]]] [[ 0.11174104]] [[ 0.09033854]]
[[[ 0.1]]] [[[ 0.1]]] [[[ True]]] [[ 0.10516279]] [[ 0.64957732]]
[[[ 0.2]]] [[[ 0.2]]] [[[ True]]] [[ 0.0559531]] [[ 0.31567895]]
[[[ 0.1]]] [

In [None]:
#Conclusion 3:
* getting the internal states is faster than the overall prediction

#Problem 4: 
* can we allow varying batch size in training?
    * or does this not really matter? it does in so far as we'd want to reset-states before consuming a new dialogue and the dialogues won't neccessarily be neatly divisible by the batch size