This notebook is highly inspired from https://github.com/Lasagne/Recipes/blob/master/examples/lstm_text_generation.py
http://lasagne.readthedocs.io/en/latest/modules/layers/recurrent.html

In [31]:
import numpy as np
import theano
import theano.tensor as T
import lasagne

In [32]:
lasagne.random.set_rng(np.random.RandomState(1))

In [33]:
# Sequence Length
SEQ_LENGTH = 20

# Number of units in the two hidden (LSTM) layers
N_HIDDEN = 512

# Optimization learning rate
LEARNING_RATE = .01

# All gradients above this will be clipped
GRAD_CLIP = 100

# How often should we check the output?
PRINT_FREQ = 100

# Number of epochs to train the net
NUM_EPOCHS = 50

# Batch Size
BATCH_SIZE = 128

# Dataset

In [34]:
import urllib.request #For downloading the sample text file. You won't need this if you are providing your own file.
try:
    in_text = urllib.request.urlopen('https://s3.amazonaws.com/text-datasets/nietzsche.txt').read()
    #in_text = open('your_file.txt', 'r').read()
    #in_text = in_text.decode("utf-8-sig").encode("utf-8")
    in_text = in_text.decode("utf-8")
except Exception as e:
    print("Please verify the location of the input file/URL.")
    print("A sample txt file can be downloaded from https://s3.amazonaws.com/text-datasets/nietzsche.txt")
    raise IOError('Unable to Read Text')

The pre-processing consists in retrieving the list of symbols occuring in the text and for each symbol, convert it into an unique index. This index will be used to create an one-hot representation of the symbol that will be the input of the model.

In [36]:
chars = list(set(in_text))
data_size, vocab_size = len(in_text), len(chars)
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
print('Number of unique symbols: {}'.format(vocab_size))
print('Number of symbols in the dataset: {}'.format(data_size))

Number of unique symbols: 84
Number of symbols in the dataset: 600893


The following auxiliary function creates a minibatch in a 3D tensor (batch_size,SEQ_LENGTH,vocab_size).
For each datapoint (fixed first coordinate of the 3D matrix), there is a matrix of dimension (SEQ_LENGTH,vocab_size)
where each line contains the one-hot vector representing the character at the associated position. Notice that the sequences have all the same length (SEQ_LENGTH), which can covers many sentences. TODO: verify if it implies truncated backprop

In [39]:
def gen_data(p, batch_size = BATCH_SIZE, data=in_text, return_target=True):
    """Return a minibatch compatible with the input of the model and the associated targets
    

    Keyword arguments:
    p -- The index of the character to begin to read
    batch_size -- The number of datapoints in the current batch
    data -- The whole text
    return_target -- Create the targets (next character) associated to the sequences
    """
    x = np.zeros((batch_size,SEQ_LENGTH,vocab_size))
    y = np.zeros(batch_size)
    
    for n in range(batch_size):
        ptr = n
        for i in range(SEQ_LENGTH):
            x[n,i,char_to_ix[data[p+ptr+i]]] = 1.
        if(return_target):
            y[n] = char_to_ix[data[p+ptr+SEQ_LENGTH]]
    return x, np.array(y,dtype='int32')

# Model definition

Layers to construct recurrent networks. Recurrent layers can be used similarly to feed-forward layers except that the input shape is expected to be (batch_size, sequence_length, num_inputs). By setting the first two dimensions as None, we are allowing them to vary. They correspond to batch size and sequence length, so we will be able to feed in batches of varying size with sequences of varying length. If `only_return_final` is set, it only returns the final sequential output (e.g. for tasks where a single target value for the entire sequence is desired). In this case, Theano makes an optimization which saves memory.

In [40]:
# First, we build an input layer
# Recurrent layers expect input of shape
# (batch size, SEQ_LENGTH, num_features)

l_in = lasagne.layers.InputLayer(shape=(None, None, vocab_size))

# We now build the LSTM layer which takes l_in as the input layer
# We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients. 

l_forward_1 = lasagne.layers.LSTMLayer(
    l_in, num_units=N_HIDDEN, grad_clipping=GRAD_CLIP,
    nonlinearity=lasagne.nonlinearities.tanh, only_return_final=True)

l_shp = lasagne.layers.ReshapeLayer(l_forward_1, (-1, N_HIDDEN))

# The output of l_forward_2 of shape (batch_size, N_HIDDEN) is then passed through the softmax nonlinearity to 
# create probability distribution of the prediction
# The output of this stage is (batch_size, vocab_size)
#l_out = lasagne.layers.DenseLayer(l_forward_2, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax)
l_out = lasagne.layers.DenseLayer(l_shp, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax)


In [41]:
print("Building network ...")
   
# First, we build the network, starting with an input layer
# Recurrent layers expect input of shape
# (batch size, SEQ_LENGTH, num_features)

l_in = lasagne.layers.InputLayer(shape=(None, None, vocab_size))

# We now build the LSTM layer which takes l_in as the input layer
# We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients. 

l_forward_1 = lasagne.layers.LSTMLayer(
    l_in, num_units=N_HIDDEN, grad_clipping=GRAD_CLIP,
    nonlinearity=lasagne.nonlinearities.tanh)

l_forward_2 = lasagne.layers.LSTMLayer(
    l_forward_1, num_units=N_HIDDEN, grad_clipping=GRAD_CLIP,
    nonlinearity=lasagne.nonlinearities.tanh,
    only_return_final=True)

# The output of l_forward_2 of shape (batch_size, N_HIDDEN) is then passed through the softmax nonlinearity to 
# create probability distribution of the prediction
# The output of this stage is (batch_size, vocab_size)
l_out = lasagne.layers.DenseLayer(l_forward_2, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax)

Building network ...


In [42]:
# Theano tensor for the targets
target_values = T.ivector('target_output')
    
# lasagne.layers.get_output produces a variable for the output of the net
network_output = lasagne.layers.get_output(l_out)

# The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target.
cost = T.nnet.categorical_crossentropy(network_output,target_values).mean()

Define the 
http://lasagne.readthedocs.io/en/latest/modules/updates.html?highlight=update

In [43]:
# Retrieve all parameters from the network
all_params = lasagne.layers.get_all_params(l_out,trainable=True)

# Compute AdaGrad updates for training
print("Computing updates ...")
updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

Computing updates ...
Compiling functions ...


In [None]:
# Theano functions for training and computing cost
print("Compiling functions ...")
train = theano.function([l_in.input_var, target_values], cost, updates=updates, allow_input_downcast=True)
compute_cost = theano.function([l_in.input_var, target_values], cost, allow_input_downcast=True)

# In order to generate text from the network, we need the probability distribution of the next character given
# the state of the network and the input (a seed).
# In order to produce the probability distribution of the prediction, we compile a function called probs. 
    
probs = theano.function([l_in.input_var],network_output,allow_input_downcast=True)

The next function generates text given a phrase of length at least SEQ_LENGTH.
The phrase is set using the variable generation_phrase.
The optional input "N" is used to set the number of characters of text to predict. 

In [44]:
generation_phrase = "The meaning of life is" #This phrase will be used as seed to generate text.

def predict(N=200):
    """
    Output a sequence of characters of lenght N according to the current model

    Keyword arguments:
    N -- number of characters to output
    """
    assert(len(generation_phrase)>=SEQ_LENGTH)
    sample_ix = []
    x,_ = gen_data(len(generation_phrase)-SEQ_LENGTH, 1, generation_phrase,0)

    for i in range(N):
        # Pick the character that got assigned the highest probability
        ix = np.argmax(probs(x).ravel())
        # Alternatively, to sample from the distribution instead:
        # ix = np.random.choice(np.arange(vocab_size), p=probs(x).ravel())
        sample_ix.append(ix)
        x[:,0:SEQ_LENGTH-1,:] = x[:,1:,:]
        x[:,SEQ_LENGTH-1,:] = 0
        x[0,SEQ_LENGTH-1,sample_ix[-1]] = 1. 

    random_snippet = generation_phrase + ''.join(ix_to_char[ix] for ix in sample_ix)    
    print("----\n %s \n----" % random_snippet)

In [None]:
print("Training ...")
print("Seed used for text generation is: " + generation_phrase)
p = 0
for it in range(int(data_size * NUM_EPOCHS / BATCH_SIZE)):
    try_it_out() # Generate text using the p^th character as the start. 
            
    avg_cost = 0;
    for i in range(PRINT_FREQ):
        x,y = gen_data(p)

        p += SEQ_LENGTH + BATCH_SIZE - 1 
        if(p+BATCH_SIZE+SEQ_LENGTH >= data_size):
            print('Carriage Return')
            p = 0;

        avg_cost += train(x, y)
        print(i)
    print("Epoch {} average loss = {}".format(it*1.0*PRINT_FREQ/data_size*BATCH_SIZE, avg_cost / PRINT_FREQ))


Training ...
Seed used for text generation is: The meaning of life is
----
 The meaning of life is                                                                                                                                                                                                         
----
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
