In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pylab
import sys

pylab.rcParams['figure.figsize'] = (10.0, 8.0)

In [2]:
from __future__ import print_function, division
import logging
import pprint
import math
import numpy
import numpy as np
import os
import operator
import theano

from theano import tensor

from collections import OrderedDict

from theano import function
from fuel.datasets import IndexableDataset
from fuel.transformers import Mapping, Batch, Padding, Filter
from fuel.schemes import (ConstantScheme, ShuffledScheme,
                          ShuffledExampleScheme, SequentialExampleScheme,
                          BatchSizeScheme)
from fuel.transformers import Flatten
from fuel.streams import DataStream
from blocks.config import config
from blocks.bricks import Tanh, Initializable, Logistic, Identity
from blocks.bricks.base import application
from blocks.bricks import Linear, Rectifier, Softmax
from blocks.graph import ComputationGraph
from blocks.bricks.lookup import LookupTable
from blocks.bricks.recurrent import SimpleRecurrent, GatedRecurrent, LSTM, Bidirectional
from blocks.bricks.parallel import Fork
from blocks.bricks.sequence_generators import (
    SequenceGenerator, Readout, SoftmaxEmitter, LookupFeedback)
from blocks.algorithms import (GradientDescent, Scale,
                               StepClipping, CompositeRule, AdaDelta, Adam)
from blocks.initialization import Orthogonal, IsotropicGaussian, Constant
from blocks.bricks.cost import CategoricalCrossEntropy
from blocks.bricks.cost import SquaredError
from blocks.serialization import load_parameter_values
from blocks.model import Model
from blocks.monitoring import aggregation
from blocks.extensions import FinishAfter, Printing, Timing
from blocks.extensions.saveload import Checkpoint
from blocks.extensions.monitoring import TrainingDataMonitoring
from blocks.main_loop import MainLoop
from blocks.bricks import WEIGHT
from blocks.roles import INPUT
from blocks.filter import VariableFilter
from blocks.graph import apply_dropout
from blocks.utils import named_copy, dict_union, shared_floatx_nans, shared_floatx_zeros, shared_floatx
from blocks.utils import dict_union, shared_floatx_nans, shared_floatx_zeros, shared_floatx
from blocks.bricks.recurrent import BaseRecurrent
from blocks.bricks.wrappers import As2D
from blocks.bricks.base import lazy
from blocks.bricks.recurrent import recurrent
from blocks.roles import add_role, INITIAL_STATE
from blocks.extensions import SimpleExtension
from blocks.bricks import MLP

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# SOME USEFUL STUFF FOR THEANO DEBUGGING

config.recursion_limit = 100000
floatX = theano.config.floatX
logger = logging.getLogger(__name__)
# this is to let the log print in the notebook
logger.setLevel(logging.DEBUG)

# theano debugging stuff
theano.config.optimizer='fast_compile'
# theano.config.optimizer='None'
theano.config.exception_verbosity='high'

# compute_test_value is 'off' by default, meaning this feature is inactive
# theano.config.compute_test_value = 'off' # Use 'warn' to activate this feature
# theano.config.compute_test_value = 'warn' # Use 'warn' to activate this feature

In [4]:
# some filter and mapping functions for fuel to use
def _transpose(data):
    return tuple(array.T for array in data)

# swap the (batch, time) axes to make the shape (time, batch, ...)
def _swapaxes(data):
    return tuple(array.swapaxes(0,1) for array in data)

def _filter_long(data):
    return len(data[0]) <= 100

In [5]:
# this is how you could load a bigger slice of the corpus
# from nltk.corpus import brown
# words_by_line, tags_by_line = zip(*[zip(*sen) for sen in list(brown.tagged_sents())[:10000]])
# for these examples, we need to make sure every seq is the same length to avoid padding/mask issues

# a tiny toy dataset for learning the POS of the ambiguous word "calls"
words_by_line = [['she', 'calls', 'me', 'every', 'day', '.'],  
                 ['I', 'received', 'two', 'calls', 'yesterday', '.']] *10


tags_by_line = [[u'PPS', u'VBZ', u'PPO', u'AT', u'NN', u'.'],
                [u'PPSS', u'VBN', u'CD', u'NNS', u'NR', u'.']] *10

idx2word = dict(enumerate(set([w for l in words_by_line for w in l])))
word2idx = {v:k for k,v in idx2word.items()}

idx2tag = dict(enumerate(set([t for l in tags_by_line for t in l])))
tag2idx = {v:k for k,v in idx2tag.items()}

iwords = [[word2idx[w] for w in l] for l in words_by_line]
itags =  [[tag2idx[t] for t in l] for l in tags_by_line]

# now create the fuel dataset
qe_dataset = IndexableDataset(
    indexables=OrderedDict([('words', iwords), ('tags', itags)]))

# now we're going to progressively wrap data streams with other streams that transform the stream somehow
qe_dataset.example_iteration_scheme = ShuffledExampleScheme(qe_dataset.num_examples)
data_stream = qe_dataset.get_example_stream()
data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(1))

# add padding and masks to the dataset
# data_stream = Padding(data_stream, mask_sources=('words','tags'))
data_stream = Padding(data_stream, mask_sources=('words'))
data_stream = Mapping(data_stream, _swapaxes)

# Example of how the iterator works
# for batch in list(data_stream.get_epoch_iterator()):
#     print([source.shape for source in batch])

In [6]:
tagset_size=len(tag2idx.keys())
vocab_size=len(word2idx.keys())
dimension=5

class LookupRecurrent(BaseRecurrent, Initializable):
    """The recurrent transition with lookup and feedback 

    The most well-known recurrent transition: a matrix multiplication,
    optionally followed by a non-linearity.

    Parameters
    ----------
    dim : int
        The dimension of the hidden state
    activation : :class:`.Brick`
        The brick to apply as activation.

    Notes
    -----
    See :class:`.Initializable` for initialization parameters.

    """
    @lazy(allocation=['dim'])
    def __init__(self, dim, activation, **kwargs):
        super(LookupRecurrent, self).__init__(**kwargs)
        self.dim = dim
        
        word_lookup = LookupTable(vocab_size, dimension)
        word_lookup.weights_init = IsotropicGaussian(0.01)
        word_lookup.initialize()
        self.word_lookup = word_lookup

        # There will be a Softmax on top of this layer
        state_to_output = Linear(name='state_to_output', input_dim=dimension, output_dim=tagset_size)
        state_to_output.weights_init = IsotropicGaussian(0.01)
        state_to_output.biases_init = Constant(0.0)
        state_to_output.initialize()
        self.state_to_output = state_to_output

        # note - As2D won't work with masks
        nonlinearity = Softmax()
        wrapper_2D = As2D(nonlinearity.apply)
        wrapper_2D.initialize()
        self.wrapper_2D = wrapper_2D
        
        # a non-linear activation (i.e. Sigmoid, Tanh, ReLU, ...)
        self.activation = activation
        
        self.children = [activation, state_to_output, nonlinearity, 
                         wrapper_2D, word_lookup]

    @property
    def W(self):
        return self.parameters[0]

    def get_dim(self, name):
        if name == 'mask':
            return 0
        if name in (LookupRecurrent.apply.sequences +
                    LookupRecurrent.apply.states):
            return self.dim
        return super(LookupRecurrent, self).get_dim(name)

    # the initial state is the 'original' lookup+feedback
    # the initial state is combined with the first input to produce the first output
    def _allocate(self):
        self.parameters.append(shared_floatx_nans((self.dim, self.dim),
                                                  name="W"))
        add_role(self.parameters[0], WEIGHT)
        
        self.parameters.append(shared_floatx(np.random.random(self.dim,), name="initial_state"))
        add_role(self.parameters[1], INITIAL_STATE)
       
    def _initialize(self):
        self.weights_init.initialize(self.W, self.rng)
    
    def get_predictions(self, inputs):
        linear_mapping = self.state_to_output.apply(inputs)
        readouts = self.wrapper_2D.apply(linear_mapping)
        return readouts
    
    # TODO: change inputs-->states or something more clear
    def get_feedback(self, inputs):
        linear_mapping = self.state_to_output.apply(inputs)
        readouts = self.wrapper_2D.apply(linear_mapping)
        predictions = readouts.argmax(axis=1)
        return self.lookup.feedback(predictions)
    
    @recurrent(sequences=['inputs', 'mask'], states=['states'], outputs=['states'], contexts=[])
    def apply(self, inputs=None, states=None, mask=None):
        """Apply the transition.

        Parameters
        ----------
        inputs : :class:`~tensor.TensorVariable`
            The 2D inputs, in the shape (batch, features).
        states : :class:`~tensor.TensorVariable`
            The 2D states, in the shape (batch, features).
        mask : :class:`~tensor.TensorVariable`
            A 1D binary array in the shape (batch,) which is 1 if
            there is data available, 0 if not. Assumed to be 1-s
            only if not given.

        """
        # first compute the current representation (_not_ state) via the standard recurrent transition
        current_representation = self.word_lookup.apply(inputs) + tensor.dot(states, self.W)
        next_states = self.children[0].apply(current_representation)
        
        if mask:
            next_states = (mask[:, None] * next_states +
                           (1 - mask[:, None]) * states)

        return next_states

    # trainable initial state
    @application(outputs=apply.states)
    def initial_states(self, batch_size, *args, **kwargs):
        return tensor.repeat(self.parameters[1][None, :], batch_size, 0)
     

In [7]:
# test applying our transition to a batch, and see what we get back
transition = LookupRecurrent(dim=dimension, activation=Tanh())

In [8]:
# this is the cost function that we'll use to train our model
def get_cost(words,words_mask,targets):

#     comment this out if you are using the GatedRecurrent transition
    states = transition.apply(
        **dict_union(inputs=words, mask=words_mask, return_initial_states=True))
    
    output = states[1:]
    output_shape = output.shape

    dim1 = output_shape[0] * output_shape[1]
    dim2 = output_shape[2]
  
    y_hat = Softmax().apply(
        transition.state_to_output.apply(
        output.reshape((dim1, dim2))))
    
    # try the blocks crossentropy
    y = targets.flatten()
    costs = theano.tensor.nnet.categorical_crossentropy(y_hat,y)
    
    final_cost = costs.mean()
    
#     return final_cost
    return (final_cost, y_hat, y, costs, final_cost)

In [9]:
def get_prediction(words, words_mask):
    
    states = transition.apply(
        **dict_union(inputs=words, mask=words_mask, return_initial_states=True))
    
    # we only care about the RNN states, which are the first and only output
    output = states[1:]
    output_shape = output.shape
    dim1 = output_shape[0] * output_shape[1]
    dim2 = output_shape[2]
    
    y_hat = Softmax().apply(
        transition.state_to_output.apply(
            output.reshape((dim1, dim2))))

    predictions = y_hat

    return predictions

In [10]:
words=tensor.lmatrix("words")
words_mask=tensor.matrix("words_mask")
targets=tensor.lmatrix("tags")
# targets_mask=tensor.matrix("tags_mask")

# let's get some feedback from the cost function so we can monitor it
cost, yhat, y, raw_costs, true_costs = get_cost(words, words_mask, targets)

yhat.name = 'yyhat'
y.name = 'y_inside'
raw_costs.name = 'raw_costs'
true_costs.name = 'true_costs'
# mlp_cost.name = 'mlp_cost'

 
# can we just get ther computation graph directly here? (without Model)
cost_cg = ComputationGraph(cost)
weights = VariableFilter(roles=[WEIGHT])(cost_cg.variables)

cost.name = "sequence_log_likelihood_cost_regularized"
prediction_model = Model(get_prediction(words, words_mask)).get_theano_function()

  from scan_perform.scan_perform import *


In [11]:
# Construct the main loop and start training!

transition.weights_init = IsotropicGaussian(0.1)
transition.biases_init = Constant(0.)
transition.initialize()

# Go through this many batches
num_batches=5000

from blocks.monitoring import aggregation

batch_cost = cost
final_cost = aggregation.mean(batch_cost, 1)
final_cost.name = 'final_cost'
test_model = Model(final_cost)

cg = ComputationGraph(final_cost)

# note that you must explicitly provide the cost function `cost=...`
algorithm = GradientDescent(
    cost=final_cost, parameters=cg.parameters,
    step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) 

#     CompositeRule([StepClipping(10.0), Scale(0.01)]))   

#     CompositeRule([StepClipping(10.0), Adam()])   
#     step_rule=AdaDelta())
# step_rule=Scale(learning_rate=1e-3)
#     step_rule=AdaDelta())    
#     step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

parameters = test_model.get_parameter_dict()
logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, value.get_value().shape) for key, value in parameters.items()],
                    width=120))

observables = [cost]

# Some other things you could observe during training
#  algorithm.total_step_norm, algorithm.total_gradient_norm]

# for name, parameter in parameters.items():
#     observables.append(named_copy(
#         parameter.norm(2), name + "_norm"))
#     observables.append(named_copy(
#         algorithm.gradients[parameter].norm(2), name + "_grad_norm"))

# this will be the prefix of the saved model and log
save_path='test-lookup-recurrent-model'

average_monitoring = TrainingDataMonitoring(
    observables, prefix="average", every_n_batches=1000)

main_loop = MainLoop(
    model=test_model,
    data_stream=data_stream,
    algorithm=algorithm,
    extensions=[
        Timing(),
        TrainingDataMonitoring(observables, after_batch=True),
        average_monitoring,
        FinishAfter(after_n_batches=num_batches),
        # This is a hook to handle NaN emerging during training -- finish if you see it
#         .add_condition(["after_batch"], _is_nan),
        # hook to save the model 
#         Checkpoint(save_path, every_n_batches=1000,
#                    save_separately=["model", "log"]),
        Printing(every_n_batches=1000, after_epoch=False)])
main_loop.run()



INFO:__main__:Parameters:
[('/lookuprecurrent/state_to_output.b', (11,)),
 ('/lookuprecurrent.initial_state', (5,)),
 ('/lookuprecurrent/lookuptable.W', (10, 5)),
 ('/lookuprecurrent.W', (5, 5)),
 ('/lookuprecurrent/state_to_output.W', (5, 11))]



-------------------------------------------------------------------------------
BEFORE FIRST EPOCH
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 0
	 received_first_batch: False
	 resumed_from: None
	 training_started: True
Log records from the iteration 0:
	 time_initialization: 0.361103773117


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 49
	 iterations_done: 1000
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 1000:
	 average_sequence_log_likelihood_cost_regularized: 2.30121779442
	 sequence_log_likelihood_cost_regular

In [13]:
# manually test some examples to get an idea what your model learned

# new_ex = [['she', 'calls', 'me', 'every', 'day']]
new_ex = [['she', 'received', 'two', 'calls']]
new_ex_int = [[word2idx[w] for w in l] for l in new_ex]
example = np.array(new_ex_int).swapaxes(0,1)
o = prediction_model(example, np.ones(example.shape).astype(theano.config.floatX))[0]
predictions = [idx2tag[i] for i in o.argmax(axis=1)]
predictions

[u'PPS', u'VBN', u'CD', u'NNS']

### Challenges:

(1)
How many batches do you need before the model learns something useful? Any ideas on how to make the training faster?

(2) 
Play with the hyperparameters, embedding size, recurrent transition size, etc... -- how does this affect the model's performance?

(2)
- come up with your own small training set of ambiguous examples (maybe in your native language)
- do a qualitative analysis of what the recurrent model learns
- Note: for this notebook, you need to make sure that your examples are the same length, because we haven't
    because we haven't implemented masking 

    
    