In [1]:
# create a character level Fuel dataset from text file

%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

In [2]:
import os
import codecs
import subprocess
from subprocess import Popen, PIPE, STDOUT
from toolz import merge

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

import numpy as np
from abc import ABCMeta, abstractmethod
from toolz import merge
import pandas as pd
from six import add_metaclass
from picklable_itertools.extras import equizip
from blocks.bricks.recurrent import (GatedRecurrent, Bidirectional)
from blocks.initialization import IsotropicGaussian, Constant, Orthogonal
import theano
import theano.tensor as T

from blocks.bricks import Initializable
from blocks.bricks.base import application, Brick, lazy
from blocks.bricks import Tanh, Linear, MLP
from blocks.bricks.lookup import LookupTable
from blocks.bricks.parallel import Fork
from blocks.utils import (shared_floatx_nans, dict_union)
from blocks.roles import add_role, WEIGHT
from blocks.bricks import Tanh, Linear, Softmax, MLP


from fuel.datasets import TextFile
from fuel.schemes import ConstantScheme
from fuel.streams import DataStream
from fuel.transformers import (
    Merge, Batch, Filter, Padding, SortMapping, Unpack, Mapping)

DEBUG:root:test


In [20]:
# path to your training data
TRAINING_DATASET = '/home/chris/projects/machine_learning/dcu-character-lms/data/paul_graham_essays.train.txt'
DEV_DATASET = '/home/chris/projects/machine_learning/dcu-character-lms/data/paul_graham_essays.dev.txt'

UNKNOWN_TOKEN = '|'
EOS_TOKEN = '`'

In [31]:
def get_vocab(dataset):
    all_symbols = set()
    with codecs.open(dataset, encoding='utf8') as inp: 
        for l in inp.read().strip().split('\n'):
            all_symbols.update(l)
    return all_symbols
    
    
def get_y_file(dataset):
    
    # a file to hold the Y representation of the dataset
    y_tmp_file = os.path.join(os.path.basename(dataset), dataset+'_temp_y.txt')
    
    with codecs.open(dataset, encoding='utf8') as inp:
        with codecs.open(
            y_tmp_file, 'wb',encoding='utf8') as y_out:
            for l in inp.read().strip().split('\n'):
                y_seq = l[1:] + EOS_TOKEN
                assert len(l) == len(y_seq)
    
                y_out.write(''.join(y_seq) + '\n')

    return y_tmp_file

In [63]:
class CharTextFile(TextFile):
    def __init__(self, *args, **kwargs):
        super(CharTextFile, self).__init__(*args, **kwargs)
    
    # override parent method to only do rstrip instead of strip
    def get_data(self, state=None, request=None):
        if request is not None:
            raise ValueError
        sentence = next(state)
        if self.preprocess is not None:
            sentence = self.preprocess(sentence)
        data = [self.dictionary[self.bos_token]] if self.bos_token else []
        if self.level == 'word':
            data.extend(self.dictionary.get(word,
                                            self.dictionary[self.unk_token])
                        for word in sentence.split())
        else:
            data.extend(self.dictionary.get(char,
                                            self.dictionary[self.unk_token])
                        for char in sentence.rstrip())
        if self.eos_token:
            data.append(self.dictionary[self.eos_token])
        return (data,)

In [64]:
# add an unknown token in case we observe a new character at prediction time
vocab = get_vocab(TRAINING_DATASET)
vocab.update([UNKNOWN_TOKEN, EOS_TOKEN])
word2idx = {v:k for k,v in enumerate(vocab)}

idx2word = {v:k for k,v in word2idx.items()}

In [65]:
from fuel.datasets import TextFile
from fuel.schemes import ConstantScheme
from fuel.streams import DataStream
from fuel.transformers import (
    Merge, Batch, Filter, Padding, SortMapping, Unpack, Mapping)


# axis swapping so that we get (time, batch, features)
def swapaxes(data):
    """Switch the axes in the sequence parts of the data tuple"""
    return tuple(array.swapaxes(0,1) for array in data)

class _too_long(object):
    """Filters sequences longer than given sequence length."""
    def __init__(self, seq_len=50):
        self.seq_len = seq_len

    def __call__(self, sentence_pair):
        return all([len(sentence) <= self.seq_len
                    for sentence in sentence_pair])


def get_stream(vocab, x_file,
               seq_len=50, batch_size=20, sort_k_batches=5, unk_token="`", **kwargs):
    """Prepares the data stream."""
    
    y_file = get_y_file(x_file)

    def _length(item):
        """Item is assumed to be (x,y)"""
        return len(item[0])
    
    # Get text files from both source and target
    X = CharTextFile([x_file], vocab, bos_token=None, eos_token=None,
                 unk_token=unk_token, level='character')

    Y = CharTextFile([y_file], vocab, bos_token=None, eos_token=None,
                 unk_token=unk_token, level='character')

    # Merge them to get x1, x2 pairs
    stream = Merge([X.get_example_stream(),
                    Y.get_example_stream()],
                    ('x', 'y'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # LOOKAHEAD SORT
    # Build a batched version of stream to read k batches ahead
#     stream = Batch(stream,
#                    iteration_scheme=ConstantScheme(
#                    batch_size*sort_k_batches))
    
    # Sort all samples in the read-ahead batch
#     stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
#     stream = Unpack(stream)
    # END LOOKAHEAD SORT

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = Padding(stream)
    
    # transpose tensors to get (time, batch, features)
    masked_stream = Mapping(masked_stream, swapaxes)

    return masked_stream

In [73]:
# def get_stream(vocab, x_file, y_file,
#                seq_len=50, batch_size=20, sort_k_batches=5, unk_token="`", **kwargs):

BATCH_SIZE = 20
MAX_SEQ_LEN = 200

train_stream = get_stream(word2idx, TRAINING_DATASET, batch_size=BATCH_SIZE, seq_len=MAX_SEQ_LEN)
dev_stream = get_stream(word2idx, DEV_DATASET, batch_size=BATCH_SIZE)

In [74]:
j = 0 
for tx, tx_mask, ty, ty_mask in list(train_stream.get_epoch_iterator()):
    l = len([idx2word[i] for i in tx[:, 0]])
    if l == 19:
        print(j)
    j += 1
    print(tx.shape)
    print(ty.shape)
    if tx.shape != ty.shape:
        print('ERROR')
        print(j)
        print(tx.shape)
        print(ty.shape)

(99, 20)
(99, 20)
(96, 20)
(96, 20)
(98, 20)
(98, 20)
(83, 20)
(83, 20)
(97, 20)
(97, 20)
(98, 20)
(98, 20)
(93, 20)
(93, 20)
(91, 20)
(91, 20)
(97, 20)
(97, 20)
(96, 20)
(96, 20)
(97, 20)
(97, 20)
(96, 20)
(96, 20)
(97, 20)
(97, 20)
(93, 20)
(93, 20)
(95, 20)
(95, 20)
(98, 20)
(98, 20)
(89, 20)
(89, 20)
(95, 20)
(95, 20)
(96, 20)
(96, 20)
(97, 20)
(97, 20)
(92, 20)
(92, 20)
(88, 20)
(88, 20)
(92, 20)
(92, 20)
(97, 20)
(97, 20)
(96, 20)
(96, 20)
(98, 20)
(98, 20)
(98, 20)
(98, 20)
(96, 20)
(96, 20)
(97, 20)
(97, 20)
(94, 20)
(94, 20)
(97, 20)
(97, 20)
(98, 20)
(98, 20)
(98, 20)
(98, 20)
(97, 20)
(97, 20)
(97, 20)
(97, 20)
(94, 20)
(94, 20)
(92, 20)
(92, 20)
(96, 20)
(96, 20)
(94, 20)
(94, 20)
(98, 20)
(98, 20)
(96, 20)
(96, 20)
(97, 20)
(97, 20)
(98, 20)
(98, 20)
(95, 20)
(95, 20)
(97, 20)
(97, 20)
(98, 20)
(98, 20)
(96, 20)
(96, 20)
(98, 20)
(98, 20)
(84, 20)
(84, 20)
(95, 20)
(95, 20)
(97, 20)
(97, 20)
(98, 20)
(98, 20)
(99, 20)
(99, 20)
(93, 20)
(93, 20)
(85, 20)
(85, 20)
(95, 20)
(

In [75]:
# Create the Model

class RNNLM(Initializable):
    """Model for character level LM"""

    def __init__(self, vocab_size, embedding_dim, state_dim, output_dim, **kwargs):
        super(RNNLM, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.output_dim = output_dim

        self.lookup = LookupTable(name='embeddings')
        self.transition = GatedRecurrent(activation=Tanh(), dim=state_dim)
        
        self.fork = Fork(
            [name for name in self.transition.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fork')

        # output layer -- this may need to be changed
        self.output_layer = Linear(name='output_layer')
        
        self.children = [self.lookup, self.transition,
                         self.fork, self.output_layer]


    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.fork.input_dim = self.embedding_dim
        self.fork.output_dims = [self.transition.get_dim(name)
                                 for name in self.fork.output_names]
        
        self.output_layer.input_dim = self.state_dim
        self.output_layer.output_dim = self.output_dim

    def cost(self, x, x_mask, y, y_mask):
        
        representation = self.lookup.apply(x)
        
        states = self.transition.apply(**merge(self.fork.apply(representation, as_dict=True), {'mask': x_mask}))
        
        # get cost from output layer, transform inputs as necessary
        states_shape = states.shape
        states_dim1 = states_shape[0] * states_shape[1]
        states_dim2 = states_shape[2]

        y_hat = Softmax().apply(
            self.output_layer.apply(states.reshape((states_dim1, states_dim2))))

        y_preds = y_hat.argmax(axis=1)

        y = y.flatten()
        costs = T.nnet.categorical_crossentropy(y_hat, y)

        flat_y_mask = y_mask.flatten()

        # here we are zeroing the fake costs for masked parts of the predictions, and dividing by the number of actual
        # instances
        final_cost = T.sum(costs * flat_y_mask) / T.sum(flat_y_mask)
        
        return final_cost
    
    # TODO: implement predict and sample
        

In [76]:
VOCAB_SIZE = len(word2idx)
STATE_DIM=100
EMBEDDING_DIM=52
OUTPUT_DIM=len(word2idx)

test_rnnlm = RNNLM(VOCAB_SIZE, EMBEDDING_DIM, STATE_DIM, OUTPUT_DIM)

In [77]:
# initialize for testing
test_rnnlm.weights_init = IsotropicGaussian(0.1)
test_rnnlm.biases_init = Constant(0)
test_rnnlm.push_initialization_config()
test_rnnlm.initialize()

In [78]:
# Make symbolic variables
x = T.lmatrix("x")
x_mask = T.matrix("x_mask")
y = T.lmatrix("y")
y_mask = T.matrix("y_mask")

cost = test_rnnlm.cost(x, x_mask, y, y_mask)
cost.name = 'minibatch_cost'

# test_cost_func = theano.function([x, x_mask, y, y_mask], test_val)

In [None]:
import pprint

from blocks.monitoring import aggregation
from blocks.graph import ComputationGraph, apply_noise
from blocks.model import Model
from blocks.algorithms import (GradientDescent, Scale,
                               StepClipping, CompositeRule, AdaDelta, Adam)
from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring
from blocks.main_loop import MainLoop
from blocks.extensions import FinishAfter, Printing, Timing
from blocks.extensions.stopping import FinishIfNoImprovementAfter
from blocks.graph import apply_dropout, apply_noise

# create the Blocks main loop
cost_cg = ComputationGraph(cost)
model = Model(cost)

algorithm = GradientDescent(
    cost=cost, parameters=cost_cg.parameters,
    step_rule=CompositeRule([StepClipping(1.),
                             AdaDelta()])
)

parameters = model.get_parameter_dict()
logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, value.get_value().shape) for key, value in parameters.items()],
                    width=120))

# Note that observables also get added to the log -- this is useful for post-processing
observables = [cost]


extensions = [
#     Timing(every_n_batches=1),
    TrainingDataMonitoring(observables, after_batch=True),
    DataStreamMonitoring(
        [cost],
        data_stream=dev_stream,
        prefix="valid",
        every_n_batches=10
    ),
    FinishAfter(after_n_batches=10000),
    Printing(every_n_batches=1, after_epoch=False)
]


main_loop = MainLoop(
    model=model,
    data_stream=train_stream,
    algorithm=algorithm,
    extensions=extensions
)

main_loop.run()


INFO:blocks.algorithms:Taking the cost gradient
INFO:blocks.algorithms:The cost gradient computation graph is built
INFO:root:Parameters:
[('/output_layer.b', (52,)),
 ('/gatedrecurrent.initial_state', (100,)),
 ('/fork/fork_gate_inputs.b', (200,)),
 ('/fork/fork_gate_inputs.W', (52, 200)),
 ('/fork/fork_inputs.b', (100,)),
 ('/fork/fork_inputs.W', (52, 100)),
 ('/embeddings.W', (52, 52)),
 ('/gatedrecurrent.state_to_gates', (100, 200)),
 ('/gatedrecurrent.state_to_state', (100, 100)),
 ('/output_layer.W', (100, 52))]
DEBUG:blocks.monitoring.evaluators:variable to evaluate: minibatch_cost
DEBUG:blocks.monitoring.evaluators:Using the default  (average over minibatches) aggregation scheme for minibatch_cost
DEBUG:blocks.monitoring.evaluators:Compiling initialization and readout functions
DEBUG:blocks.monitoring.evaluators:Initialization and readout functions compiled
DEBUG:blocks.monitoring.evaluators:variable to evaluate: minibatch_cost
DEBUG:blocks.monitoring.evaluators:Compiling initi


-------------------------------------------------------------------------------
BEFORE FIRST EPOCH
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 0
	 received_first_batch: False
	 resumed_from: None
	 training_started: True
Log records from the iteration 0:
	 valid_minibatch_cost: 3.95420837402


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 1
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 1:
	 minibatch_cost: 3.953820467


-------------------------------------------------------------------------------

INFO:blocks.extensions.monitoring:Monitoring on auxiliary data started
INFO:blocks.extensions.monitoring:Monitoring on auxiliary data finished


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 9
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 9:
	 minibatch_cost: 3.61965918541


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 10
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 10:
	 minibatch_cost: 3.43964910507
	 valid_minibatch_cost: 3.29560899734


----------------------------------------------------------------

INFO:blocks.extensions.monitoring:Monitoring on auxiliary data started
INFO:blocks.extensions.monitoring:Monitoring on auxiliary data finished


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 19
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 19:
	 minibatch_cost: 3.03987407684


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 20
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 20:
	 minibatch_cost: 3.04270458221
	 valid_minibatch_cost: 3.090726614


----------------------------------------------------------------

INFO:blocks.extensions.monitoring:Monitoring on auxiliary data started
INFO:blocks.extensions.monitoring:Monitoring on auxiliary data finished


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 29
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 29:
	 minibatch_cost: 3.04321408272


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 30
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 30:
	 minibatch_cost: 3.00761699677
	 valid_minibatch_cost: 3.02150440216


--------------------------------------------------------------

INFO:blocks.extensions.monitoring:Monitoring on auxiliary data started
INFO:blocks.extensions.monitoring:Monitoring on auxiliary data finished


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 39
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 39:
	 minibatch_cost: 2.93052244186


-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 40
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 40:
	 minibatch_cost: 2.95222306252
	 valid_minibatch_cost: 3.0037150383


---------------------------------------------------------------