In [8]:
# create a character level Fuel dataset from text file

%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

In [53]:
import os
import codecs
import subprocess
from subprocess import Popen, PIPE, STDOUT
from toolz import merge

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

import numpy as np
from abc import ABCMeta, abstractmethod
from toolz import merge
import pandas as pd
from six import add_metaclass
from picklable_itertools.extras import equizip
from blocks.bricks.recurrent import (GatedRecurrent, Bidirectional)
from blocks.initialization import IsotropicGaussian, Constant, Orthogonal
import theano
import theano.tensor as T

from blocks.bricks import Initializable
from blocks.bricks.base import application, Brick, lazy
from blocks.bricks import Tanh, Linear, MLP
from blocks.bricks.lookup import LookupTable
from blocks.bricks.parallel import Fork
from blocks.utils import (shared_floatx_nans, dict_union)
from blocks.roles import add_role, WEIGHT


from fuel.datasets import TextFile
from fuel.schemes import ConstantScheme
from fuel.streams import DataStream
from fuel.transformers import (
    Merge, Batch, Filter, Padding, SortMapping, Unpack, Mapping)

DEBUG:root:test


In [108]:
# path to your training data
TRAINING_DATASET = '/home/chris/projects/machine_learning/dcu-character-lms/data/paul_graham_essays.train.txt'
DEV_DATASET = '/home/chris/projects/machine_learning/dcu-character-lms/data/paul_graham_essays.dev.txt'

UNKNOWN_TOKEN = '|'
EOS_TOKEN = '`'

In [109]:
def get_vocab(dataset):
    all_symbols = set()
    with codecs.open(dataset, encoding='utf8') as inp: 
        for l in inp.read().strip().split('\n'):
            all_symbols.update(l)
    return all_symbols
    
    
def get_y_file(dataset):
    
    # a file to hold the Y representation of the dataset
    y_tmp_file = os.path.join(os.path.basename(dataset), dataset+'_temp_y.txt')
    
    with codecs.open(dataset, encoding='utf8') as inp:
        with codecs.open(
            y_tmp_file, 'wb',encoding='utf8') as y_out:
            for l in inp.read().strip().split('\n'):
                y_seq = l[1:] + EOS_TOKEN
                assert len(l) == len(y_seq)

                y_out.write(''.join(y_seq) + '\n')

    return y_tmp_file

In [110]:
# add an unknown token in case we observe a new character at prediction time
vocab = get_vocab(TRAINING_DATASET)
vocab.update([UNKNOWN_TOKEN, EOS_TOKEN])
word2idx = {v:k for k,v in enumerate(vocab)}

idx2word = {v:k for k,v in word2idx.items()}

In [111]:
from fuel.datasets import TextFile
from fuel.schemes import ConstantScheme
from fuel.streams import DataStream
from fuel.transformers import (
    Merge, Batch, Filter, Padding, SortMapping, Unpack, Mapping)


# axis swapping so that we get (time, batch, features)
def swapaxes(data):
    """Switch the axes in the sequence parts of the data tuple"""
    return tuple(array.swapaxes(0,1) for array in data)

class _too_long(object):
    """Filters sequences longer than given sequence length."""
    def __init__(self, seq_len=50):
        self.seq_len = seq_len

    def __call__(self, sentence_pair):
        return all([len(sentence) <= self.seq_len
                    for sentence in sentence_pair])


def get_stream(vocab, x_file,
               seq_len=50, batch_size=20, sort_k_batches=5, unk_token="`", **kwargs):
    """Prepares the data stream."""
    
    y_file = get_y_file(x_file)

    def _length(item):
        """Item is assumed to be (x,y)"""
        return len(item[0])
    
    # Get text files from both source and target
    X = TextFile([x_file], vocab, bos_token=None, eos_token=None,
                   unk_token=unk_token, level='character')

    Y = TextFile([y_file], vocab, bos_token=None, eos_token=None,
                       unk_token=unk_token, level='character')

    # Merge them to get x1, x2 pairs
    stream = Merge([X.get_example_stream(),
                    Y.get_example_stream()],
                    ('x', 'y'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # LOOKAHEAD SORT
    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                   batch_size*sort_k_batches))
    
    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)
    # END LOOKAHEAD SORT

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = Padding(stream, mask_sources=['x1', 'x2'])
    
    # transpose tensors to get (time, batch, features)
    masked_stream = Mapping(masked_stream, swapaxes)

    return masked_stream

In [112]:
# def get_stream(vocab, x_file, y_file,
#                seq_len=50, batch_size=20, sort_k_batches=5, unk_token="`", **kwargs):

train_stream = get_stream(word2idx, TRAINING_DATASET)

dev_stream = get_stream(word2idx, DEV_DATASET)

In [84]:
# Create the Model

class RNNLM(Initializable):
    """Model for character level LM"""

    def __init__(self, vocab_size, embedding_dim, state_dim, output_dim, **kwargs):
        super(RNNLM, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.output_dim = output_dim

        self.lookup = LookupTable(name='embeddings')
        self.transition = GatedRecurrent(activation=Tanh(), dim=state_dim)
        
        self.fork = Fork(
            [name for name in self.transition.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fork')

        # output layer -- this may need to be changed
        self.output_layer = Linear(name='output_layer')
        
        self.children = [self.lookup, self.transition,
                         self.fork, self.output_layer]


    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.fork.input_dim = self.embedding_dim
        self.fork.output_dims = [self.transition.get_dim(name)
                                 for name in self.fork.output_names]
        
        self.output_layer.input_dim = self.state_dim
        self.output_layer.output_dim = self.output_dim

    def cost(self, x, x_mask, y, y_mask):
        
        representation = self.lookup.apply(x)
        
        states = self.transition.apply(**merge(self.fork.apply(representation, as_dict=True), {'mask': x_mask}))
        
        # get cost from output layer, transform inputs as necessary
        
        
        return states
#         return cost
        

In [44]:
VOCAB_SIZE = len(word2idx)
STATE_DIM=10
EMBEDDING_DIM=4
OUTPUT_DIM=3

test_rnnlm = RNNLM(VOCAB_SIZE, EMBEDDING_DIM, STATE_DIM, OUTPUT_DIM)

In [46]:
# initialize for testing
test_rnnlm.weights_init = IsotropicGaussian(0.1)
test_rnnlm.biases_init = Constant(0)
test_rnnlm.push_initialization_config()
test_rnnlm.initialize()

In [48]:
# test_rnnlm.lookup.W.get_value()

In [50]:
# Make symbolic variables
x = T.lmatrix("x")
x_mask = T.matrix("x_mask")
y = T.lmatrix("y")
y_mask = T.matrix("y_mask")

# cost_given_input.name = 'cost_given_input'

test_val = test_rnnlm.cost(x, x_mask, y, y_mask)

test_cost_func = theano.function([x, x_mask, y, y_mask], test_val)

UnusedInputError: theano.function was asked to create a function computing outputs given certain inputs, but the provided input variable at index 1 is not part of the computational graph needed to compute the outputs: x_mask.
To make this error into a warning, you can pass the parameter on_unused_input='warn' to theano.function. To disable it completely, use on_unused_input='ignore'.