In [8]:
# create a character level Fuel dataset from text file

%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

In [9]:
import os
import codecs
import subprocess
from subprocess import Popen, PIPE, STDOUT

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

import numpy as np
from abc import ABCMeta, abstractmethod
from toolz import merge
import pandas as pd
from six import add_metaclass
from picklable_itertools.extras import equizip
from blocks.bricks.recurrent import (GatedRecurrent, Bidirectional)
from blocks.initialization import IsotropicGaussian, Constant, Orthogonal
import theano
import theano.tensor as T

from blocks.bricks import Initializable
from blocks.bricks.base import application, Brick, lazy
from blocks.bricks import Tanh, Linear, MLP
from blocks.bricks.lookup import LookupTable
from blocks.bricks.parallel import Fork
from blocks.utils import (shared_floatx_nans, dict_union)
from blocks.roles import add_role, WEIGHT


from fuel.datasets import TextFile
from fuel.schemes import ConstantScheme
from fuel.streams import DataStream
from fuel.transformers import (
    Merge, Batch, Filter, Padding, SortMapping, Unpack, Mapping)

DEBUG:root:test


In [19]:
# path to your training data
TRAINING_DATASET = '/home/chris/projects/machine_learning/dcu-character-lms/data/paul_graham_essays.txt'

UNKNOWN_TOKEN = '|'
EOS_TOKEN = '`'

In [24]:
all_symbols = set()

y_data_temp_file = 'temp_y.txt'

# make our dictionaries, and write a temporary file for the 
with codecs.open(TRAINING_DATASET, encoding='utf8') as inp:
    with codecs.open(y_data_temp_file, 'wb', encoding='utf8') as y_out:
        for l in inp.read().strip().split('\n'):
            all_symbols.update(l)
            y_seq = l[1:] + EOS_TOKEN + '\n'
            assert len(l) == len(y_seq)-1

            y_out.write(''.join(y_seq))
        
        
# add an unknown token in case we observe a new character at prediction time
all_symbols.update([UNKNOWN_TOKEN, EOS_TOKEN])
        
len(all_symbols)

52

In [25]:
word2idx = {v:k for k,v in enumerate(all_symbols)}
idx2word = {v:k for k,v in word2idx.items()}


In [30]:
train_X = TextFile([TRAINING_DATASET], word2idx, bos_token=None, eos_token=None,
                   unk_token=UNKNOWN_TOKEN, level='character')

train_Y = TextFile([y_data_temp_file], word2idx, bos_token=None, eos_token=None,
                   unk_token=UNKNOWN_TOKEN, level='character')

# Merge them to get x1, x2 pairs
stream = Merge([train_X.get_example_stream(),
                train_Y.get_example_stream()],
                ('x', 'y'))

In [32]:
idx2word[0]

u' '

In [31]:
# take a look at the stream
sample = list(stream.get_epoch_iterator())[1]
sample

([38,
  43,
  0,
  45,
  38,
  0,
  41,
  44,
  45,
  0,
  32,
  45,
  0,
  36,
  38,
  43,
  28,
  0,
  27,
  43,
  44,
  45,
  24,
  37,
  37,
  48,
  8,
  0,
  16,
  0,
  36,
  38,
  39,
  45,
  33,
  42,
  0,
  27,
  28,
  31,
  38,
  43,
  28,
  0,
  45,
  33,
  28,
  48,
  4,
  43,
  28,
  0,
  38,
  44,
  45,
  0,
  38,
  31,
  0,
  27,
  44,
  42,
  32,
  39,
  28,
  42,
  42],
 [43,
  0,
  45,
  38,
  0,
  41,
  44,
  45,
  0,
  32,
  45,
  0,
  36,
  38,
  43,
  28,
  0,
  27,
  43,
  44,
  45,
  24,
  37,
  37,
  48,
  8,
  0,
  16,
  0,
  36,
  38,
  39,
  45,
  33,
  42,
  0,
  27,
  28,
  31,
  38,
  43,
  28,
  0,
  45,
  33,
  28,
  48,
  4,
  43,
  28,
  0,
  38,
  44,
  45,
  0,
  38,
  31,
  0,
  27,
  44,
  42,
  32,
  39,
  28,
  42,
  42,
  25])

In [None]:
# Create the Model

class RNNLM(Initializable):
    """Model for character level LM"""

    def __init__(self, vocab_size, embedding_dim, state_dim, output_dim, **kwargs):
        super(BidirectionalSemanticEncoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.output_dim = output_dim

        self.lookup = LookupTable(name='embeddings')
        self.transition = GatedRecurrent(activation=Tanh(), dim=state_dim)
        
        self.fork = Fork(
            [name for name in self.transition.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fork')

        # output layer -- this may need to be changed
        self.output_layer = Linear(name='output_layer')
        
        self.children = [self.lookup, self.transition,
                         self.fork, self.output_layer]


    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.fork.input_dim = self.embedding_dim
        self.fork.output_dims = [self.transition.get_dim(name)
                                     for name in self.fork.output_names]
        
        self.output_layer.input_dim = self.state_dim
        self.output_layer.output_dim = self.output_dim

    def cost(self, x, x_mask, y, y_mask):
        
        pass
#         return cost
        