In [229]:
import mxnet as mx
import numpy as np
from mxnet import nd, gluon, autograd
from mxnet.gluon import nn, Block

def one_hots(numerical_list, vocab_size=vocab_size):
    result = nd.zeros((len(numerical_list), vocab_size), ctx=ctx)
    for i, idx in enumerate(numerical_list):
        result[i, idx] = 1.0
    return result

def textify(embedding):
    result = ""
    indices = nd.argmax(embedding, axis=1).asnumpy()
    for idx in indices:
        result += character_list[int(idx)]
    return result

def load_time_machine(seq_length=64, batch_size=1):
    # loading dataset
    path = "../../data/timemachine.txt"
    with open(path) as f:
        time_machine = f.read()
    time_machine = time_machine[:-38083] #hardcoded to remove crap
    character_dict, vocab_size = get_char_dict(time_machine)
    
    # -1 here so we have enough characters for labels later
    num_samples = (len(time_numerical) - 1) // seq_length
    dataset = one_hots(time_numerical[:seq_length*num_samples]).reshape((num_samples, seq_length, vocab_size))
    num_batches = len(dataset) // batch_size
    train_data = dataset[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, vocab_size))
    
    # swap batch_size and seq_length axis to make later access easier
    train_data = nd.swapaxes(train_data, 0, 1)
    train_data = nd.swapaxes(train_data, 1, 2)
    print('Shape of data set: ', train_data.shape)
    
    labels = one_hots(time_numerical[1:seq_length*num_samples+1])
    train_label = labels.reshape((batch_size, num_batches, seq_length, vocab_size))
    train_label = nd.swapaxes(train_label, 0, 1)
    train_label = nd.swapaxes(train_label, 1, 2)
    print('Shape of label set: ', train_label.shape)
    
    return train_data, train_label
    

def get_char_dict(data):
    # get character dictionary
    character_list = list(set(data))
    vocab_size = len(character_list)
    # get the character dictionary
    character_dict = {}
    for e, char in enumerate(character_list):
        character_dict[char] = e
    return character_dict, vocab_size

def rnn_helper(num_hidden, vocab_size): 
    num_inputs = vocab_size
    num_outputs = vocab_size
    Wxh = nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
    Whh = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx) * .01
    bh = nd.random_normal(shape=num_hidden, ctx=ctx) * .01
    Why = nd.random_normal(shape=(num_hidden,num_outputs), ctx=ctx) * .01
    by = nd.random_normal(shape=num_outputs, ctx=ctx) * .01
    params = [Wxh, Whh, bh, Why, by]

    for param in params:
        param.attach_grad()
    return params

def decoder(steps, encoder_outputs, state, num_hidden, vocab_size):
    Wxh, Whh, bh, Why, by = rnn_helper(num_hidden, vocab_size)
    outputs = []
    h = state
    for i in range(steps):
        X = attention(h, encoder_outputs)
        h_linear = nd.dot(X, Wxh) + nd.dot(h, Whh) + bh
        h = nd.tanh(h_linear)
        yhat_linear = nd.dot(h, Why) + by
        yhat = softmax(yhat_linear) 
        outputs.append(yhat)
    return (outputs, h)
 
def attention(decoder_hidden_t, encoder_output):
    if(decoder_hidden_t.shape[0]!=1):
        decoder_hidden_t = nd.expand_dims(decoder_hidden_t,axis=0)
    attend = softmax(nd.dot(decoder_hidden_t, encoder_output.T)) 
    return nd.dot(attend, encoder_output)

def softmax(y_linear):
    exp = nd.exp(y_linear-nd.max(y_linear))
    partition = nd.nansum(exp, axis=0, exclude=True)
    return exp / partition

def encoder(steps, input_data, num_hidden, vocab_size, state):
    Wxh, Whh, bh, Why, by = rnn_helper(num_hidden, vocab_size)
    outputs = nd.zeros((input_data.shape[0],vocab_size))
    h = state
    for i in range(input_data.shape[0]):
        h_linear = nd.dot(input_data[i], Wxh) + nd.dot(h, Whh) + bh
        h = nd.tanh(h_linear)
        yhat_linear = nd.dot(h, Why) + by
        yhat = softmax(yhat_linear) 
        outputs[i]=yhat[0]
    return (outputs, h)


class encoder_block(Block):
    def __init__(self, vocab_size, num_hidden, tie_weights=False, **kwargs):
        super(encoder_block, self).__init__(**kwargs)
        with self.name_scope():
            self.rnn = rnn.RNN(100, 1, activation='relu', input_size=vocab_size)
            self.decoder = nn.Dense(vocab_size, in_units = num_hidden)
    def forward(self, inputs, hidden_state):
        output, hidden_state = self.rnn(inputs, hidden_state) # make sure hidden is 100x1
        decoded = self.decoder(output.reshape((-1, num_hidden)))
        return decoded, hidden_state

In [129]:
# context usage
ctx = mx.cpu()
data, labels = load_time_machine()

('Shape of data set: ', (2600L, 64L, 1L, 88L))
('Shape of label set: ', (2600L, 64L, 1L, 88L))


In [235]:
num_hidden = 88
steps = 10
output_encoder,hidden_encoder=encoder(steps, data[0], num_hidden, int(data.shape[3]), nd.zeros(num_hidden))

In [236]:
output_decoder, hidden_state = decoder(10,output_encoder,nd.zeros(num_hidden),num_hidden,int(data.shape[3]))