In [345]:
import mxnet as mx
import numpy as np
from mxnet import nd, gluon, autograd
from mxnet.gluon import nn, Block

def one_hots(numerical_list, vocab_size):
    result = nd.zeros((len(numerical_list), vocab_size), ctx=ctx)
    for i, idx in enumerate(numerical_list):
        result[i, idx] = 1.0
    return result

def textify(embedding):
    result = ""
    indices = nd.argmax(embedding, axis=0).asnumpy()
    for idx in indices:
        result += character_list[int(idx)]
    return result

def load_time_machine(seq_length=64, batch_size=1):
    # loading dataset
    path = "../../data/timemachine.txt"
    with open(path) as f:
        time_machine = f.read()
    time_machine = time_machine[:-38083] #hardcoded to remove crap
    character_dict, vocab_size = get_char_dict(time_machine)
    
    time_numerical = [character_dict[char] for char in time_machine]
    # -1 here so we have enough characters for labels later
    num_samples = (len(time_numerical) - 1) // seq_length
    dataset = one_hots(time_numerical[:seq_length*num_samples],vocab_size).reshape((num_samples, seq_length, vocab_size))
    num_batches = len(dataset) // batch_size
    train_data = dataset[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, vocab_size))
    
    # swap batch_size and seq_length axis to make later access easier
    train_data = nd.swapaxes(train_data, 0, 1)
    train_data = nd.swapaxes(train_data, 1, 2)
    print('Shape of data set: ', train_data.shape)
    
    labels = one_hots(time_numerical[1:seq_length*num_samples+1], vocab_size)
    train_label = labels.reshape((batch_size, num_batches, seq_length, vocab_size))
    train_label = nd.swapaxes(train_label, 0, 1)
    train_label = nd.swapaxes(train_label, 1, 2)
    print('Shape of label set: ', train_label.shape)
    
    return train_data, train_label

def get_char_dict(data):
    # get character dictionary
    character_list = list(set(data))
    vocab_size = len(character_list)
    # get the character dictionary
    character_dict = {}
    for e, char in enumerate(character_list):
        character_dict[char] = e
    return character_dict, vocab_size

def get_char_dict_builder(data, character_dict):
    # get character dictionary
    print "building dictionary"
    for line in data:
        character_list = list(set(line))
        # get the character dictionary
        for i in range(len(character_list)):
            if(character_list[i] not in character_dict):
                character_dict[character_list[i]] = len(character_dict)
    vocab_size = len(character_dict)
    return character_dict, vocab_size

def SGD(params, lr):    
    for param in params:
        param[:] = param - lr * param.grad
        

def cross_entropy(out, targ):
    return - nd.sum(targ * nd.log(out), axis=0, exclude=True)


def average_ce_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + cross_entropy(output, label)
    return total_loss / len(outputs)

        
def list_to_nd_array(list_of_nd_arrays):
    return nd.concat(*list_of_nd_arrays)

def list_to_nd_array_with_reshaping(list_of_nd_arrays):
    for i in range(len(list_of_nd_arrays)):
        list_of_nd_arrays[i]=list_of_nd_arrays[i].reshape((list_of_nd_arrays[i].shape[0],1))
    return nd.concat(*list_of_nd_arrays)


def translation_numerical(data,character_dict):
    print "turning characters into numerical representation"
    return_list=[]
    for line in data:
        return_list.append([character_dict[char] for char in line])
    return return_list

def numerical_to_nd(one_data,translation_dict):
    one_hot = one_hots(one_data, len(translation_dict))
    temp = one_hot.reshape((1,1,one_hot.shape[0],one_hot.shape[1]))
    temp = nd.swapaxes(temp,0,1)
    temp = nd.swapaxes(temp,1,2)
    return temp

def clean_data(train_data, test_data, threshold_min, threshold_max):
    print "cleaning data"
    train_data_list = []
    test_data_list = []
    for train_line, test_line in zip(train_data,test_data):
            train_line = train_line.lower()
            test_line = test_line.lower()  
            return_train_line = ""
            return_test_line = ""
            
            for i in range(len(train_line)):
                c = train_line[i]
                if((ord(c)==32)or(ord(c)>=97 and ord(c)<=122)):
                    return_train_line = return_train_line + c
                    
            for i in range(len(test_line)):
                c = test_line[i]
                if((ord(c)==32)or(ord(c)>=97 and ord(c)<=122)):
                    return_test_line = return_test_line + c
            
            if(len(return_train_line)>=threshold_min and len(return_train_line)<=threshold_max):
                train_data_list.append(return_train_line)
                test_data_list.append(return_test_line)
    return train_data_list,test_data_list

def pad_zeros(data_numerical):
    print "padding zeros"
    #first, find the maximum length of data.
    max_len = 0
    for line in data_numerical:
        if(len(line)>max_len):
            max_len = len(line)
            
    #iterate through each line and pad with zeros until length equals max_len
    for i in range(len(data_numerical)):
        data_numerical[i] = data_numerical[i] + [0]*(max_len - len(data_numerical[i]))
    
    return data_numerical           


def rnn_helper(num_hidden, num_inputs, num_outputs): 
    Wxh = nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
    Whh = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx) * .01
    bh = nd.random_normal(shape=num_hidden, ctx=ctx) * .01
    Why = nd.random_normal(shape=(num_hidden,num_outputs), ctx=ctx) * .01
    by = nd.random_normal(shape=num_outputs, ctx=ctx) * .01
    params = [Wxh, Whh, bh, Why, by]

    for param in params:
        param.attach_grad()
    return params
 
def softmax(y_linear):
    exp = nd.exp(y_linear-nd.max(y_linear))
    partition = nd.nansum(exp, axis=0, exclude=True)
    return exp / partition

def encoder(steps, input_data, num_hidden, vocab_size, state, params):
    Wxh, Whh, bh, Why, by = params
    outputs = []
    h = state
    for i in range(input_data.shape[0]):
        input_temp = nd.dot(input_data[i], Wxh)
        hidden_temp = nd.dot(h, Whh)
        h_linear = input_temp + hidden_temp + bh
        h = nd.tanh(h_linear)
        yhat_linear = nd.dot(h, Why) + by
        outputs.append(nd.expand_dims(yhat_linear[0],axis=1))
    return (outputs, h)


def attention_helper(num_attention, num_hidden_encoder, num_hidden_decoder):
    W = nd.random_normal(shape=(num_attention,num_hidden_decoder), ctx=ctx) * .01
    V = nd.random_normal(shape=(num_attention,num_hidden_encoder), ctx=ctx) * .01
    w =  nd.random_normal(shape=(1,num_attention), ctx=ctx) * .01
    b = nd.random_normal(shape=(num_attention,1), ctx=ctx) * .01
    params = [W,V,w,b]
    for param in params:
        param.attach_grad()
    return params

def attention(decoder_hidden, encoder_output, att_params):
    W, V, w, b = att_params
    decoder_temp = nd.dot(W,decoder_hidden)
    encoder_temp = nd.dot(V,encoder_output)
    net_temp = nd.reshape(decoder_temp,(decoder_temp.shape[0],1))+encoder_temp+b
    return nd.dot(w,nd.tanh(net_temp))
    #return nd.dot(softmax(nd.dot(decoder_hidden_t, encoder_output)) , encoder_output.T)


def decoder(steps, encoder_outputs, state, num_hidden, vocab_size, params, att_params):
    Wxh, Whh, bh, Why, by = params
    outputs = []
    h = state
    # only look at steps long. (consider this 'dynamic')
    for i in range(steps):
        #h=nd.reshape(h,(1,h.size))
        attention_temp = attention(h, encoder_outputs, att_params)
        input_recursive_temp = nd.dot(nd.sum(attention_temp*encoder_outputs,axis=1), Wxh)
        hidden_recursive_temp = nd.dot(h, Whh)
        h = nd.tanh(input_recursive_temp + hidden_recursive_temp + bh)
        net_temp = nd.dot(h, Why) + by
        yhat = nd.softmax(net_temp)
        outputs.append(yhat)
    return (outputs, h)   




In [344]:
# open the datasets
with open("../../data/train.en","rb") as f:
    raw_train_data = f.read().splitlines()
with open("../../data/train.fr","rb") as f:
    raw_train_labels = f.read().splitlines()

#clean data
train_data, train_labels = clean_data(raw_train_data, raw_train_labels, 100,150)

# create dictionary and a character list 
translation_dict = {}
_, num_items = get_char_dict_builder(train_data,translation_dict)
_, num_items = get_char_dict_builder(train_labels, translation_dict)
character_list = list(translation_dict.keys())

# from characters to numerical representations
english_numerical=translation_numerical(train_data,translation_dict)
french_numerical=translation_numerical(train_labels,translation_dict)

# pad zeros
#data = pad_zeros(english_numerical)
#labels = pad_zeros(french_numerical)
data = english_numerical
labels = french_numerical

cleaning data
building dictionary
building dictionary
turning characters into numerical representation
turning characters into numerical representation


In [359]:
ctx = mx.cpu()
num_hidden = 256
learning_rate = 0.001
vocab_size = len(translation_dict)
encoder_input_size = vocab_size

encoder_params = rnn_helper(num_hidden, num_inputs=27, num_outputs=256)
encoder_params_1 = rnn_helper(num_hidden, num_inputs=256, num_outputs=256)
decoder_params = rnn_helper(num_hidden, num_inputs=256, num_outputs=27) #num_inputs -> vocab_size
att_params = attention_helper(num_hidden, num_hidden_encoder=256, num_hidden_decoder=256)

params = decoder_params + encoder_params + att_params +encoder_params_1

In [362]:
for i in range(len(params)):
    print params[i].grad


[[ -2.30064361e-05  -4.51234246e-06   3.19543187e-06 ...,  -9.44812655e-06
    8.25409643e-06   1.66710379e-05]
 [  1.03502511e-03   2.01071729e-04  -1.42313176e-04 ...,   4.24035097e-04
   -3.70312104e-04  -7.49033410e-04]
 [ -2.21199560e-04  -4.29479624e-05   3.03964534e-05 ...,  -9.06097121e-05
    7.91282582e-05   1.60067153e-04]
 ..., 
 [ -6.39082049e-04  -1.24176979e-04   8.78900901e-05 ...,  -2.61835754e-04
    2.28664212e-04   4.62507480e-04]
 [ -2.66740739e-04  -5.18753586e-05   3.67182329e-05 ...,  -1.09309716e-04
    9.54646748e-05   1.93065032e-04]
 [ -8.91173200e-04  -1.73131615e-04   1.22538142e-04 ...,  -3.65104119e-04
    3.18847800e-04   6.44933025e-04]]
<NDArray 256x256 @cpu(0)>

[[ -1.91525993e-04   1.04719320e-05  -6.03472381e-06 ...,  -4.59349249e-05
    3.32382406e-05   1.07237618e-04]
 [ -2.27400946e-04  -1.47922265e-05   8.55781309e-06 ...,  -8.12409053e-05
    7.03833211e-05   1.52819659e-04]
 [  2.27156706e-04   1.58749826e-05  -9.07313006e-06 ...,   8.489470

In [363]:
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt

loss_tracker = []
counter = 0
vs, sqrs = setup_adam(params)
for epoch in range(100):
    cum_loss = 0
    for i in range(len(data)):
        
        with autograd.record():
            en = numerical_to_nd(data[i],translation_dict)
            fr = numerical_to_nd(labels[i],translation_dict)
            en = en.reshape((en.shape[1],en.shape[2],en.shape[3]))
            fr = fr.reshape((fr.shape[1],fr.shape[2],fr.shape[3]))
            
            output_encoder_layer_0,hidden_encoder_layer_0=encoder(
                en.shape[0], en, num_hidden, int(en.shape[2]), 
                nd.zeros(num_hidden),encoder_params)
            
            temp = nd.concat(*output_encoder_layer_0)
            o_e = nd.reshape(temp,(temp.shape[1],1,temp.shape[0]))
            
            output_encoder_layer_1,hidden_encoder_layer_1=encoder(
                o_e.shape[0], o_e, num_hidden, int(o_e.shape[2]), 
                hidden_encoder_layer_0,encoder_params_1)
       
            out_enc = list_to_nd_array(output_encoder_layer_1)
        
            output_decoder, hidden_state = decoder(
                fr.shape[0],out_enc,nd.reshape(hidden_encoder_layer_1,(num_hidden)),
                num_hidden,int(fr.shape[2]),decoder_params, att_params)
            
            loss = average_ce_loss(output_decoder, nd.reshape(fr,(fr.shape[0],fr.shape[2]))) 

        loss.backward()
        counter += 1
        adam(params, vs, sqrs, learning_rate, 1, counter)
        
        #graphing
        sum_loss = nd.sum(loss)
        cum_loss += sum_loss.asscalar()
        x_axis = range(len(loss_tracker))
        if(i%100==0):
            
            print("pred text: ",textify(list_to_nd_array_with_reshaping(output_decoder)))
            print("cumulative loss: ", cum_loss/100)
            cum_loss = 0
            loss_tracker.append(cum_loss)
    
    plt.semilogy(x_axis, loss_tracker)
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()
    

('pred text: ', '                                                                                                                                                                               ')
('cumulative loss: ', 0.02859285831451416)


KeyboardInterrupt: 

In [7]:
def setup_adam(params):
    sqrs = []
    vs = []
    for param in params:
        vs.append(param.zeros_like())
        sqrs.append(param.zeros_like())
    return vs, sqrs

In [216]:
class attention_block(Block):
    def __init__(self, decoder_state_size, encoder_state_size, attention_size, **kwargs):
        super(attention_block, self).__init__(**kwargs)
        with self.name_scope():
            self.decoder_state_size = decoder_state_size
            self.encoder_state_size = encoder_state_size
            self.attention_size = attention_size
            
            self.W = self.params.get('W', init=mx.init.Xavier(magnitude=2.24), 
                                     shape=(self.attention_size,self.decoder_state_size))
            self.V = self.params.get('V', init=mx.init.Xavier(magnitude=2.24), 
                                     shape=(self.attention_size,self.encoder_state_size))
            self.w = self.params.get('w', init=mx.init.Xavier(magnitude=2.24), 
                                     shape=(1,self.attention_size))
            self.b = self.params.get('b', shape=(self.attention_size,1))
    
    def forward(self, decoder_hidden, encoder_output):
        with encoder_output.context:
            decoder_temp = nd.dot(self.W,decoder_hidden)
            encoder_temp = nd.dot(self.V,encoder_output)
            net_temp = nd.reshape(decoder_temp,(decoder_temp.shape[0],1))+encoder_temp+self.b
            return nd.dot(self.w,nd.tanh(net_temp))

In [285]:
params[17].grad


[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
<NDArray 256x256 @cpu(0)>

In [278]:
len(params)

19


[[  1.30422052e-03  -6.41440973e-03   5.10825589e-03 ...,  -1.34379510e-02
    4.31262924e-05   9.32443980e-03]
 [  1.39874090e-02   1.74041204e-02   1.10777542e-02 ...,   6.40203105e-03
    2.06707995e-02   7.28362380e-03]
 [  1.37236156e-03   5.07817464e-03   2.42010169e-02 ...,  -7.58874975e-03
   -8.40481278e-03   8.14634562e-03]
 ..., 
 [ -1.86159983e-02  -1.86261116e-03   3.53552550e-02 ...,  -1.22878852e-03
    3.85040039e-04  -1.27779124e-02]
 [  1.55838989e-02  -1.56477792e-03  -1.03876069e-02 ...,  -6.73914840e-03
    1.66862477e-02  -2.55577289e-03]
 [  1.31259812e-03   9.93658323e-03  -5.96137485e-03 ...,  -1.68684535e-02
    1.60950851e-02  -2.09686020e-03]]
<NDArray 256x256 @cpu(0)>

In [207]:
from mxnet.gluon import nn, rnn

In [213]:
e = rnn.LSTM(256, 3, input_size=27)
d = rnn.LSTM(256, 3, input_size = 256)

dense = nn.Dense(27, in_units = 256)


TypeError: __init__() takes exactly 5 arguments (1 given)