# Vanilla RNN

reference
* https://mxnet.incubator.apache.org/_modules/mxnet/gluon/rnn/rnn_layer.html
* https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.RNN

In [45]:
import numpy as np
import mxnet as mx
from mxnet import autograd, nd
from mxnet.gluon import nn, rnn
np.set_printoptions(precision=3)

In [8]:
batch_size = 16
n_hidden_state = 10
embedding_input = 5

In [3]:
# hidden_size, num_layers, activation, layout, dropout, bidirectional, 
# i2h_weight_initializer, h2h_weight_initializer, i2h_bias_initializer, h2h_bias_initializer, input_size
model = rnn.RNN(n_hidden_state, 1, layout = 'NTC', input_size = embedding_input, prefix='mdl_')

In [4]:
model.collect_params().initialize(mx.init.Xavier(), ctx = mx.cpu())

In [5]:
initial_state = model.begin_state(batch_size = 16)

<img src="RNN-rolled.png" alt="drawing" width="200"/>

image by http://colah.github.io/posts/2015-08-Understanding-LSTMs

### Hiddens state size

In [7]:
print(model.params['mdl_l0_i2h_weight'].data().shape)
print(model.params['mdl_l0_i2h_bias'].data().shape)
print(model.params['mdl_l0_h2h_weight'].data().shape)
print(model.params['mdl_l0_h2h_bias'].data().shape)

(10, 5)
(10,)
(10, 10)
(10,)


## step 1

In [11]:
time_step = 1
data = nd.random.normal(shape =(batch_size, time_step, embedding_input))

In [47]:
print(len(data))
print(data[0])
print(data[0][0])

16

[[ 0.823 -1.879  0.886  1.912  0.333]]
<NDArray 1x5 @cpu(0)>

[ 0.823 -1.879  0.886  1.912  0.333]
<NDArray 5 @cpu(0)>


In [17]:
model

RNN(5 -> 10, NTC)

### Inputs

* __data__: input tensor with shape (sequence_length, batch_size, input_size) when layout is “TNC”. For other layouts, dimensions are permuted accordingly using transpose() operator which adds performance overhead. Consider creating batches in TNC layout during data batching step.

* __states__: initial recurrent state tensor with shape (num_layers, batch_size, num_hidden). If bidirectional is True, shape will instead be (2*num_layers, batch_size, num_hidden). If states is None, zeros will be used as default begin states.

In [18]:
out, state = model(data, initial_state)

### outputs

* __out__: output tensor with shape (sequence_length, batch_size, num_hidden) when layout is “TNC”. If bidirectional is True, output shape will instead be (sequence_length, batch_size, 2*num_hidden)

* __out_states__: output recurrent state tensor with the same shape as states. If states is None out_states will not be returned.

In [58]:
print(len(out)) #N: batch_size
print(len(out[0])) #T: sequence_length
print(len(out[0][0])) #C: num_hidden

16
2
10


In [57]:
print(len(state))
print(len(state[0])) #num_layers
print(len(state[0][0])) #batch_size
print(len(state[0][0][0])) #num_hidden

1
1
16
10


In [31]:
out[0][0] == state[0][0][0]


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
<NDArray 10 @cpu(0)>

In [32]:
h2h_weight = model.params['mdl_l0_h2h_weight'].data()
h2h_bias = model.params['mdl_l0_h2h_bias'].data()
i2h_weight = model.params['mdl_l0_i2h_weight'].data()
i2h_bias = model.params['mdl_l0_i2h_bias'].data()

In [33]:
res = nd.relu(nd.dot(dat[0][0], i2h_weight, transpose_b = True) + i2h_bias + nd.dot(h2h_weight, initial_state[0][0][0]) + h2h_bias)

In [46]:
print(res.asnumpy())
print(out[0][0].asnumpy())
print(state[0][0][0].asnumpy())

[0.652 0.103 0.    1.276 0.682 0.77  0.788 0.994 0.    0.764]
[0.947 0.522 0.032 0.    0.    0.    0.    0.463 1.607 0.   ]
[0.947 0.522 0.032 0.    0.    0.    0.    0.463 1.607 0.   ]


In [35]:
print(len(out))
print(len(out[0]))
print(len(out[0][0]))

16
1
10


In [42]:
print(len(state))
print(len(state[0]))
print(len(state[0][0]))
print(len(state[0][0][0]))

1
1
16
10


## step 2

In [49]:
time_step = 2
data = nd.random.normal(shape =(batch_size, time_step, embedding_input))
out, state = model(data, initial_state)

In [50]:
print(out[0][1].asnumpy())
print(state[0][0][0].asnumpy())

[0.    0.    0.502 0.    0.    0.    0.638 0.    0.    0.   ]
[0.    0.    0.502 0.    0.    0.    0.638 0.    0.    0.   ]


In [51]:
h2h_weight = model.params['mdl_l0_h2h_weight'].data()
h2h_bias = model.params['mdl_l0_h2h_bias'].data()
i2h_weight = model.params['mdl_l0_i2h_weight'].data()
i2h_bias = model.params['mdl_l0_i2h_bias'].data()

In [59]:
out_t1 = nd.relu(nd.dot(data[0][0], i2h_weight, transpose_b = True) + i2h_bias \
      + nd.dot(h2h_weight, initial_state[0][0][0]) + h2h_bias)

In [60]:
print(out_t1.asnumpy())
print(out[0][0].asnumpy())

[0.947 0.522 0.032 0.    0.    0.    0.    0.463 1.607 0.   ]
[0.    0.    0.    1.085 0.    0.841 0.864 0.    0.    0.402]


In [54]:
out_t2 = nd.relu(nd.dot(dat[0][1], i2h_weight, transpose_b = True) + i2h_bias \
      + nd.dot(h2h_weight, out_t1) + h2h_bias)

In [55]:
print(out_t2.asnumpy()) # calculation
print(state[0][0][0].asnumpy()) # State at last time step
print(out[0][1].asnumpy()) # Last time step

[0.    0.    0.502 0.    0.    0.    0.638 0.    0.    0.   ]
[0.    0.    0.502 0.    0.    0.    0.638 0.    0.    0.   ]
[0.    0.    0.502 0.    0.    0.    0.638 0.    0.    0.   ]
