# change your runtime
![](../images/runtime1.png)
![](../images/runtime2.png)

In [None]:
#Please run this block if you would like to use GPU
!apt update -qq;
!wget https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-ubuntu1604-10-1-local-10.1.168-418.67_1.0-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-10-1-local-10.1.168-418.67_1.0-1_amd64.deb
!apt-key add /var/cuda-repo-10-1-local-10.1.168-418.67/7fa2af80.pub
!dpkg --configure -a
!apt-get update
!apt-get install cuda
!pip install mxnet-cu100

In [None]:
# or run this for cpu only
!pip install mxnet

# Stacked Bi-directional LSTMs in Gluon

Starting with a plain RNN, we'll look at various modifications and build up to the complete Stacked Bi-directional LSTM model.

* Base: __RNN__
* Modification: __Stacked RNN__
* Modification: __Bi-directional RNN__
* Modification: __LSTM__
* Combined: __Stacked Bi-directional LSTM__

# Base: RNN

### Implicit initial hidden state

In [10]:
import mxnet as mx

In [11]:
sequence_length = 4
batch_size = 5
channels = 3

inputs = mx.nd.random.uniform(shape=(sequence_length, batch_size, channels))
first_input = inputs[0]
first_input


[[0.0939405  0.9211576  0.5759465 ]
 [0.08311249 0.9292962  0.27771857]
 [0.31856894 0.0093567  0.6674104 ]
 [0.8423421  0.13179787 0.6471741 ]
 [0.7163272  0.84138614 0.2894061 ]]
<NDArray 5x3 @cpu(0)>

In [16]:
hid_layers = 1
hid_units = 6

rnn = mx.gluon.rnn.RNN(hidden_size=hid_units, num_layers=hid_layers, layout='TNC')


In [17]:
# lazy initialize weights
rnn.initialize()

In [23]:
# since not provided, will initialize hidden state to zeros of approprate shape
outputs = rnn(inputs)

In [24]:
# for a plain rnn, output is the same as hidden state. get it for every time step.
outputs.shape

(4, 5, 6)

In [25]:
final_output = outputs[-1]
final_output


[[0.         0.06878319 0.02269816 0.         0.00092114 0.        ]
 [0.         0.11525708 0.02774326 0.         0.03441893 0.        ]
 [0.         0.05674308 0.00495926 0.         0.03428157 0.        ]
 [0.         0.06954266 0.02817963 0.         0.         0.        ]
 [0.         0.08359297 0.01891519 0.         0.02883152 0.        ]]
<NDArray 5x6 @cpu(0)>

### Explicit initial hidden state

In [26]:
hid_init = mx.nd.random.uniform(shape=(hid_layers, batch_size, hid_units))

In [27]:
# get tuple returned
outputs, hid_states = rnn(inputs, hid_init)

In [28]:
outputs.shape

(4, 5, 6)

In [29]:
final_output = outputs[-1]
final_output


[[0.         0.06881174 0.02271482 0.         0.00096867 0.        ]
 [0.         0.11529209 0.02777055 0.         0.03446553 0.        ]
 [0.         0.05676252 0.00497272 0.         0.03430802 0.        ]
 [0.         0.06954573 0.02818356 0.         0.         0.        ]
 [0.         0.08362921 0.01894405 0.         0.02887885 0.        ]]
<NDArray 5x6 @cpu(0)>

In [30]:
# single hidden state between blocks for plain rnn
len(hid_states)

1

In [31]:
# only get for last time step
hid_states[0].shape

(1, 5, 6)

In [32]:
# same as final_output
hid_states[0]


[[[0.         0.06881174 0.02271482 0.         0.00096867 0.        ]
  [0.         0.11529209 0.02777055 0.         0.03446553 0.        ]
  [0.         0.05676252 0.00497272 0.         0.03430802 0.        ]
  [0.         0.06954573 0.02818356 0.         0.         0.        ]
  [0.         0.08362921 0.01894405 0.         0.02887885 0.        ]]]
<NDArray 1x5x6 @cpu(0)>

# Modification: Stacked RNN

In [88]:
hid_layers = 2

In [33]:
stack_rnn = mx.gluon.rnn.RNN(hidden_size=hid_units, num_layers=hid_layers, layout='TNC')
stack_rnn.initialize()

In [34]:
hid_init = mx.nd.random.uniform(shape=(hid_layers, batch_size, hid_units))
outputs, hid_states = stack_rnn(inputs, hid_init)

In [35]:
# output unchanged by number of layers. once again, one per time step
outputs.shape

(4, 5, 6)

In [36]:
final_output = outputs[-1]
final_output


[[2.90030371e-02 2.87810881e-02 0.00000000e+00 1.85078196e-02
  0.00000000e+00 6.03503222e-03]
 [2.74311658e-02 5.46624884e-02 0.00000000e+00 4.41584922e-02
  2.15570033e-02 1.07452795e-02]
 [0.00000000e+00 4.73658554e-02 0.00000000e+00 5.04149981e-02
  7.92650972e-05 0.00000000e+00]
 [4.17802446e-02 2.40432117e-02 0.00000000e+00 8.91629327e-03
  0.00000000e+00 9.00023524e-03]
 [3.30022424e-02 2.55878046e-02 0.00000000e+00 2.68065743e-02
  2.50780936e-02 1.74692664e-02]]
<NDArray 5x6 @cpu(0)>

In [37]:
# single hidden state between blocks for plain rnn
len(hid_states)

1

In [38]:
# but now have more hidden states (last step only)
hid_states[0].shape

(1, 5, 6)

In [39]:
# see last element is same as output (first is not part of output)
hid_states[0]


[[[2.90030371e-02 2.87810881e-02 0.00000000e+00 1.85078196e-02
   0.00000000e+00 6.03503222e-03]
  [2.74311658e-02 5.46624884e-02 0.00000000e+00 4.41584922e-02
   2.15570033e-02 1.07452795e-02]
  [0.00000000e+00 4.73658554e-02 0.00000000e+00 5.04149981e-02
   7.92650972e-05 0.00000000e+00]
  [4.17802446e-02 2.40432117e-02 0.00000000e+00 8.91629327e-03
   0.00000000e+00 9.00023524e-03]
  [3.30022424e-02 2.55878046e-02 0.00000000e+00 2.68065743e-02
   2.50780936e-02 1.74692664e-02]]]
<NDArray 1x5x6 @cpu(0)>

# Modification: Bi-directional RNNs

In [40]:
hid_layers = 1
bidirectional = True

In [41]:
bidir_rnn = mx.gluon.rnn.RNN(hidden_size=hid_units, num_layers=hid_layers, layout='TNC', bidirectional=bidirectional)
bidir_rnn.initialize()

In [42]:
# now hid_layers * 2, initial hidden states for forward and backward rnns.
hid_init = mx.nd.random.uniform(shape=(hid_layers * 2, batch_size, hid_units))
outputs, hid_states = bidir_rnn(inputs, hid_init)

In [43]:
# hid_units * 2 channels
# 6 from forward rnn, 6 from backward rnn, concatenated to give 12
outputs.shape

(4, 5, 12)

In [44]:
final_output = outputs[-1]
final_output


[[0.01669596 0.05843757 0.         0.         0.         0.
  0.         0.05363762 0.02028696 0.         0.01502694 0.03977481]
 [0.04652181 0.07698372 0.         0.         0.         0.
  0.         0.05344069 0.04430495 0.         0.08617246 0.        ]
 [0.05922109 0.03268612 0.         0.         0.02816408 0.
  0.         0.00220778 0.00385158 0.         0.         0.04738104]
 [0.00424436 0.06644951 0.         0.         0.         0.
  0.         0.08911546 0.02351735 0.         0.08870225 0.02079489]
 [0.02377889 0.04763812 0.         0.         0.         0.
  0.         0.04784007 0.01109214 0.         0.         0.        ]]
<NDArray 5x12 @cpu(0)>

In [45]:
# from forward rnn
final_output[:,:6]


[[0.01669596 0.05843757 0.         0.         0.         0.        ]
 [0.04652181 0.07698372 0.         0.         0.         0.        ]
 [0.05922109 0.03268612 0.         0.         0.02816408 0.        ]
 [0.00424436 0.06644951 0.         0.         0.         0.        ]
 [0.02377889 0.04763812 0.         0.         0.         0.        ]]
<NDArray 5x6 @cpu(0)>

In [46]:
# single hidden state between blocks for plain rnn
len(hid_states)

1

In [47]:
# forward rnn hidden, then backward rnn hidden
# BUT from different time steps! orward rnn hidden from last time step, backward rnn hidden from first time step.
# useful when feeding a decoder, otherwise backward rnn only seen 1 example by step n.
hid_states[0]


[[[0.01669596 0.05843757 0.         0.         0.         0.        ]
  [0.04652181 0.07698372 0.         0.         0.         0.        ]
  [0.05922109 0.03268612 0.         0.         0.02816408 0.        ]
  [0.00424436 0.06644951 0.         0.         0.         0.        ]
  [0.02377889 0.04763812 0.         0.         0.         0.        ]]

 [[0.         0.         0.05309198 0.07664979 0.01742797 0.        ]
  [0.         0.         0.03257876 0.06573668 0.00515372 0.        ]
  [0.         0.01129739 0.06318042 0.01553865 0.04551434 0.        ]
  [0.         0.03893747 0.08874083 0.01155296 0.07271851 0.        ]
  [0.         0.         0.06543292 0.04324468 0.04356055 0.        ]]]
<NDArray 2x5x6 @cpu(0)>

In [48]:
# same as first 6 channels of output at last stage
hid_states[0][0]


[[0.01669596 0.05843757 0.         0.         0.         0.        ]
 [0.04652181 0.07698372 0.         0.         0.         0.        ]
 [0.05922109 0.03268612 0.         0.         0.02816408 0.        ]
 [0.00424436 0.06644951 0.         0.         0.         0.        ]
 [0.02377889 0.04763812 0.         0.         0.         0.        ]]
<NDArray 5x6 @cpu(0)>

In [49]:
first_output = outputs[0]
first_output


[[0.06437154 0.05151022 0.         0.         0.01762882 0.
  0.         0.         0.05309198 0.07664979 0.01742797 0.        ]
 [0.07620252 0.01413342 0.02534905 0.02782245 0.         0.
  0.         0.         0.03257876 0.06573668 0.00515372 0.        ]
 [0.04417444 0.0314437  0.         0.         0.         0.
  0.         0.01129739 0.06318042 0.01553865 0.04551434 0.        ]
 [0.         0.14102149 0.07606754 0.0224137  0.01174133 0.
  0.         0.03893747 0.08874083 0.01155296 0.07271851 0.        ]
 [0.02028606 0.05943597 0.         0.         0.         0.
  0.         0.         0.06543292 0.04324468 0.04356055 0.        ]]
<NDArray 5x12 @cpu(0)>

In [50]:
# from backward rnn
first_output[:,6:]


[[0.         0.         0.05309198 0.07664979 0.01742797 0.        ]
 [0.         0.         0.03257876 0.06573668 0.00515372 0.        ]
 [0.         0.01129739 0.06318042 0.01553865 0.04551434 0.        ]
 [0.         0.03893747 0.08874083 0.01155296 0.07271851 0.        ]
 [0.         0.         0.06543292 0.04324468 0.04356055 0.        ]]
<NDArray 5x6 @cpu(0)>

In [51]:
# same as last 6 channels of output at first stage
hid_states[0][1]


[[0.         0.         0.05309198 0.07664979 0.01742797 0.        ]
 [0.         0.         0.03257876 0.06573668 0.00515372 0.        ]
 [0.         0.01129739 0.06318042 0.01553865 0.04551434 0.        ]
 [0.         0.03893747 0.08874083 0.01155296 0.07271851 0.        ]
 [0.         0.         0.06543292 0.04324468 0.04356055 0.        ]]
<NDArray 5x6 @cpu(0)>

# Modification: LSTM

In [108]:
hid_layers = 1

In [109]:
lstm = mx.gluon.rnn.LSTM(hidden_size=hid_units, num_layers=hid_layers, layout='TNC')
lstm.initialize()

In [110]:
hid_init_h = mx.nd.random.uniform(shape=(hid_layers, batch_size, hid_units))
hid_init_c = mx.nd.random.uniform(shape=(hid_layers, batch_size, hid_units))
hid_init = [hid_init_h, hid_init_c]
outputs, hid_states = lstm(inputs, hid_init)

In [111]:
# output same as before
outputs.shape

(4, 5, 6)

In [112]:
final_output = outputs[-1]
final_output


[[ 0.01253454 -0.02345554  0.02795083  0.02464588  0.00788901  0.02751796]
 [ 0.01594986 -0.0184415   0.03759725  0.01181254  0.0063988   0.02784991]
 [ 0.0094468  -0.00300365  0.03994624 -0.00055172  0.03623314  0.01056249]
 [ 0.01638241  0.00139618  0.04743286  0.01258996  0.02631953  0.02926661]
 [ 0.00316927 -0.01528073  0.04393305  0.00166157  0.01046809  0.01166981]]
<NDArray 5x6 @cpu(0)>

In [113]:
# now have two cell memory and hidden state
len(hid_states)

2

In [114]:
# hidden state (bottom line in diagram)
hid_states[0].shape

(1, 5, 6)

In [115]:
# cell memory (top line in diagram)
hid_states[1].shape

(1, 5, 6)

In [116]:
# same as the output for uni-directional and non-stacked case
hid_states[0]


[[[ 0.01253454 -0.02345554  0.02795083  0.02464588  0.00788901  0.02751796]
  [ 0.01594986 -0.0184415   0.03759725  0.01181254  0.0063988   0.02784991]
  [ 0.0094468  -0.00300365  0.03994624 -0.00055172  0.03623314  0.01056249]
  [ 0.01638241  0.00139618  0.04743286  0.01258996  0.02631953  0.02926661]
  [ 0.00316927 -0.01528073  0.04393305  0.00166157  0.01046809  0.01166981]]]
<NDArray 1x5x6 @cpu(0)>

# Combined: Stacked Bi-directional LSTM

In [117]:
hid_layers = 2
bidirectional = True

In [118]:
stack_bidir_lstm = mx.gluon.rnn.LSTM(hidden_size=hid_units, num_layers=hid_layers, layout='TNC', bidirectional=bidirectional)
stack_bidir_lstm.initialize()

In [119]:
# 2 * hid_layers (since bi-directional)
hid_init_h = mx.nd.random.uniform(shape=(2*hid_layers, batch_size, hid_units))
hid_init_c = mx.nd.random.uniform(shape=(2*hid_layers, batch_size, hid_units))
hid_init = [hid_init_h, hid_init_c]
outputs, hid_states = stack_bidir_lstm(inputs, hid_init)

In [120]:
# 2 * hid_units = 12 channels since bi-directional
outputs.shape

(4, 5, 12)

In [121]:
final_output = outputs[-1]
final_output


[[ 0.02836313  0.00606763  0.02310923  0.00844923  0.03136069  0.00835426
   0.00834884  0.22160307  0.16497645  0.17498061  0.17497027  0.16968904]
 [ 0.02018425 -0.00205736  0.02368148  0.01517256  0.02936829  0.01105657
  -0.00114159  0.02814094  0.06628538  0.18898237  0.02543253  0.16276605]
 [ 0.02180842  0.02542746  0.04244835 -0.00741896  0.03391297  0.00343686
   0.18560167  0.15380849  0.1863319   0.14486022  0.11376306  0.03071362]
 [ 0.01190888  0.00480322  0.01914669  0.004719    0.01296413 -0.00054099
   0.05539425  0.09306861  0.21310844  0.12720783  0.08961899  0.15656401]
 [ 0.01545303  0.00941906  0.02581433  0.01284648  0.00813204  0.00568986
   0.19248244  0.0284084   0.17106406  0.02503861  0.10314985  0.01844846]]
<NDArray 5x12 @cpu(0)>

In [122]:
# channels from forward rnn in last step of last layer
final_output[:,:6]


[[ 0.02836313  0.00606763  0.02310923  0.00844923  0.03136069  0.00835426]
 [ 0.02018425 -0.00205736  0.02368148  0.01517256  0.02936829  0.01105657]
 [ 0.02180842  0.02542746  0.04244835 -0.00741896  0.03391297  0.00343686]
 [ 0.01190888  0.00480322  0.01914669  0.004719    0.01296413 -0.00054099]
 [ 0.01545303  0.00941906  0.02581433  0.01284648  0.00813204  0.00568986]]
<NDArray 5x6 @cpu(0)>

In [123]:
# channels from backward rnn in last step of last layer
final_output[:,6:]


[[ 0.00834884  0.22160307  0.16497645  0.17498061  0.17497027  0.16968904]
 [-0.00114159  0.02814094  0.06628538  0.18898237  0.02543253  0.16276605]
 [ 0.18560167  0.15380849  0.1863319   0.14486022  0.11376306  0.03071362]
 [ 0.05539425  0.09306861  0.21310844  0.12720783  0.08961899  0.15656401]
 [ 0.19248244  0.0284084   0.17106406  0.02503861  0.10314985  0.01844846]]
<NDArray 5x6 @cpu(0)>

In [124]:
len(hid_states)

2

In [125]:
# hidden state
hid_states[0].shape

(4, 5, 6)

In [126]:
# cell memeory
hid_states[1].shape

(4, 5, 6)

In [127]:
# combined forward and backward, then across stack, e.g.
# [ L1_forward
#   L1_backward,
#   L2_forward,
#   L2_backward ]
hid_states[0]


[[[  1.03355674e-02   1.60647300e-03   1.32582430e-02   1.38146300e-02
    -1.35828340e-02  -1.12173790e-02]
  [  2.33832765e-02   1.31784236e-05   2.16201320e-02   2.42266501e-03
    -9.80209676e-04  -1.19874710e-02]
  [  1.59390830e-02   2.46383925e-03   5.02551533e-03   1.26088131e-02
     4.33453452e-03  -2.34422293e-02]
  [  1.38091547e-02   1.20086502e-02   4.43106666e-02   7.60523602e-03
    -7.71074602e-03  -2.75092013e-02]
  [  3.55932228e-02   8.30993708e-03   2.85629816e-02  -1.24422682e-03
     5.16857579e-03   6.64406968e-03]]

 [[  9.75320395e-03   3.27899903e-02   3.42854112e-02   9.73990746e-03
    -1.82624068e-02  -4.52016518e-02]
  [  2.99503710e-02   4.22814377e-02   1.59972627e-02   3.10577378e-02
     3.01205181e-03   3.97029473e-03]
  [  2.96767391e-02   4.26794328e-02   1.73485801e-02   7.67373433e-03
    -1.50409192e-02  -7.97832385e-03]
  [  1.15076527e-02   4.02115956e-02   2.62511540e-02   3.11808214e-02
     1.00480448e-02  -1.79561898e-02]
  [  2.81295236e

In [128]:
# take last two rows since bi-dir
hid_last = hid_states[0][-2:,:]

In [129]:
# first of row pair, to get forward
hid_last_forward = hid_last[0]

In [130]:
# same as first 6 channels of last step output
hid_last_forward


[[ 0.02836313  0.00606763  0.02310923  0.00844923  0.03136069  0.00835426]
 [ 0.02018425 -0.00205736  0.02368148  0.01517256  0.02936829  0.01105657]
 [ 0.02180842  0.02542746  0.04244835 -0.00741896  0.03391297  0.00343686]
 [ 0.01190888  0.00480322  0.01914669  0.004719    0.01296413 -0.00054099]
 [ 0.01545303  0.00941906  0.02581433  0.01284648  0.00813204  0.00568986]]
<NDArray 5x6 @cpu(0)>

In [131]:
first_output = outputs[0]

In [132]:
# last 6 channels of first step output
first_output[:,6:]


[[-0.00148457  0.0318089   0.01821657  0.0233146   0.02311079  0.02432076]
 [ 0.00166964  0.00372249  0.00529031  0.03002147  0.00336746  0.02585452]
 [ 0.02263732  0.01611603  0.02180129  0.02280513  0.01358894  0.00735507]
 [ 0.00859078  0.01099376  0.0230337   0.02421864  0.00913094  0.03071127]
 [ 0.0328386  -0.00055734  0.01982282  0.01173063  0.00941289  0.01053296]]
<NDArray 5x6 @cpu(0)>

In [133]:
# second of row pair, to get backward
hid_last_backward = hid_last[1]

In [134]:
hid_last_backward


[[-0.00148457  0.0318089   0.01821657  0.0233146   0.02311079  0.02432076]
 [ 0.00166964  0.00372249  0.00529031  0.03002147  0.00336746  0.02585452]
 [ 0.02263732  0.01611603  0.02180129  0.02280513  0.01358894  0.00735507]
 [ 0.00859078  0.01099376  0.0230337   0.02421864  0.00913094  0.03071127]
 [ 0.0328386  -0.00055734  0.01982282  0.01173063  0.00941289  0.01053296]]
<NDArray 5x6 @cpu(0)>