In [None]:
from __future__ import print_function, division, absolute_import

In [None]:
from keras import backend as K

In [None]:
import keras.layers.LSTM as LSTM

From the supplementary material of Ba, Kiros, Hinton, 2016. These are their equations for LSTM and layer normalized LSTM:
(edited a little so they would display in the notebook environment).

This section describes how layer normalization is applied to each of the papers'
 experiments. For notation convenience, we define layer normalization as a funct
ion mapping $LN: R^D \to R^D$ with two set of adaptive parameters, gains
 ${\bf \alpha }$ and biases ${\bf \beta }$ :
\begin{eqnarray}
LN({\bf z} ; \bf{\alpha}, \bf{\beta}) = \frac{({\bf z} - \mu)}{\sigma} \odot \bf
{\alpha} + \bf{\beta}, \\
\mu = \frac{1}{D}\sum_{i=1}^D z_i, \quad \sigma = \sqrt{\frac{1}{D}\sum_{i=1}^D
(z_i-\mu)^2},
\end{eqnarray}
where, $z_i$ is the $i^{th}$ element of the vector ${\bf z}$.



The basic LSTM equations used for these experiment are given by:

\begin{eqnarray}
\begin{pmatrix}{\bf f}_t\\{\bf i}_t\\{\bf o}_t\\{\bf g}_t\end{pmatrix} &=& {\bf
W}_h {\bf h}_{t-1} + {\bf W}_x {\bf x}_t + b \\
{\bf c}_t &=& \sigma({\bf f}_t) \odot {\bf c}_{t-1} + \sigma({\bf i}_t) \odot \tanh({\bf g}_t) \\
{\bf h}_t &=& \sigma({\bf o}_t) \odot \text{tanh}({\bf c}_t)
\end{eqnarray}


The version that incorporates layer normalization is modified as follows:

\begin{eqnarray}
\begin{pmatrix}{\bf f}_t\\{\bf i}_t\\{\bf o}_t\\{\bf g}_t\end{pmatrix} &=& LN({\bf W}_h {\bf h}_{t-1}; \bf{\alpha}_1, \bf{\beta}_1) + LN({\bf W}_x {\bf x}_t; \bf{\alpha}_2, \bf{\beta}_2) + b \\
{\bf c}_t &=& \sigma({\bf f}_t) \odot {\bf c}_{t-1} + \sigma({\bf i}_t) \odot \text{tanh}({\bf g}_t) \\
{\bf h}_t &=& \sigma({\bf o}_t) \odot \text{tanh}(LN({\bf c}_t; \bf{\alpha}_3, \bf{\beta}_3))
\end{eqnarray}

where $\bf{\alpha}_i, \bf{\beta}_i$ are the additive and multiplicative parameters, respectively. Each $\bf{\alpha}_i$ is initialized to a vector of zeros and each $\bf{\beta}_i$ is initialized to a vector of ones.


In [1]:
# modeled after ryankiros and gruln code
# a simple sample normalization code, which I've tested with keras
# this does not also apply the parameters like ryankiros's code does

def sample_normalize(x, _eps=1e-5):
    """centers a set of samples x to have zero mean and unit standard deviation"""
    # keepdims=True the axes which are reduced are left in the result as dimensions with size one
    # axis=-1 means do things across the last axis
    m = K.mean(x, axis=-1, keepdims=True) # could subtract this off earlier
    # std = K.std(x)
    std = K.sqrt(K.var(x, axis=-1, keepdims=True) + _eps) # not using K.std for _eps stability
    return (x-m)/ (std+_eps)

Here is what the code for a single step of LSTM looks like in Keras 
(recurrent.py, the LSTM class function)

```python 
    def step(self, x, states):
        h_tm1 = states[0]
        c_tm1 = states[1]
        B_U = states[2]
        B_W = states[3]

        if self.consume_less == 'gpu':
            z = K.dot(x * B_W[0], self.W) + K.dot(h_tm1 * B_U[0], self.U) + self.b

            z0 = z[:, :self.output_dim]
            z1 = z[:, self.output_dim: 2 * self.output_dim]
            z2 = z[:, 2 * self.output_dim: 3 * self.output_dim]
            z3 = z[:, 3 * self.output_dim:]

            i = self.inner_activation(z0)
            f = self.inner_activation(z1)
            c = f * c_tm1 + i * self.activation(z2)
            o = self.inner_activation(z3)
        else:
            if self.consume_less == 'cpu':
                x_i = x[:, :self.output_dim]
                x_f = x[:, self.output_dim: 2 * self.output_dim]
                x_c = x[:, 2 * self.output_dim: 3 * self.output_dim]
                x_o = x[:, 3 * self.output_dim:]
            elif self.consume_less == 'mem':
                x_i = K.dot(x * B_W[0], self.W_i) + self.b_i
                x_f = K.dot(x * B_W[1], self.W_f) + self.b_f
                x_c = K.dot(x * B_W[2], self.W_c) + self.b_c
                x_o = K.dot(x * B_W[3], self.W_o) + self.b_o
            else:
                raise Exception('Unknown `consume_less` mode.')

            i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i))
            f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f))
            c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * B_U[2], self.U_c))
            o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o))

        h = o * self.activation(c)
        return h, [h, c]
```

Here is the ln() normalization function from ryankiros (written in theano). It both normalizes the tensor x but also 
multiplies in the s Tensor and adds the bias term b

```python
def ln(x, b, s):
    _eps = 1e-5
    output = (x - x.mean(1)[:,None]) / tensor.sqrt((x.var(1)[:,None] + _eps))
    output = s[None, :] * output + b[None,:]
    return output
```

And here is the _step function:
- he separates inputs into sbelow and sbefore (sbefore starts at initial state)
- each sbelow and sbefore get normalized
- the memory cell actictivity c also is normalized as well

```python
# class function for lstm layser normalization



    def _step(mask, sbelow, sbefore, cell_before, *args):
        sbelow_ = ln(sbelow, param('b1'), param('s1'))
        sbefore_ = ln(dot(sbefore, param('U')), param('b2'), param('s2'))

        preact = sbefore_ + sbelow_ + param('b')

        i = Sigmoid(_slice(preact, 0, dim))
        f = Sigmoid(_slice(preact, 1, dim))
        o = Sigmoid(_slice(preact, 2, dim))
        c = Tanh(_slice(preact, 3, dim))

        c = f * cell_before + i * c
        c = mask * c + (1. - mask) * cell_before

        c_ = ln(c, param('b3'), param('s3'))
        h = o * tensor.tanh(c_)
        h = mask * h + (1. - mask) * sbefore


```

In [None]:
class LSTM-LN(LSTM):
    def step(self, x, states):
        h_tm1 = states[0]
        c_tm1 = states[1]
        B_U = states[2]
        B_W = states[3]

        if self.consume_less == 'gpu':
            # original linear activity
            # z = K.dot(x * B_W[0], self.W) + K.dot(h_tm1 * B_U[0], self.U) + self.b
            # linear activity without bias term self.b # will need to add this back !!! (see what ryankiros does in ln())

            z = K.dot(x * B_W[0], self.W) + K.dot(h_tm1 * B_U[0], self.U)
            # seems that ryankiros divides things into inputs from below and recurrent input from before (t-1)
            # and normalizes them 
            
            z0 = z[:, :self.output_dim]                         # z0(x_i)         
            z1 = z[:, self.output_dim: 2 * self.output_dim]     # z1(x_f)
            z2 = z[:, 2 * self.output_dim: 3 * self.output_dim] # z2(x_c)
            z3 = z[:, 3 * self.output_dim:]                     # z3(x_o)
            # normalization
            
            i = self.inner_activation(z0)
            f = self.inner_activation(z1)
            c = f * c_tm1 + i * self.activation(z2)
            o = self.inner_activation(z3)
        else:
            if self.consume_less == 'cpu':
                x_i = x[:, :self.output_dim]
                x_f = x[:, self.output_dim: 2 * self.output_dim]
                x_c = x[:, 2 * self.output_dim: 3 * self.output_dim]
                x_o = x[:, 3 * self.output_dim:]
            elif self.consume_less == 'mem':
                x_i = K.dot(x * B_W[0], self.W_i) + self.b_i
                x_f = K.dot(x * B_W[1], self.W_f) + self.b_f
                x_c = K.dot(x * B_W[2], self.W_c) + self.b_c
                x_o = K.dot(x * B_W[3], self.W_o) + self.b_o
            else:
                raise Exception('Unknown `consume_less` mode.')

            i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i))
            f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f))
            c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * B_U[2], self.U_c))
            o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o))

        h = o * self.activation(c)
        return h, [h, c]