Copyright 2016 Google Inc. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

## Gated Feedback Recurrent Neural Network

This notebook contains a Tensorflow (http://www.tensorflow.org) implementation of the Gated Feedback Recurrent Neural Network (the LSTM version) from this paper: http://arxiv.org/pdf/1502.02367v4.pdf

In [1]:
import tensorflow as tf
from tensorflow.models.rnn.ptb import reader
import numpy as np

train_data, valid_data, test_data, vocab = reader.ptb_raw_data('simple-examples/data/')

In [2]:
# Hyperparameters
batch_size = 20
num_steps = 20
hidden_size = 200
emb_size = 200 # Note: this is kind of a cheat. This will *not* work if emb_size != hidden_size
vocab_size = 10000
epochs = 2
init_scale = 0.1
num_hidden_layers = 1

lr = tf.placeholder(tf.float32, [])

In [3]:
## Build Model
session = tf.Session()

X = tf.placeholder(tf.int32, [batch_size, num_steps])
targets = tf.placeholder(tf.int64, [batch_size, num_steps])

embedding = tf.Variable(
  tf.random_uniform([vocab_size, emb_size], minval=-init_scale, maxval=init_scale),
  name="embedding")

# For input gate.
Wi = [tf.Variable(
  tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),
  name="Wi_%d" % i) for i in range(num_hidden_layers)]
Ui = [tf.Variable(
  tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),
  name="Ui_%d" % i) for i in range(num_hidden_layers)]

# For forget gate.
Wf = [tf.Variable(
  tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),
  name="Wf_%d" % i) for i in range(num_hidden_layers)]
Uf = [tf.Variable(
  tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),
  name="Uf_%d" % i) for i in range(num_hidden_layers)]

# For content -- Quick note: there's no transformation from content -> state. They are both
# the same size.
Wc = [tf.Variable(
  tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),
  name="Wc_%d" % i) for i in range(num_hidden_layers)]
Uc = [tf.Variable(
  tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),
  name="Uc_%d" % i) for i in range(num_hidden_layers)]

# For hidden state output gate.
Wo = [tf.Variable(
  tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),
  name="Wo_%d" % i) for i in range(num_hidden_layers)]
Uo = [tf.Variable(
  tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),
  name="Uo_%d" % i) for i in range(num_hidden_layers)]

# For gated feedback gates (e.g. the contribution of the paper).
Wg = [tf.Variable(
  tf.random_uniform([emb_size, 1], minval=-init_scale, maxval=init_scale),
  name="Wg_%d" % i) for i in range(num_hidden_layers)]
Ug = [tf.Variable(
  tf.random_uniform([hidden_size * num_hidden_layers, 1], minval=-init_scale, maxval=init_scale),
  name="Ug_%d" % i) for i in range(num_hidden_layers)]

# For output.
output_weights = tf.Variable(
  tf.random_uniform([hidden_size, vocab_size], minval=-init_scale, maxval=init_scale),
  name="output_weights")
output_bias = tf.Variable(tf.zeros([vocab_size]), name="output_bias")

X_in = tf.nn.embedding_lookup(embedding, X)

initial_state = tf.zeros([batch_size, hidden_size])
content = initial_state
state = [initial_state] * num_hidden_layers
prev_concat_h = tf.zeros([batch_size, hidden_size * num_hidden_layers])
loss = tf.zeros([])
# TODO: prev concat h
for time_step in range(num_steps):
  h_prev = X_in[:, time_step, :]
  for layer in range(num_hidden_layers):
    input_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wi[layer])  + tf.matmul(state[layer], Ui[layer]))
    forget_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wf[layer]) + tf.matmul(state[layer], Uf[layer]))
    output_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wo[layer]) + tf.matmul(state[layer], Uo[layer]))
    
    # Main contribution of paper:
    gates = [tf.sigmoid(tf.matmul(h_prev, Wg[i]) + tf.matmul(prev_concat_h, Ug[i])) for i in range(num_hidden_layers)]
    gated_prev_timestep = [gates[i] * tf.matmul(state[layer], Uc[i]) for i in range(num_hidden_layers)]
    new_content = tf.nn.tanh(tf.matmul(h_prev, Wc[layer]) + tf.add_n(gated_prev_timestep))
    
    content = tf.mul(forget_gate, content) + tf.mul(input_gate, new_content)
    state[layer] = tf.mul(output_gate, tf.nn.tanh(content))
    
  logits = tf.nn.bias_add(tf.matmul(state[num_hidden_layers-1], output_weights), output_bias)
  step_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, targets[:, time_step])
  loss += tf.reduce_sum(step_loss)
  prev_concat_h = tf.concat(1, state)

final_state = state
cost = loss / batch_size

tf.scalar_summary("cost", cost)
merged = tf.merge_all_summaries()
writer = tf.train.SummaryWriter("summaries/gfrnn", session.graph_def)

In [4]:
# Train Model
session.run(tf.initialize_all_variables())
sgd = tf.train.GradientDescentOptimizer(lr).minimize(cost)
costs = 0.0
iters = 0
for i in range(epochs):
  print 'Epoch', i
  for step, (x, y) in enumerate(reader.ptb_iterator(train_data, batch_size, num_steps)):
    result, step_cost, _, = session.run([merged, cost, sgd],
                             {X: x, targets: y, lr: 1.0 / (i + 1)})
    costs += step_cost
    iters += num_steps
    if iters % 1000 == 0:
      print iters, np.exp(costs / iters)
      writer.add_summary(result, iters)
      writer.flush()

Epoch 0
1000 1536.86554276
2000 1075.77593235
3000 845.104395239
4000 718.76732411
5000 641.0083012
6000 593.578288039
7000 552.39057359
8000 518.773597082
9000 490.011530283
10000 469.79929651
11000 445.811142429
12000 427.981149944
13000 413.231142047
14000 399.589796646
15000 387.374589524
16000 375.535316084
17000 364.656415065
18000 357.716949046
19000 350.007875462
20000 340.5189465
21000 334.729726814
22000 328.91687226
23000 323.211649227
24000 315.498545393
25000 309.774060383
26000 303.523051557
27000 297.389077065
28000 292.475912875
29000 287.393987197
30000 283.363092117
31000 278.863735412
32000 275.836836503
33000 272.480957052
34000 269.913912177
35000 266.354806615
36000 263.881540787
37000 260.255694866
38000 255.84195618
39000 253.142906497
40000 250.839347938
41000 247.700747125
42000 244.609508041
43000 241.109089536
44000 238.690587697
45000 236.178669209
46000 234.919347477
Epoch 1
47000 233.529050243
48000 231.346767272
49000 228.853817309
50000 225.70174362
510