In [1]:
from __future__ import print_function
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from cs231n.image_utils import image_from_url

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [2]:
data = load_coco_data()

# Print out all the keys and values from the data dictionary
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))

train_captions <class 'numpy.ndarray'> (400135, 17) int32
train_image_idxs <class 'numpy.ndarray'> (400135,) int32
val_captions <class 'numpy.ndarray'> (195954, 17) int32
val_image_idxs <class 'numpy.ndarray'> (195954,) int32
train_features <class 'numpy.ndarray'> (82783, 512) float32
val_features <class 'numpy.ndarray'> (40504, 512) float32
idx_to_word <class 'list'> 1004
word_to_idx <class 'dict'> 1004
train_urls <class 'numpy.ndarray'> (82783,) <U63
val_urls <class 'numpy.ndarray'> (40504,) <U63


In [3]:
class LSTM(object):
    def __init__(self, word_to_idx, input_dim=512, wordvec_dim=256,
                 hidden_dim=512, dtype=np.float32):
        """
        Construct a new CaptioningRNN instance.

        Inputs:
        - word_to_idx: A dictionary giving the vocabulary. It contains V entries,
          and maps each string to a unique integer in the range [0, V).
        - input_dim: Dimension D of input image feature vectors.
        - wordvec_dim: Dimension W of word vectors.
        - hidden_dim: Dimension H for the hidden state of the RNN.
        - cell_type: What type of RNN to use; either 'rnn' or 'lstm'.
        - dtype: numpy datatype to use; use float32 for training and float64 for
          numeric gradient checking.
        """

        self.dtype = dtype
        self.word_to_idx = word_to_idx
        self.idx_to_word = {i: w for w, i in word_to_idx.items()}
        self.params = {}

        vocab_size = len(word_to_idx)

        self._null = word_to_idx['<NULL>']
        self._start = word_to_idx.get('<START>', None)
        self._end = word_to_idx.get('<END>', None)

        # Initialize word vectors
        self.params['W_embed'] = tf.Variable(tf.random_normal((vocab_size, wordvec_dim)) / 100.0, name = 'W_embed')

        # Initialize NN -> hidden state projection parameters
        self.params['W_proj'] = tf.Variable(tf.random_normal((input_dim, hidden_dim)) / tf.sqrt(float(input_dim)), name = 'W_proj')
        self.params['b_proj'] = tf.Variable(tf.zeros(hidden_dim), name = 'b_proj')

        # Initialize output to vocab weights
        self.params['W_vocab'] = tf.Variable(tf.random_normal((hidden_dim, vocab_size)) / tf.sqrt(float(hidden_dim)), name = 'W_vocab')
        self.params['b_vocab'] = tf.Variable(tf.zeros(vocab_size), name = 'b_vocab')
            
#         self.model = tf.contrib.cudnn_rnn.CudnnLSTM(
#             num_layers = 1,
#             num_units = hidden_dim,
#             input_size = input_dim
#             )
        self.model = tf.contrib.rnn.BasicLSTMCell(hidden_dim)
    
#         self.input_dim = input_dim
#         self.wordvec_dim = wordvec_dim
#         self.hidden_dim = hidden_dim
#         self.vocab_size = vocab_size
            
    def get_forward_op(self, features, captions_in):
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
        # Word embedding matrix
        W_embed = self.params['W_embed']
        
        # Weight and bias for the hidden-to-vocab transformation.
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
        
        h0 = tf.matmul(features, W_proj) + b_proj # W_proj : input x hidden
        
        word_embeds = tf.nn.embedding_lookup(W_embed, captions_in) # W_embed : vocab_size x wordvec_dim
        
        unrolled_output, _ = tf.nn.dynamic_rnn(
            self.model, 
            inputs = word_embeds,
            initial_state = tf.contrib.rnn.LSTMStateTuple( h0, tf.zeros_like(h0) ) # https://github.com/tensorflow/tensorflow/issues/3860
        )
        
        T, D = unrolled_output.get_shape().as_list()[1:]
        M = b_vocab.get_shape().as_list()[0]
        shape = tf.shape(unrolled_output)
        dim = tf.reduce_prod(shape[:2])

        logits = tf.reshape(tf.matmul(
            tf.reshape(unrolled_output, [dim, D]),
            W_vocab
        ), [shape[0], T, M]) + b_vocab
        
        return logits        
    
    def get_train_and_loss_ops(self, x, y, mask):
        shape = tf.shape(x)
        M = x.get_shape().as_list()[2]
        dim = tf.reduce_prod(shape[:2])

        x_flat = tf.reshape(x, [dim, M])
        y_flat = tf.reshape(y, [dim])
        mask_flat = tf.reshape(mask, [dim])

        probs = tf.exp(x_flat - tf.reduce_max(x_flat, axis=1, keep_dims=True))
        probs /= tf.reduce_sum(probs, axis=1, keep_dims=True)
        
        # https://github.com/tensorflow/tensorflow/issues/418
        sel = tf.log(tf.gather(tf.reshape(probs, [-1]), tf.range(dim) * tf.shape(probs)[1] + y_flat))
        
        loss = -tf.reduce_sum(tf.cast(mask_flat, tf.float32) * sel) / tf.cast(shape[0], tf.float32)
        
        train_op = tf.train.AdamOptimizer(5e-3).minimize(loss)

        return train_op, loss
    
    def get_infer_op(self, features, max_length=30):
        N = tf.shape(features)[0]

        # Unpack parameters
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
        W_embed = self.params['W_embed']
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
        
        h0 = tf.matmul(features, W_proj) + b_proj
        
        x = tf.cast(tf.ones(shape = (N, )) * self._start, tf.int32)
        c = tf.zeros_like(h0)
                
        state = h0, c
        captions = []
        
        for i in range(max_length):
            output, state = self.model( tf.nn.embedding_lookup(W_embed, x), state )
            
            x = tf.argmax(
                tf.matmul(output, W_vocab) + b_vocab, 
                axis = 1
            )
            captions.append(x)
            
        return np.asarray(captions).T      

In [4]:
batch_size = 128
val_batch_size = 10000
num_epochs = 5

In [5]:
tf.reset_default_graph()

small_lstm_model = LSTM(
          word_to_idx=data['word_to_idx'],
          input_dim=data['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
          dtype=np.float32,
        )

num_train = data['train_captions'].shape[0]
iterations_per_epoch = max(num_train // batch_size, 1)
num_iterations = num_epochs * iterations_per_epoch

num_val = data['val_captions'].shape[0]
num_val_batches = max(num_val // val_batch_size, 1)

x_in = tf.placeholder(tf.float32, shape = (None, data['train_features'].shape[1]), name = 'x_in')
captions_in = tf.placeholder(tf.int32, shape = (None, data['train_captions'].shape[1] - 1), name = 'cap_in')
captions_out = tf.placeholder(tf.int32, shape = (None, data['train_captions'].shape[1] - 1), name = 'cap_out')
mask_ph = tf.placeholder(tf.bool, shape = captions_out.shape, name = 'mask_ph')

logits = small_lstm_model.get_forward_op(x_in, captions_in)
train_op, loss_op = small_lstm_model.get_train_and_loss_ops(logits, captions_out, mask_ph)

infer_op = small_lstm_model.get_infer_op(x_in)

saver = tf.train.Saver()

init_op = tf.global_variables_initializer()
with tf.Session() as sess:
    
    file_writer = tf.summary.FileWriter('logs', sess.graph)
    
    sess.run(init_op)
    
    for t in range(num_iterations):
        
        if not t % int(iterations_per_epoch / 10) and t:
            print('epoch {}, train loss {}'.format(int(t / iterations_per_epoch), loss))
            
        if not t % iterations_per_epoch:
            # compute validation loss at end of epoch
            val_loss_all = 0
            for i in range(num_val_batches):
                
                idxs = range(i * val_batch_size, (i + 1) * val_batch_size)
                captions = data['val_captions'][idxs]
                image_idxs = data['val_image_idxs'][idxs]
                features = data['val_features'][image_idxs]

                c_in = captions[:, :-1]
                c_out = captions[:, 1:]
                mask = (c_out != small_lstm_model._null)

                feed_dict = {
                    x_in : features,
                    captions_in : c_in,
                    captions_out : c_out,
                    mask_ph : mask
                }
                val_loss_all += sess.run(loss_op, feed_dict=feed_dict)
                
            print('validation loss {}'.format(val_loss_all / num_val_batches))
            save_path = saver.save(sess, "models/caption_model_epoch", global_step = int(t / iterations_per_epoch))
            print("Model saved in file: %s" % save_path)
            print()

        captions, features, urls = sample_coco_minibatch(data,
                      batch_size=batch_size,
                      split='train')
        
        c_in = captions[:, :-1]
        c_out = captions[:, 1:]
        mask = (c_out != small_lstm_model._null)
        
        feed_dict = {
            x_in : features,
            captions_in : c_in,
            captions_out : c_out,
            mask_ph : mask
        }
        loss, _ = sess.run([loss_op, train_op], feed_dict=feed_dict)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


KeyboardInterrupt: 

In [29]:
tf.reset_default_graph()

model = "caption_model_epoch_1"

p = "models/{}.ckpt".format(model)

saver = tf.train.import_meta_graph(p + ".meta")
with tf.Session() as sess:
    saver.restore(sess, p)
    
    [m for m in tf.globql_variables()]
#     [print(m.values()) for m in ops]
    
#     for split in ['train', 'val']:
#         minibatch = sample_coco_minibatch(data, split=split, batch_size=2)
#         gt_captions, features, urls = minibatch
#         gt_captions = decode_captions(gt_captions, data['idx_to_word'])

#         sample_captions = sess.run(infer_op, feed_dict={x_in: features})
#         sample_captions = decode_captions(sample_captions, data['idx_to_word'])

#         for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls):
#             plt.imshow(image_from_url(url))
#             plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
#             plt.axis('off')
#             plt.show()

INFO:tensorflow:Restoring parameters from models/caption_model_epoch_1.ckpt


INFO:tensorflow:Restoring parameters from models/caption_model_epoch_1.ckpt


Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Please use tf.global_variables instead.
