In [31]:
from collections import defaultdict
from datetime import datetime
from itertools import chain
import json
import glob
import tensorflow as tf
from nltk import word_tokenize
import numpy as np
import os

In [23]:
data_path = '/Users/kevinhchon/Documents/11-777/11777-GAN-image-description/attr-net/preprocessing'
weight_path = '/Users/kevinhchon/Documents/11-777/11777-GAN-image-description/attr-net/model.ckpt'
summaries_path = '/Users/kevinhchon/Documents/11-777/11777-GAN-image-description/attr-net/summaries'
visual_concepts_path = '/Users/kevinhchon/Documents/11-777/data/'
annotations_path = visual_concepts_path + 'annotations'

In [3]:
# punctuations to be removed from the sentences, from scripts/script_1.py
punctuations = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-",
  ".", "?", "!", ",", ":", "-", "--", "...", ";"] 

def tokenize_caption(caption):
    return [token.lower() for token in word_tokenize(caption) if token not in punctuations]

def load_captions(annotations_path, dataset):
    annotations_json_path = annotations_path + '/captions_{}2014.json'.format(dataset)
    with open(annotations_json_path, 'r') as annotations_file:
        annotations_json = annotations_file.read()
    annotations = json.loads(annotations_json)
    captions = defaultdict(lambda: [])
    for c in annotations['annotations']:
        captions[c['image_id']].append(tokenize_caption(c['caption']))
    return captions

def load_batch(batches_path, captions_dict, batch_id):
    batch_json_path = batches_path + '/batch{}.npz'.format(batch_id)
    npzfile = np.load(batch_json_path)
    ids = sorted(int(key) for key in npzfile.keys())
    data_batch = [npzfile[str(id_)] for id_ in ids]
    npzfile.close()
    captions_batch = [captions_dict[id_] for id_ in ids]
    return data_batch, captions_batch

def load_data(data_path, dataset):
    batches_path = data_path + '/' + dataset
    captions_dict = load_captions(annotations_path, dataset)
    num_batches = len([f for f in os.listdir(batches_path)])
    #num_batches = sum(1 for _ in batches_path.glob('*.npz'))
    data_batches, captions_batches = zip(*[load_batch(batches_path, captions_dict, batch_id)
                                           for batch_id in range(num_batches)])
    data = np.asarray(list(chain.from_iterable((data_batches))))
    captions = list(chain.from_iterable(captions_batches))
    return data, captions

In [4]:
def voc(captions_train):
    vocabulary_set = set()
    for tcaptions_lists in captions_train:
        for tcaption in tcaptions_lists:
            vocabulary_set.update(tcaption)
    vocabulary = sorted(vocabulary_set)

    vocabulary += ['<start>', '<end>', '<UNK>']
    word_to_ix = {word: i for i, word in enumerate(vocabulary)}
    return np.asarray(vocabulary), word_to_ix

def encode_captions(captions, word_to_ix):
    unknown_ix = word_to_ix['<UNK>']
    encoded_captions = np.ones((data_train.shape[0], num_captions_per_example, max_len_caption), dtype=int) * word_to_ix['<end>']
    encoded_captions[:, :, 0] = word_to_ix['<start>']
    caption_lengths = np.zeros((data_train.shape[0], num_captions_per_example), dtype=int)
    for i, tcaption_list in enumerate(captions):
        for j in range(num_captions_per_example):
            tcaption = tcaption_list[j]
            caption_lengths[i, j] = len(tcaption) + 2 # num words + attrs + <start>
            for k, word in enumerate(tcaption):
                encoded_captions[i, j, k+1] = word_to_ix.get(word, unknown_ix)
    return encoded_captions, caption_lengths

In [5]:
num_captions_per_example = 5
max_len_caption = 60

data_train, captions_train = load_data(data_path, 'train')

vocabulary, word_to_ix = voc(captions_train)
enc_captions_train, caption_lengths_train = encode_captions(captions_train, word_to_ix)

# Keep one caption per example for now
enc_captions_train = enc_captions_train[:, 0, :]
caption_lengths_train = caption_lengths_train[:, 0]

max_size_caption = 15
if max_size_caption is not None:
    max_len_caption = max_size_caption
    captions_to_keep = caption_lengths_train <= max_size_caption
    enc_captions_train = enc_captions_train[captions_to_keep][:, :max_size_caption]
    caption_lengths_train = caption_lengths_train[captions_to_keep]
    data_train = data_train[captions_to_keep]

In [7]:
enc_captions_train.shape

(8475, 15)

In [8]:
enc_captions_train.shape

(8475, 15)

In [6]:
embedding_dim = 1024
voc_dim = len(vocabulary)

In [7]:
new_attr_size = None
if new_attr_size is not None:
    data_train = data_train[:, :new_attr_size]

In [11]:
data_train.shape

(8475, 1000)

In [39]:
tf.reset_default_graph()

attrs_dim = data_train.shape[1]
attrs = tf.placeholder(tf.float32, [None, attrs_dim], name='attrs')
Ta = tf.get_variable('Ta',
                     shape=[attrs_dim, embedding_dim],
                     initializer=tf.random_normal_initializer())
xm1 = tf.matmul(attrs, Ta) # x^{-1}

captions = tf.placeholder(tf.int32, [None, max_len_caption], name='training_captions')
caption_1_hot = tf.one_hot(indices=captions,
                           depth=voc_dim,
                           axis=-1)
Ts = tf.get_variable('Ts',
                     shape=[voc_dim, embedding_dim],
                     initializer=tf.random_normal_initializer())

c1hr = tf.reshape(caption_1_hot, [-1, voc_dim])
xr = tf.matmul(c1hr, Ts)
x = tf.reshape(xr, [-1, max_len_caption, embedding_dim])
xs = tf.split(x, max_len_caption, axis=1)
xs = [tf.reshape(xsi, [-1, embedding_dim]) for xsi in xs]

lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=voc_dim)

inputs = tf.stack([xm1] + xs[:-1], axis=1)

captions_l = tf.placeholder(tf.int32, [None], name='captions_length')

# def loop_fn(time, cell_output, cell_state, loop_state):
#     emit_output = cell_output
#     if cell_output is None:
#         next_cell_state = cell.zero_state(xm1.shape[0], tf.float32)
#     else:
#         next_cell_state = cell_state
#     elements_finished = (time >= captions_l)
    
    
# Inference
c_state_input = tf.placeholder(tf.float32, [None, lstm_cell.state_size[0]], name='c_state')
m_state_input = tf.placeholder(tf.float32, [None, lstm_cell.state_size[1]], name='m_state')
word_input = tf.placeholder(tf.int32, [None], name='word_input')
# feeding_attrs = tf.placeholder(tf.bool, (), name='feeding_attrs')

word_1_hot = tf.one_hot(indices=word_input,
                        depth=voc_dim,
                        axis=-1)
word_embedding = tf.matmul(word_1_hot, Ts)

# def embedding_feeding_word():
#     word_1_hot = tf.one_hot(indices=word_input,
#                             depth=voc_dim,
#                             axis=-1)
#     return tf.matmul(word_1_hot, Ts)
# embedding = tf.cond(feeding_attrs,
#                     lambda: xm1,
#                     embedding_feeding_word)
embedding = tf.placeholder(tf.float32, [None, embedding_dim], name='embedding')
state_input = (c_state_input, m_state_input)

# LSTM
with tf.variable_scope('lstm') as lstm_scope:
    outputs, _ = tf.nn.dynamic_rnn(lstm_cell,
                                   inputs=inputs,
                                   dtype=tf.float32,
                                   sequence_length=captions_l,
                                   parallel_iterations=1,
                                   swap_memory=True)
    lstm_scope.reuse_variables()
    
    # Inference
    with tf.variable_scope('rnn'):
        output_inf, state_inf = lstm_cell(inputs=embedding,
                                          state=state_input)
outputs_idx = tf.argmax(outputs, axis=-1)
outputs_idx_inf = tf.argmax(output_inf, axis=-1)

mask = tf.sequence_mask(captions_l, max_len_caption)

cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=captions)
masked_ce = tf.boolean_mask(cross_entropy, mask)
cost = tf.reduce_mean(masked_ce)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
train_optimizer = optimizer.minimize(cost)

with tf.name_scope('summaries'):
    tf.summary.scalar('cost', cost)
    tf.summary.histogram('Ta', Ta)
    tf.summary.histogram('Ts', Ts)
    tf.summary.histogram('outputs', outputs)
    
merged = tf.summary.merge_all()
saver = tf.train.Saver()

In [44]:
lstm_cell.variable

AttributeError: 'BasicLSTMCell' object has no attribute 'variable'

In [40]:
def train(epoch, batch_size, sess, train_writer):
    _, cost_, summary = sess.run([train_optimizer, cost, merged],
           feed_dict={captions: enc_captions_train[epoch*batch_size:(epoch+1*batch_size)],
                      captions_l: caption_lengths_train[epoch*batch_size:(epoch+1*batch_size)],
                      attrs: data_train[epoch*batch_size:(epoch+1*batch_size)]})
    train_writer.add_summary(summary, epoch)
    train_writer.flush()
    print(cost_)

In [41]:
batch_size=5
num_epochs=10

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    train_writer = tf.summary.FileWriter(summaries_path + '/train/' + str(datetime.now()),
                                         sess.graph)
    for epoch in range(num_epochs):
        train(epoch, batch_size, sess, train_writer)
    saver.save(sess, weight_path)

9.15972
9.16912
nan


KeyboardInterrupt: 

In [None]:
data_val, captions_val = load_data(data_path, 'val')


In [144]:
def test(sess, subset_size):
    word_input_ = [word_to_ix['<start>']] * subset_size
    c_state_input_, m_state_input_ = lstm_cell.zero_state(batch_size=subset_size, dtype=tf.float32)
    c_state_input_ = c_state_input_.eval(session=sess)
    m_state_input_ = m_state_input_.eval(session=sess)
    captions = []
    for _ in range(subset_size):
        captions.append([])
    ended = np.zeros(subset_size, dtype=bool)
    
    # Feed the attributes
    attrs_embedding, = sess.run([xm1], feed_dict={attrs: data_val[:subset_size]})
    state_, = sess.run([state_inf], feed_dict={c_state_input: c_state_input_,
                                               m_state_input: m_state_input_,
                                               embedding: attrs_embedding})
    
    # Generate the words
    for step in range(max_len_caption):
        c_state_input_, m_state_input_ = state_
        word_embedding_, = sess.run([word_embedding],
                                   feed_dict={word_input: word_input_})
        output_, state_ = sess.run([outputs_idx_inf, state_inf],
                                feed_dict={c_state_input: c_state_input_,
                                           m_state_input: m_state_input_,
                                           embedding: word_embedding_})
        ended = np.logical_or(ended, output_ == word_to_ix['<end>'])
        if ended.all():
            break
        for caption, output_id, ended_caption in zip(captions, output_, ended):
            if not ended_caption:
                caption.append(vocabulary[output_id])
        word_input_ = output_
    return [' '.join(caption) for caption in captions]

In [147]:
subset_size = 5
with tf.Session() as sess:
    saver.restore(sess, weigth_path)
    captions = test(sess, subset_size)
print(captions)

[u'assist timey lumber uniquely mull incorporating footage arbor oars buffet rickshaw remote technological frosted doughnut', u"'s angle angle dishware handlebars coming farmers takes ambience desk high-performance defaced settlers direction direction", u'domino dough bald ribbed growth bib doors safeway safeway eighteen advertisements speed traditional step common', u"else blizzard toned toned study arugula all arugula all author 's chaise cape wifit cool", u'assist except brochures peaking turn turn overhang submarine hour pair samsung montage compartmentalized shallows shallows']


In [None]:
np.array(vocabulary)[outputs_idx_]

In [None]:
outputs_idx.shape