In [64]:
from collections import defaultdict
from itertools import chain
import json
import glob
import tensorflow as tf
from nltk import word_tokenize
import numpy as np
import os

In [65]:
data_path = '/Users/kevinhchon/Documents/11-777/11777-GAN-image-description/attr-net/preprocessing'
visual_concepts_path = '/Users/kevinhchon/Documents/11-777/data/'
annotations_path = visual_concepts_path + 'annotations'

In [66]:
# punctuations to be removed from the sentences, from scripts/script_1.py
punctuations = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-",
  ".", "?", "!", ",", ":", "-", "--", "...", ";"] 

def tokenize_caption(caption):
    return [token.lower() for token in word_tokenize(caption) if token not in punctuations]

def load_captions(annotations_path, dataset):
    annotations_json_path = annotations_path + '/captions_{}2014.json'.format(dataset)
    with open(annotations_json_path, 'r') as annotations_file:
        annotations_json = annotations_file.read()
    annotations = json.loads(annotations_json)
    captions = defaultdict(lambda: [])
    for c in annotations['annotations']:
        captions[c['image_id']].append(tokenize_caption(c['caption']))
    return captions

def load_batch(batches_path, captions_dict, batch_id):
    batch_json_path = batches_path + '/batch{}.npz'.format(batch_id)
    npzfile = np.load(batch_json_path)
    ids = sorted(int(key) for key in npzfile.keys())
    data_batch = [npzfile[str(id_)] for id_ in ids]
    npzfile.close()
    captions_batch = [captions_dict[id_] for id_ in ids]
    return data_batch, captions_batch

def load_data(data_path, dataset):
    batches_path = data_path + '/' + dataset
    captions_dict = load_captions(annotations_path, dataset)
    num_batches = len([f for f in os.listdir(batches_path)])
    #num_batches = sum(1 for _ in batches_path.glob('*.npz'))
    data_batches, captions_batches = zip(*[load_batch(batches_path, captions_dict, batch_id)
                                           for batch_id in range(num_batches)])
    data = np.asarray(list(chain.from_iterable((data_batches))))
    captions = list(chain.from_iterable(captions_batches))
    return data, captions

In [67]:
def voc(captions_train):
    vocabulary_set = set()
    for tcaptions_lists in captions_train:
        for tcaption in tcaptions_lists:
            vocabulary_set.update(tcaption)
    vocabulary = sorted(vocabulary_set)

    vocabulary += ['<start>', '<end>', '<UNK>']
    word_to_ix = {word: i for i, word in enumerate(vocabulary)}
    return vocabulary, word_to_ix

def encode_captions(captions, word_to_ix):
    unknown_ix = word_to_ix['<UNK>']
    encoded_captions = np.ones((data_train.shape[0], num_captions_per_example, max_len_caption), dtype=int) * word_to_ix['<end>']
    encoded_captions[:, :, 0] = word_to_ix['<start>']
    caption_lengths = np.zeros((data_train.shape[0], num_captions_per_example), dtype=int)
    for i, tcaption_list in enumerate(captions):
        for j in range(num_captions_per_example):
            tcaption = tcaption_list[j]
            caption_lengths[i, j] = len(tcaption) + 2 # num words + attrs + <start>
            for k, word in enumerate(tcaption):
                encoded_captions[i, j, k+1] = word_to_ix.get(word, unknown_ix)
    return encoded_captions, caption_lengths

In [214]:
enc_captions_train

array([[9576, 1725, 5576, ..., 9577, 9577, 9577],
       [9576,   89, 3565, ..., 9577, 9577, 9577],
       [9576,   89, 3252, ..., 9577, 9577, 9577],
       ..., 
       [9576, 1623, 1118, ..., 9577, 9577, 9577],
       [9576,   89, 4609, ...,  780, 4345, 9577],
       [9576,   89, 4490, ..., 9577, 9577, 9577]])

In [68]:
num_captions_per_example = 5
max_len_caption = 60

data_train, captions_train = load_data(data_path, 'train')

vocabulary, word_to_ix = voc(captions_train)
enc_captions_train, caption_lengths_train = encode_captions(captions_train, word_to_ix)

# Keep one caption per example for now
enc_captions_train = enc_captions_train[:, 0, :]
caption_lengths_train = caption_lengths_train[:, 0]

max_size_caption = 15
if max_size_caption is not None:
    max_len_caption = max_size_caption
    captions_to_keep = caption_lengths_train <= max_size_caption
    enc_captions_train = enc_captions_train[captions_to_keep][:, :max_size_caption]
    caption_lengths_train = caption_lengths_train[captions_to_keep]
    data_train = data_train[captions_to_keep]

In [69]:
enc_captions_train.shape

(8475, 15)

In [50]:
enc_captions_train.shape

(8475, 15)

In [70]:
embedding_dim = 10
voc_dim = len(vocabulary)

In [71]:
new_attr_size = 100
if new_attr_size is not None:
    data_train = data_train[:, :new_attr_size]

In [49]:
data_train.shape

(8475, 100)

In [72]:
tf.reset_default_graph()

attrs_dim = data_train.shape[1]
attrs = tf.placeholder(tf.float32, [None, attrs_dim])
Ta = tf.get_variable('Ta',
                     shape=[attrs_dim, embedding_dim],
                     initializer=tf.random_normal_initializer())
xm1 = tf.matmul(attrs, Ta) # x^{-1}

captions = tf.placeholder(tf.int32, [None, max_len_caption])
caption_1_hot = tf.one_hot(indices=captions,
                           depth=voc_dim,
                           axis=-1)
Ts = tf.get_variable('Ts',
                     shape=[voc_dim, embedding_dim],
                     initializer=tf.random_normal_initializer())

c1hr = tf.reshape(caption_1_hot, [-1, voc_dim])
xr = tf.matmul(c1hr, Ts)
x = tf.reshape(xr, [-1, max_len_caption, embedding_dim])
xs = tf.split(x, max_len_caption, axis=1)
xs = [tf.reshape(xsi, [-1, embedding_dim]) for xsi in xs]

lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=voc_dim)

inputs = tf.stack([xm1] + xs[:-1], axis=1)

captions_l = tf.placeholder(tf.int32, [None])
outputs, _ = tf.nn.dynamic_rnn(lstm_cell,
                               inputs=inputs,
                               dtype=tf.float32,
                               sequence_length=captions_l,
                               parallel_iterations=1,
                               swap_memory=True)

cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=captions))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
train = optimizer.minimize(cost)

In [73]:

outputs_

array([[[ 0.00191331,  0.00194587,  0.02540742, ...,  0.0072472 ,
          0.0117728 , -0.01441276],
        [ 0.0095245 ,  0.00363831,  0.01355193, ...,  0.00732302,
          0.00975402, -0.012561  ],
        [ 0.00515647, -0.00172455,  0.00491368, ...,  0.00618521,
          0.00993401, -0.01052327],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]]], dtype=float32)

In [74]:
batch_size=1

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
_, cost_, outputs_ = sess.run([train, cost, outputs],
               feed_dict={captions: enc_captions_train[:batch_size],
                          captions_l: caption_lengths_train[:batch_size],
                          attrs: data_train[:batch_size]})

In [75]:
outputs_

array([[[-0.00259182,  0.00694182, -0.00249476, ...,  0.00126594,
          0.0134656 , -0.01170486],
        [-0.00453151, -0.00622142, -0.00233123, ...,  0.00464376,
          0.00243244, -0.00309067],
        [-0.00330993, -0.00765449, -0.00609488, ..., -0.00359173,
          0.00063763, -0.01337169],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]]], dtype=float32)