In [30]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import re

from keras.preprocessing import sequence
import matplotlib.pyplot as plt

video_train_feat_path = './rgb_train_features'
video_train_data_path = './data/video_corpus.csv'
video_test_feat_path = './rgb_test_features'
video_test_data_path = './data/video_corpus.csv'

dim_image = 4096
dim_hidden= 256

n_video_lstm_step = 80
n_caption_lstm_step = 20
n_frame_step = 80

n_epochs = 200
batch_size = 50
learning_rate = 0.0001

def get_video_train_data(video_data_path, video_feat_path):
    video_data = pd.read_csv(video_data_path, sep=',')
    video_data = video_data[video_data['Language'] == 'English']
    video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
    video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
    video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
    video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]

    unique_filenames = sorted(video_data['video_path'].unique())
    train_data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)]
    return train_data

def get_video_test_data(video_data_path, video_feat_path):
    video_data = pd.read_csv(video_data_path, sep=',')
    video_data = video_data[video_data['Language'] == 'English']
    video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
    video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
    video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
    video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]

    unique_filenames = sorted(video_data['video_path'].unique())
    test_data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)]
    return test_data

def preProBuildWordVocab(sentence_iterator, word_count_threshold=5):
    # borrowed this function from NeuralTalk
    print('preprocessing word counts and creating vocab based on word count threshold ', (word_count_threshold))
    word_counts = {}
    nsents = 0
    for sent in sentence_iterator:
        nsents += 1
        for w in sent.lower().split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('filtered words from', (len(word_counts), ' to ', len(vocab)))

    ixtoword = {}
    ixtoword[0] = '<pad>'
    ixtoword[1] = '<bos>'
    ixtoword[2] = '<eos>'
    ixtoword[3] = '<unk>'

    wordtoix = {}
    wordtoix['<pad>'] = 0
    wordtoix['<bos>'] = 1
    wordtoix['<eos>'] = 2
    wordtoix['<unk>'] = 3

    for idx, w in enumerate(vocab):
        wordtoix[w] = idx+4
        ixtoword[idx+4] = w

    word_counts['<pad>'] = nsents
    word_counts['<bos>'] = nsents
    word_counts['<eos>'] = nsents
    word_counts['<unk>'] = nsents

    bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range

    return wordtoix, ixtoword, bias_init_vector

Using TensorFlow backend.
  'Matplotlib is building the font cache using fc-list. '


In [2]:
class Video_Caption_Generator():
    def __init__(self, dim_image, n_words, dim_hidden, batch_size, n_lstm_steps, n_video_lstm_step, n_caption_lstm_step, bias_init_vector=None):
        self.dim_image = dim_image
        self.n_words = n_words
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_video_lstm_step=n_video_lstm_step
        self.n_caption_lstm_step=n_caption_lstm_step

        with tf.device("/cpu:0"):
            self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='Wemb')
        #self.bemb = tf.Variable(tf.zeros([dim_hidden]), name='bemb')

        #self.lstm1 = tf.nn.rnn_cell.BasicLSTMCell(dim_hidden, state_is_tuple=False)
        #self.lstm2 = tf.nn.rnn_cell.BasicLSTMCell(dim_hidden, state_is_tuple=False)

        self.lstm1 = tf.contrib.rnn.BasicLSTMCell(dim_hidden, state_is_tuple=False)
        self.lstm2 = tf.contrib.rnn.BasicLSTMCell(dim_hidden, state_is_tuple=False)

        self.encode_image_W = tf.Variable( tf.random_uniform([dim_image, dim_hidden], -0.1, 0.1), name='encode_image_W')
        self.encode_image_b = tf.Variable( tf.zeros([dim_hidden]), name='encode_image_b')

        self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1,0.1), name='embed_word_W')
        if bias_init_vector is not None:
            self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
        else:
            self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b')

    def build_model(self):
        video = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step, self.dim_image])
        video_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step])

        caption = tf.placeholder(tf.int32, [self.batch_size, self.n_caption_lstm_step+1])
        caption_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_caption_lstm_step+1])

        video_flat = tf.reshape(video, [-1, self.dim_image])
        image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b ) # (batch_size*n_lstm_steps, dim_hidden)
        image_emb = tf.reshape(image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden])

        state1 = tf.zeros([self.batch_size, self.lstm1.state_size])
        state2 = tf.zeros([self.batch_size, self.lstm2.state_size])
        padding = tf.zeros([self.batch_size, self.dim_hidden])

        probs = []
        loss = 0.0

        ##############################  Encoding Stage ##################################
        for i in range(0, self.n_video_lstm_step):
            if i > 0:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("LSTM1"):
                output1, state1 = self.lstm1(image_emb[:,i,:], state1)

            with tf.variable_scope("LSTM2"):
                output2, state2 = self.lstm2(tf.concat([padding, output1], 1), state2)

        ############################# Decoding Stage ######################################
        for i in range(0, self.n_caption_lstm_step): ## Phase 2 => only generate captions
            #if i == 0:
            #    current_embed = tf.zeros([self.batch_size, self.dim_hidden])
            #else:
            with tf.device("/cpu:0"):
                current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:, i])

            tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("LSTM1"):
                output1, state1 = self.lstm1(padding, state1)

            with tf.variable_scope("LSTM2"):
                output2, state2 = self.lstm2(tf.concat([current_embed, output1], 1), state2)

            labels = tf.expand_dims(caption[:, i+1], 1)
            indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
            concated = tf.concat([indices, labels], 1)
            onehot_labels = tf.sparse_to_dense(concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)

            logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=onehot_labels)
            cross_entropy = cross_entropy * caption_mask[:,i]
            probs.append(logit_words)

            current_loss = tf.reduce_sum(cross_entropy)/self.batch_size
            loss = loss + current_loss

        return loss, video, video_mask, caption, caption_mask, probs

In [41]:


def train():
    
#if True:
    # train_data.shape: (59590, 9), train_captions.shape: (59590,)
    train_data = get_video_train_data(video_train_data_path, video_train_feat_path)
    train_captions = train_data['Description'].values
    # test_data.shape: (4051, 9), test_captions.shape: (4051,)
    test_data = get_video_test_data(video_test_data_path, video_test_feat_path)
    test_captions = test_data['Description'].values

    # len(captions_list) = 63641
    captions_list = list(train_captions) + list(test_captions)
    captions = np.asarray(captions_list, dtype=np.object)

    #captions = map(lambda x: x.replace('.', ''), captions)
    #captions = map(lambda x: x.replace(',', ''), captions)
    #captions = map(lambda x: x.replace('"', ''), captions)
    #captions = map(lambda x: x.replace('\n', ''), captions)
    #captions = map(lambda x: x.replace('?', ''), captions)
    #captions = map(lambda x: x.replace('!', ''), captions)
    #captions = map(lambda x: x.replace('\\', ''), captions)
    #captions = map(lambda x: x.replace('/', ''), captions)
    captions = [re.sub('[\",.;?!%”“]\n\\/', '', x.lower()) for x in captions]
    

    # len(wordtoix): 11720, len(ixtoword): 11720, bias_init_vector.shape: (11720,)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=0)
    np.save("./data/wordtoix", wordtoix)
    np.save('./data/ixtoword', ixtoword)
    np.save("./data/bias_init_vector", bias_init_vector)

    model = Video_Caption_Generator(dim_image=dim_image,                     # 4096
                                    n_words=len(wordtoix),                   # 11720
                                    dim_hidden=dim_hidden,                   # 1000
                                    batch_size=batch_size,                   # 50
                                    n_lstm_steps=n_frame_step,               # 80
                                    n_video_lstm_step=n_video_lstm_step,     # 80
                                    n_caption_lstm_step=n_caption_lstm_step, # 20
                                    bias_init_vector=bias_init_vector)       # bias_init_vector.shape: (11720,)
    
    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model()
    sess = tf.InteractiveSession()

    # my tensorflow version is 0.12.1, I write the saver with version 1.0
    saver = tf.train.Saver(max_to_keep=100, write_version=1)
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE) as scope:
        #assert tf.get_variable_scope().reuse == True 
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.global_variables_initializer().run()
    loss_fd = open('loss.txt', 'w')
    loss_to_draw = []
    
    #################################
    for epoch in range(0, n_epochs):
        loss_to_draw_epoch = []

        index = list(train_data.index)
        np.random.shuffle(index)
        train_data = train_data.ix[index]

        current_train_data = train_data.groupby('video_path').apply(lambda x: x.iloc[np.random.choice(len(x))])
        current_train_data = current_train_data.reset_index(drop=True)

        for start, end in zip(
                range(0, len(current_train_data), batch_size),
                range(batch_size, len(current_train_data), batch_size)):

            start_time = time.time()

            current_batch = current_train_data[start:end]
            current_videos = current_batch['video_path'].values

            current_feats = np.zeros((batch_size, n_video_lstm_step, dim_image))
            #current_feats_vals = map(lambda vid: np.load(vid), current_videos)
            current_feats_vals = [np.load(vid) for vid in current_videos]

            current_video_masks = np.zeros((batch_size, n_video_lstm_step))

            for ind,feat in enumerate(current_feats_vals):
                
                current_feats[ind][:len(list(current_feats_vals)[ind])] = feat
                current_video_masks[ind][:len(list(current_feats_vals)[ind])] = 1

            current_captions = current_batch['Description'].values
            #current_captions = map(lambda x: '<bos> ' + x, current_captions)
            #current_captions = map(lambda x: x.replace('.', ''), current_captions)
            #current_captions = map(lambda x: x.replace(',', ''), current_captions)
            #current_captions = map(lambda x: x.replace('"', ''), current_captions)
            #current_captions = map(lambda x: x.replace('\n', ''), current_captions)
            #current_captions = map(lambda x: x.replace('?', ''), current_captions)
            #current_captions = map(lambda x: x.replace('!', ''), current_captions)
            #current_captions = map(lambda x: x.replace('\\', ''), current_captions)
            #current_captions = map(lambda x: x.replace('/', ''), current_captions)
 
            current_captions = ['<bos> ' + x for x in current_captions]
            current_captions = [re.sub('[\",.;?!%”“]\n\\/', '', x.lower()) for x in current_captions]
            current_captions = [x.replace(' 2 ', ' two ') for x in current_captions]
            current_captions = [x.replace(' 3 ', ' three ') for x in current_captions]
            current_captions = [x.replace(' 4 ', ' four ') for x in current_captions]
            current_captions = [x.replace(' 5 ', ' five ') for x in current_captions]
            current_captions = [x.replace(' 6 ', ' six ') for x in current_captions]


            for idx, each_cap in enumerate(current_captions):
                word = each_cap.lower().split(' ')
                if len(word) < n_caption_lstm_step:
                    current_captions[idx] = current_captions[idx] + ' <eos>'
                else:
                    new_word = ''
                    for i in range(n_caption_lstm_step-1):
                        new_word = new_word + word[i] + ' '
                    current_captions[idx] = new_word + '<eos>'

            current_caption_ind = []
            for cap in current_captions:
                current_word_ind = []
                for word in cap.lower().split(' '):
                    if word in wordtoix:
                        current_word_ind.append(wordtoix[word])
                    else:
                        current_word_ind.append(wordtoix['<unk>'])
                current_caption_ind.append(current_word_ind)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=n_caption_lstm_step)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix), 1] ) ] ).astype(int)
            current_caption_masks = np.zeros( (current_caption_matrix.shape[0], current_caption_matrix.shape[1]) )
            #nonzeros = np.array( map(lambda x: (x != 0).sum() + 1, current_caption_matrix ) )
            nonzeros = np.array( [(x != 0).sum() + 1 for x in current_caption_matrix ])
                                
            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            probs_val = sess.run(tf_probs, feed_dict={
                tf_video:current_feats,
                tf_caption: current_caption_matrix
                })

            _, loss_val = sess.run(
                    [train_op, tf_loss],
                    feed_dict={
                        tf_video: current_feats,
                        tf_video_mask : current_video_masks,
                        tf_caption: current_caption_matrix,
                        tf_caption_mask: current_caption_masks
                        })
            loss_to_draw_epoch.append(loss_val)

            print('idx: ', start, " Epoch: ", epoch, " loss: ", loss_val, ' Elapsed time: ', str((time.time() - start_time)))
            loss_fd.write('epoch ' + str(epoch) + ' loss ' + str(loss_val) + '\n')

        # draw loss curve every epoch
        loss_to_draw.append(np.mean(loss_to_draw_epoch))
        plt_save_dir = "./loss_imgs"
        plt_save_img_name = str(epoch) + '.png'
        plt.plot(range(len(loss_to_draw)), loss_to_draw, color='g')
        plt.grid(True)
        plt.savefig(os.path.join(plt_save_dir, plt_save_img_name))

        if np.mod(epoch, 10) == 0:
            print("Epoch ", epoch, " is done. Saving the model ...")
            saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
    #################################
    loss_fd.close()

In [32]:
def test(model_path='./models/model-100'):
    test_data = get_video_test_data(video_test_data_path, video_test_feat_path)
    #test_videos = test_data['video_path'].unique()
    test_videos = np.array(['./rgb_test_features/klteYv1Uv9A_27_33.avi.npy', 
                        './rgb_test_features/5YJaS2Eswg0_22_26.avi.npy',
                        './rgb_test_features/UbmZAe5u5FI_132_141.avi.npy',
                        './rgb_test_features/JntMAcTlOF0_50_70.avi.npy',
                        './rgb_test_features/tJHUH9tpqPg_113_118.avi.npy',])

    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())

    bias_init_vector = np.load('./data/bias_init_vector.npy')

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
            n_video_lstm_step=n_video_lstm_step,
            n_caption_lstm_step=n_caption_lstm_step,
            bias_init_vector=bias_init_vector)

    video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator()

    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    saver.restore(sess, model_path)

    test_output_txt_fd = open('S2VT_results.txt', 'w')
    for idx, video_feat_path in enumerate(test_videos):
        print(idx, video_feat_path)

        video_feat = np.load(video_feat_path)[None,...]
        #video_feat = np.load(video_feat_path)
        #video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
        if video_feat.shape[1] == n_frame_step:
            video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
        else:
            continue
            #shape_templete = np.zeros(shape=(1, n_frame_step, 4096), dtype=float )
            #shape_templete[:video_feat.shape[0], :video_feat.shape[1], :video_feat.shape[2]] = video_feat
            #video_feat = shape_templete
            #video_mask = np.ones((video_feat.shape[0], n_frame_step))

        generated_word_index = sess.run(caption_tf, feed_dict={video_tf:video_feat, video_mask_tf:video_mask})
        generated_words = ixtoword[generated_word_index]

        punctuation = np.argmax(np.array(generated_words) == '<eos>') + 1
        generated_words = generated_words[:punctuation]

        generated_sentence = ' '.join(generated_words)
        generated_sentence = generated_sentence.replace('<bos> ', '')
        generated_sentence = generated_sentence.replace(' <eos>', '')
        print(generated_sentence,'\n')
        test_output_txt_fd.write(video_feat_path + '\n')
        test_output_txt_fd.write(generated_sentence + '\n\n')

In [42]:
sta = time.time()
train()
end = time.time()
t = end - sta
print('time:', t/3600, 'hour', (t%3600)/60, 'min', (t%3600)%60, 'sec')

preprocessing word counts and creating vocab based on word count threshold  0
filtered words from (15351, ' to ', 15351)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


ResourceExhaustedError: OOM when allocating tensor with shape[50,80,256]
	 [[Node: gradients_7/LSTM1_706/strided_slice_grad/StridedSliceGrad = StridedSliceGrad[Index=DT_INT32, T=DT_FLOAT, begin_mask=5, ellipsis_mask=0, end_mask=5, new_axis_mask=0, shrink_axis_mask=2, _device="/job:localhost/replica:0/task:0/device:GPU:0"](gradients_7/LSTM1_706/strided_slice_grad/Shape, LSTM1_706/strided_slice/stack, LSTM1_706/strided_slice/stack_1, LSTM1_706/strided_slice/stack_2, gradients_7/LSTM1_700/LSTM1/basic_lstm_cell/concat_12_grad/tuple/control_dependency)]]

Caused by op 'gradients_7/LSTM1_706/strided_slice_grad/StridedSliceGrad', defined at:
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-42-8eb72a3b23a1>", line 2, in <module>
    train()
  File "<ipython-input-41-8fbd31500797>", line 50, in train
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 343, in minimize
    grad_loss=grad_loss)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 414, in compute_gradients
    colocate_gradients_with_ops=colocate_gradients_with_ops)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 581, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 353, in _MaybeCompile
    return grad_fn()  # Exit early
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 581, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/ops/array_grad.py", line 245, in _StridedSliceGrad
    shrink_axis_mask=op.get_attr("shrink_axis_mask")), None, None, None
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 5572, in strided_slice_grad
    shrink_axis_mask=shrink_axis_mask, name=name)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

...which was originally created as op 'LSTM1_706/strided_slice', defined at:
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
[elided 19 identical lines from previous traceback]
  File "<ipython-input-42-8eb72a3b23a1>", line 2, in <module>
    train()
  File "<ipython-input-41-8fbd31500797>", line 43, in train
    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model()
  File "<ipython-input-2-f5efa2e25a88>", line 54, in build_model
    output1, state1 = self.lstm1(image_emb[:,i,:], state1)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 538, in _SliceHelper
    name=name)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 706, in strided_slice
    shrink_axis_mask=shrink_axis_mask)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 5430, in strided_slice
    name=name)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/bonzo/anaconda3/envs/keras2.0/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[50,80,256]
	 [[Node: gradients_7/LSTM1_706/strided_slice_grad/StridedSliceGrad = StridedSliceGrad[Index=DT_INT32, T=DT_FLOAT, begin_mask=5, ellipsis_mask=0, end_mask=5, new_axis_mask=0, shrink_axis_mask=2, _device="/job:localhost/replica:0/task:0/device:GPU:0"](gradients_7/LSTM1_706/strided_slice_grad/Shape, LSTM1_706/strided_slice/stack, LSTM1_706/strided_slice/stack_1, LSTM1_706/strided_slice/stack_2, gradients_7/LSTM1_700/LSTM1/basic_lstm_cell/concat_12_grad/tuple/control_dependency)]]


In [None]:
test()

In [31]:
dim_hidden

256

In [7]:
current_video_masks[ind][:len(list(current_feats_vals)[ind])] = 1

IndexError: list index out of range

In [9]:
len(list(current_feats_vals)[ind])

IndexError: list index out of range

In [10]:
list(current_feats_vals)

[]

In [23]:
current_feats_vals = map(lambda vid: np.load(vid), current_videos)
#current_feats_vals[0]

In [21]:
map(lambda vid: np.load(vid), current_videos)

<map at 0x7fac25e0aac8>

In [26]:
list(current_captions)

['<bos> A person stabs a photo print with a knife',
 '<bos> okra is cooking in boiling water',
 '<bos> a boy playing music',
 '<bos> A female swimmer pushes off the side of pool underwater',
 '<bos> A baby husky is in a cage with a larger husky that is walking around and sniffing it',
 '<bos> A woman plays a flute',
 '<bos> A young woman is exercising on a mat',
 '<bos> the woman is cooking',
 '<bos> A woman slices a zucchini',
 '<bos> A man cuts a cucumber with skin into two and thereafter slices it finely',
 '<bos> A man is cutting a cucumber',
 '<bos> A woman pours a beaten egg mixture into a hot pan and swirls the pan to spread it evenly',
 '<bos> A person takes meat and vegetables out of a pot',
 '<bos> the women is cooking something',
 '<bos> The lady made the sushi look like an octopus',
 '<bos> A woman is slicing some Bentos',
 '<bos> A carrot is chopped up',
 '<bos> the cooking with dog',
 '<bos> A woman mixes a beauty treatment in a glass',
 '<bos> People are running in a rac

In [36]:
current_caption_masks.shape

(50, 21)

In [37]:
row[:nonzeros[ind]]

IndexError: too many indices for array

In [38]:
nonzeros[ind]

IndexError: too many indices for array

In [39]:
nonzeros

array(<map object at 0x7fac0fe55080>, dtype=object)