In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import re

from keras.preprocessing import sequence
import matplotlib.pyplot as plt

video_train_feat_path = './rgb_train_features'
video_train_data_path = './data/video_corpus.csv'
video_test_feat_path = './rgb_test_features'
video_test_data_path = './data/video_corpus.csv'
model_path = './models'

dim_image = 4096
dim_hidden= 128

n_video_lstm_step = 80
n_caption_lstm_step = 40
n_frame_step = 80

n_epochs = 200
#n_epochs = 2

batch_size = 50
learning_rate = 0.0001
"""
def get_video_train_data(video_data_path, video_feat_path):
    video_data = pd.read_csv(video_data_path, sep=',')
    video_data = video_data[video_data['Language'] == 'English']
    video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
    video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
    video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
    video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]

    unique_filenames = sorted(video_data['video_path'].unique())
    train_data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)]
    return train_data

def get_video_test_data(video_data_path, video_feat_path):
    video_data = pd.read_csv(video_data_path, sep=',')
    video_data = video_data[video_data['Language'] == 'English']
    video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
    video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
    video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
    video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]

    unique_filenames = sorted(video_data['video_path'].unique())
    test_data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)]
    return test_data
"""
def preProBuildWordVocab(sentence_iterator, word_count_threshold=5):
    # borrowed this function from NeuralTalk
    print('preprocessing word counts and creating vocab based on word count threshold ', (word_count_threshold))
    word_counts = {}
    nsents = 0
    for sent in sentence_iterator:
        nsents += 1
        for w in sent.lower().split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('filtered words from', (len(word_counts), ' to ', len(vocab)))

    ixtoword = {}
    ixtoword[0] = '<pad>'
    ixtoword[1] = '<bos>'
    ixtoword[2] = '<eos>'
    #ixtoword[3] = '<unk>'

    wordtoix = {}
    wordtoix['<pad>'] = 0
    wordtoix['<bos>'] = 1
    wordtoix['<eos>'] = 2
    #wordtoix['<unk>'] = 3

    for idx, w in enumerate(vocab):
        if w not in ['<pad>', '<bos>', '<eos>']:
            wordtoix[w] = idx+3
            ixtoword[idx+3] = w

    word_counts['<pad>'] = nsents
    #word_counts['<bos>'] = nsents
    #word_counts['<eos>'] = nsents
    #word_counts['<unk>'] = nsents

    bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range

    return wordtoix, ixtoword, bias_init_vector

  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
class Video_Caption_Generator():
    def __init__(self, dim_image, n_words, dim_hidden, batch_size, n_lstm_steps, n_video_lstm_step, n_caption_lstm_step, bias_init_vector=None):
        self.dim_image = dim_image                     # 4096
        self.n_words = n_words                         # 6900
        self.dim_hidden = dim_hidden                   # 256
        self.batch_size = batch_size                   # 100
        self.n_lstm_steps = n_lstm_steps               # 80
        self.n_video_lstm_step=n_video_lstm_step       # 80
        self.n_caption_lstm_step=n_caption_lstm_step   # 40

        #with tf.device("/cpu:0"):
        with tf.device("/device:GPU:0"):              # 6900 x 256 
            self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='Wemb')

                                                  # 256
        self.lstm1 = tf.contrib.rnn.BasicLSTMCell(dim_hidden, state_is_tuple=False)
        self.lstm2 = tf.contrib.rnn.BasicLSTMCell(dim_hidden, state_is_tuple=False)
                                                             # 4096 x 256 
        self.encode_image_W = tf.Variable( tf.random_uniform([dim_image, dim_hidden], -0.1, 0.1), name='encode_image_W')
        self.encode_image_b = tf.Variable( tf.zeros([dim_hidden]), name='encode_image_b')
                                                          # 256 x 6900
        self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1,0.1), name='embed_word_W')
        if bias_init_vector is not None:
            self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
        else:                                        # 6900
            self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b')

    def build_model(self):                 # 50 x 80 x 4096
        video = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step, self.dim_image])
        video_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step])
                                           # 50 x 40+1
        caption = tf.placeholder(tf.int32, [self.batch_size, self.n_caption_lstm_step+1])
        caption_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_caption_lstm_step+1])
                                       # (50*80) x 4096
        video_flat = tf.reshape(video, [-1, self.dim_image])
        image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b ) # (batch_size*n_lstm_steps, dim_hidden)
                                          # 50
        image_emb = tf.reshape(image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden])
                          # 100 x 
        state1 = tf.zeros([self.batch_size, self.lstm1.state_size])
        state2 = tf.zeros([self.batch_size, self.lstm2.state_size])
        padding = tf.zeros([self.batch_size, self.dim_hidden])

        probs = []
        loss = 0.0

        ##############################  Encoding Stage ##################################
        for i in range(0, self.n_video_lstm_step):
            if i > 0:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("LSTM1"):
            #with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                output1, state1 = self.lstm1(image_emb[:,i,:], state1)

            with tf.variable_scope("LSTM2"):
            #with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                output2, state2 = self.lstm2(tf.concat([padding, output1], 1), state2)

        ############################# Decoding Stage ######################################
        for i in range(0, self.n_caption_lstm_step): ## Phase 2 => only generate captions
            #if i == 0:
            #    current_embed = tf.zeros([self.batch_size, self.dim_hidden])
            #else:
            #with tf.device("/cpu:0"):
            with tf.device("/device:GPU:0"):
                current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:, i])

            tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("LSTM1"):
            #with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                output1, state1 = self.lstm1(padding, state1)

            with tf.variable_scope("LSTM2"):
            #with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                output2, state2 = self.lstm2(tf.concat([current_embed, output1], 1), state2)

            labels = tf.expand_dims(caption[:, i+1], 1)
            indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
            concated = tf.concat([indices, labels], 1)
            onehot_labels = tf.sparse_to_dense(concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)

            logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=onehot_labels)
            cross_entropy = cross_entropy * caption_mask[:,i]
            probs.append(logit_words)

            current_loss = tf.reduce_sum(cross_entropy)/self.batch_size
            loss = loss + current_loss

        return loss, video, video_mask, caption, caption_mask, probs
    

    def build_generator(self):
        video = tf.placeholder(tf.float32, [1, self.n_video_lstm_step, self.dim_image])
        video_mask = tf.placeholder(tf.float32, [1, self.n_video_lstm_step])

        video_flat = tf.reshape(video, [-1, self.dim_image])
        image_emb = tf.nn.xw_plus_b(video_flat, self.encode_image_W, self.encode_image_b)
        image_emb = tf.reshape(image_emb, [1, self.n_video_lstm_step, self.dim_hidden])

        state1 = tf.zeros([1, self.lstm1.state_size])
        state2 = tf.zeros([1, self.lstm2.state_size])
        padding = tf.zeros([1, self.dim_hidden])

        generated_words = []

        probs = []
        embeds = []

        for i in range(0, self.n_video_lstm_step):
            if i > 0:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("LSTM1"):
            #with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                output1, state1 = self.lstm1(image_emb[:, i, :], state1)

            with tf.variable_scope("LSTM2"):
            #with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                output2, state2 = self.lstm2(tf.concat([padding, output1], 1), state2)

        for i in range(0, self.n_caption_lstm_step):
            tf.get_variable_scope().reuse_variables()
            
            
            print('i=',i)
            if i == 0:
                #with tf.device('/cpu:0'):
                with tf.device("/device:GPU:0"):
                    current_embed = tf.nn.embedding_lookup(self.Wemb, tf.ones([1], dtype=tf.int64))

            with tf.variable_scope("LSTM1"):
            #with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                output1, state1 = self.lstm1(padding, state1)

            with tf.variable_scope("LSTM2"):
            #with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                output2, state2 = self.lstm2(tf.concat([current_embed, output1], 1), state2)

            logit_words = tf.nn.xw_plus_b( output2, self.embed_word_W, self.embed_word_b)
            max_prob_index = tf.argmax(logit_words, 1)[0]
            generated_words.append(max_prob_index)
            probs.append(logit_words)

            #with tf.device("/cpu:0"):
            with tf.device("/device:GPU:0"):
                current_embed = tf.nn.embedding_lookup(self.Wemb, max_prob_index)
                current_embed = tf.expand_dims(current_embed, 0)

            embeds.append(current_embed)

        return video, video_mask, generated_words, probs, embeds

In [None]:
var_history = dict()
def print_all_vars(msg):
    global var_history
    #print(msg)
    
    var_history[msg]= tf.global_variables()
    #print('[')
    #for a in tf.global_variables():
        #print(a)
    #print(']')  

## Training

In [None]:
import numpy as np
import pandas as pd
import pickle
import glob
import random

data_dir = './MLDS_hw2_data/'
test_out_dir = './test_out_dir/'
peer_out_dir = './peer_out_dir/'

train_feat_list = pd.Series(glob.glob(data_dir+'training_data/feat/*'))
test_feat_list = pd.Series(glob.glob(data_dir+'testing_data/feat/*'))
print(len(train_feat_list), 'training set,', len(test_feat_list), 'testing set')


import json
print('loading training label ...')
train_label_json = json.load(open(data_dir+'training_label.json'))
print('loading training id ...')
training_id = pd.Series([x.split('/')[-1].replace('.npy','') for x in train_feat_list])

print('loading testing label ...')
test_label_json = json.load(open(data_dir+'testing_label.json'))
print('loading testing id ...')
testing_id = pd.Series([x.split('/')[-1].replace('.npy','') for x in test_feat_list])


    
import re

#caption_words = set()
#caption_dict = {'2':'two', '3':'three', '4':'four', '5':'five', '6':'six'}
train_caption_dict = dict()
test_caption_dict = dict()
captions = list()
#words = set()
for tj in train_label_json:
    tj_caption = list()
    for c in tj['caption']:
        c = '<bos> '+c+' <eos>'
        c = c.replace('one hundred', '100')
        c = c.replace('two hundred', '200')

            
        s = [re.sub('[\",.;?!%”“()]', '', s.lower()) for s in c.split(' ') if re.sub('[\",.;?!%”“]', '', s.lower()) ]
        s = [si.replace('2','two').replace('3','three').replace('4','four').replace('5','five').replace('6','six').replace('three/four','3/4') for si in s]
        #words |= set(s)
        captions.append(' '.join(s))
        tj_caption.append(' '.join(s))
    else:
        train_caption_dict[tj['id']] = tj_caption
        del tj_caption
        
for tj in test_label_json:
    tj_caption = list()
    for c in tj['caption']:
        c = c.replace('one hundred', '100')
        c = c.replace('two hundred', '200')

            
        s = [re.sub('[\",.;?!%”“()]', '', s.lower()) for s in c.split(' ') if re.sub('[\",.;?!%”“]', '', s.lower()) ]
        s = [si.replace('2','two').replace('3','three').replace('4','four').replace('5','five').replace('6','six').replace('three/four','3/4') for si in s]
        #words |= set(s)
        captions.append(' '.join(s))
        tj_caption.append(' '.join(s))
    else:
        test_caption_dict[tj['id']] = tj_caption
        del tj_caption


print('loading training features ...')
train_data = list()
for train_feat in train_feat_list:
    train_data.append(np.load(train_feat))
else:
    train_data = np.array(train_data)
    print('train_data.shape:', train_data.shape, 'train_data.dtype:', train_data.dtype)
    print('pickling train_data')
    pickle.dump(train_data, open('train_data.pkl', 'wb'))
    print('done!')
    
    
print('loading testing features ...')
test_data = list()
for test_feat in test_feat_list:
    test_data.append(np.load(test_feat))
else:
    test_data = np.array(test_data)
    print('test_data.shape:', test_data.shape, 'test_data.dtype:', test_data.dtype)
    print('pickling test_data')
    pickle.dump(test_data, open('test_data.pkl', 'wb'))
    print('done!')

1450 training set, 100 testing set
loading training label ...
loading training id ...
loading testing label ...
loading testing id ...
loading training features ...


In [None]:
#def train():
## training phase
if True:
    print_all_vars('before training')
    
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=0)
    np.save("./data/wordtoix", wordtoix)
    np.save('./data/ixtoword', ixtoword)
    np.save("./data/bias_init_vector", bias_init_vector)
    
    model = Video_Caption_Generator(dim_image=dim_image,                     # 4096
                                    n_words=len(wordtoix),                   # 11720
                                    dim_hidden=dim_hidden,                   # 1000
                                    batch_size=batch_size,                   # 50
                                    n_lstm_steps=n_frame_step,               # 80
                                    n_video_lstm_step=n_video_lstm_step,     # 80
                                    n_caption_lstm_step=n_caption_lstm_step, # 20
                                    bias_init_vector=bias_init_vector)       # bias_init_vector.shape: (11720,)
    
    print_all_vars('training phase after Video_Caption_Generator()')
    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model()
    sess = tf.InteractiveSession()

    saver = tf.train.Saver(max_to_keep=200)
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE) as scope:
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)

    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    
    loss_fd = open('loss.txt', 'w')
    loss_to_draw = []
    
    #################################
    for epoch in range(1, n_epochs+1):
        loss_to_draw_epoch = []
        index = list(range(train_data.shape[0]))
        
        np.random.shuffle(index)
        train_data_shuffle = np.copy(train_data[index])
        training_id_shuffle = pd.Series.copy(training_id[index])
        
        caption_shuffle = [random.choice(train_caption_dict[i]) for i in training_id_shuffle ]

        for start in range(0, 1450, batch_size):
            end = start + batch_size

                
            start_time = time.time()
            
            current_feats = train_data_shuffle[start:end]
            current_video_masks = np.ones((batch_size, n_video_lstm_step))
            current_captions = caption_shuffle[start:end]
            current_caption_ind = [[wordtoix[ci] for ci in c.split(' ')] for c in current_captions]

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=n_caption_lstm_step)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix), 1] ) ] ).astype(int)
            current_caption_masks = np.zeros(current_caption_matrix.shape)
            nonzeros = np.array( [(x != 0).sum() + 1 for x in current_caption_matrix ])
                                
            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            #print(current_caption_matrix.shape)
            probs_val = sess.run(tf_probs, 
                                 feed_dict={tf_video:current_feats,
                                            tf_caption: current_caption_matrix})
            _, loss_val = sess.run([train_op, tf_loss], 
                                   feed_dict={tf_video: current_feats,
                                              tf_video_mask : current_video_masks,
                                              tf_caption: current_caption_matrix,
                                              tf_caption_mask: current_caption_masks})
            loss_to_draw_epoch.append(loss_val)
            
            print('idx: ', start, " Epoch: ", epoch, " loss: ", loss_val, ' Elapsed time: ', str((time.time() - start_time)))
            loss_fd.write('epoch ' + str(epoch) + ' loss ' + str(loss_val) + '\n')
            
        
        # draw loss curve every epoch
        loss_to_draw.append(np.mean(loss_to_draw_epoch))
        plt_save_dir = "./loss_imgs"
        plt_save_img_name = str(epoch) + '.png'
        plt.plot(range(len(loss_to_draw)), loss_to_draw, color='g')
        plt.grid(True)
        plt.savefig(os.path.join(plt_save_dir, plt_save_img_name))

        print_all_vars('saving to'+model_path+'/model_'+str(epoch)+'.ckpt')
        if np.mean(loss_to_draw_epoch) == min(loss_to_draw):
        #if np.mod(epoch, 20) == 0:
        #if np.mod(epoch, 1) == 0:
            save_path = saver.save(sess, model_path+'/model_'+str(epoch)+'.ckpt')
            print("Epoch ", epoch, " is improved. Saving the model at", save_path, '...')
            #saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
            #saver.save(sess, model_path+'/model_'+str(epoch)+'.ckpt')
        
    #################################
    
    loss_fd.close()
    

## Testing

In [6]:
if True:
    model_path='./models/model_1.ckpt'


    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())

    bias_init_vector = np.load('./data/bias_init_vector.npy')
    """
    model = Video_Caption_Generator(dim_image=dim_image,                     # 4096
                                    n_words=len(ixtoword),                   # 11720
                                    dim_hidden=dim_hidden,                   # 1000
                                    batch_size=batch_size,                   # 50
                                    n_lstm_steps=n_frame_step,               # 80
                                    n_video_lstm_step=n_video_lstm_step,     # 80
                                    n_caption_lstm_step=n_caption_lstm_step, # 20
                                    bias_init_vector=bias_init_vector)       # bias_init_vector.shape: (11720,)
    
    """
    video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator()
    """
    sess = tf.InteractiveSession()

    saver = tf.train.Saver() 
    """
    print(model_path)
    print('#############################')
    saver.restore(sess, model_path)
    print('#############################')

    test_output_txt_fd = open('S2VT_results_256_96_0.01.txt', 'w')

i= 0
i= 1
i= 2
i= 3
i= 4
i= 5
i= 6
i= 7
i= 8
i= 9
i= 10
i= 11
i= 12
i= 13
i= 14
i= 15
i= 16
i= 17
i= 18
i= 19
i= 20
i= 21
i= 22
i= 23
i= 24
i= 25
i= 26
i= 27
i= 28
i= 29
i= 30
i= 31
i= 32
i= 33
i= 34
i= 35
i= 36
i= 37
i= 38
i= 39
./models/model_1.ckpt
#############################
INFO:tensorflow:Restoring parameters from ./models/model_1.ckpt
#############################


In [12]:
test_output_txt_fd = open('S2VT_results_512_200.txt', 'w')

In [7]:
#def test(model_path='./models/model_0.ckpt'):
if True:
    """
    model_path='./models/model_990.ckpt'
    #test_data = get_video_test_data(video_test_data_path, video_test_feat_path)
    #test_videos = test_data['video_path'].unique()
    
    #test_videos = np.array(['./rgb_test_features/klteYv1Uv9A_27_33.avi.npy', 
    #                    './rgb_test_features/5YJaS2Eswg0_22_26.avi.npy',
    #                    './rgb_test_features/UbmZAe5u5FI_132_141.avi.npy',
    #                    './rgb_test_features/JntMAcTlOF0_50_70.avi.npy',
    #                    './rgb_test_features/tJHUH9tpqPg_113_118.avi.npy',])

    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())

    bias_init_vector = np.load('./data/bias_init_vector.npy')

    
    
    model = Video_Caption_Generator(dim_image=dim_image,                     # 4096
                                    n_words=len(ixtoword),                   # 11720
                                    dim_hidden=dim_hidden,                   # 1000
                                    batch_size=batch_size,                   # 50
                                    n_lstm_steps=n_frame_step,               # 80
                                    n_video_lstm_step=n_video_lstm_step,     # 80
                                    n_caption_lstm_step=n_caption_lstm_step, # 20
                                    bias_init_vector=bias_init_vector)       # bias_init_vector.shape: (11720,)
    
    
    #print_all_vars('testing phase after Video_Caption_Generator()')
    video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator()
    #print_all_vars('testing phase after model.build_generator()')
    ####sess = tf.InteractiveSession()

    ####saver = tf.train.Saver()
    print(model_path)
    print('#############################')
    saver.restore(sess, model_path)
    print('#############################')
    """
    #test_output_txt_fd = open('S2VT_results.txt', 'w')
    #for idx, video_feat_path in enumerate(test_videos):
    for tid, video_feat in zip(testing_id, test_data):

        #print(idx, video_feat_path)

        video_feat = np.copy(video_feat.reshape(1, n_video_lstm_step ,dim_image))
        print('video_feat.shape', video_feat.shape)
        #video_feat = np.load(video_feat_path)
        #video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
        #video_mask = np.ones(video_feat.shape)
        #print('video_feat.shape[1] == n_frame_step:', video_feat.shape[1] == n_frame_step)
        if video_feat.shape[1] == n_frame_step:
            video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
        else:
            continue
        #video_feat = video_feat.reshape(1, n_video_lstm_step, dim_image)
            #shape_templete = np.zeros(shape=(1, n_frame_step, 4096), dtype=float )
            #shape_templete[:video_feat.shape[0], :video_feat.shape[1], :video_feat.shape[2]] = video_feat
            #video_feat = shape_templete
            #video_mask = np.ones((video_feat.shape[0], n_frame_step))

        generated_word_index = sess.run(caption_tf, feed_dict={video_tf:video_feat, video_mask_tf:video_mask})
        generated_words = ixtoword[generated_word_index]

        punctuation = np.argmax(np.array(generated_words) == '<eos>') + 1
        generated_words = generated_words[:punctuation]

        generated_sentence = ' '.join(generated_words)
        generated_sentence = generated_sentence.replace('<bos> ', '')
        generated_sentence = generated_sentence.replace(' <eos>', '')
        ## filter
        gsplit = generated_sentence.split(' ')
        for gi, g in enumerate(gsplit):
            if gi == 0:
                g1 = g
            else:
                if g1 == g:
                    gsplit[gi-1] = ''
        else:
            generated_sentence = ' '.join([gs for gs in gsplit if gs])
        
        print(generated_sentence,'\n')
        #test_output_txt_fd.write(video_feat_path + '\n')
        test_output_txt_fd.write(tid+','+generated_sentence + '\n')
    test_output_txt_fd.close()

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat.shape (1, 80, 4096)
a man is is 

video_feat