In [None]:
from google.colab import drive
drive.mount('/content/drive')
print('mount success')

Mounted at /content/drive
mount success


In [None]:
import os
import json

In [None]:
# test path
test_path = '/content/drive/Shareddrives/msvd-dataset/test'

# test features
test_features_path = '/content/drive/Shareddrives/msvd-test-feats/test/custom_feat'

In [None]:
!pip install keras_preprocessing

Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


## MODEL FOR INFERENCE

In [None]:
import os
from keras.layers import Input, LSTM, Dense
from keras.models import Model, load_model
import joblib

In [None]:
save_model_path = '/content/drive/Shareddrives/FYP-models/without_cust_vocab_3700'
encoder_name = 'encoder_model.h5'
decoder_name = 'decoder_model_weights.h5'
num_encoder_tokens = 4126
num_decoder_tokens = 3700
latent_dim = 512
time_steps_encoder = 80

In [None]:
def inference_model():
    """Returns the model that will be used for inference"""
    with open(os.path.join(save_model_path, 'tokenizer' + str(num_decoder_tokens)), 'rb') as file:
        tokenizer = joblib.load(file)
    # loading encoder model. This remains the same
    inf_encoder_model = load_model(os.path.join(save_model_path, encoder_name))

    # Manually compile the encoder model
    inf_encoder_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


    # inference decoder model loading
    decoder_inputs = Input(shape=(None, num_decoder_tokens))
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    inf_decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    inf_decoder_model.load_weights(os.path.join(save_model_path, decoder_name))
    return tokenizer, inf_encoder_model, inf_decoder_model


# TESTING


### IMPORTS

In [None]:
!pip install dill
!pip show h5py

Collecting dill
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.8
Name: h5py
Version: 3.9.0
Summary: Read and write HDF5 files from Python
Home-page: 
Author: 
Author-email: Andrew Collette <andrew.collette@gmail.com>
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy
Required-by: h5netcdf, tensorflow


In [None]:
import functools
import operator
import os
import time
import h5py
import numpy as np

In [None]:
model_name = save_model_path.split('/')[-1]
print(model_name)

without_cust_vocab_att_3700


## TEST VIDEO CAPTIONING

In [None]:
test_videos_filepath = os.path.join(self.test_path, "testing_ids.txt")

In [None]:
class VideoDescriptionInference(object):
    """
        Initialize the parameters for the model
        """
    def __init__(self):
        self.latent_dim = latent_dim
        self.num_encoder_tokens = num_encoder_tokens
        self.num_decoder_tokens = num_decoder_tokens
        self.time_steps_encoder = time_steps_encoder
        self.max_probability = -1

        # models
        self.tokenizer, self.inf_encoder_model,  self.inf_decoder_model = inference_model()
        self.save_model_path = save_model_path
        self.test_path = test_path
        self.test_features_path = test_features_path
        self.search_type = 'greedy'

    def greedy_search(self, loaded_array):
        """

        :param f: the loaded numpy array after creating videos to frames and extracting features
        :return: the final sentence which has been predicted greedily
        """
        inv_map = self.index_to_word()
        states_value = self.inf_encoder_model.predict(loaded_array.reshape(-1, self.time_steps_encoder, self.num_encoder_tokens))
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        final_sentence = ''
        target_seq[0, 0, self.tokenizer.word_index['bos']] = 1
        for i in range(15):
            output_tokens, h, c = self.inf_decoder_model.predict([target_seq] + states_value)
            states_value = [h, c]
            output_tokens = output_tokens.reshape(self.num_decoder_tokens)
            y_hat = np.argmax(output_tokens)
            if y_hat == 0:
                continue
            if inv_map[y_hat] is None:
                break
            if inv_map[y_hat] == 'eos':
                break
            else:
                final_sentence = final_sentence + inv_map[y_hat] + ' '
                target_seq = np.zeros((1, 1, self.num_decoder_tokens))
                target_seq[0, 0, y_hat] = 1
        return final_sentence

    def decode_sequence2bs(self, input_seq):
        states_value = self.inf_encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        target_seq[0, 0, self.tokenizer.word_index['bos']] = 1
        self.beam_search(target_seq, states_value, [], [], 0)
        return decode_seq

    def beam_search(self, target_seq, states_value, prob, path, lens):
        """

        :param target_seq: the array that is fed into the model to predict the next word
        :param states_value: previous state that is fed into the lstm cell
        :param prob: probability of predicting a word
        :param path: list of words from each sentence
        :param lens: number of words
        :return: final sentence
        """
        global decode_seq
        node = 2
        output_tokens, h, c = self.inf_decoder_model.predict(
            [target_seq] + states_value)
        output_tokens = output_tokens.reshape(self.num_decoder_tokens)
        sampled_token_index = output_tokens.argsort()[-node:][::-1]
        states_value = [h, c]
        for i in range(node):
            if sampled_token_index[i] == 0:
                sampled_char = ''
            else:
                sampled_char = list(self.tokenizer.word_index.keys())[
                    list(self.tokenizer.word_index.values()).index(sampled_token_index[i])]
            MAX_LEN = 12
            if sampled_char != 'eos' and lens <= MAX_LEN:
                p = output_tokens[sampled_token_index[i]]
                if sampled_char == '':
                    p = 1
                prob_new = list(prob)
                prob_new.append(p)
                path_new = list(path)
                path_new.append(sampled_char)
                target_seq = np.zeros((1, 1, self.num_decoder_tokens))
                target_seq[0, 0, sampled_token_index[i]] = 1.
                self.beam_search(target_seq, states_value, prob_new, path_new, lens + 1)
            else:
                p = output_tokens[sampled_token_index[i]]
                prob_new = list(prob)
                prob_new.append(p)
                p = functools.reduce(operator.mul, prob_new, 1)
                if p > self.max_probability:
                    decode_seq = path
                    self.max_probability = p

    def decoded_sentence_tuning(self, decoded_sentence):
        # tuning sentence
        decode_str = []
        filter_string = ['bos', 'eos']
        uni_gram = {}
        last_string = ""
        for idx2, c in enumerate(decoded_sentence):
            if c in uni_gram:
                uni_gram[c] += 1
            else:
                uni_gram[c] = 1
            if last_string == c and idx2 > 0:
                continue
            if c in filter_string:
                continue
            if len(c) > 0:
                decode_str.append(c)
            if idx2 > 0:
                last_string = c
        return decode_str

    def index_to_word(self):
        # inverts word tokenizer
        index_to_word = {value: key for key, value in self.tokenizer.word_index.items()}
        return index_to_word

    def get_test_data(self):
        filename_test = []
        feat_test = []

        with open(test_videos_filepath) as testing_file:
            lines = testing_file.readlines()
            check_num = 0
            for line in lines:
                # getting filename for the test video
                file_name = line.split(".")[0]
                filename_test.append(file_name)

                '''
                # getting cnn features for the test video
                cnn_path = os.path.join(self.test_features_path, 'feat', file_name + '.npy')
                if os.path.exists(cnn_path): #features exist
                    cnn_f = np.load(cnn_path)
                else:
                    print("features not found")
                feat_test.append(cnn_f)
                '''

                # getting cnn and yolo features for the test video
                cnn_path = os.path.join(self.test_features_path, 'feat', file_name + '.npy')
                yolo_path = os.path.join(self.test_features_path, 'yolo-feat', file_name + '.npy')
                if os.path.exists(cnn_path) and os.path.exists(yolo_path): #features exist
                    cnn_f = np.load(cnn_path)
                    yolo_f = np.load(yolo_path)
                else:
                    print("features not found")
                yolo_f = yolo_f.reshape(-1, 30)
                #concatenate arrays
                combined_f = np.concatenate((cnn_f, yolo_f), axis=1)

                feat_test.append(combined_f)

                check_num += 1
                print(f"{check_num} feature files loaded")


            feat_test = np.array(feat_test)

        return feat_test, filename_test

    def test(self):
        feat_test, filename_test = self.get_test_data()

        # generate inference test outputs
        with open(os.path.join(self.test_path, f'test_video_captions_generated_{model_name}_{self.search_type}.txt'), 'w') as file:
            for idx, feat in enumerate(feat_test):
                print(idx)
                file.write(filename_test[idx] + ',')
                if self.search_type == 'greedy':
                    start = time.time()
                    decoded_sentence = self.greedy_search(feat.reshape(-1, self.time_steps_encoder, self.num_encoder_tokens))
                    file.write(decoded_sentence + ',{:.2f}'.format(time.time()-start))
                else:
                    start = time.time()
                    decoded_sentence = self.decode_sequence2bs(feat.reshape(-1, self.time_steps_encoder, self.num_encoder_tokens))
                    decode_str = self.decoded_sentence_tuning(decoded_sentence)
                    for d in decode_str:
                        file.write(d + ' ')
                    file.write(',{:.2f}'.format(time.time() - start))
                file.write('\n')

                # re-init max prob
                self.max_probability = -1

In [None]:
if __name__ == "__main__":
    video_to_text = VideoDescriptionInference()
    video_to_text.test()



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
