In [1]:
from utils import configurations, utils, base, image_sequence, sentence
import tensorflow as tf
import numpy as np

In [2]:
configs = configurations.DatasetConfigs()
configs.n_features = 128
configs.batch_size = 16
configs.video.features_folder = "rgb_vgg_fc7_features/"
configs.describe()

batch_size = 16
clip_length = 150
fps = 30
include_audio_features = False
n_features = 128
sentence
  --- embedding_file = utils/vocabulary/glove.6B.50d.txt
  --- embeddings_dim = 50
  --- embeddings_folder = utils/vocabulary/
  --- n_tokens = 20000
test_info_path = data/test_data.json
train_info_path = data/train_data.json
valid_info_path = data/val_data.json
video
  --- features_folder = rgb_vgg_fc7_features/
  --- files_pattern = datasets/DiDeMo/{}.mp4
  --- max_frames = 900
  --- n_extracted_features = 400
  --- n_splits = 6
  --- size = (224, 224)


In [3]:
from itertools import product
proposals = np.asarray(list(product(range(6), range(6))))
proposals = proposals[proposals[:, 1]>=proposals[:, 0]].tolist()

In [4]:
with configs:
    train_ds, valid_ds, test_ds, embedding_matrix = base.preprocess_datasets()

    video_layer = image_sequence.VideoLayer()
    sentence_layer = sentence.SentenceLayer(embedding_matrix)
    #     audio_layer = get_audio_layer()

    moment = base.MomentVideo(video_layer, sentence_layer, proposals, configs.batch_size)

In [5]:
tf.config.set_soft_device_placement(True)

In [6]:
moment.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4))

In [7]:
moment.evaluate(valid_ds)



[]

In [8]:
with tf.device("GPU:0"):
    moment.fit(train_ds.repeat(), epochs=30, validation_data=valid_ds, steps_per_epoch=600)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30

KeyboardInterrupt: 

In [5]:
all_y_true = []
batch_ordered_proposals = []
for videos, sentences, lenghts, y_true in iter(test_ds):
    videos_repr = moment.video_1(videos)
    sentences_repr = moment.sentence_1(sentences)
    all_y_true.extend(y_true)
    for i in range(moment.batch_size):
        similarities = moment.cosine_similarity(videos_repr[i], sentences_repr[i], axis=-1)
        scores = []
        for proposal in moment.proposals:
            scores.append(tf.reduce_sum(similarities[proposal[0]*25:(proposal[1]+1)*25]))


        indices = tf.argsort(scores, axis=-1, direction='DESCENDING')
        batch_ordered_proposals.append(tf.gather(proposals, indices))
    print("-", end=" ")

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

In [6]:
batch_ordered_proposals = tf.stack(batch_ordered_proposals)
all_y_true = tf.stack(all_y_true)

In [15]:
def iou(pred, gt):
    intersection = max(0, min(pred[1], gt[1]) + 1 - max(pred[0], gt[0]))
    union = max(pred[1], gt[1]) + 1 - min(pred[0], gt[0])
    return float(intersection)/union

def rank(pred, gt):
    return pred.index(tuple(gt)) + 1 

def eval_predictions(segments, batch_y_true, quiet=False):
    avg_ranks = []
    miou = []
    for s, y_true in zip(segments, batch_y_true):
        s = [tuple(elem) for elem in s.tolist()]
        pred = s[0]
        miou.append(iou(pred, y_true))
        ranks = rank(s, y_true)
        avg_ranks.append(ranks)
    rank1 = np.sum(np.array(avg_ranks) <= 1)/float(len(avg_ranks)) 
    rank3 = np.sum(np.array(avg_ranks) <= 3)/float(len(avg_ranks))
    rank5 = np.sum(np.array(avg_ranks) <= 5)/float(len(avg_ranks))
    miou = np.mean(miou)

    if not quiet:
        print("Average rank@1: %f" % rank1)
        print("Average rank@3: %f" % rank3)
        print("Average rank@5: %f" % rank5)
        print("Average iou: %f" % miou)
    return rank1, rank5, miou

In [16]:
eval_predictions(batch_ordered_proposals.numpy(), tf.reshape(all_y_true, [-1, 2]).numpy())

(5, 5) [4 4] 0.0
(0, 1) [2 2] 0.0
(1, 3) [1 1] 0.3333333333333333
(4, 4) [4 5] 0.5
(4, 4) [2 2] 0.0
(0, 5) [1 2] 0.3333333333333333
(0, 0) [2 2] 0.0
(0, 5) [0 0] 0.16666666666666666
(1, 4) [0 1] 0.2
(0, 5) [1 1] 0.16666666666666666
(3, 3) [3 3] 1.0
(0, 5) [1 1] 0.16666666666666666
(0, 5) [3 4] 0.3333333333333333
(0, 4) [3 3] 0.2
(0, 0) [3 3] 0.0
(0, 3) [4 5] 0.0
(5, 5) [3 3] 0.0
(1, 3) [5 5] 0.0
(1, 4) [5 5] 0.0
(0, 0) [2 2] 0.0
(2, 2) [1 1] 0.0
(0, 4) [1 1] 0.2
(0, 5) [3 3] 0.16666666666666666
(0, 5) [2 2] 0.16666666666666666
(0, 1) [2 2] 0.0
(0, 5) [4 4] 0.16666666666666666
(0, 5) [5 5] 0.16666666666666666
(0, 5) [2 2] 0.16666666666666666
(0, 5) [5 5] 0.16666666666666666
(0, 5) [5 5] 0.16666666666666666
(4, 4) [0 0] 0.0
(4, 4) [2 2] 0.0
(1, 1) [3 4] 0.0
(1, 1) [4 4] 0.0
(1, 1) [5 5] 0.0
(5, 5) [3 4] 0.0
(1, 5) [4 4] 0.2
(0, 4) [0 2] 0.6
(0, 4) [1 1] 0.2
(1, 1) [3 3] 0.0
(0, 2) [2 2] 0.3333333333333333
(0, 5) [0 0] 0.16666666666666666
(1, 1) [2 2] 0.0
(0, 4) [3 3] 0.2
(2, 2) [0 0] 0.0

(0.06772908366533864, 0.29731075697211157, 0.18709744355909694)

### random
Average rank@1: 0.067729 <br>
Average rank@3: 0.184512 <br>
Average rank@5: 0.297311 <br>
Average iou: 0.187097

### basic supervised
Average rank@1: 0.134960 <br>
Average rank@3: 0.391434 <br>
Average rank@5: 0.620518 <br>
Average iou: 0.180698 <br>

### mcn
Average rank@1: 0.2810<br>
Average rank@3: -<br>
Average rank@5: 0.7821<br>
Average iou: 0.4108