In [1]:
from utils import configurations, utils, base, image_sequence, sentence
import tensorflow as tf
import numpy as np

In [2]:
configs = configurations.DatasetConfigs()
configs.n_features = 128
configs.batch_size = 12
configs.video.features_folder = "rgb_vgg_fc7_features/"
configs.describe()

batch_size = 12
clip_length = 150
fps = 30
include_audio_features = False
n_features = 128
sentence
  --- embedding_file = utils/vocabulary/glove.6B.50d.txt
  --- embeddings_dim = 50
  --- embeddings_folder = utils/vocabulary/
  --- n_tokens = 20000
test_info_path = data/test_data.json
train_info_path = data/train_data.json
valid_info_path = data/val_data.json
video
  --- features_folder = rgb_vgg_fc7_features/
  --- files_pattern = datasets/DiDeMo/{}.mp4
  --- max_frames = 900
  --- n_extracted_features = 400
  --- n_splits = 6
  --- size = (224, 224)


In [3]:
with configs:
    train_ds, valid_ds, test_ds, embedding_matrix = base.preprocess_datasets()

    video_layer = image_sequence.VideoLayer()
    sentence_layer = sentence.SentenceLayer(embedding_matrix)
    #     audio_layer = get_audio_layer()

    moment = base.MomentVideo(video_layer, sentence_layer, configs.batch_size)

In [4]:
tf.config.set_soft_device_placement(True)

In [5]:
moment.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4))

In [8]:
moment.evaluate(valid_ds)



[]

In [7]:
with tf.device("GPU:0"):
    moment.fit(train_ds.repeat(), epochs=10, validation_data=valid_ds, steps_per_epoch=600)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 18/600 [..............................] - ETA: 3:57 - loss: 24.0013

KeyboardInterrupt: 

In [17]:
from utils.metrics import mIOU, prediction_by_moving_average, intersection_over_union

In [7]:
data = iter(test_ds).get_next()

In [None]:
mean_15 = 0
mean_7 = 0
mean_30 = 0
count = 0
for data in iter(test_ds):
    videos, sentences, y_true = data

    videos_repr = video_layer(videos)
    sentences_repr = sentence_layer(sentences)

    scores_videos = []
    for i in range(configs.batch_size):
        coattention_matrix = matrix_cosine_similarity(videos_repr[i], sentences_repr[i])
        scores_video = get_video_score(videos_repr[i], sentences_repr[i], coattention_matrix)
        scores_videos.append(scores_video)

    scores_videos = tf.stack(scores_videos)
    miou_score = tf.numpy_function(mIOU, [scores_videos, y_true, 7], tf.float64)
    mean_7 += miou_score
    miou_score = tf.numpy_function(mIOU, [scores_videos, y_true, 15], tf.float64)
    mean_15 += miou_score
    miou_score = tf.numpy_function(mIOU, [scores_videos, y_true, 30], tf.float64)
    mean_30 += miou_score
    count += 1

In [19]:
mean_15 = 0
mean_7 = 0
mean_30 = 0
count = 0
for data in iter(train_ds):
    videos, sentences, y_true = data

    videos_repr = moment.video_1(videos)
    sentences_repr = moment.sentence_1(sentences)

    scores_videos = []
    for i in range(moment.batch_size):
        #coattention_matrix = self.matrix_cosine_similarity(videos_repr[i], sentences_repr[i])
        scores_video = moment.get_video_score(videos_repr[i], sentences_repr[i])
        scores_videos.append(scores_video)

    scores_videos = tf.stack(scores_videos)
    miou_score = tf.numpy_function(mIOU, [scores_videos, y_true, 7], tf.float64)
    mean_7 += miou_score
    miou_score = tf.numpy_function(mIOU, [scores_videos, y_true, 15], tf.float64)
    mean_15 += miou_score
    miou_score = tf.numpy_function(mIOU, [scores_videos, y_true, 30], tf.float64)
    mean_30 += miou_score
    count += 1

In [20]:
print(mean_7/count)
print(mean_15/count)
print(mean_30/count)

tf.Tensor(0.19779595959595933, shape=(), dtype=float64)
tf.Tensor(0.19475303030302968, shape=(), dtype=float64)
tf.Tensor(0.2000580808080813, shape=(), dtype=float64)


In [14]:
from itertools import product

In [15]:
def generate_proposals(scores):
    n_proposals = scores.shape[1] // 25
    proposals = list(product(range(n_proposals), range(n_proposals)))
    y_pred = []
    for score in scores:
        max_score = 0
        proposal = [0, 0]
        for proposal in proposals:
            if proposal[1] >= proposal[0]:
                final_score = tf.reduce_mean(score[proposal[0]*25:(proposal[1]+1)*25])
                if final_score > max_score:
                    max_score = final_score
                    max_proposal = proposal
                    
        y_pred.append(max_proposal)
    
    return y_pred

In [18]:
total_scores = 0
count = 0
for data in iter(test_ds):
    videos, sentences, y_true = data

    videos_repr = moment.video_1(videos)
    sentences_repr = moment.sentence_1(sentences)

    scores_videos = []
    for i in range(moment.batch_size):
        coattention_matrix = moment.matrix_cosine_similarity(videos_repr[i], sentences_repr[i])
        scores_video = moment.get_video_score(videos_repr[i], sentences_repr[i], coattention_matrix)
        scores_videos.append(scores_video)

    scores_videos = tf.stack(scores_videos)
    
    y_pred = tf.cast(tf.convert_to_tensor(generate_proposals(scores_videos)), tf.int64)
    scores = np.apply_along_axis(
        intersection_over_union, 1, np.concatenate([y_true, y_pred], axis=-1))
    total_scores += tf.reduce_sum(scores)
    count+=12

In [19]:
print(total_scores/count)

tf.Tensor(0.16770729684908797, shape=(), dtype=float64)


In [8]:
from torch import nn
import torch
from torch.autograd import Variable

def xattn_score_t2i(images, captions, cap_lens, opt):
    """
    Images: (n_image, n_regions, d) matrix of images
    Captions: (n_caption, max_n_word, d) matrix of captions
    CapLens: (n_caption) array of caption lengths
    """
    similarities = []
    n_image = images.size(0)
    n_caption = captions.size(0)
    for i in range(n_caption):
        # Get the i-th text description
        n_word = cap_lens[i]
        cap_i = captions[i, :n_word, :].unsqueeze(0).contiguous()
        # --> (n_image, n_word, d)
        cap_i_expand = cap_i.repeat(n_image, 1, 1)
        """
            word(query): (n_image, n_word, d)
            image(context): (n_image, n_regions, d)
            weiContext: (n_image, n_word, d)
            attn: (n_image, n_region, n_word)
        """
        weiContext, attn = func_attention(cap_i_expand, images, opt, smooth=opt.lambda_softmax)
        print()
        cap_i_expand = cap_i_expand.contiguous()
        weiContext = weiContext.contiguous()
        # (n_image, n_word)
        row_sim = cosine_similarity(cap_i_expand, weiContext, dim=2)
        if opt.agg_func == 'LogSumExp':
            row_sim.mul_(opt.lambda_lse).exp_()
            row_sim = row_sim.sum(dim=1, keepdim=True)
            row_sim = torch.log(row_sim)/opt.lambda_lse
        elif opt.agg_func == 'Max':
            row_sim = row_sim.max(dim=1, keepdim=True)[0]
        elif opt.agg_func == 'Sum':
            row_sim = row_sim.sum(dim=1, keepdim=True)
        elif opt.agg_func == 'Mean':
            row_sim = row_sim.mean(dim=1, keepdim=True)
        else:
            raise ValueError("unknown aggfunc: {}".format(opt.agg_func))
        similarities.append(row_sim)

    # (n_image, n_caption)
    similarities = torch.cat(similarities, 1)
    
    return similarities

class ContrastiveLoss(nn.Module):
    """
    Compute contrastive loss
    """
    def __init__(self, margin=0, max_violation=False):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        self.max_violation = max_violation

    def forward(self, scores):
        # compute image-sentence score matrix
#         if self.opt.cross_attn == 't2i':
#             scores = xattn_score_t2i(im, s, s_l, self.opt)
#         elif self.opt.cross_attn == 'i2t':
#             scores = xattn_score_i2t(im, s, s_l, self.opt)
#         else:
#             raise ValueError("unknown first norm type:", opt.raw_feature_norm)
        diagonal = scores.diag().view(scores.size(0), 1)
        d1 = diagonal.expand_as(scores)
        d2 = diagonal.t().expand_as(scores)

        # compare every diagonal score to scores in its column
        # caption retrieval
        cost_s = (1 + scores - d1).clamp(min=0)
        # compare every diagonal score to scores in its row
        # image retrieval
        cost_im = (1 + scores - d2).clamp(min=0)

        # clear diagonals
        mask = torch.eye(scores.size(0)) > .5
        I = Variable(mask)
        cost_s = cost_s.masked_fill_(I, 0)
        cost_im = cost_im.masked_fill_(I, 0)

        # keep the maximum violating negative for each query
        if self.max_violation:
            cost_s = cost_s.max(1)[0]
            cost_im = cost_im.max(0)[0]
        return cost_s.sum() + cost_im.sum()

In [20]:
def get_ranking_loss(scores, margin, top_k=8):
    scores_positives = tf.linalg.diag_part(scores)

    shape_negative = list(scores.shape)
    bool_matrix = tf.ones(shape_negative)
    bool_matrix = tf.linalg.set_diag(bool_matrix, tf.zeros(shape_negative[0])) == 1
    shape_negative[-1] = shape_negative[-1] - 1

    scores_negatives = tf.reshape(scores[bool_matrix], shape_negative)
    top_k_scores_negatives = tf.sort(scores_negatives, axis=1, direction="DESCENDING")[:, 0]

    loss = 1 - scores_positives + top_k_scores_negatives
    loss = tf.where(loss < 0.0, 0.0, loss)

    return tf.reduce_sum(loss)


def margin_based_ranking_loss(scores_videos, scores_sentences, margin=1, top_k=8):
    video_loss = get_ranking_loss(scores_videos, margin, top_k)
    sentence_loss = get_ranking_loss(scores_sentences, margin, top_k)
    
    return video_loss + sentence_loss

In [9]:
scores_videos, scores_sentences = moment.call((data[0], data[1]))

In [23]:
ContrastiveLoss()(torch.from_numpy(scores_videos.numpy()))

tensor(262.3871)

In [25]:
get_ranking_loss(scores_videos, 1)

<tf.Tensor: shape=(), dtype=float32, numpy=12.396402>

In [29]:
scores = torch.from_numpy(scores_videos.numpy())

In [11]:
data = iter(test_ds).get_next()
videos_repr = moment.video_1(data[0])
sentences_repr = moment.sentence_1(data[1])

In [9]:
images = torch.from_numpy(videos_repr.numpy())
captions = torch.from_numpy(sentences_repr.numpy())
cap_lens = torch.from_numpy(lengths.numpy())

In [33]:
def func_attention(query, context, smooth, eps=1e-8):
    """
    query: (n_context, queryL, d)
    context: (n_context, sourceL, d)
    """
    batch_size_q, queryL = query.size(0), query.size(1)
    batch_size, sourceL = context.size(0), context.size(1)


    # Get attention
    # --> (batch, d, queryL)
    queryT = torch.transpose(query, 1, 2)

    # (batch, sourceL, d)(batch, d, queryL)
    # --> (batch, sourceL, queryL)
    attn = torch.bmm(context, queryT)
#     if opt.raw_feature_norm == "softmax":
#         # --> (batch*sourceL, queryL)
#         attn = attn.view(batch_size*sourceL, queryL)
#         attn = nn.Softmax()(attn)
#         # --> (batch, sourceL, queryL)
#         attn = attn.view(batch_size, sourceL, queryL)
#     elif opt.raw_feature_norm == "l2norm":
#         attn = l2norm(attn, 2)
#     elif opt.raw_feature_norm == "clipped_l2norm": #usado normalmente
#         attn = nn.LeakyReLU(0.1)(attn)
#         attn = l2norm(attn, 2)
#     elif opt.raw_feature_norm == "l1norm":
#         attn = l1norm_d(attn, 2)
#     elif opt.raw_feature_norm == "clipped_l1norm":
#         attn = nn.LeakyReLU(0.1)(attn)
#         attn = l1norm_d(attn, 2)
#     elif opt.raw_feature_norm == "clipped":
#         attn = nn.LeakyReLU(0.1)(attn)
#     elif opt.raw_feature_norm == "no_norm":
#         pass
#     else:
#         raise ValueError("unknown first norm type:", opt.raw_feature_norm)
    # --> (batch, queryL, sourceL)
    attn = torch.transpose(attn, 1, 2).contiguous()
    # --> (batch*queryL, sourceL)
    attn = attn.view(batch_size*queryL, sourceL)
    attn = nn.Softmax()(attn*smooth)
    # --> (batch, queryL, sourceL)
    attn = attn.view(batch_size, queryL, sourceL)
    # --> (batch, sourceL, queryL)
    attnT = torch.transpose(attn, 1, 2).contiguous()

    # --> (batch, d, sourceL)
    contextT = torch.transpose(context, 1, 2)
    # (batch x d x sourceL)(batch x sourceL x queryL)
    # --> (batch, d, queryL)
    weightedContext = torch.bmm(contextT, attnT)
    # --> (batch, queryL, d)
    weightedContext = torch.transpose(weightedContext, 1, 2)

    return weightedContext, attnT

In [106]:
"""
Images: (n_image, n_regions, d) matrix of images
Captions: (n_caption, max_n_word, d) matrix of captions
CapLens: (n_caption) array of caption lengths
"""
similarities = []
n_image = images.size(0)
n_caption = captions.size(0)
for i in range(n_caption):
    # Get the i-th text description
    n_word = cap_lens[i]
    cap_i = captions[i, :n_word, :].unsqueeze(0).contiguous()
    # --> (n_image, n_word, d)
    cap_i_expand = cap_i.repeat(n_image, 1, 1)
    """
        word(query): (n_image, n_word, d)
        image(context): (n_image, n_regions, d)
        weiContext: (n_image, n_word, d)
        attn: (n_image, n_region, n_word)
    """
    weiContext, attn = func_attention(cap_i_expand, images, smooth=1.)
    cap_i_expand = cap_i_expand.contiguous()
    weiContext = weiContext.contiguous()
    # (n_image, n_word)
    row_sim = cosine_similarity_pytorch(cap_i_expand, weiContext, dim=2)
#     print(row_sim)
    row_sim.mul_(1.).exp_()
    row_sim = row_sim.sum(dim=1, keepdim=True)
    row_sim = torch.log(row_sim)/1.
#     elif opt.agg_func == 'Max':
#         row_sim = row_sim.max(dim=1, keepdim=True)[0]
#     elif opt.agg_func == 'Sum':
#         row_sim = row_sim.sum(dim=1, keepdim=True)
#     elif opt.agg_func == 'Mean':
#         row_sim = row_sim.mean(dim=1, keepdim=True)
#     else:
#         raise ValueError("unknown aggfunc: {}".format(opt.agg_func))
    similarities.append(row_sim)

# (n_image, n_caption)
similarities = torch.cat(similarities, 1)
print(similarities)

tensor([[1.9431, 1.7703, 2.1892, 2.1531, 2.2121, 1.9591, 2.4017, 1.4632, 1.9711,
         2.7199, 1.9371, 1.7407],
        [2.0120, 1.8523, 2.2039, 2.2174, 2.3001, 1.9903, 2.4443, 1.4378, 2.0332,
         2.7731, 2.0203, 1.8760],
        [1.9344, 1.8250, 2.1832, 2.1645, 2.1897, 1.9731, 2.3717, 1.4712, 1.9309,
         2.6850, 1.9666, 1.7853],
        [1.9939, 1.8438, 2.2456, 2.2681, 2.2908, 2.0080, 2.4695, 1.4769, 2.0152,
         2.7646, 2.0093, 1.8757],
        [2.0942, 1.9954, 2.3747, 2.4108, 2.3422, 2.1091, 2.5893, 1.6596, 2.1577,
         2.8742, 2.1335, 1.9484],
        [2.0105, 1.7982, 2.2342, 2.2858, 2.2660, 1.9675, 2.4553, 1.4045, 1.9990,
         2.7719, 1.9954, 1.8556],
        [1.9837, 1.8423, 2.2284, 2.2312, 2.3162, 2.0031, 2.4751, 1.4330, 2.0578,
         2.8265, 2.0234, 1.8789],
        [1.9276, 1.7240, 2.1259, 2.1465, 2.1960, 1.8922, 2.3520, 1.3686, 1.8991,
         2.6574, 1.9461, 1.7680],
        [1.8869, 1.7628, 2.1234, 2.1250, 2.2119, 1.8845, 2.3515, 1.3353, 1.9227,

In [None]:
attn = torch.transpose(attn, 1, 2).contiguous()
# --> (batch*queryL, sourceL)
attn = attn.view(batch_size*queryL, sourceL)
attn = nn.Softmax()(attn*smooth)
# --> (batch, queryL, sourceL)
attn = attn.view(batch_size, queryL, sourceL)
# --> (batch, sourceL, queryL)
attnT = torch.transpose(attn, 1, 2).contiguous()

# --> (batch, d, sourceL)
contextT = torch.transpose(context, 1, 2)
# (batch x d x sourceL)(batch x sourceL x queryL)
# --> (batch, d, queryL)
weightedContext = torch.bmm(contextT, attnT)
# --> (batch, queryL, d)
weightedContext = torch.transpose(weightedContext, 1, 2)

In [99]:
def cosine_similarity(tensor1, tensor2, axis=2):
    num = tf.reduce_sum(tensor1 * tensor2, axis=axis)
    den = tf.norm(tensor1, axis=axis) * tf.norm(tensor2, axis=axis)
    return (num/(den+1e-15))

In [100]:
def cosine_similarity_pytorch(x1, x2, dim=1, eps=1e-8):
    """Returns cosine similarity between x1 and x2, computed along dim."""
    w12 = torch.sum(x1 * x2, dim)
    w1 = torch.norm(x1, 2, dim)
    w2 = torch.norm(x2, 2, dim)
    return (w12 / (w1 * w2).clamp(min=eps)).squeeze()

In [77]:
def attention(query, context, smooth=1., axis=2):
    batch_size_q, queryL = query.shape[0], query.shape[1]
    batch_size, sourceL = context.shape[0], context.shape[1]
    
    queryT = tf.transpose(query, [0, 2, 1])
    attn = tf.matmul(context, queryT)
    attn = tf.transpose(attn, [0, 2, 1])
    attn = tf.reshape(attn, (batch_size*queryL, sourceL))
    attn = tf.nn.softmax(attn*smooth)
    attn = tf.reshape(attn, (batch_size, queryL, sourceL))
    attnT = tf.transpose(attn, [0, 2, 1])
    contextT = tf.transpose(context, [0, 2, 1])
    weighted_context  = tf.matmul(contextT, attnT)
    weighted_context = tf.transpose(weighted_context, [0, 2, 1])
    
    return weighted_context

In [109]:
videos, sentences, lengths, labels = iter(test_ds).get_next()

videos_repr = moment.video_1(videos)
sentences_repr = moment.sentence_1(sentences)

scores = []

for i in range(moment.batch_size):
    n_word = lengths[i]
    cap_i = sentences_repr[i, :n_word, :]
    cap_i_expand = tf.broadcast_to(cap_i, [configs.batch_size, n_word, configs.n_features])
    
    attended_videos = attention(cap_i_expand, videos_repr, axis=2)
    similarity = cosine_similarity(attended_videos, cap_i_expand)
#     print(similarity)
    score = tf.reduce_logsumexp(similarity, axis=1)
    scores.append(score)

scores = tf.stack(scores, axis=1)
print(scores)

tf.Tensor(
[[1.9430578 1.7702808 2.1892307 2.15313   2.212085  1.9590675 2.4017167
  1.4632156 1.9711251 2.7198946 1.937081  1.7406503]
 [2.011984  1.8523271 2.2039373 2.217395  2.300087  1.9902754 2.4442534
  1.4377826 2.0331593 2.77314   2.0203288 1.8760084]
 [1.9343557 1.8249769 2.1832304 2.164532  2.189677  1.9731306 2.3717
  1.4712443 1.9308549 2.6850007 1.9665947 1.785289 ]
 [1.9938582 1.8438002 2.245632  2.2680688 2.2907639 2.0079536 2.4695222
  1.4768684 2.0151649 2.7645772 2.0092876 1.875711 ]
 [2.0941916 1.9954453 2.374743  2.410759  2.3421936 2.109109  2.5892982
  1.6595821 2.157721  2.8741732 2.1335208 1.9484286]
 [2.010512  1.7982444 2.234227  2.2858405 2.2659795 1.9674993 2.4552782
  1.4045287 1.9990218 2.7719142 1.9954426 1.8555772]
 [1.983668  1.842339  2.2284348 2.2312255 2.3161888 2.0030966 2.475127
  1.4329965 2.0577526 2.826468  2.023432  1.8789468]
 [1.9275945 1.7240446 2.1258788 2.1465027 2.1959543 1.8921885 2.3519516
  1.3685654 1.8990715 2.6574173 1.9461416 1.76

In [5]:
videos, sentences, lengths, labels = iter(test_ds).get_next()

videos_repr = moment.video_1(videos)
sentences_repr = moment.sentence_1(sentences)

scores_videos = []
for i in range(moment.batch_size):
    _scores_videos = []
    
    
    for j in range(moment.batch_size):
        coattention_matrix = moment.matrix_cosine_similarity(videos_repr[i], sentences_repr[j])
        scores_video = moment.get_video_score(videos_repr[i], sentences_repr[j], coattention_matrix)

        _scores_videos.append(tf.reduce_logsumexp(scores_video))

    scores_videos.append(_scores_videos)

scores_videos = tf.stack(scores_videos)

In [6]:
scores_videos

<tf.Tensor: shape=(12, 12), dtype=float32, numpy=
array([[4.8511205, 4.8986025, 4.8925076, 4.874424 , 4.8867793, 4.900085 ,
        4.9110694, 4.9775167, 4.905105 , 4.9022326, 4.9064703, 4.8731217],
       [4.944601 , 4.9338574, 4.8743906, 4.8945284, 4.9631333, 4.922152 ,
        4.9323125, 4.9232564, 4.943099 , 4.952224 , 4.942667 , 4.945979 ],
       [4.853339 , 4.913038 , 4.9157557, 4.860389 , 4.872889 , 4.914023 ,
        4.8827047, 4.960747 , 4.8737316, 4.898944 , 4.892063 , 4.8918996],
       [4.9080496, 4.9252305, 4.9198675, 4.9207854, 4.955092 , 4.946013 ,
        4.927921 , 4.9607573, 4.95114  , 4.9478283, 4.951502 , 4.95385  ],
       [5.010769 , 5.0383425, 5.026829 , 5.0751786, 5.0121384, 5.0301247,
        5.032574 , 5.095777 , 5.0758386, 5.017351 , 5.0531955, 5.0220013],
       [4.936885 , 4.913944 , 4.916531 , 4.9627905, 4.93622  , 4.9168797,
        4.941568 , 4.91589  , 4.921717 , 4.9537983, 4.92158  , 4.9398446],
       [4.88289  , 4.9135957, 4.874546 , 4.8684583, 4.95