# 1. Sentence retrieval

In [1]:
import numpy as np
import torch
from model import SMCN
from didemo import DidemoSMCN, TemporalFeatures

SMCN_PRM = dict(visual_size=2048, lang_size=300, embedding_size=100,
                dropout=0.3, max_length=50, visual_hidden=500,
                lang_hidden=1000)
PTH_FILE = 'data/interim/smcn_12/a/3_checkpoint.pth.tar'
VAL_LIST_PATH = 'data/raw/val_data.json'
RGB_FEAT_PATH = 'data/interim/didemo/resnet152/320x240_max.h5'
args = dict(test=False, context=False, loc=TemporalFeatures.NONE, cues=dict(rgb=dict(file=RGB_FEAT_PATH)))

def load_model(smcn_prm, cuda=False, filename=None):
    model = SMCN(**smcn_prm)
    model.eval()
    if cuda:
        model.cuda()

    if filename is not None:
        snapshot = torch.load(filename)
        model.load_state_dict(snapshot['state_dict'])
    return model

def torchify_and_collate(data, cuda=True):
    if isinstance(data, dict):
        if cuda:
            return {k: torch.from_numpy(v).unsqueeze_(0).cuda()
                    for k, v in data.items()}
        return {k: torch.from_numpy(v).unsqueeze_(0)
                for k, v in data.items()}
    elif isinstance(data, np.ndarray):
        if cuda:
            return torch.from_numpy(data).unsqueeze_(0).cuda()
        return torch.from_numpy(data).unsqueeze_(0)
    elif isinstance(data, int):
        if cuda:
            return torch.tensor([data]).cuda()
        return torch.tensor([data])
    else:
        raise

In [2]:
def honorable_cev(cuda):
    torch.set_grad_enabled(False)
    model = load_model(SMCN_PRM, filename=PTH_FILE, cuda=cuda)
    val_dataset = DidemoSMCN(VAL_LIST_PATH, **args)
    descriptions_rank = []
    counter = 0
    for moment_i_data in val_dataset:
        # get visual representation of a moment
        # TODO (critical): make it deterministic
        moment_i_ind = moment_i_data[0]
        moment_i_visual_rep = torchify_and_collate(moment_i_data[4], cuda=cuda)
        score_wrt_all_sentences = []
        for moment_j_data in val_dataset:
            # get text representation of sentence
            sentence_j_rep = torchify_and_collate(moment_j_data[2])
            sentence_j_length = torchify_and_collate(moment_j_data[3], cuda=cuda)
            score_j, is_similarity = model.predict(
                sentence_j_rep, sentence_j_length, moment_i_visual_rep)
            score_wrt_all_sentences.append(score_j)

        score_wrt_all_sentences = torch.cat(score_wrt_all_sentences)
        if not is_similarity:
            _, ranked_ind = score_wrt_all_sentences.sort()
            descriptions_rank.append(ranked_ind.eq(moment_i_ind).nonzero()[0, 0])
        else:
            NotImplementedError('WIP :P')

        counter += 1
        if counter == 10:
            break
    # TODO (critical): compute median and mean rank of description

# 2. Profiling

In [3]:
%load_ext line_profiler

Where is bottleneck?

In [6]:
%lprun -s -r -f honorable_cev honorable_cev()

__init__  214.74 s


<line_profiler.LineProfiler at 0x7f7f7734d828>

Timer unit: 1e-06 s

Total time: 527.86 s
File: <ipython-input-2-3011451fa9d9>
Function: honorable_cev at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def honorable_cev():
     2         1         37.0     37.0      0.0      torch.set_grad_enabled(False)
     3         1     150554.0 150554.0      0.0      model = load_model(SMCN_PRM, filename=PTH_FILE)
     4         1  219278103.0 219278103.0     41.5      val_dataset = DidemoSMCN(VAL_LIST_PATH, **args)
     5         1          2.0      2.0      0.0      descriptions_rank = []
     6         1          1.0      1.0      0.0      counter = 0
     7        10       5979.0    597.9      0.0      for moment_i_data in val_dataset:
     8                                                   # get visual representation of a moment
     9                                                   # TODO (critical): make it deterministic
    10        10         14.0      1.4    

Identifying bottleneck after switching to GPU

In [None]:
%lprun -s -r -f honorable_cev honorable_cev(True)

# 3. Batched implementation

TODO. It was cumbersome and we did not have time 😓

# 4. Moment retrieval

In [1]:
import numpy as np
import torch
from model import SMCN
from didemo import DidemoSMCNRetrieval
from didemo import RetrievalMode, TemporalFeatures

SMCN_PRM = dict(visual_size=2048, lang_size=300, embedding_size=100,
                dropout=0.3, max_length=50, visual_hidden=500,
                lang_hidden=1000)
PTH_FILE = 'data/interim/smcn_12/a/3_checkpoint.pth.tar'
VAL_LIST_PATH = 'data/raw/val_data_wwa.json'
RGB_FEAT_PATH = 'data/interim/didemo/resnet152/320x240_max.h5'
DATASET_PRM = dict(context=False, loc=TemporalFeatures.NONE,
                   cues=dict(rgb=dict(file=RGB_FEAT_PATH)))

def load_model(smcn_prm, cuda=False, filename=None):
    model = SMCN(**smcn_prm)
    model.eval()
    if cuda:
        model.cuda()

    if filename is not None:
        snapshot = torch.load(filename)
        model.load_state_dict(snapshot['state_dict'])
    return model

def torchify_and_collate(data, unsqueeze=False, cuda=False):
    if isinstance(data, dict):
        return {k: torchify_and_collate(v) for k, v in data.items()}
    elif isinstance(data, np.ndarray):
        output = torch.from_numpy(data)
        if unsqueeze:
            output.unsqueeze_(0)
        if cuda:
            return output.cuda()
        return output
    elif isinstance(data, int):
        if cuda:
            return torch.tensor([data]).cuda()
        return torch.tensor([data])
    else:
        raise

In [2]:
def honorable_cev(cuda):
    torch.set_grad_enabled(False)
    model = load_model(SMCN_PRM, cuda, PTH_FILE)
    val_dataset = DidemoSMCNRetrieval(VAL_LIST_PATH, **DATASET_PRM)
    # Setup prediction matrix
    val_dataset.mode = RetrievalMode.VIDEO_TO_DESCRIPTION
    # TODO (extension): future work once we are set with DiDeMo
    N_s = len(val_dataset.segments)
    N_c = len(val_dataset) * N_s
    val_dataset.mode = RetrievalMode.DESCRIPTION_TO_MOMENT
    M_l = len(val_dataset)
    prediction_matrix = torch.empty(M_l, N_c)

    counter = 0
    for moment_i_data in val_dataset:
        # get visual representation of a moment
        moment_i_ind = moment_i_data[0]
        sentence_i_rep = torchify_and_collate(moment_i_data[1], True, cuda)
        sentence_i_length = torchify_and_collate(moment_i_data[2], False, cuda)

        # Switch mode to iterate over phrases
        val_dataset.mode = RetrievalMode.VIDEO_TO_DESCRIPTION
        for video_j_data in val_dataset:
            # get text representation of sentence
            video_j_ind = video_j_data[0]
            video_j_visual_rep = torchify_and_collate(video_j_data[1], False, cuda)
            assert N_s == video_j_visual_rep['mask'].shape[0]
            # TODO (debug): double check that predict works here
            # 1st check, apparently we are good to go. let's try out!
            score_ij, is_similarity = model.predict(
                sentence_i_rep, sentence_i_length, video_j_visual_rep)
            ind_start, ind_end = video_j_ind * N_s, (video_j_ind + 1) * N_s
            prediction_matrix[moment_i_ind, ind_start:ind_end] = score_ij
            # TODO (critical): block-out segments in videos without visual
            # feature e.g. a video only has 5 chunks, similarity for the 6-th
            # should be 0
            # TODO (debug): hash video-id and collect them

        val_dataset.mode = RetrievalMode.DESCRIPTION_TO_MOMENT

        counter += 1
        if counter == 10:
            break