In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

In [5]:
from pathlib import Path
import h5py
import torch
import numpy as np
from torch.utils.data import DataLoader
from didemo import Didemo
from model import MCN
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class MCNDebug(MCN):
    "Debug MCN to add Caffe weights"

    def __init__(self, *args, **kwargs):
        super(MCNDebug, self).__init__(*args, **kwargs)

    def forward(self, padded_query, query_length, visual_pos,
                visual_neg_intra=None, visual_neg_inter=None):
        # Keep the same signature but does not use neg inputs
        visual_pos, visual_neg_intra, visual_neg_inter = self._unpack_visual(
            visual_pos, visual_neg_intra, visual_neg_inter)
        v_embedding_neg_intra = None
        v_embedding_neg_inter = None
        B = len(padded_query)

        v_embedding_pos = self.img_encoder(visual_pos)
        if visual_neg_intra is not None:
            v_embedding_neg_intra = self.img_encoder(visual_neg_intra)
        if visual_neg_inter is not None:
            v_embedding_neg_inter = self.img_encoder(visual_neg_inter)

        packed_query = pack_padded_sequence(
            padded_query, query_length, batch_first=True)
        packed_output, _ = self.sentence_encoder(packed_query)
        output, _ = pad_packed_sequence(packed_output, batch_first=True,
                                        total_length=self.max_length)
        # TODO: try max-pooling
        last_output = output[range(B), query_length - 1, :]
        l_embedding = self.lang_encoder(last_output)
        return (l_embedding, v_embedding_pos, last_output, output)


def load_mcn_weights_from_caffe(weights_h5, visual_size = 8194,
                                text_dim = 300, max_length = 50):
    with h5py.File(weights_h5) as f:
        ported_weights = {}
        mapping = {}
        for k, v in f.items():
            ported_weights[k] = v[:]
            # print(k, v.shape)
            if k == 'InnerProduct1_0':
                mapping['img_encoder.0.weight'] = k
            elif k == 'InnerProduct1_1':
                mapping['img_encoder.0.bias'] = k
            elif k == 'InnerProduct2_0':
                mapping['img_encoder.2.weight'] = k
            elif k == 'InnerProduct2_1':
                mapping['img_encoder.2.bias'] = k
            elif k == 'LSTM1_0':
                dim = v.shape[0] // 4
                v = np.concatenate([v[:2*dim, ...], v[3*dim:4*dim, ...], v[2*dim:3*dim, ...]], axis=0)
                mapping['sentence_encoder.weight_ih_l0'] = k
            elif k == 'LSTM1_2':
                dim = v.shape[0] // 4
                v = np.concatenate([v[:2*dim, ...], v[3*dim:4*dim, ...], v[2*dim:3*dim, ...]], axis=0)
                mapping['sentence_encoder.weight_hh_l0'] = k
            elif k == 'LSTM1_1':
                dim = v.shape[0] // 4
                v = np.concatenate([v[:2*dim, ...], v[3*dim:4*dim, ...], v[2*dim:3*dim, ...]], axis=0)
                mapping['sentence_encoder.bias_hh_l0'] = k
            elif k == 'embedding_text_0':
                mapping['lang_encoder.weight'] = k
            elif k == 'embedding_text_1':
                mapping['lang_encoder.bias'] = k
            else:
                raise

    net = MCN(visual_size=visual_size,
               lang_size=text_dim,
               max_length=max_length)
    for name, parameter in net.named_parameters():
        # print(name, parameter.shape)
        if name == 'sentence_encoder.bias_hh_l0':
            parameter.data.zero_()
            continue
        parameter.data = torch.from_numpy(ported_weights[mapping[name]])

    return net

def load_mcn_weights_from_caffe_debug(
    weights_h5, visual_size = 8194, text_dim = 300, max_length = 50):
    with h5py.File(weights_h5) as f:
        ported_weights = {}
        mapping = {}
        for k, v in f.items():
            # print(k, v.shape)
            if k == 'InnerProduct1_0':
                mapping['img_encoder.0.weight'] = k
            elif k == 'InnerProduct1_1':
                mapping['img_encoder.0.bias'] = k
            elif k == 'InnerProduct2_0':
                mapping['img_encoder.2.weight'] = k
            elif k == 'InnerProduct2_1':
                mapping['img_encoder.2.bias'] = k
            elif k == 'LSTM1_0':
                dim = v.shape[0] // 4
                v = np.concatenate([v[:2*dim, ...], v[3*dim:4*dim, ...], v[2*dim:3*dim, ...]], axis=0)
                mapping['sentence_encoder.weight_ih_l0'] = k
            elif k == 'LSTM1_2':
                dim = v.shape[0] // 4
                v = np.concatenate([v[:2*dim, ...], v[3*dim:4*dim, ...], v[2*dim:3*dim, ...]], axis=0)
                mapping['sentence_encoder.weight_hh_l0'] = k
            elif k == 'LSTM1_1':
                dim = v.shape[0] // 4
                v = np.concatenate([v[:2*dim, ...], v[3*dim:4*dim, ...], v[2*dim:3*dim, ...]], axis=0)
                # sentence_encoder.bias_hh_l0
                # sentence_encoder.bias_ih_l0
                mapping['sentence_encoder.bias_hh_l0'] = k
            elif k == 'embedding_text_0':
                mapping['lang_encoder.weight'] = k
            elif k == 'embedding_text_1':
                mapping['lang_encoder.bias'] = k
            else:
                raise
            ported_weights[k] = v[:]

    net = MCNDebug(visual_size=visual_size,
               lang_size=text_dim,
               max_length=max_length)
    for name, parameter in net.named_parameters():
        # print(name, parameter.shape)
        if name == 'sentence_encoder.bias_ih_l0':
            parameter.data.zero_()
            continue
        
        parameter.data = torch.from_numpy(ported_weights[mapping[name]])

    return net

RAW_PATH = Path('data/raw')
EVAL_BATCH_SIZE = 1
rgb_feat_path = RAW_PATH / 'average_fc7.h5'
flow_feat_path = RAW_PATH / 'average_global_flow.h5'
val_list_path = RAW_PATH / 'val_data.json'
rgb_cue = {'rgb': {'file': rgb_feat_path}}
flow_cue = {'flow': {'file': flow_feat_path}}
rgb_data = '../localizing-moments/results/sample_blobs_rgb_iccv_release_feature_process_context_recurrent_embedding_lfTrue_dv0.3_dl0.0_nlv2_nlllstm_no_embed_edl1000-100_edv500-100_pmFalse_losstriplet_lwInter0.2_iter_30000.caffemodel_val.hdf5'
flow_data = '../localizing-moments/results/sample_blobs_flow_iccv_release_feature_process_context_recurrent_embedding_lfTrue_dv0.3_dl0.0_nlv2_nlllstm_no_embed_edl1000-100_edv500-100_pmFalse_losstriplet_lwInter0.2_iter_30000.caffemodel_val.hdf5'
rgb_weights = '../localizing-moments/rgb-weights.hdf5'
flow_weights = '../localizing-moments/flow-weights.hdf5'

In [3]:
val_rgb = Didemo(val_list_path, cues=rgb_cue, test=True)
val_flow = Didemo(val_list_path, cues=flow_cue, test=True)
# val_dataset.collate_test_data
keys_rgb = {v['annotation_id']: i
            for i, v in enumerate(val_rgb.metadata)}
keys_flow = {v['annotation_id']: i
             for i, v in enumerate(val_flow.metadata)}
assert keys_rgb == keys_flow
feat_rgb = val_rgb[0]
feat_flow = val_flow[0]
text_dim = feat_rgb[0].shape[1]
if val_rgb.eval == True:
    text_dim = feat_rgb[0].shape[2]
rgb_dim = feat_rgb[2]['rgb'].shape[0]
flow_dim = feat_flow[2]['flow'].shape[0]
max_length = feat_rgb[0].shape[0]
rgb_setup = dict(visual_size=rgb_dim, lang_size=text_dim,
                 max_length=max_length)
flow_setup = dict(visual_size=flow_dim, lang_size=text_dim,
                 max_length=max_length)
net_rgb = load_mcn_weights_from_caffe_debug(rgb_weights, visual_size=rgb_dim)
net_flow = load_mcn_weights_from_caffe_debug(flow_weights, visual_size=flow_dim)

feed the same data for cross-check

In [6]:
original_data = {}
with h5py.File(flow_data) as f:
    for k, v in f.items():
        k = int(k)
        original_data[k] = {}
        for k2, v2 in v.items():
            original_data[k][k2] = v2[:]

net_rgb.eval()
net_flow.eval()
for i in original_data.keys():
    index = keys_rgb[i]
    # pre-processing
    rst = val_flow.collate_test_data([val_flow[index]])
    #
    input_visual = rst[2]['flow'].detach().numpy()
    input_visual_ = original_data[i]['Concat1']
    np.testing.assert_almost_equal(input_visual, input_visual_)
    #
    text_data = rst[0].detach().numpy()
    text_data = text_data.transpose((1, 0, 2))
    text_data_ = original_data[i]['text_data']
    # np.testing.assert_almost_equal(text_data, text_data_)
    seq_len = rst[1][0].item()
    np.testing.assert_almost_equal(text_data[0:seq_len, :, :],
                                   text_data_[50-seq_len:, :, :])
    # visual embedding
    embedding_text, embedding_visual, encoded_sentence, lstm_output  = net_flow(*rst)
    embedding_visual = embedding_visual.detach().numpy()
    embedding_visual_ = original_data[i]['embedding_visual']
    np.testing.assert_almost_equal(embedding_visual, embedding_visual_)
    # lstm output
    lstm_output = lstm_output.detach().numpy()
    lstm_output = lstm_output[:, 0:seq_len, :]
    lstm_output_ = original_data[i]['LSTM1'].transpose((1, 0, 2))
    lstm_output_ = lstm_output_[:, 50-seq_len:, :]
    # NOTE: reduce tolerance to 6 decimals due to mismatch of 0.02%
    np.testing.assert_almost_equal(lstm_output, lstm_output_, 6)
    # sentence encoded
    encoded_sentence = encoded_sentence.detach().numpy()
    encoded_sentence_ = original_data[i]['Reshape1']
    np.testing.assert_almost_equal(encoded_sentence, encoded_sentence_)
    # language embedding
    embedding_text = embedding_text.detach().numpy()
    embedding_text_ = original_data[i]['embedding_text']
    np.testing.assert_almost_equal(embedding_text, embedding_text_)

[debugging] force reloading

In [5]:
import model
import importlib
importlib.reload(model)

<module 'model' from '/mnt/ilcompf9d1/user/escorcia/moments-retrieval/model.py'>

[debugging] Check blob names

In [None]:
original_data[i].keys()