# Exit-poster

## MCN to encode a description
- ~~load mcn weights form caffe~~
- ~~unit-test of forward-pass~~

In [1]:
import sys
sys.path.append('..')

import h5py
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

torch.set_grad_enabled(False)

class MCNRetrievalFromCaffe(nn.Module):
    "Use MCN trained on Caffe weights"

    def __init__(self, lang_size=300, embedding_size=100, lang_hidden=1000,
                 max_length=50):
        super(MCNRetrievalFromCaffe, self).__init__()
        self.embedding_size = embedding_size
        self.max_length = max_length
        
        self.sentence_encoder = nn.LSTM(
            lang_size, lang_hidden, batch_first=True)
        self.lang_encoder = nn.Linear(lang_hidden, embedding_size)

    def forward(self, padded_query, query_length):
        # Keep the same signature but does not use neg inputs
        B = len(padded_query)

        packed_query = pack_padded_sequence(
            padded_query, query_length, batch_first=True)
        packed_output, _ = self.sentence_encoder(packed_query)
        output, _ = pad_packed_sequence(packed_output, batch_first=True,
                                        total_length=self.max_length)
        last_output = output[range(B), query_length - 1, :]
        l_embedding = self.lang_encoder(last_output)
        return l_embedding
    
    def load_caffe_weights(self, filename):
        with h5py.File(filename) as f:
            ported_weights = {}
            mapping = {}
            for k, v in f.items():
                # print(k, v.shape)
                # ignore visual-part 'cause we have it in a "database"
#                 if k == 'InnerProduct1_0':
#                     mapping['img_encoder.0.weight'] = k
#                 elif k == 'InnerProduct1_1':
#                     mapping['img_encoder.0.bias'] = k
#                 elif k == 'InnerProduct2_0':
#                     mapping['img_encoder.2.weight'] = k
#                 elif k == 'InnerProduct2_1':
#                     mapping['img_encoder.2.bias'] = k
                if k == 'LSTM1_0':
                    dim = v.shape[0] // 4
                    v = np.concatenate([v[:2*dim, ...],
                                        v[3*dim:4*dim, ...],
                                        v[2*dim:3*dim, ...]], axis=0)
                    mapping['sentence_encoder.weight_ih_l0'] = k
                elif k == 'LSTM1_2':
                    dim = v.shape[0] // 4
                    v = np.concatenate([v[:2*dim, ...],
                                        v[3*dim:4*dim, ...],
                                        v[2*dim:3*dim, ...]], axis=0)
                    mapping['sentence_encoder.weight_hh_l0'] = k
                elif k == 'LSTM1_1':
                    dim = v.shape[0] // 4
                    v = np.concatenate([v[:2*dim, ...],
                                        v[3*dim:4*dim, ...],
                                        v[2*dim:3*dim, ...]], axis=0)
                    # sentence_encoder.bias_hh_l0
                    # sentence_encoder.bias_ih_l0
                    mapping['sentence_encoder.bias_hh_l0'] = k
                elif k == 'embedding_text_0':
                    mapping['lang_encoder.weight'] = k
                elif k == 'embedding_text_1':
                    mapping['lang_encoder.bias'] = k
                ported_weights[k] = v[:]
            
            # Set parameters
            for name, parameter in self.named_parameters():
                # print(name, parameter.shape)
                if name == 'sentence_encoder.bias_ih_l0':
                    parameter.data.zero_()
                    continue
                parameter.data = torch.from_numpy(
                    ported_weights[mapping[name]])
# xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [2]:
from didemo import sentences_to_words
from glove import RecurrentEmbedding

class LanguageRepresentationMCN(object):
    "Get representation of sentence"

    def __init__(self, max_words, glove_file, vocab_file):
        self.max_words = max_words
        self.rec_embedding = RecurrentEmbedding(
            glove_file, vocab_file=vocab_file)
        self.dim = self.rec_embedding.embedding.glove_dim

    def __call__(self, query_str):
        "Return padded sentence feature"
        query = sentences_to_words([query_str])
        feature = np.zeros((self.max_words, self.dim), dtype=np.float32)
        len_query = min(len(query), self.max_words)
        for i, word in enumerate(query[:len_query]):
            if word in self.rec_embedding.vocab_dict:
                feature[i, :] = self.rec_embedding.vocab_dict[word]
        return feature, len_query

In [None]:
import json

val_json = '../data/raw/val_data.json'
filename = '../data/interim/mcn/queries_val_rgb.hdf5'

with open(val_json, 'r') as f:
    val_data = {i['annotation_id']: i
                for i in json.load(f)}

# Setup language processor
max_words = 50
glove_file = '../data/raw/glove.6B.300d.txt'
glove_vocab = '../data/raw/vocab_glove_complete.txt'
cev = LanguageRepresentationMCN(max_words, glove_file, glove_vocab)
# Setup MCN
rgb_weights = '../../localizing-moments/rgb-weights.hdf5'
aja = MCNRetrievalFromCaffe()
aja.load_caffe_weights(rgb_weights)

with h5py.File(filename, 'r') as f:
    for k, v in f.items():
        cole = v[:]
        padded_query, len_query = cev(val_data[int(k)]['description'])
        padded_query = torch.from_numpy(padded_query).unsqueeze(0)
        len_query = torch.tensor([len_query])
        pls = aja(padded_query, len_query)
        break
        
pls.squeeze_().numpy()

[debug] validate that we can recreate the queries with pytorch

In [3]:
import json

val_json = '../data/raw/val_data.json'
filename = '../data/interim/mcn/queries_val_rgb.hdf5'

with open(val_json, 'r') as f:
    val_data = {i['annotation_id']: i
                for i in json.load(f)}

# Setup language processor
max_words = 50
glove_file = '../data/raw/glove.6B.300d.txt'
glove_vocab = '../data/raw/vocab_glove_complete.txt'
cev = LanguageRepresentationMCN(max_words, glove_file, glove_vocab)
# Setup MCN
rgb_weights = '../../localizing-moments/rgb-weights.hdf5'
aja = MCNRetrievalFromCaffe()
aja.load_caffe_weights(rgb_weights)

with h5py.File(filename, 'r') as f:
    for k, v in f.items():
        cole = v[:]
        padded_query, len_query = cev(val_data[int(k)]['description'])
        padded_query = torch.from_numpy(padded_query).unsqueeze(0)
        len_query = torch.tensor([len_query])
        pls = aja(padded_query, len_query)
        np.testing.assert_array_almost_equal(pls[0, :].detach().numpy(), cole)
        
filename = '../data/interim/mcn/queries_val_flow.hdf5'
flow_weights = '../../localizing-moments/flow-weights.hdf5'
aja = MCNRetrievalFromCaffe()
aja.load_caffe_weights(flow_weights)

with h5py.File(filename, 'r') as f:
    for k, v in f.items():
        cole = v[:]
        padded_query, len_query = cev(val_data[int(k)]['description'])
        padded_query = torch.from_numpy(padded_query).unsqueeze(0)
        len_query = torch.tensor([len_query])
        pls = aja(padded_query, len_query)
        np.testing.assert_array_almost_equal(pls[0, :].detach().numpy(), cole)

Check-out flask server called `moments_retrieval_demo.py`