In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [7]:
import os

import math

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
from easydict import EasyDict as edict

from MultiheadAttention import MultiheadAttention


MULTIHEADATTENTION_HEADS = 1

# Modelo

In [3]:
class SE(nn.Module):
    def __init__(self, input_dim, output_dim, r = 16):
        super(SE, self).__init__()

        self.r = r
        self.scale = int(input_dim/self.r)
        self.output_dim = output_dim

        self.fc1 = nn.Linear(input_dim, self.scale)
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Linear(self.scale, output_dim)
        self.sigmoid = nn.Sigmoid()
        self.norm = nn.InstanceNorm1d(1, affine = False)

        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.xavier_uniform_(self.fc2.weight)

    def forward(self, x):
        tgt_len, bsz, embed_dim = x.size()
        out = self.relu(self.fc1(x.view(bsz, embed_dim)))
        out = self.fc2(out)
        out = self.norm(out.unsqueeze(1))
        out = self.sigmoid(out)     

        return out, out.view(bsz, 1, self.output_dim)

In [38]:
def squeeze(weights):
    return weights.mean(dim=1)

class SEAttend(nn.Module):
    def __init__(self, in_dim=256, out_dim=256, squeeze_dim=16):
        super().__init__()
        
        self.in_dim = in_dim # Cantidad de neuronas capa anterior
        self.out_dim = out_dim # Cantidad de neuronas capa siguiente (mascara sobre estas)
        
        self.excite = nn.Sequential(
            nn.Linear(out_dim, squeeze_dim),
            nn.ReLU(),
            nn.Linear(squeeze_dim, out_dim),
            nn.Sigmoid(),
        )
        
        self.attend = MultiheadAttention(
            in_dim,
            MULTIHEADATTENTION_HEADS,
            dropout=0.1,
        )
    
    def forward(self, qst, weights):
        bsz = qst.size(0)
        
        scale = self.squeeze(weights) # scale: [out_dim (256)]
        scale = self.excite(scale.unsqueeze(0)) # scale: [1, out_dim (256)]
        weights = weights * scale.t() # weights: [out_dim, in_dim]
        weights = weights.unsqueeze(1).expand(self.out_dim, bsz, self.in_dim) # weights: [out_dim, bsz, in_dim]
        
        _, attn_output_weights = self.attend(qst, weights, weights)
        
        # Retorno None para mantener el formato
        return None, attn_output_weights
    
    @staticmethod
    def squeeze(weights):
        return squeeze(weights)

In [44]:
class ConvInputModel(nn.Module):
    def __init__(self):
        super(ConvInputModel, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 24, 3, stride=2, padding=1)
        self.batchNorm1 = nn.BatchNorm2d(24)
        self.conv2 = nn.Conv2d(24, 24, 3, stride=2, padding=1)
        self.batchNorm2 = nn.BatchNorm2d(24)
        self.conv3 = nn.Conv2d(24, 24, 3, stride=2, padding=1)
        self.batchNorm3 = nn.BatchNorm2d(24)
        self.conv4 = nn.Conv2d(24, 24, 3, stride=2, padding=1)
        self.batchNorm4 = nn.BatchNorm2d(24)
        
    def forward(self, img):
        """convolution"""
        x = self.conv1(img)
        x = self.batchNorm1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.batchNorm2(x)
        x = F.relu(x)
        x = self.conv3(x)
        x = self.batchNorm3(x)
        x = F.relu(x)
        x = self.conv4(x)
        x = self.batchNorm4(x)
        x = F.relu(x)
        return x


class QuestionEmbedModel(nn.Module):
    def __init__(self, in_size, embed=32, hidden=128):
        super(QuestionEmbedModel, self).__init__()
        
        self.wembedding = nn.Embedding(in_size + 1, embed)  #word embeddings have size 32
        self.lstm = nn.LSTM(embed, hidden, batch_first=True)  # Input dim is 32, output dim is the question embedding
        self.hidden = hidden
        
    def forward(self, question):
        #calculate question embeddings
        wembed = self.wembedding(question)
        # wembed = wembed.permute(1,0,2) # in lstm minibatches are in the 2-nd dimension
        self.lstm.flatten_parameters()
        _, hidden = self.lstm(wembed) # initial state is set to zeros by default
        qst_emb = hidden[0] # hidden state of the lstm. qst = (B x 128)
        #qst_emb = qst_emb.permute(1,0,2).contiguous()
        #qst_emb = qst_emb.view(-1, self.hidden*2)
        qst_emb = qst_emb[0]
        
        return qst_emb

class RelationalLayerBase(nn.Module):
    def __init__(self, in_size, out_size, qst_size, hyp):
        super().__init__()

        # f_fc1
        self.f_fc1 = nn.Linear(hyp["g_layers"][-1], hyp["f_fc1"])
        self.mha_fc1 = SEAttend(hyp["g_layers"][-1], hyp["f_fc1"])
        self.identity_fc1 = nn.Identity()
        # f_fc2
        self.f_fc2 = nn.Linear(hyp["f_fc1"], hyp["f_fc2"])
        self.mha_fc2 = SEAttend(hyp["f_fc1"], hyp["f_fc2"])
        self.identity_fc2 = nn.Identity()
        # f_fc3
        self.f_fc3 = nn.Linear(hyp["f_fc2"], out_size)
        self.mha_fc3 = SEAttend(hyp["f_fc2"], out_size)
        self.identity_fc3 = nn.Identity()
    
        self.dropout = nn.Dropout(p=hyp["dropout"])
        
        self.on_gpu = False
        self.hyp = hyp
        self.qst_size = qst_size
        self.in_size = in_size
        self.out_size = out_size

    def cuda(self, device=None):
        self.on_gpu = True
        super().cuda(device)
    

class RelationalLayer(RelationalLayerBase):
    def __init__(self, in_size, out_size, qst_size, hyp, extraction=False):
        super().__init__(in_size, out_size, qst_size, hyp)

        self.quest_inject_position = hyp["question_injection_position"]
        self.in_size = in_size

	    #create all g layers
        self.g_layers = []
        self.g_layers_size = hyp["g_layers"]

        #create all multiheadattention layers
        self.mha_layers = []
        self.identity_layers = []

        for idx,g_layer_size in enumerate(hyp["g_layers"]):
            in_s = in_size if idx==0 else hyp["g_layers"][idx-1]
            out_s = g_layer_size
            if idx==self.quest_inject_position:
                #create the h layer. Now, for better code organization, it is part of the g layers pool. 
                l = nn.Linear(in_s+qst_size, out_s)
                mha = SEAttend(in_s+qst_size, out_s)
            else:
                #create a standard g layer.
                l = nn.Linear(in_s, out_s)
                mha = SEAttend(in_s, out_s)
            self.g_layers.append(l)
            self.mha_layers.append(mha)
            self.identity_layers.append(nn.Identity())


        self.g_layers = nn.ModuleList(self.g_layers)
        self.mha_layers = nn.ModuleList(self.mha_layers)
        self.identity_layers = nn.ModuleList(self.identity_layers)
        self.extraction = extraction
    
    def forward(self, x, qst):
        # x = (B x 8*8 x 24)
        # qst = (B x 128)
        """g"""
        b, d, k = x.size()
        qst_size = qst.size()[1]
        l1_reg = 0
        
        # add question everywhere
        qst = torch.unsqueeze(qst, 1)                      # (B x 1 x 128)
        query = qst.clone().transpose(1, 0)
        qst = qst.repeat(1, d, 1)                       # (B x 64 x 128)
        qst = torch.unsqueeze(qst, 2)                      # (B x 64 x 1 x 128)
        
        # cast all pairs against each other
        x_i = torch.unsqueeze(x, 1)                   # (B x 1 x 64 x 26)
        x_i = x_i.repeat(1, d, 1, 1)                    # (B x 64 x 64 x 26)
        x_j = torch.unsqueeze(x, 2)                   # (B x 64 x 1 x 26)
        #x_j = torch.cat([x_j, qst], 3)
        x_j = x_j.repeat(1, 1, d, 1)                    # (B x 64 x 64 x 26)
        
        # concatenate all together
        x_full = torch.cat([x_i, x_j], 3)                  # (B x 64 x 64 x 2*26)
        
        # reshape for passing through network
        x_ = x_full.view(b * d**2, self.in_size)

        #create g and inject the question at the position pointed by quest_inject_position.
        for idx, (g_layer, mha_layer, g_layer_size, identity) in enumerate(zip(self.g_layers, self.mha_layers, self.g_layers_size, self.identity_layers)):
            if idx==self.quest_inject_position:
                in_size = self.in_size if idx==0 else self.g_layers_size[idx-1]

                # questions inserted
                x_img = x_.view(b,d,d,in_size)
                qst = qst.repeat(1,1,d,1)
                x_concat = torch.cat([x_img,qst],3) #(B x 64 x 64 x 128 + 2 * 26)

                # h layer
                x_ = x_concat.view(b*(d**2),in_size+self.qst_size)
                x_ = g_layer(x_)
                x_ = F.relu(x_)
            else:
                x_ = g_layer(x_)
                x_ = F.relu(x_)
                # Pass through multiheadattention layer
                weights = g_layer.weight
                # weights = torch.unsqueeze(g_layer.weight, 0).repeat(b, 1, 1).transpose(1, 0)
                print('wights', weights.size())
                print(query.size())
                _, attn_output_weights = mha_layer(query, weights)
                l1_reg += (attn_output_weights.abs().sum() / (attn_output_weights.size(0) * attn_output_weights.size(2)))
                
                print('attn size', attn_output_weights.size())

                attn_output_weights = attn_output_weights.expand(b, d**2, attn_output_weights.size(-1))
                
                # Apply attn_output_weights to x_
                print('attn size', attn_output_weights.size())
                print('x size', x_.view(b, d**2, g_layer_size).size())
                x_ = x_.view(b, d**2, g_layer_size) * attn_output_weights
                x_ = x_.view(b * (d ** 2), g_layer_size)
            x_ = identity(x_)

        if self.extraction:
            return None
        
        # reshape again and sum
        x_g = x_.view(b, d**2, self.g_layers_size[-1])
        x_g = x_g.sum(1).squeeze(1)
        
        """f"""
        # f_fc1
        x_f = self.f_fc1(x_g)
        x_f = F.relu(x_f)
        weights = self.f_fc1.weight
        # weights = torch.unsqueeze(self.f_fc1.weight, 0).repeat(b, 1, 1).transpose(1, 0)
        _, attn_output_weights = self.mha_fc1(query, weights)
        l1_reg += (attn_output_weights.abs().sum() / (attn_output_weights.size(0) * attn_output_weights.size(2)))
        x_f = x_f * attn_output_weights.squeeze(1)
        x_f = self.identity_fc1(x_f)
        # f_fc2
        x_f = self.f_fc2(x_f)
        x_f = self.dropout(x_f)
        x_f = F.relu(x_f)
        weights = self.f_fc2.weight
        # weights = torch.unsqueeze(self.f_fc2.weight, 0).repeat(b, 1, 1).transpose(1, 0)
        _, attn_output_weights = self.mha_fc2(query, weights)
        l1_reg += (attn_output_weights.abs().sum() / (attn_output_weights.size(0) * attn_output_weights.size(2)))
        x_f = x_f * attn_output_weights.squeeze(1)
        x_f = self.identity_fc2(x_f)
        # f_fc3
        x_f = self.f_fc3(x_f)
        weights = self.f_fc3.weight 
        # weights = torch.unsqueeze(self.f_fc3.weight, 0).repeat(b, 1, 1).transpose(1, 0)
        _, attn_output_weights = self.mha_fc3(query, weights)
        l1_reg += (attn_output_weights.abs().sum() / (attn_output_weights.size(0) * attn_output_weights.size(2)))
        x_f = x_f * attn_output_weights.squeeze(1)
        x_f = self.identity_fc3(x_f)
        return F.log_softmax(x_f, dim=1), l1_reg 

class RN(nn.Module):
    def __init__(self, args, hyp, extraction=False):
        super(RN, self).__init__()
        self.coord_tensor = None
        self.on_gpu = False
        
        # CNN
        self.conv = ConvInputModel()
        self.state_desc = hyp['state_description']            
            
        # LSTM
        hidden_size = hyp["lstm_hidden"]
        self.text = QuestionEmbedModel(args.qdict_size, embed=hyp["lstm_word_emb"], hidden=hidden_size)
        
        # RELATIONAL LAYER
        self.rl_in_size = hyp["rl_in_size"]
        self.rl_out_size = args.adict_size
        self.rl = RelationalLayer(self.rl_in_size, self.rl_out_size, hidden_size, hyp, extraction) 
        if hyp["question_injection_position"] != 0:          
            print('Supposing IR model')
        else:     
            print('Supposing original DeepMind model')

    def forward(self, img, qst_idxs):
        if self.state_desc:
            x = img # (B x 12 x 8)
        else:
            x = self.conv(img)  # (B x 24 x 8 x 8)
            b, k, d, _ = x.size()
            x = x.view(b,k,d*d) # (B x 24 x 8*8)
            
            # add coordinates
            if self.coord_tensor is None or torch.cuda.device_count() == 1:
                self.build_coord_tensor(b, d)                  # (B x 2 x 8 x 8)
                self.coord_tensor = self.coord_tensor.view(b,2,d*d) # (B x 2 x 8*8)
            
            x = torch.cat([x, self.coord_tensor], 1)    # (B x 24+2 x 8*8)
            x = x.permute(0, 2, 1)    # (B x 64 x 24+2)
        
        qst = self.text(qst_idxs)
        y = self.rl(x, qst)
        return y
       
    # prepare coord tensor
    def build_coord_tensor(self, b, d):
        coords = torch.linspace(-d/2., d/2., d)
        x = coords.unsqueeze(0).repeat(d, 1)
        y = coords.unsqueeze(1).repeat(1, d)
        ct = torch.stack((x,y))
        # broadcast to all batches
        # TODO: upgrade pytorch and use broadcasting
        ct = ct.unsqueeze(0).repeat(b, 1, 1, 1)
        self.coord_tensor = Variable(ct, requires_grad=False)
        if self.on_gpu:
            self.coord_tensor = self.coord_tensor.cuda()
    
    def cuda(self, device=None):
        self.on_gpu = True
        self.rl.cuda(device)
        super(RN, self).cuda(device)
        


# Utils

In [14]:
def build_dictionaries(clevr_dir):

    def compute_class(answer):
        for name,values in classes.items():
            if answer in values:
                return name
        
        raise ValueError('Answer {} does not belong to a known class'.format(answer))
        
        
    cached_dictionaries = os.path.join('questions', 'CLEVR_built_dictionaries.pkl')
    if os.path.exists(cached_dictionaries):
        print('==> using cached dictionaries: {}'.format(cached_dictionaries))
        with open(cached_dictionaries, 'rb') as f:
            return pickle.load(f)
            
    quest_to_ix = {}
    answ_to_ix = {}
    answ_ix_to_class = {}
    json_train_filename = os.path.join(clevr_dir, 'questions', 'CLEVR_train_questions.json')
    #load all words from all training data
    with open(json_train_filename, "r") as f:
        questions = json.load(f)['questions']
        for q in tqdm(questions):
            question = tokenize(q['question'])
            answer = q['answer']
            #pdb.set_trace()
            for word in question:
                if word not in quest_to_ix:
                    quest_to_ix[word] = len(quest_to_ix)+1 #one based indexing; zero is reserved for padding
            
            a = answer.lower()
            if a not in answ_to_ix:
                    ix = len(answ_to_ix)+1
                    answ_to_ix[a] = ix
                    answ_ix_to_class[ix] = compute_class(a)

    ret = (quest_to_ix, answ_to_ix, answ_ix_to_class)    
    with open(cached_dictionaries, 'wb') as f:
        pickle.dump(ret, f)

    return ret

In [15]:
def test(data, model, epoch, dictionaries, args):
    model.eval()

    # accuracy for every class
    class_corrects = {}
    # for every class, among all the wrong answers, how much are non pertinent
    class_invalids = {}
    # total number of samples for every class
    class_n_samples = {}
    # initialization
    for c in dictionaries[2].values():
        class_corrects[c] = 0.0
        class_invalids[c] = 0.0
        class_n_samples[c] = 0.0

    corrects = 0.0
    invalids = 0.0
    n_samples = 0

    inverted_answ_dict = {v: k for k,v in dictionaries[1].items()}
    sorted_classes = sorted(dictionaries[2].items(), key=lambda x: hash(x[1]) if x[1]!='number' else int(inverted_answ_dict[x[0]]))
    sorted_classes = [c[0]-1 for c in sorted_classes]

    confusion_matrix_target = []
    confusion_matrix_pred = []

    sorted_labels = sorted(dictionaries[1].items(), key=lambda x: x[1])
    sorted_labels = [c[0] for c in sorted_labels]
    sorted_labels = [sorted_labels[c] for c in sorted_classes]

    avg_loss = 0.0
    progress_bar = tqdm(data)
    with torch.set_grad_enabled(True):
        for batch_idx, sample_batched in enumerate(progress_bar):
            img, qst, label = utils.load_tensor_data(sample_batched, args.cuda, args.invert_questions, volatile=True)

            output, l1_reg = model(img, qst)
            pred = output.data.max(1)[1]

            print(l1_reg)
            print(l1_reg.mean())
            print('item', l1_reg.mean().item())
            loss = F.nll_loss(output, label) + args.l1_lambd * l1_reg.mean()
            print(loss)

            # compute per-class accuracy
            pred_class = [dictionaries[2][o.item()+1] for o in pred]
            real_class = [dictionaries[2][o.item()+1] for o in label.data]
            for idx,rc in enumerate(real_class):
                class_corrects[rc] += (pred[idx] == label.data[idx]).item()
                class_n_samples[rc] += 1

            for pc, rc in zip(pred_class,real_class):
                class_invalids[rc] += (pc != rc)

            for p,l in zip(pred, label.data):
                confusion_matrix_target.append(sorted_classes.index(l))
                confusion_matrix_pred.append(sorted_classes.index(p))

            # compute global accuracy
            corrects += (pred == label.data).sum().item()
            assert corrects == sum(class_corrects.values()), 'Number of correct answers assertion error!'
            invalids = sum(class_invalids.values())
            n_samples += len(label)
            assert n_samples == sum(class_n_samples.values()), 'Number of total answers assertion error!'

            avg_loss += loss.item()

            if batch_idx % args.log_interval == 0:
                accuracy = corrects / n_samples
                invalids_perc = invalids / n_samples
                progress_bar.set_postfix(dict(acc='{:.2%}'.format(accuracy), inv='{:.2%}'.format(invalids_perc)))
    
    avg_loss /= len(data)
    invalids_perc = invalids / n_samples      
    global_accuracy = corrects / n_samples

    print('Test Epoch {}: Accuracy = {:.2%} ({:g}/{}); Invalids = {:.2%} ({:g}/{}); Test loss = {}'.format(epoch, accuracy, corrects, n_samples, invalids_perc, invalids, n_samples, avg_loss))
    for v in class_n_samples.keys():
        accuracy = 0
        invalid = 0
        if class_n_samples[v] != 0:
            accuracy = class_corrects[v] / class_n_samples[v]
            invalid = class_invalids[v] / class_n_samples[v]
        print('{} -- acc: {:.2%} ({}/{}); invalid: {:.2%} ({}/{})'.format(v,accuracy,class_corrects[v],class_n_samples[v],invalid,class_invalids[v],class_n_samples[v]))

    dump_object = {
        'class_corrects':class_corrects,
        'class_invalids':class_invalids,
        'class_total_samples':class_n_samples,
        'confusion_matrix_target':confusion_matrix_target,
        'confusion_matrix_pred':confusion_matrix_pred,
        'confusion_matrix_labels':sorted_labels,
        'global_accuracy':global_accuracy,
        'global_invalids':invalids_perc
    }
    torch.cuda.empty_cache()
    return avg_loss, dump_object

# Config

In [16]:
import utils
from clevr_dataset_connector import ClevrDataset, ClevrDatasetStateDescription

In [17]:
hyp =  {
    "state_description": False,
    "g_layers": [256,256,256,256],
    "question_injection_position": 0,

    "f_fc1": 256,
    "f_fc2": 256,

    "dropout": 0.5,
    "lstm_hidden": 256,
    "lstm_word_emb": 32,
    "rl_in_size": 52
}

args = edict(
    batch_size=220,
    bs_gamma=1,
    bs_max=-1,
    bs_step=20,
    clevr_dir='/Users/sebamenabar/Documents/datasets/CLEVR/CLEVR_v1.0',
    clip_norm=50,
    comet=1,
    config='config.json',
    conv_transfer_learn=None,
    dropout=-1, 
    epochs=400,
    experiment='Norm',
    freeze_RN=False,
    invert_questions=True,
    l1_lambd=0.0,
    log_interval=10,
    lr=5e-06,
    lr_gamma=2,
    lr_max=0.0005,
    lr_step=20,
    model='original-fp',
    no_cuda=False,
    no_invert_questions=False,
    question_injection=-1,
    resume=None,
    resume_comet='',
    resume_optimizer=None,
    seed=42,
    subset=1.0,
    test=False,
    test_batch_size=100,
)

if args.dropout > 0:
    hyp['dropout'] = args.dropout
if args.question_injection >= 0:
    hyp['question_injection_position'] = args.question_injection

args.cuda = not args.no_cuda and torch.cuda.is_available()

In [19]:
dictionaries = utils.build_dictionaries(args.clevr_dir, 'clevr')

==> using cached dictionaries: questions/CLEVR_built_dictionaries.pkl


In [20]:
args.qdict_size = len(dictionaries[0])
args.adict_size = len(dictionaries[1])

In [21]:
import torchvision.transforms as transforms

In [22]:
train_transforms = transforms.Compose([transforms.Resize((128, 128)),
                                   transforms.Pad(8),
                                   transforms.RandomCrop((128, 128)),
                                   transforms.RandomRotation(2.8),  # .05 rad
                                   transforms.ToTensor()])
test_transforms = transforms.Compose([transforms.Resize((128, 128)),
                                  transforms.ToTensor()])

clevr_dataset_train = ClevrDataset(args.clevr_dir, True, dictionaries, train_transforms)
clevr_dataset_test = ClevrDataset(args.clevr_dir, False, dictionaries, test_transforms)

==> using cached questions: questions/CLEVR_train_questions.pkl
==> using cached questions: questions/CLEVR_val_questions.pkl


In [23]:
from torch.utils.data import DataLoader, Subset

In [24]:
use_subset = True
workers = 4
args.bs = 8
subset_size = 128

clevr_subset_test = Subset(clevr_dataset_test, np.arange(subset_size))

if use_subset:
    clevr_test_loader = DataLoader(clevr_subset_test, batch_size=args.bs,
                                   shuffle=False, num_workers=4, collate_fn=utils.collate_samples_from_pixels)  
# clevr_train_loader = DataLoader(clevr_dataset_train, batch_size=args.bs,
#                                 shuffle=False, num_workers=4, collate_fn=utils.collate_samples_from_pixels)
else:
    clevr_test_loader = DataLoader(clevr_dataset_test, batch_size=args.bs,
                                   shuffle=False, num_workers=4, collate_fn=utils.collate_samples_from_pixels)

# Eval

In [46]:
model

RN(
  (conv): ConvInputModel(
    (conv1): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (batchNorm1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (batchNorm2): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (batchNorm3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv4): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (batchNorm4): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (text): QuestionEmbedModel(
    (wembedding): Embedding(83, 32)
    (lstm): LSTM(32, 256, batch_first=True)
  )
  (rl): RelationalLayer(
    (f_fc1): Linear(in_features=256, out_features=256, bias=True)
    (mha_fc1): SEAttend(
      (excite): Sequential(
 

In [45]:
model = RN(args, hyp)
model = nn.DataParallel(model)
# model.load_state_dict(torch.load('./best_weights/SE_norm_0.5_reg.pth', map_location='cpu'))
model = model.module
model;

b = next(iter(clevr_test_loader))
model(b['image'], b['question'])

Supposing original DeepMind model
wights torch.Size([256, 256])
attn size torch.Size([8, 1, 32])
attn size torch.Size([8, 4096, 32])
x size torch.Size([8, 4096, 256])


RuntimeError: The size of tensor a (256) must match the size of tensor b (32) at non-singleton dimension 2

In [35]:
hyp

{'state_description': False,
 'g_layers': [256, 256, 256, 256],
 'question_injection_position': 0,
 'f_fc1': 256,
 'f_fc2': 256,
 'dropout': 0.5,
 'lstm_hidden': 256,
 'lstm_word_emb': 32,
 'rl_in_size': 52}

In [None]:
print(f'masks {attn_output_weights}')
print(f'masks shape {attn_output_weights.shape}')
_m = torch.zeros_like(attn_output_weights)
print(f'zeros shape {_m.shape}')
top_index = (-attn_output_weights).argsort(dim=2)[:, :, :128]
print(f'top index shape {top_index.shape}')
print(f'top index {top_index}')
_m[top_index] = attn_output_weights[top_index]

In [139]:
masks = torch.randn(2, 1, 256)
top_index = (-masks).argsort(dim=2)[:, :, :128]


In [140]:
masks.size(), top_index.size()

(torch.Size([2, 1, 256]), torch.Size([2, 1, 128]))

In [141]:
masks[top_index]

IndexError: index 110 is out of bounds for dimension 0 with size 2

In [193]:
model = RN(args, hyp)
model = nn.DataParallel(model)
model.load_state_dict(torch.load('./best_weights/SE_norm_0.5_reg.pth', map_location='cpu'))
model = model.module
model;
model.coord_tensor = None
algo = test(clevr_test_loader, model, 0, dictionaries, args)

Supposing original DeepMind model


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

tensor(2.9943, grad_fn=<AddBackward0>)
tensor(2.9943, grad_fn=<MeanBackward0>)
item 2.994332790374756
tensor(0.0235, grad_fn=<AddBackward0>)


KeyboardInterrupt: 

In [180]:
mask = np.random.uniform(0, 1, (2, 1, 5))
mask

array([[[0.22157167, 0.00833366, 0.03545763, 0.11640876, 0.6828096 ]],

       [[0.77533526, 0.29543232, 0.31299817, 0.42692693, 0.84035245]]])

In [183]:
top = np.sort((mask))

In [184]:
t

array([[[0.00833366, 0.03545763, 0.11640876, 0.22157167, 0.6828096 ]],

       [[0.29543232, 0.31299817, 0.42692693, 0.77533526, 0.84035245]]])

In [None]:
top[:,:,2:] = 0

In [172]:
top = (-mask).argsort()
top = (top < 2) * 1
mask * top

array([[[0.        , 0.        , 0.        , 0.45450132, 0.        ]],

       [[0.        , 0.        , 0.        , 0.        , 0.29772767]]])

In [152]:
top.shape, masks.shape

((2, 1, 2), torch.Size([2, 1, 256]))

In [157]:
top

array([[[0, 1]],

       [[0, 3]]])

In [164]:
top.tolist()

[[[0, 1]], [[0, 3]]]

In [163]:
mask[top.tolist()]

SyntaxError: invalid syntax (<ipython-input-163-151313a4468f>, line 1)

In [154]:
_m = np.zeros_like(mask)
_m[top] = mask[top]
_m

  


IndexError: index 3 is out of bounds for axis 0 with size 2

In [109]:
mask[(-mask).argsort()[:2]]

array([0.85873339, 0.75114839])

In [106]:
mask[mask.argsort()]

array([0.15547956, 0.22081248, 0.50479333, 0.75114839, 0.85873339])

In [103]:
np.sort(mask, order=True)

ValueError: Cannot specify order when the array has no fields.

In [118]:
torch.randn(2, 256).argsort(dim=1)

tensor([[ 26, 191,  93, 247, 228,  85,  60,  11,  99, 241,  82, 224, 109, 239,
         201, 119,  20,  19,  61, 194,   6, 161, 152,  44, 125, 219,  84, 123,
          87,  28,  56, 114, 225, 146, 113,  38, 112,  33, 129, 179,  94, 182,
         208, 212, 235,  92, 127, 217, 207,  83, 249,  58, 170,   1, 103, 231,
         132,  13, 237, 227,  15,  72, 117, 160,  96, 234,  95, 177, 138,   0,
          79, 140,  53,  63, 211, 139, 157, 104, 193,  69,  30, 130, 137, 159,
         136, 196, 150,  70, 243, 190, 198,  27, 101,  75,  10,  35, 253,  46,
          66, 163, 111, 248, 105, 154,  36,  98,  40,  51, 216,  41, 134,  43,
         131, 121, 166, 115,   3,  47,  32, 209, 200, 255, 133, 151, 164, 128,
         156, 148, 246,  18,  74,   8, 106,  17, 175, 203,  89,   9,   7, 202,
         176, 206, 195, 199, 210, 141, 172, 197, 186, 110, 189,  14,  65,  86,
         116, 102, 118, 226,  25,  88,  76, 145, 100,  78, 173, 180,  37, 230,
         205,  29,  34, 143, 254, 245,  24,  22, 187

In [99]:
print(mask.argsort())
mask[mask.argsort()]

[0 3 4 2 1]


array([0.50100204, 0.59806581, 0.64691903, 0.71714421, 0.80007776])

In [89]:
mask[(mask).argsort()[0]]

IndexError: index 4 is out of bounds for axis 0 with size 1

In [84]:
mask * ((-mask).argsort() < 2)

array([[0.        , 0.        , 0.44464914, 0.84900637, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])