# Attention module

In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import copy

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])



class SummaryAttention(nn.Module):

    def __init__(self, hidden_size, dropout):
        super(SummaryAttention, self).__init__()
        self.mlp = nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(inplace=True),
                nn.Linear(hidden_size, 1)
                )
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):

        pass


class NormalSubLayer(nn.Module):

    def __init__(self, hidden_size, dropout):

        super(NormalSubLayer, self).__init__()
        self.linear = nn.Sequential(nn.Linear(hidden_size*3, hidden_size),
                                    nn.ReLU(inplace=True),
                                    nn.Dropout(p=dropout))

    def forward(self, x):
        """x: shape [batch_size, M, hidden_size*3]"""
        return self.linear(x)


class MultiHeadAttention(nn.Module):

    def __init__(self, hidden_size, num_heads, memory_size=1, dropout=0.0):
        super().__init__()

        self.hidden_size = hidden_size
        self.memory_size = memory_size
        self.num_heads = num_heads
        self.d_h = hidden_size // num_heads
        self.dropout = nn.Dropout(p=dropout)

        self.x_proj_linear = nn.Linear(hidden_size, hidden_size, bias=False)
        self.y_proj_linear = nn.Linear(hidden_size, hidden_size, bias=False)

        self.x_memory = nn.Parameter(nn.init.xavier_uniform_(torch.empty(memory_size, hidden_size)))
        self.y_memory = nn.Parameter(nn.init.xavier_uniform_(torch.empty(memory_size, hidden_size)))

    def project(self, x, x_mem, linear):
        x_mem_size = x.size(0), self.memory_size, self.hidden_size
        x = torch.cat([x_mem.unsqueeze(0).expand(*x_mem_size), x], dim=1)
        x_proj = linear(x)
        x_proj = x.view(x.size(0), x.size(1), self.num_heads, self.d_h)
        return x, x_proj


    def forward(self, X, Y, mask_X, mask_Y):
        """
        X: shape: [batch_size, M, hidden_size]
        Y: shape: [batch_size, N, hidden_size]
        mask_X: shape: [batch_size, M]
        mask_Y: shape: [batch_size, N]
        """
        memory_mask = X.new_ones((X.size(0), self.memory_size)).long()

        mask_X = torch.cat([memory_mask, mask_X], dim=1)
        mask_Y = torch.cat([memory_mask, mask_Y], dim=1)
        M_mem, N_mem = mask_X.size(1), mask_Y.size(1)
        
        mask_X = mask_X[:, None, :, None].repeat(1, self.num_heads, 1, N_mem)
        mask_Y = mask_Y[:, None, None, :].repeat(1, self.num_heads, M_mem, 1)

        print('mask_X', mask_X.shape)
        print('mask_Y', mask_Y.shape)
        X_mem, X_proj = self.project(X, self.x_memory, self.x_proj_linear)
        Y_mem, Y_proj = self.project(Y, self.y_memory, self.y_proj_linear)

        # (1) shape [bs, num_heads, mem_size + M, d_h]
        # (2) shape [bs, num_heads, d_h, mem_size + N]
        X_proj = X_proj.permute(0, 2, 1, 3)
        Y_proj = Y_proj.permute(0, 2, 3, 1)

        # shape: [bs, num_heads, mem_size + M, mem_size + N]
        affinity_matrix = torch.matmul(X_proj, Y_proj)
        affinity_matrix = affinity_matrix.masked_fill(mask_X==0, -1e9)
        affinity_matrix = affinity_matrix.masked_fill(mask_Y==0, -1e9)

        attn_X_guided_by_Y = torch.softmax(affinity_matrix, dim=2)
        attn_Y_guided_by_X = torch.softmax(affinity_matrix, dim=3)

        # (1) shape [bs, mem_size + M, mem_size + N]
        # (2) shape [bs, mem_size + M, mem_size + N]
        attn_X_guided_by_Y = torch.mean(attn_X_guided_by_Y, dim=1)
        attn_Y_guided_by_X = torch.mean(attn_Y_guided_by_X, dim=1)

        # (1) shape: [bs, mem_size + N, hidden_size]
        # (2) shape: [bs, mem_size + M, hidden_size]
        X_attends_in_Y = torch.matmul(attn_X_guided_by_Y.transpose(1, 2), X_mem)
        Y_attends_in_X = torch.matmul(attn_Y_guided_by_X, Y_mem)

        X_attends_in_Y = X_attends_in_Y[:, self.memory_size:, :]
        Y_attends_in_X = Y_attends_in_X[:, self.memory_size:, :]
        return X_attends_in_Y, Y_attends_in_X


class CrossAttentionLayer(nn.Module):

    def __init__(self, hidden_size, num_heads, memory_size=1, dropout=0.0):
        super(CrossAttentionLayer, self).__init__()
        self.mh_attentions  = clones(MultiHeadAttention(hidden_size, num_heads, memory_size, dropout), 3)
        self.norm_sublayers = clones(NormalSubLayer(hidden_size, dropout), 3)


    def forward(self, img, hist, ques, img_mask, hist_mask, ques_mask):
        img_in_hist, hist_in_img = self.mh_attentions[0](img, hist, img_mask, hist_mask)
        img_in_ques, ques_in_img = self.mh_attentions[1](img, ques, img_mask, ques_mask)
        hist_in_ques, ques_in_hist = self.mh_attentions[2](hist, ques, hist_mask, ques_mask)

        img  = self.norm_sublayers[0](torch.cat([img, hist_in_img, ques_in_img], dim=-1))
        ques = self.norm_sublayers[1](torch.cat([ques, hist_in_ques, img_in_ques], dim=-1))
        hist = self.norm_sublayers[2](torch.cat([hist, ques_in_hist, img_in_hist], dim=-1))
        return img, hist, ques

In [46]:
hs = 20
num_heads = 2
x = torch.randn(8, 4, 20)
maskx = torch.randint(0, 2, size=(8, 4))

y = torch.randn(8, 6, 20)
masky = torch.randint(0, 2, size=(8, 6))

z = torch.randn(8, 8, 20)
maskz = torch.randint(0, 2, size=(8, 8))

res = CrossAttentionLayer(hs, num_heads)(x, y, z, maskx, masky, maskz)
for r in res:
    print(r.shape)

mask_X torch.Size([8, 2, 5, 7])
mask_Y torch.Size([8, 2, 5, 7])
mask_X torch.Size([8, 2, 5, 9])
mask_Y torch.Size([8, 2, 5, 9])
mask_X torch.Size([8, 2, 7, 9])
mask_Y torch.Size([8, 2, 7, 9])
torch.Size([8, 4, 20])
torch.Size([8, 6, 20])
torch.Size([8, 8, 20])


# Test Module and Dataset

In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [2]:
run visdial/data/dataset.py

In [3]:
config = get_config()
dataset, dataloader, batch = test_visdial_dataset(config)
import json
print(json.dumps(config, indent=4))

  4%|▍         | 1735/45238 [00:00<00:02, 17347.87it/s]

[val2018] Tokenizing questions...


100%|██████████| 45238/45238 [00:02<00:00, 17116.80it/s]
  5%|▌         | 1786/34822 [00:00<00:01, 17852.19it/s]

[val2018] Tokenizing answers...


100%|██████████| 34822/34822 [00:01<00:00, 18265.59it/s]
 57%|█████▋    | 1179/2064 [00:00<00:00, 4396.67it/s]

[val2018] Tokenizing captions...


100%|██████████| 2064/2064 [00:00<00:00, 6115.71it/s]


img_ids              torch.Size([])
num_rounds           torch.Size([])
opts                 torch.Size([10, 100, 25])
opts_in              torch.Size([10, 100, 25])
opts_out             torch.Size([10, 100, 25])
opts_len             torch.Size([10, 100])
ans                  torch.Size([10, 25])
ans_in               torch.Size([10, 25])
ans_out              torch.Size([10, 25])
ans_len              torch.Size([10])
ans_ind              torch.Size([10])
gt_relevance         torch.Size([100])
round_id             torch.Size([])
img_feat             torch.Size([36, 2048])
ques_feats           torch.Size([10, 23, 768])
hist_feats           torch.Size([11, 768])
ques_masks           torch.Size([10, 23])
{
    "ROOT": "/home/ubuntu",
    "seed": 0,
    "dataset": {
        "overfit": true,
        "img_norm": 1,
        "concat_history": true,
        "batch_size": 8,
        "cpu_workers": 4,
        "max_seq_len": 25,
        "path": "datasets/visdial",
        "train": {
            "pat

In [4]:
dataloader.batch_size

1

In [None]:
run visdial/encoders/txt_embeddings.py

In [None]:
run visdial/encoders/lf_encoder.py

In [None]:
from visdial.utils import move_to_cuda

In [None]:
batch = move_to_cuda(batch, device='cuda:0')

In [None]:
encoder = LFEncoder(config)
encoder = encoder.to('cuda:0')

In [None]:
img_feats = encoder.img_embeddings(batch)

In [None]:
print(img_feats.size())
print(img_feats.device)


In [None]:
txt_feats = encoder.txt_embeddings(batch, type='lf')

In [None]:
for key in txt_feats:
    print(key, txt_feats[key].size(), txt_feats[key].device)

In [None]:
encoder_output = encoder(batch)

In [None]:
run visdial/decoders/disc.py

In [None]:
decoder = DiscriminativeDecoder(config)
decoder = decoder.to('cuda:0')

In [133]:
a = torch.ones(10)
b = a

In [134]:
a.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [155]:
x = torch.ones((10,))
x

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [168]:
nn.Dropout(p=0.2).eval()(x)

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [175]:
nn.Dropout(p=0.5).eval()(torch.ones(4, 10))

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [136]:
a = torch.ones_like(a)
print(a)
print(b)

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [177]:
x = nn.Sequential(nn.Linear(1, 2), nn.ReLU())
x

Sequential(
  (0): Linear(in_features=1, out_features=2, bias=True)
  (1): ReLU()
)

In [98]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, hidden_size, num_heads):
        
        super().__init__()
        
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        
        self.x_proj_linear = nn.Linear(hidden_size, hidden_size, bias=False)
        self.y_proj_linear = nn.Linear(hidden_size, hidden_size, bias=False)
        
    
    def forward(self, X, Y, mask_X, mask_Y):
        """
        X: shape: [batch_size, M, hidden_size]
        Y: shape: [batch_size, N, hidden_size]
        mask_X: shape: [batch_size, M]
        mask_Y: shape: [batch_size, N]
        """
        d_h = self.hidden_size // self.num_heads
        
        X_proj = self.x_proj_linear(X)
        Y_proj = self.y_proj_linear(Y)
        
        X_proj = X_proj.view(X.size(0), X.size(1), self.num_heads, d_h)
        Y_proj = Y_proj.view(Y.size(0), Y.size(1), self.num_heads, d_h)
        
        # shape: [bs, num_heads, M, d_h]
        X_proj = X_proj.permute(0, 2, 1, 3)
        # shape: [bs, num_heads, d_h, N]
        Y_proj = Y_proj.permute(0, 2, 3, 1) 
        
        # shape: [bs, num_heads, M, N] 
        affinity_matrix = torch.matmul(X_proj, Y_proj)

        mask_X = mask_X[:, None, :, None].repeat(1, self.num_heads, 1, Y.size(1))
        mask_Y = mask_Y[:, None, None, :].repeat(1, self.num_heads, X.size(1), 1)
        affinity_matrix[~mask_X] = -9999999.0
        affinity_matrix[~mask_Y] = -9999999.0
        print(affinity_matrix)
        
        attn_X_guided_by_Y = torch.softmax(affinity_matrix, dim=2)
        attn_Y_guided_by_X = torch.softmax(affinity_matrix, dim=3)
        
        print(attn_X_guided_by_Y)
        print(attn_Y_guided_by_X)
        # (1) shape: [bs, M, N]
        # (2) shape: [bs, M, N]
        attn_X_guided_by_Y = torch.mean(attn_X_guided_by_Y, dim=1)
        attn_Y_guided_by_X = torch.mean(attn_Y_guided_by_X, dim=1)
        
        # (1) shape: [bs, N, hidden_size]
        # (2) shape: [bs, M, hidden_size]
        X_attends_in_Y = torch.matmul(attn_X_guided_by_Y.transpose(1, 2), X)
        Y_attends_in_X = torch.matmul(attn_Y_guided_by_X, Y)
        
        return X_attends_in_Y, Y_attends_in_X
    
        

In [None]:
x = torch.batch_size

In [99]:
params = {
    'hidden_size': 5, 
    'num_heads' : 1
}

multi_heads_attn = MultiHeadAttention(**params)

In [100]:
multi_heads_attn

MultiHeadAttention(
  (x_proj_linear): Linear(in_features=5, out_features=5, bias=False)
  (y_proj_linear): Linear(in_features=5, out_features=5, bias=False)
)

In [101]:
mask_x = torch.randint(0, 2, size=(2, 3)).byte()
mask_y = torch.randint(0, 2, size=(2, 4)).byte()

In [102]:
X = torch.randn(2, 3, 5)
Y = torch.randn(2, 4, 5)

a, b = multi_heads_attn(X, Y, mask_x, mask_y)
print(a.shape)
print(b.shape)

tensor([[[[-1.0000e+07, -1.0000e+07, -1.0000e+07, -1.0000e+07],
          [-1.0000e+07, -1.0000e+07, -1.0000e+07, -1.0000e+07],
          [-3.6676e-01, -1.0000e+07, -1.0000e+07, -6.3033e-02]]],


        [[[-1.6020e-01,  1.8862e-01,  5.2731e-02, -1.0000e+07],
          [ 1.0865e+00,  1.7230e+00, -4.2912e-01, -1.0000e+07],
          [-6.1388e-01, -1.0570e-01,  2.7913e-01, -1.0000e+07]]]],
       grad_fn=<IndexPutBackward>)
tensor([[[[0.0000, 0.3333, 0.3333, 0.0000],
          [0.0000, 0.3333, 0.3333, 0.0000],
          [1.0000, 0.3333, 0.3333, 1.0000]]],


        [[[0.1955, 0.1566, 0.3482, 0.3333],
          [0.6802, 0.7266, 0.2151, 0.3333],
          [0.1242, 0.1167, 0.4367, 0.3333]]]], grad_fn=<SoftmaxBackward>)
tensor([[[[0.2500, 0.2500, 0.2500, 0.2500],
          [0.2500, 0.2500, 0.2500, 0.2500],
          [0.4246, 0.0000, 0.0000, 0.5754]]],


        [[[0.2736, 0.3878, 0.3386, 0.0000],
          [0.3216, 0.6078, 0.0706, 0.0000],
          [0.1959, 0.3256, 0.4785, 0.0000]]]], grad_

In [84]:
a

tensor([[[ 0.4033,  0.0483,  0.0527,  ..., -0.4376, -0.0761, -0.3726],
         [ 0.6515, -0.7384,  0.0404,  ...,  0.0301, -0.1306, -0.5231],
         [ 0.4033,  0.0483,  0.0527,  ..., -0.4376, -0.0761, -0.3726],
         ...,
         [ 0.1078, -0.1965, -0.4716,  ...,  0.0100, -0.3892, -0.3319],
         [ 0.7828, -0.3128, -0.7584,  ..., -0.3495, -0.1553, -0.0919],
         [ 0.4033,  0.0483,  0.0527,  ..., -0.4376, -0.0761, -0.3726]],

        [[-0.6709, -0.6391,  0.0901,  ..., -0.9314, -0.6749,  0.3258],
         [-0.9335, -0.4887, -0.4550,  ..., -1.0240, -0.3731,  0.3095],
         [-0.3399,  0.0126,  0.0423,  ..., -0.1205,  0.2022,  0.0717],
         ...,
         [-0.6586,  0.2715, -0.0574,  ..., -0.2848,  0.2655, -0.2963],
         [-0.3399,  0.0126,  0.0423,  ..., -0.1205,  0.2022,  0.0717],
         [-0.3399,  0.0126,  0.0423,  ..., -0.1205,  0.2022,  0.0717]],

        [[-0.0742,  0.0781,  0.2334,  ...,  0.0329, -0.1039, -0.3782],
         [-0.0742,  0.0781,  0.2334,  ...,  0

In [83]:
matrix = torch.randn(5, 3, 4)
matrix

tensor([[[-1.0059,  1.7653,  0.3292, -0.4051],
         [-0.6591,  2.1579, -0.2128,  1.1763],
         [-0.8461,  0.5538, -0.9316, -1.5338]],

        [[ 1.1550,  0.6238,  0.7421,  0.5108],
         [-0.2003,  0.4859, -1.5054, -0.8756],
         [-0.2574,  0.3262,  1.3166, -1.3152]],

        [[-0.1423,  0.1998,  0.8951, -1.7911],
         [ 0.1389,  0.8564, -0.4722,  0.6451],
         [-1.1392,  0.0230,  0.4652,  1.3334]],

        [[-0.9671, -0.1166,  0.9532,  0.0507],
         [ 0.7592,  0.5865,  1.4701,  0.1961],
         [ 0.2382, -0.3408,  1.3652, -0.0586]],

        [[ 0.2561, -0.4663, -0.9957, -1.3115],
         [-0.0206,  2.5574,  0.2176,  0.9995],
         [ 0.1746, -0.7608,  0.7546,  0.7625]]])

In [54]:
a = torch.tensor([
    [1, 0, 0], 
    [1, 1, 0], 
    [1, 1, 1],
    [1, 0, 1],
    [1, 0, 0]
]).byte()
a = a[:, :, None]
a = a.repeat(1, 1, 4)
#print(a)
b = torch.tensor([
    [1, 0, 0, 0], 
    [1, 1, 0, 0], 
    [1, 1, 1, 0],
    [1, 1, 1, 1],
    [1, 1, 1, 0]
]).byte()
b = b[:, None, :]
b = b.repeat(1, 3, 1)
print(b)

tensor([[[1, 0, 0, 0],
         [1, 0, 0, 0],
         [1, 0, 0, 0]],

        [[1, 1, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 0, 0]],

        [[1, 1, 1, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 0]],

        [[1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1]],

        [[1, 1, 1, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 0]]], dtype=torch.uint8)


In [55]:
import copy
ma = copy.deepcopy(matrix)
ma[~a] = 0
ma[~b] = 0
ma

tensor([[[ 6.5747e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],

        [[-9.7794e-01,  7.3029e-02,  0.0000e+00,  0.0000e+00],
         [-1.0049e+00, -3.3992e-01,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],

        [[ 1.1446e+00,  4.7268e-01,  3.2832e-01,  0.0000e+00],
         [-6.1762e-01, -1.6292e+00,  1.9820e+00,  0.0000e+00],
         [-4.7139e-01,  5.3024e-01, -2.9554e-01,  0.0000e+00]],

        [[ 1.1559e-01,  6.6762e-01, -5.9305e-01,  5.8589e-02],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 4.1537e-01,  1.0254e+00, -4.0526e-01,  6.9145e-02]],

        [[-1.2107e-01, -1.1034e-03, -5.6049e-01,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]]])

In [13]:
torch.softmax(x, dim=0)

tensor([[4.2010e-02, 3.3531e-04, 4.9832e-01, 2.5950e-01],
        [1.1420e-01, 1.2335e-04, 4.9832e-01, 3.5119e-02],
        [8.4379e-01, 9.9954e-01, 3.3577e-03, 7.0538e-01]])

In [14]:
x

tensor([[5., 1., 6., 2.],
        [6., 0., 6., 0.],
        [8., 9., 1., 3.]])

# Dataset

In [1]:
import torch

In [30]:
run configs/lf_disc_config.py

In [31]:
config = get_config()

In [32]:
config

{'seed': 0,
 'callbacks': {'validate': True,
  'resume': False,
  'comet_project': 'lf-bert-disc',
  'path_pretrained_ckpt': '',
  'path_dir_save_ckpt': '/home/quanguet/checkpoints/visdial/lf_disc/lf_bert_disc'},
 'dataset': {'overfit': False,
  'img_norm': 1,
  'concat_history': True,
  'batch_size': 16,
  'cpu_workers': 4,
  'max_seq_len': 25,
  'is_return_options': True,
  'is_add_boundaries': True,
  'train': {'path_feat_img': '/home/quanguet/datasets/visdial/features_faster_rcnn_x101_train.h5',
   'path_json_dialogs': '/home/quanguet/datasets/visdial/visdial_1.0_train.json',
   'path_feat_history': '/home/quanguet/datasets/visdial/features_bert_train_history.h5',
   'path_feat_answers': '/home/quanguet/datasets/visdial/features_bert_train_answers.h5',
   'path_feat_questions': '/home/quanguet/datasets/visdial/features_bert_train_questions.h5',
   'path_json_dense_dialogs': '/home/quanguet/datasets/visdial/visdial_1.0_word_counts_train.json',
   'path_json_word_count': '/home/quang

In [55]:
run visdial/data/readers.py

[nltk_data] Downloading package punkt to /home/quanguet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
reader = DialogsReader(config, 'val')

  6%|▌         | 2495/45238 [00:00<00:03, 12408.33it/s]

[val2018] Tokenizing questions...


100%|██████████| 45238/45238 [00:03<00:00, 11814.86it/s]
  4%|▎         | 1237/34822 [00:00<00:02, 12367.45it/s]

[val2018] Tokenizing answers...


100%|██████████| 34822/34822 [00:02<00:00, 13190.89it/s]
100%|██████████| 2064/2064 [00:00<00:00, 11019.36it/s]

[val2018] Tokenizing captions...





In [57]:
run visdial/data/dataset.py

In [58]:
dataset = VisDialDataset(config, split='val')

KeyError: 'ROOT'

In [45]:
dialreader = DialogsReader(config, split='val')

KeyError: 'ROOT'

In [15]:
from visdial.data.dataset import VisDialDataset

dataset = VisDialDataset(config, split='val')

KeyError: 'dataset'

In [43]:
split='val'
path_json_dialogs = config['dataset'][split]['path_json_dialogs']

with open(path_json_dialogs, "r") as visdial_file:
    visdial_data = json.load(visdial_file)

In [44]:
len(visdial_data['data']['questions'])

45237

In [46]:
visdial_data['split']

'val2018'