In [3]:
# https://pytorch.org/get-started/previous-versions/
# !pip3 install torch==1.5.0 -f https://download.pytorch.org/whl/cu100/stable # CUDA 10.0 build  1.0.1
# !pip3 install torch==1.5.0+cu101 torchvision==0.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip3 install -U -q PyDrive

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [0]:
# !ls "drive/My Drive/Colab Notebooks/ir_data" -l

In [6]:
## LIBRARIES IMPORT
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import os, sys, pickle, shutil#, numbers
import numpy as np
import pandas as pd

import torch
print(torch.__version__)
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.backends.cudnn as cudnn

from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
1.5.0+cu101


In [0]:
## SIMPLE FC NN
class MapperI(nn.Module):
    def __init__(self, feat_dim, args):
        super(MapperI, self).__init__()
        self.fc = nn.Sequential(nn.Linear(int(feat_dim), int(args.dim_embed)),
                                nn.BatchNorm1d(args.dim_embed),
                                nn.ReLU(inplace=True),
                                nn.Dropout(p=0.5),
                                nn.Linear(int(args.dim_embed), int(args.bit)) )
        if args.cuda: self.cuda()
        
    def forward(self, images):
        x = self.fc(images)
        x = nn.functional.normalize(x)  # L2 normalize each feature vector
        return x

class MapperT(nn.Module):
    def __init__(self, feat_dim, vecs, args):
        super(MapperT, self).__init__()
        self.fc = nn.Sequential(nn.Linear(int(feat_dim), int(args.dim_embed)),
                                nn.BatchNorm1d(args.dim_embed),
                                nn.ReLU(inplace=True),
                                nn.Dropout(p=0.5),
                                nn.Linear(int(args.dim_embed), int(args.bit)) )
        n_tokens, token_dim = vecs.shape
        self.vecs = vecs
        self.words = nn.Embedding(n_tokens, token_dim)
        self.words.weight = nn.Parameter(vecs)
        if args.cuda:
            self.cuda()
            self.vecs = vecs.cuda()

    def forward(self, tokens):
        words = self.words(tokens)
        n_words = torch.sum(tokens > 0, 1).float() + 1e-10
        sum_words = words.sum(1).squeeze()
        sentences = sum_words / n_words.unsqueeze(1)
        x = self.fc(sentences)
        x = nn.functional.normalize(x)  # L2 normalize each feature vector
        return x

In [0]:
def calculate_S(label1, label2, args):
    # calculate the similar matrix (S) -- neighbour matrix
    S = (label1.matmul(label2.transpose(0, 1)) > 0).type(torch.FloatTensor)
    return S

def derivative(F, G, B, S, args):
    sigma = torch.sigmoid(torch.matmul(F, G.transpose(0, 1)) / 2.)
    term1 = torch.sum( (torch.matmul(sigma, F) - torch.matmul(S, F)) / 2.)  # similarity
    term2 = 2*args.gamma * torch.sum(F - B)  # preserve similarity
    term3 = 2*args.eta * torch.sum(F)  # preserve balance
    loss = term1 + term2 + term3
    return loss

def compound_loss(F, G, B, S, args):
    theta = torch.matmul(F, G.transpose(0, 1)) / 2.  # D*D resp T*T (D = unique image-caption pairs, T = image-caption pairs in batch)
    term1 = torch.sum(S*theta - torch.log(1. + torch.exp(theta)))  # similarity
    term2 = torch.sum(torch.pow(B - F, 2) + torch.pow(B - G, 2))  # preserve similarity
    term3 = torch.sum(torch.pow(F.sum(dim=0), 2) + torch.pow(G.sum(dim=0), 2))  # preserve balance
    loss = term1 + args.gamma * term2 + args.eta * term3
    return loss

def generate_image_code(mapper_i, x, args):
    img = mapper_i(x)
    return torch.sign(img)

def generate_text_code(mapper_t, y, args):
    text = mapper_t(y)
    return torch.sign(text)

In [0]:
## DATASET LOADER
def load_word_embeddings(word_embedding_filename, embedding_length):
    with open(word_embedding_filename, 'r') as f:
        word_embeddings = {}
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                continue

            vec = line.split()
            if len(vec) != embedding_length + 1:
                continue
            
            label = vec[0].lower()
            vec = np.array([float(x) for x in vec[1:]], np.float32)            
            assert len(vec) == embedding_length
            word_embeddings[label] = vec
        
        print('Loaded {:d} embedding vectors'.format(i))
    return word_embeddings

def load_flickr_captions(args, split):
    stop_words = set(stopwords.words('english'))
    split_fn = os.path.join(args.feat_path, split + '.txt')
    images = [im.strip() for im in open(split_fn, 'r')]
    im2idx = dict(zip(images, range(len(images))))
    images = set(images)
    caption_fn = os.path.join(args.feat_path, 'captions.token')
    im2captions = {}
    with open(caption_fn, 'r') as f:
        for line in f:
            line = line.strip().lower().split()
            im = line[0].split('.')[0]
            if im in images:
                if im not in im2captions:
                    im2captions[im] = []

                im2captions[im].append([token for token in line[1:-1] if token not in stop_words])  # last token = '.', thus [1:-1]

    assert(len(im2idx) == len(im2captions))
    captions = []
    cap2im = []
    for im, idx in im2idx.items():
        im_captions = im2captions[im]
        captions += im_captions
        cap2im.append(np.ones(len(im_captions), np.int32) * idx)

    cap2im = np.hstack(cap2im)
    return captions, cap2im, im2idx

def load_imfeature_embeddings(args, split):
    split_fn = os.path.join(args.feat_path, split + '.txt')
    images = [im.strip() for im in open(split_fn, 'r')]
    
    im_features = {}
    im_feat_path = os.path.join(args.feat_path, 'image_features.csv')
    with open(im_feat_path, 'r') as f:
        for i, line in enumerate(f):
            tmp = line.split('\n')
            tmp = tmp[0].split(' ', 1)
            if len(tmp) == 2 and tmp[0] in images:
                im_features[str(tmp[0].split('.')[0])] = tmp[1]
    return im_features
            
class DatasetLoader:
    """ Dataset loader class that loads feature matrices from given paths and
        create shuffled batch for training, unshuffled batch for evaluation.
    """
    def __init__(self, args, split='train'):
        self.captions, self.cap2im, self.im2idx = load_flickr_captions(args, split)
        
        if not args.given_im_features:
          # LOAD IMAGE FEATURES USED IN PAPER
          feat_path = os.path.join(args.feat_path, split + '_features.npy')
        else:
          # LOAD GIVEN IMAGE FEATURES
          feat_path = os.path.join(args.feat_path, split + '_im_features.npy')
          
        self.im_feats = np.load(feat_path)
        print('Loading features from', feat_path)

        if split == 'val':
            num_images = 1000
            self.im_feats = self.im_feats[:num_images]
            subset_ims = self.cap2im < num_images
            self.captions = [caption for caption, is_val in zip(self.captions, subset_ims) if is_val]
            self.cap2im = [im for im, is_val in zip(self.cap2im, subset_ims) if is_val]

        assert len(self.cap2im) == len(self.captions)
        if split != 'train':
            self.labels = np.zeros((len(self.cap2im), len(self.im_feats)), np.float)
            self.labels[(range(len(self.cap2im)), self.cap2im)] = 1
        else:
            self.im2cap = {}
            for cap, im in enumerate(self.cap2im):
                if im not in self.im2cap:
                    self.im2cap[im] = []

                self.im2cap[im].append(cap)

        print('Loading complete')
        self.split = split
        self.sample_size = args.sample_size

    def build_vocab(self, cache_filename, word_embeddings_filename=None, embedding_length=300):
        if os.path.exists(cache_filename):
            vocab_data = pickle.load(open(cache_filename, 'rb'))
            self.max_length = vocab_data['max_length']
            self.tok2idx = vocab_data['tok2idx']
            vecs = vocab_data['vecs']
        else:
            assert word_embeddings_filename is not None
            word_embeddings = load_word_embeddings(word_embeddings_filename, embedding_length)
            self.max_length = 0
            vocab = set()
            for caption in self.captions:
                tokens = [token for token in caption if token in word_embeddings]
                vocab.update(tokens)
                self.max_length = max(self.max_length, len(tokens))

            vocab = list(vocab)
            # +1 for a padding vector which *must* be the 0th index
            self.tok2idx = dict(zip(vocab, range(1, len(vocab) + 1)))
            vecs = np.zeros((len(vocab) + 1, embedding_length), np.float32)
            for i, token in enumerate(vocab):
                vecs[i + 1] = word_embeddings[token]
            
            vocab_data = {'max_length' : self.max_length,
                          'tok2idx' : self.tok2idx,
                          'vecs' : vecs}

            pickle.dump(vocab_data, open(cache_filename, 'wb'))

        self.sent_feats = np.zeros((len(self.captions), self.max_length), np.int64)
        for i, caption in enumerate(self.captions):
            tokens = [self.tok2idx[token] for token in caption if token in self.tok2idx]
            self.sent_feats[i, :len(tokens)] = tokens

        return vecs

    def __len__(self):
        return len(self.captions)

    def __getitem__(self, index):
        im = self.cap2im[index]
        im_feat = self.im_feats[self.cap2im[index]]
        sample_index = np.random.choice(
            [i for i in self.im2cap[im] if i != index],
            self.sample_size - 1, replace=False)
        sample_index = sorted(np.append(sample_index, index))
        sent_feat = self.sent_feats[sample_index]
        return im_feat, sent_feat

In [0]:
# GET SENTENCE FEATURES -- INDICES OF TOKENS -- BASED ON CONSTRUCTED VOCABULARY
# converts given sentence to word embeddings
def get_sentence_features(sentences, args):
    if os.path.exists(args.feat_path+'vocab.pkl'):
        vocab_data = pickle.load(open(args.feat_path+'vocab.pkl', 'rb'))
        max_length = vocab_data['max_length']
        tok2idx = vocab_data['tok2idx']
        vecs = vocab_data['vecs']
    else:
        print('could find data/vocab.pkl file!')
    
    sent_feats = np.zeros((len(sentences), max_length), np.int64)
    for i, caption in enumerate(sentences):
        tokens = [tok2idx[token.lower().rstrip(',.!?:')] for token in caption.split() if token.lower().rstrip(',.!?:') in tok2idx]
        sent_feats[i, :len(tokens)] = tokens
    return sent_feats

In [0]:
# GENERATE TRAIN, VAL, TEST SET SPLITS OF GIVEN CSV FILE WITH IMAGE FEATURES AND SAVE IT AS NPY FILE
def generate_npy(filepath, targetFolder):
    splits = ['train', 'test', 'val']
    data = pd.read_csv(filepath, sep=' ', header=None)
    data[0] = data[0].map(lambda x: x.split('.')[0])  # remove .jpg ending
    
    for split in splits:
        split_idx = [im.strip() for im in open(ARGS.feat_path + split + '.txt', 'r')]
        data_split = data.loc[data[0].isin(split_idx)].copy()
        sorterIndex = dict(zip(split_idx, range(len(split_idx))))  # sort by given split
        # Generate a rank column that will be used to sort the dataframe numerically
        data_split['rank'] = data_split[0].map(sorterIndex)
        data_split.sort_values(['rank'], ascending = [True], inplace = True)
        data_split.drop(['rank', 0], 1, inplace = True)  # drop rank and name of image columns
        data_split = data_split.to_numpy(dtype=np.float32)
        np.save(targetFolder + split + '_im_features.npy', data_split)
        print(split, 'saved with shape', data_split.shape)

In [0]:
def train(train_loader, mapper_i, mapper_t, optimizer_i, optimizer_t, epoch, args):
    steps_per_epoch = len(train_loader.dataset) // args.batch_size
    display_interval = int(np.floor(steps_per_epoch * args.display_interval))
    D = len(train_loader.dataset)   

    # Initialize
    # F_tmp = torch.rand((args.batch_size, args.bit))
    # G_tmp = torch.rand((args.batch_size, args.bit))
    # B = torch.sign(F_tmp + G_tmp)
    # if args.cuda:
    #     F_tmp, G_tmp, B = F_tmp.cuda(), G_tmp.cuda(), B.cuda()

    mapper_i.train()
    for batch_idx, (im_feats, sent_feats) in enumerate(train_loader):
        labels = torch.from_numpy(np.eye(im_feats.size(0), dtype=np.float32))
        if args.cuda:
            im_feats, sent_feats, labels = im_feats.cuda(), sent_feats.cuda(), labels.cuda()

        im_feats, labels = Variable(im_feats), Variable(labels)
        sent_feats = sent_feats.view(labels.size(0), -1)

        S = calculate_S(labels, labels, args)
        F = mapper_i(im_feats)
        G = mapper_t(sent_feats)
        B = torch.sign(args.gamma * (F+G))
        if args.cuda:
            F, G, S = F.cuda(), G.cuda(), S.cuda()

        F, G = Variable(F), Variable(G)  #  requires_grad=True
        derivative_F = derivative(F, G, B, S, args)
        # derivative_F /= (args.batch_size * D)

        # compute gradient and do optimizer step
        optimizer_i.zero_grad()
        derivative_F.backward()  #retain_graph=True
        optimizer_i.step()

    mapper_t.train()
    for batch_idx, (im_feats, sent_feats) in enumerate(train_loader):
        labels = torch.from_numpy(np.eye(im_feats.size(0), dtype=np.float32))
        if args.cuda:
            im_feats, sent_feats, labels = im_feats.cuda(), sent_feats.cuda(), labels.cuda()

        sent_feats, labels =  Variable(sent_feats), Variable(labels)
        sent_feats = sent_feats.view(labels.size(0), -1)

        S = calculate_S(labels, labels, args)
        F = mapper_i(im_feats)
        G = mapper_t(sent_feats)
        B = torch.sign(args.gamma * (F+G))
        if args.cuda:
            F, G, S = F.cuda(), G.cuda(), S.cuda()

        F, G = Variable(F), Variable(G)  # requires_grad=True
        derivative_G = derivative(G, F, B, S, args)
        # derivative_G /= (args.batch_size * D)

        # compute gradient and do optimizer step
        optimizer_t.zero_grad()
        derivative_G.backward()  #retain_graph=True
        optimizer_t.step()


    B = torch.sign(args.gamma * (F + G))
    loss = compound_loss(F, G, B, Variable(S), args)
    print('Epoch: {:d} Loss: {:f}, Image Loss: {:f}, Text Loss: {:f} '.format(epoch, loss, 
                                                                              derivative_F,
                                                                              derivative_G))  #i_average_loss.avg()

def test(test_loader, mapper_i, mapper_t, args):
    mapper_i.eval()
    mapper_t.eval()
    sent_feats = torch.from_numpy(test_loader.sent_feats)  # sent_feats are indices
    im_feats = torch.from_numpy(test_loader.im_feats)
    if ARGS.cuda:
        sent_feats, im_feats = sent_feats.cuda(), im_feats.cuda()

    sent_feats, im_feats = Variable(sent_feats), Variable(im_feats)

    i_hash = generate_image_code(mapper_i, im_feats, args)
    t_hash = generate_text_code(mapper_t, sent_feats, args)
    labels = torch.from_numpy(test_loader.labels)
    im2sent = map_k(i_hash, t_hash, labels.t())
    sent2im = map_k(t_hash, i_hash, labels)
    MAP = (im2sent + sent2im)/2
    print('\n{} set im2sent: {:.2f}%, sent2im: {:.2f}%, total: {:.2f}% \n'.format(test_loader.split,
                                                                               im2sent*100, sent2im*100, MAP*100))
    return MAP

def map_k(queries, targets, labels, k=10):
    mAP = 0.
    is_correct_target = torch.repeat_interleave(labels, 5, dim=0)  # [#images*5, #captions] -- img vs. caption
    correct_target_num = (is_correct_target[0].sum()).type(torch.LongTensor).item()
    assert correct_target_num > 0

    hamming_dist = torch.matmul(queries, targets.t())
    _, hd_sorted_idx = hamming_dist.sort(axis=1, descending=True)  # -1*-1=1 and 1*1=1 looking max value = max similarity = min hamm_dist
    total = min(k, correct_target_num)
    count = torch.arange(1, correct_target_num+1)  # calculate the weight for MAP calculation

    for i in range(len(queries)):
        query_result = is_correct_target[i, hd_sorted_idx[i]]    
        tindex = torch.nonzero(query_result)[:total].squeeze() + 1.  # get non zero indices
        mAP += torch.mean(count.type(torch.FloatTensor)/tindex.type(torch.FloatTensor))
    return mAP/len(queries)

def recallAtK(dist_matrix, labels):
    assert len(dist_matrix) == len(labels)
    thresholds = [1, 5, 10]
    successAtK = np.zeros(len(thresholds), np.float32)
    _, indices = dist_matrix.topk(max(thresholds), largest=False)
    for i, k in enumerate(thresholds):
        for sample_indices, sample_labels in zip(indices[:, :k], labels):
            successAtK[i] += sample_labels[sample_indices].max()

    if len(indices) > 0:
        successAtK /= len(indices)

    successAtK = np.round(successAtK*100, 1)
    return successAtK

In [0]:
## ARGUMENTS
class ARGS:
    def __init__(self):
        self.name = 'Two_Branch_Network'
        self.seed = 11
        self.cuda = True
        self.feat_path = "drive/My Drive/Colab Notebooks/ir_data/"
        self.save_dir = 'models/'
        self.resume = 'none'#'models/Two_Branch_Network/checkpoint.pth.tar'
        self.test = ''
        self.display_interval=0.25
        self.given_im_features = False  # use given features else use resnet feat
        self.embedding_length=300
        self.dim_embed=2048
        # LEARNING  
        self.lr=1e-4
        self.text_lr_multi=2.0
        self.batch_size=256
        self.sample_size=1
        self.max_num_epoch=1
        self.no_gain_stop=4
        self.minimum_gain=0.1
        self.num_neg_sample=10
        self.margin=0.05
        
        self.bit=32
        self.gamma=1
        self.eta=1

ARGS = ARGS()

if ARGS.cuda:
    torch.cuda.manual_seed(ARGS.seed)

# choose the features -- one given by TA or the ones used in the paper
# as default used the ones from paper -- better performance
if not (os.path.exists(ARGS.feat_path + 'train_im_features.npy') and os.path.exists(ARGS.feat_path + 'test_im_features.npy') and os.path.exists(ARGS.feat_path + 'val_im_features.npy')):
    generate_npy(ARGS.feat_path+'image_features.csv', ARGS.feat_path)

In [26]:
train_loader = DatasetLoader(ARGS, 'train')

vocab_filename = os.path.join(ARGS.feat_path, 'vocab.pkl')
word_embeddings_filename = os.path.join(ARGS.feat_path, 'mt_grovle.txt')

print('Loading vocab')
vecs = train_loader.build_vocab(vocab_filename, word_embeddings_filename, ARGS.embedding_length)
print('Loading complete')

kwargs = {'num_workers': 8, 'pin_memory': True} if ARGS.cuda else {}
train_loader = torch.utils.data.DataLoader(train_loader, batch_size=ARGS.batch_size, shuffle=True, **kwargs)
test_loader = DatasetLoader(ARGS, 'test')
val_loader = DatasetLoader(ARGS, 'val')

# Assumes the train_loader has already built the vocab and can be loaded from the cached file.
test_loader.build_vocab(vocab_filename)
val_loader.build_vocab(vocab_filename)

image_feature_dim = train_loader.dataset.im_feats.shape[-1]
n_tokens, token_dim = vecs.shape
vecs = torch.from_numpy(vecs)
if ARGS.cuda: vecs = vecs.cuda()

mapper_t = MapperT(token_dim, vecs, ARGS)
mapper_i = MapperI(image_feature_dim, ARGS)


# optionally resume from a checkpoint
start_epoch, best_acc = 1, 0.0  # load_checkpoint(mappe, ARGS.resume)
cudnn.benchmark = True

parameters_i = [{'params' : mapper_i.fc.parameters()}]

parameters_t = [{'params' : mapper_t.words.parameters(), 'weight_decay' : 0.},
                {'params' : mapper_t.fc.parameters(), 
                'lr' : ARGS.lr*ARGS.text_lr_multi}]

optimizer_i = optim.Adam(parameters_i, lr=ARGS.lr, weight_decay=0.001)
scheduler_i = torch.optim.lr_scheduler.ExponentialLR(optimizer_i, gamma = 0.794)
optimizer_t = optim.Adam(parameters_t, lr=ARGS.lr, weight_decay=0.001)
scheduler_t = torch.optim.lr_scheduler.ExponentialLR(optimizer_t, gamma = 0.794)

n_parameters = sum([p.data.nelement() for model in [mapper_i, mapper_t] for p in model.parameters()])
print('  + Number of params: {}'.format(n_parameters))

save_directory = os.path.join(ARGS.save_dir, ARGS.name)
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

Loading features from drive/My Drive/Colab Notebooks/ir_data/train_features.npy
Loading complete
Loading vocab
Loading complete
Loading features from drive/My Drive/Colab Notebooks/ir_data/test_features.npy
Loading complete
Loading features from drive/My Drive/Colab Notebooks/ir_data/val_features.npy
Loading complete
  + Number of params: 9378628


In [27]:
epoch = 1
best_epoch = epoch
for epoch in range(1,5):
    train(train_loader, mapper_i, mapper_t, optimizer_i, optimizer_t, epoch, ARGS)  # train for one epoch
    acc = test(val_loader, mapper_i, mapper_t, ARGS)  # evaluate on validation set

    # update learning rate
    scheduler_i.step()
    scheduler_t.step()

Epoch: 1 Loss: 5124.277344, Image Loss: 4649.031250, Text Loss: -1511.302979 

val set im2sent: 0.35%, sent2im: 0.64%, total: 0.50% 

Epoch: 2 Loss: 20760.554688, Image Loss: 7121.621094, Text Loss: -1092.000000 

val set im2sent: 0.38%, sent2im: 0.66%, total: 0.52% 

Epoch: 3 Loss: -10755.574219, Image Loss: -1421.160645, Text Loss: -1532.000000 

val set im2sent: 2.03%, sent2im: 0.74%, total: 1.38% 

Epoch: 4 Loss: -10753.132812, Image Loss: -1472.000000, Text Loss: -1460.000000 

val set im2sent: 2.03%, sent2im: 0.47%, total: 1.25% 



In [0]:
we = load_word_embeddings(ARGS.feat_path + '/mt_grovle.txt', 300)

Loaded 23375 embedding vectors


In [0]:
captions, cap2im, im2idx = load_flickr_captions(ARGS, 'train')

In [0]:
sys.getsizeof(captions) / (1024*1024)

1.210662841796875

In [0]:
len(im2idx)

29783

In [0]:
print(len(train_loader.dataset))

148915


In [0]:
i_average_loss = RunningAverage()
t_average_loss = RunningAverage()
steps_per_epoch = len(train_loader.dataset) // ARGS.batch_size
display_interval = int(np.floor(steps_per_epoch * ARGS.display_interval))

mapper_i.train()
mapper_t.train()

MapperT(
  (fc): Sequential(
    (0): Linear(in_features=300, out_features=2048, bias=True)
    (1): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=2048, out_features=32, bias=True)
  )
  (words): Embedding(14755, 300)
)

In [0]:
batch_idx, (im_feats, sent_feats) = next(enumerate(train_loader))

In [0]:
labels = torch.from_numpy(np.eye(im_feats.size(0), dtype=np.float32))
if ARGS.cuda:
    im_feats, sent_feats, labels = im_feats.cuda(), sent_feats.cuda(), labels.cuda()

im_feats, sent_feats, labels = Variable(im_feats), Variable(sent_feats), Variable(labels)
sent_feats = sent_feats.view(labels.size(0), -1)
print(im_feats.shape, sent_feats.shape, labels.shape)

torch.Size([256, 2048]) torch.Size([256, 45]) torch.Size([256, 256])


In [0]:
labels

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')

In [0]:
F = mapper_i(im_feats)
G = mapper_t(sent_feats)
B = ARGS.gamma * torch.sign(F+G)
theta = torch.matmul(F.transpose(0, 1), G) / 2
S = calculate_S(labels, labels, ARGS).cuda()
print(F.shape, G.shape, B.shape, theta.shape, S.shape)

torch.Size([256, 32]) torch.Size([256, 32]) torch.Size([256, 32]) torch.Size([32, 32]) torch.Size([256, 256])


In [0]:
theta = torch.matmul(F, G.transpose(0, 1)) / 2
sigma = torch.sigmoid(theta)
T, D = theta.shape
print(theta.shape, sigma.shape, S.shape)

torch.Size([256, 256]) torch.Size([256, 256]) torch.Size([256, 256])


In [0]:
term1 = torch.sum(torch.matmul(sigma, G) - torch.matmul(S, G))  # similarity
term2 = 2*ARGS.gamma*torch.sum(F - B)  # preserve similarity
term3 = 2*ARGS.eta * torch.sum(F)  # preserve balance
loss = term1 + term2 + term3
print(term1.shape, term2.shape, term3.shape)

torch.Size([]) torch.Size([]) torch.Size([])


In [0]:
loss

tensor(7021.7539, device='cuda:0', grad_fn=<AddBackward0>)

In [0]:
# TESTING - VALIDATING

In [0]:
mapper_i.eval()
mapper_t.eval()
sent_feats = torch.from_numpy(test_loader.sent_feats)  # sent_feats are indices
im_feats = torch.from_numpy(test_loader.im_feats)
if ARGS.cuda:
    sent_feats, im_feats = sent_feats.cuda(), im_feats.cuda()

In [0]:
sent_feats, im_feats = Variable(sent_feats), Variable(im_feats)
F = mapper_i(im_feats)
G = mapper_t(sent_feats)
F = torch.repeat_interleave(F, 5, dim=0)
B = ARGS.gamma * torch.sign(F+G)    
# G, F = G.data, F.data
labels = torch.from_numpy(test_loader.labels).t()

In [0]:
i_hash = generate_image_code(mapper_i, im_feats, ARGS)
t_hash = generate_text_code(mapper_t, sent_feats, ARGS)
i_hash = torch.repeat_interleave(i_hash, 5, dim=0)
# labels = torch.repeat_interleave(labels.t(), 5, dim=0)
print(i_hash.shape, t_hash.shape, labels.shape)

torch.Size([5000, 32]) torch.Size([5000, 32]) torch.Size([1000, 5000])


In [0]:
labels

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 1., 1.]], dtype=torch.float64)

In [0]:
is_correct_target = torch.repeat_interleave(labels, 5, dim=0)
correct_target_num = (is_correct_target[0].sum()).type(torch.LongTensor).item()
print(is_correct_target.shape)

torch.Size([5000, 5000])


In [0]:
hamming_dist = torch.matmul(i_hash, t_hash.t())

In [0]:
hamming_dist[0]

tensor([ 0., -6., -4.,  ...,  0., -6., -6.], device='cuda:0',
       grad_fn=<SelectBackward>)

In [0]:
_, idx = hamming_dist.sort(axis=1, descending=True)
print(idx.shape, '\n', idx)

torch.Size([5000, 5000]) 
 tensor([[4468,   21,   46,  ..., 2518, 4956, 1419],
        [4468,   21,   46,  ..., 2518, 4956, 1419],
        [4468,   21,   46,  ..., 2518, 4956, 1419],
        ...,
        [1199, 1252, 1662,  ..., 4615, 4791, 4910],
        [1199, 1252, 1662,  ..., 4615, 4791, 4910],
        [1199, 1252, 1662,  ..., 4615, 4791, 4910]], device='cuda:0')


In [0]:
is_correct_target.cuda()*idx.cuda()

tensor([[4468.,   21.,   46.,  ...,    0.,    0.,    0.],
        [4468.,   21.,   46.,  ...,    0.,    0.,    0.],
        [4468.,   21.,   46.,  ...,    0.,    0.,    0.],
        ...,
        [   0.,    0.,    0.,  ..., 4615., 4791., 4910.],
        [   0.,    0.,    0.,  ..., 4615., 4791., 4910.],
        [   0.,    0.,    0.,  ..., 4615., 4791., 4910.]], device='cuda:0',
       dtype=torch.float64)

In [0]:
count = torch.arange(1, correct_target_num+1)
count

tensor([1, 2, 3, 4, 5])

In [0]:
# tindex = torch.nonzero(res)[:10].squeeze() + 1.
tindex

tensor([1., 2., 3., 4.])

In [0]:
query_result = is_correct_target[0, [0,1,2,3,10,10,4]]    
tindex = torch.nonzero(query_result)[:5].squeeze() + 1.  # get non zero indices
mAP = torch.mean(count.type(torch.FloatTensor)/tindex.type(torch.FloatTensor))
print(query_result, '\n', tindex, '\n', mAP)

tensor([1., 1., 1., 1., 0., 0., 1.], dtype=torch.float64) 
 tensor([1., 2., 3., 4., 7.]) 
 tensor(0.9429)


In [0]:
queries, queries_label = i_hash, labels
query_item, query_label = next(zip(queries, queries_label.t()))
print(query_item.shape, query_label.shape)

torch.Size([32]) torch.Size([5000])


In [0]:
print(labels.shape)

torch.Size([5000, 1000])


In [0]:
is_correct_target = (torch.matmul(labels.t(), labels) > 0).type(torch.FloatTensor)
print(is_correct_target)
correct_target_num = is_correct_target.sum().type(torch.LongTensor).item()
print(correct_target_num)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])
1000


In [0]:
i_hash.shape

torch.Size([1000, 32])

In [0]:
torch.repeat_interleave(i_hash, 5, dim=0).shape

torch.Size([5000, 32])

In [0]:
print(labels.t().shape, "\n", labels.t())

torch.Size([1000, 5000]) 
 tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 1., 1.]], dtype=torch.float64)


In [0]:
mAP = 0.
is_correct_target = torch.repeat_interleave(labels.t(), 5, dim=0)  # [#images*5, #captions]

correct_target_num = is_correct_target.sum().type(torch.LongTensor).item()
if correct_target_num == 0:
    continue

hamming_dist = torch.matmul(query_item, target_item.t())
_, hd_sorted_idx = hamming_dist.sort(descending=True)  # -1*-1=1 and 1*1=1 looking max value = max similarity = min hammdist
query_result = is_correct_target[hd_sorted_idx]
total = min(k, correct_target_num)

count = torch.arange(1, correct_target_num+1)
tindex = torch.nonzero(query_result)[:total].squeeze() + 1.
mAP += torch.mean(count.type(torch.FloatTensor)/tindex.type(torch.FloatTensor))
mAP/len(queries)

In [0]:
mAP = 0.
query_item, query_label = i_hash, labels.t()
target_item, target_label = t_hash, labels
print(query_item.shape, "\n", target_item.shape)
print(query_label.shape)

torch.Size([1000, 32]) 
 torch.Size([5000, 32])
torch.Size([1000, 5000])


In [0]:
correct_target_num = query_label.sum().type(torch.LongTensor).item()
print(correct_target_num)

5000


In [0]:
print(i_hash[0,:], "\n", t_hash[0,:])

tensor([-1.,  1., -1., -1., -1., -1., -1., -1.,  1., -1.,  1., -1.,  1., -1.,
        -1., -1.,  1.,  1., -1., -1., -1., -1., -1., -1., -1.,  1., -1., -1.,
        -1.,  1.,  1., -1.], device='cuda:0', grad_fn=<SliceBackward>) 
 tensor([ 1., -1., -1.,  1., -1., -1., -1., -1., -1., -1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1., -1., -1., -1., -1., -1.,  1., -1.,  1.,  1., -1.,
         1., -1., -1., -1.], device='cuda:0', grad_fn=<SliceBackward>)


In [0]:
hamming_dist = torch.matmul(i_hash, t_hash.t())
print(hamming_dist)

tensor([[  6.,  10.,   8.,  ...,  -2.,   8.,  -8.],
        [  6.,  10.,   8.,  ...,  -2.,   8.,  -8.],
        [  6.,  10.,   8.,  ...,  -2.,   8.,  -8.],
        ...,
        [  2.,  10.,   4.,  ..., -10.,   0.,  -8.],
        [  2.,  10.,   4.,  ..., -10.,   0.,  -8.],
        [  2.,  10.,   4.,  ..., -10.,   0.,  -8.]], device='cuda:0',
       grad_fn=<MmBackward>)


In [0]:
_, hd_sorted_idx = hamming_dist.sort(descending=True)
print(hd_sorted_idx)

tensor([[2877,  169,  273,  ..., 4434, 3044, 3888],
        [2877,  169,  273,  ..., 4434, 3044, 3888],
        [2877,  169,  273,  ..., 4434, 3044, 3888],
        ...,
        [1158, 1173, 4281,  ..., 4119, 4279, 1714],
        [1158, 1173, 4281,  ..., 4119, 4279, 1714],
        [1158, 1173, 4281,  ..., 4119, 4279, 1714]], device='cuda:0')


In [0]:
query_result = is_correct_target[hd_sorted_idx]
total = min(k, correct_target_num)

In [0]:
total = min(k, correct_target_num)

count = torch.arange(1, correct_target_num+1)
tindex = torch.nonzero(query_result)[:total].squeeze() + 1.
mAP += torch.mean(count.type(torch.FloatTensor)/tindex.type(torch.FloatTensor))
mAP/len(queries)

In [0]:
theta = torch.sigmoid(theta)
term1 = torch.sum(torch.matmul(theta.transpose(0,1), G) - torch.matmul(S.transpose(0,1).type(torch.FloatTensor), G))
term2 = torch.sum(2*ARGS.gamma * (F-B))
term3 = torch.sum(2*ARGS.eta * F.sum(dim=1))
loss = term1 + term2 + term3

In [0]:
loss

In [0]:
F = mapper_i(im_feats)
G = mapper_t(sent_feats)
B = ARGS.gamma * torch.sign(F+G)
S = calculate_S(labels, labels, ARGS)
if ARGS.cuda:
    F, G, B = F.cuda(), G.cuda(), B.cuda()

In [0]:
im_loss = it_loss(F, G, B, S, ARGS)
t_loss = it_loss(G, F, B, S, ARGS)

In [0]:
theta = torch.matmul(F,G.transpose(0,1))
term1 = torch.matmul(theta.transpose(0,1), G)
term1 -= torch.matmul(S.transpose(0,1).cuda(), G)
term1 = torch.sum(term1)
term2 = torch.sum(2*ARGS.gamma * (F-B))
term3 = torch.sum(2*ARGS.eta * F.sum(dim=1))
loss = torch.abs(term1 + term2 + term3)

In [0]:
loss