In [168]:


import torch
import torchtext

# The first time you run this will download a ~823MB file
glove = torchtext.vocab.GloVe(name="840B", # trained on Wikipedia 2014 corpus
                              dim=300) 

In [202]:
import numpy as np
from fastdtw import fastdtw


In [185]:
import h5py
import json

word = 'dog'

glove_dim=300

model_path = '.vector_cache/pretrained_vico/glove_300_vico_linear_200/'

f = h5py.File(f'{model_path}/visual_word_vecs.h5py','r')
word_to_idx = json.load(open(f'{model_path}/visual_word_vecs_idx.json','r'))
visual_words = json.load(open(f'{model_path}/visual_words.json','r'))

# To just slice the row in the matrix without loading the full matrix in RAM do the following:
embed_mat = f['embeddings'][()]

# To load the entire matrix in memory (recommended if you are going to query words frequently) use the following instead:
# embed_mat = f[embeddings][()]

if word in word_to_idx:
    word_embed = embed_mat[word_to_idx[word]]
    word_embed_glove = word_embed[:glove_dim] # GloVe component
    word_embed_vico = word_embed[glove_dim:]  # ViCo component
else:
    print('Word not in vocabulary')

if word in visual_words:
    print('Word has ViCo component')
else:
    print('Word is not in the visual word vocabulary. word_embed_vico is set to average ViCo embedding computed across visual words')

    

        

Word has ViCo component


In [190]:
def vico(word):
    if word in word_to_idx:
        word_embed = embed_mat[word_to_idx[word]]
        _glove = torch.tensor(word_embed[:glove_dim]) # GloVe component
        _vico = torch.tensor(word_embed[glove_dim:])  # ViCo component
        return _vico
    else:
        print('I don\'t know', word)
        return torch.zeros(200)

In [205]:
from tslearn.metrics import dtw_path_from_metric

def DTW(seq_a, seq_b, band_width=3, fast=False):
    """
    DTW is used to find the optimal alignment path;
    Returns GT like 001110000 for each seq_a
    """
    if fast:
        if band_width is None:
            path, dist = dtw_path_from_metric(seq_a.detach().cpu().numpy(),
                                              seq_b.detach().cpu().numpy())
        else:
            path, dist = dtw_path_from_metric(seq_a.detach().cpu().numpy(),
                                              seq_b.detach().cpu().numpy(),
                                              sakoe_chiba_radius=band_width)
    else:
        if band_width is None:
            dist, path = fastdtw(seq_a.detach().cpu().numpy(),
                                              seq_b.detach().cpu().numpy())
        else:
            dist, path = fastdtw(seq_a.detach().cpu().numpy(),
                                              seq_b.detach().cpu().numpy(),
                                              radius=band_width)
    ndtw = np.exp(-dist/(len(seq_b) + len(seq_a)))
    return ndtw


In [172]:


x = torch.vstack([glove['laptop'], glove['table'], glove['ron']]).sum(0)
y = torch.vstack([glove['laptop'], glove['bench'], glove['television'], glove['toy']]).sum(0)


torch.norm(y - x)

tensor(13.1493)

In [173]:
x = torch.vstack([vico('laptop'), vico('table'), vico('television')]).sum(0)
y = torch.vstack([vico('laptop'), vico('bench'), vico('television'), vico('toy')]).sum(0)


np.linalg.norm(y - x)

10.786944621870715

In [174]:
x = torch.vstack([glove['laptop'], glove['table'], glove['tv']]).sum(0)
y = torch.vstack([glove['laptop'], glove['bathroom'], glove['television'], ]).sum(0)

x = torch.vstack([x, glove['tv']]).sum(0)
torch.norm(y - x)

tensor(12.2889)

In [175]:
x = torch.vstack([glove['rob']]).sum(0)
y = torch.vstack([glove['laptop'], glove['bench'], glove['television']]).sum(0)

# x += glove['tv']
torch.norm(y - x)

tensor(14.8372)

In [176]:
# DTW
loc1 = torch.stack([(glove['computer'] + glove['plant'] + glove['something'])/3 , (glove['toilet'] + glove['shower']) / 2, glove['tv']])
loc2 = torch.stack([glove['plant'], glove['tv'] + glove['remote_controller']])


path = torch.stack([glove['computer'], glove['bathroom'], glove['television']])
print(DTW(loc1, path, 1))
print(DTW(loc2, path, 1))


0.010925987588726625
0.0005696911484005163


In [210]:
all_objects = torch.stack([vico('window'), vico('chair'), vico('plant'), 
                           vico('couch'), vico('fireexinguisher'), 
                           vico('stair'), vico('lamp'), vico('fireplace'), 
                           vico('bed'), vico('washing machine'), vico('sink'),vico('toilet')]).sum(0)

# DTW
bad_path1a = torch.stack([vico('window'), vico('chair')])
bad_path1b = torch.stack([vico('window'), vico('plant')])
bad_path12 = torch.stack([vico('window'), vico('couch')])
bad_path12c = torch.stack([vico('window'), vico('couch'), vico('fireexinguisher')])
bad_path12d = torch.stack([vico('window'), vico('couch'), vico('stair')])
bad_path123e = torch.stack([vico('window'), vico('couch'), vico('picture')])
bad_path123e = torch.stack([vico('window'), vico('couch'), vico('picture'), vico('lamp')])
bad_path1234 = torch.stack([vico('window'), vico('couch') ,vico('picture'), vico('fireplace')])
bad_path1234f = torch.stack([vico('window'), vico('couch'), vico('picture'), vico('fireplace'), vico('bed')])
bad_path1234g = torch.stack([vico('window'), vico('couch'), vico('picture'), vico('fireplace'), vico('washing machine')])
bad_path1234h = torch.stack([vico('window'), vico('couch'), vico('picture'), vico('fireplace'), vico('bathtub')])
bad_path12345 = torch.stack([vico('window'), vico('couch') ,vico('picture'), vico('fireplace'), vico('sink')])
bad_path12345h = torch.stack([vico('window'), vico('couch') ,vico('picture'), vico('fireplace'), vico('sink'), vico('bathtub')])

bad_path123456 = torch.stack([vico('window'), vico('couch') ,vico('picture'), vico('fireplace'), vico('sink'), vico('toilet')])

gt_path = torch.stack([vico('window'), vico('couch'), vico('fireplace'), vico('bathtub')])


print('1a', DTW(bad_path1a, gt_path, fast=True) * 0.001)
print('1b',DTW(bad_path1b, gt_path, fast=True) * 0.001)
print('12', DTW(bad_path12, gt_path, fast=True) * 0.005)

print('12c', DTW(bad_path12c, gt_path, fast=True) * 0.01)
print('12d', DTW(bad_path12d, gt_path, fast=True) * 0.02)
print('123e', DTW(bad_path123e, gt_path, fast=True) * 0.03)

print('1234', DTW(bad_path1234, gt_path, fast=True) * 0.02)
print('1234f', DTW(bad_path1234f, gt_path, fast=True) * 0.1)
print('1234g', DTW(bad_path1234g, gt_path, fast=True) * 0.5)

print('1234g', DTW(bad_path1234g, gt_path, fast=True) * 0.4)
print('1234h', DTW(bad_path1234h, gt_path, fast=True) * 0.3)
print('12345', DTW(bad_path12345, gt_path, fast=True) * 0.1)

print('12345h', DTW(bad_path12345h, gt_path, fast=True) * 0.02)
print('123456', DTW(bad_path123456, gt_path, fast=True) * 0.5)

I don't know fireexinguisher
I don't know washing machine
I don't know fireexinguisher
I don't know washing machine
1a 1.3552296459938273e-05
1b 7.046801227704599e-06
12 0.0002028168750412643
12c 0.0013251195469679026
12d 0.0013428706298528902
123e 0.002960929169360092
1234 0.0019921283492005553
1234f 0.01304474942397376
1234g 0.08654474225307786
1234g 0.0692357938024623
1234h 0.11153828277854577
12345 0.014956566376992986
12345h 0.003617251522438981
123456 0.04108958079684591


In [242]:
1/torch.cosine_similarity((vico('bathtub')).unsqueeze(0), all_objects.unsqueeze(0))

tensor([4.4718], dtype=torch.float64)

In [243]:
1/torch.cosine_similarity((vico('bathtub') + vico('shower')).unsqueeze(0), all_objects.unsqueeze(0))

tensor([3.4456], dtype=torch.float64)

In [244]:
1/torch.cosine_similarity(all_objects.unsqueeze(0), all_objects.unsqueeze(0))

tensor([1.0000], dtype=torch.float64)

In [213]:
vico('bathtub').shape

torch.Size([200])