### import everything

In [1]:
from annoy import AnnoyIndex
import random

f = 40
t = AnnoyIndex(f)  # Length of item vector that will be indexed
for i in range(1000):
    v = [random.gauss(0, 1) for z in range(f)]
    t.add_item(i, v)

t.build(10) # 10 trees
t.save('test.ann')

# ...

u = AnnoyIndex(f)
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 2, include_distances=True)) # will find the 1000 nearest neighbors

([0, 914], [0.0, 0.9361092448234558])


In [2]:
import torch
import pickle
import sys

In [3]:
# first we load the data
features_PCA = torch.load('PCA-features.pt')

# load the corresponding captions
with open('captions.pkl', 'rb') as f:
    captions_list = pickle.load(f)
    
# load the raw features
raw_features = torch.load('raw-features.pt')

### compute knn using Annoy

In [4]:
def progress(count, total, suffix=''):
    """ Shows the progress of a given action 
    
    @params:
    - count : the current count of done operations
    - total : the total number of operation to do
    - suffix : a message printed after the progress bar
    """
    
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ... %s\r' % (bar, percents, '%', suffix))
    sys.stdout.flush()

In [5]:
len([random.gauss(0, 1) for z in range(f)])

TypeError: '_io.BufferedReader' object cannot be interpreted as an integer

In [6]:
raw_features[1].tolist()

[0.07474596053361893,
 0.025632264092564583,
 0.22847002744674683,
 0.3650291860103607,
 0.11080736666917801,
 0.41275760531425476,
 0.0,
 0.30871883034706116,
 0.0,
 0.1140320673584938,
 2.2037174701690674,
 0.5302984118461609,
 0.0,
 0.026711639016866684,
 0.31919577717781067,
 0.31540563702583313,
 0.3239627778530121,
 0.464668869972229,
 0.054290771484375,
 0.3654480278491974,
 0.8648094534873962,
 0.06711650639772415,
 1.97968327999115,
 0.20585331320762634,
 0.013152047991752625,
 1.0948580503463745,
 0.005607042461633682,
 1.1144211292266846,
 0.1129646897315979,
 0.022808417677879333,
 0.5697683095932007,
 0.5442680716514587,
 0.0,
 0.3850795328617096,
 0.05760813131928444,
 0.0,
 1.5800180435180664,
 0.4151003360748291,
 1.0882452726364136,
 0.10312871634960175,
 0.23221327364444733,
 0.0,
 0.4055694341659546,
 0.18197110295295715,
 0.36258000135421753,
 0.22064660489559174,
 3.4559807777404785,
 0.44271034002304077,
 0.31133684515953064,
 0.6165153980255127,
 0.04169853031635

In [13]:
def knn_annoy(tensor, k, n_trees=10):
    """
        Compute a knn list with element being tuples (value, list) ordered,
        of the k nearest rows for each row of a tensor using annoy method.
        @params:
            - tensor: 2D tensor on which to perform the algorithm
            - k: the number of nearest neighbor to keep
        @return:
            - knn_list: list of ordered tuples with the value, indices of the k nearest row for each row
    """
    
    knn_list = []

    t = AnnoyIndex(tensor.size()[1], metric='euclidean')
    
    for i in range(len(tensor)):
        t.add_item(i, tensor[i].tolist())
        
    t.build(n_trees)
    
    for i in range(len(tensor)):
        knn_list.append(t.get_nns_by_item(i, k, include_distances=True))
        
    return knn_list

In [10]:
%%time
knn_raw_features_list = knn_annoy(raw_features, 1)

CPU times: user 22.6 s, sys: 1.16 s, total: 23.8 s
Wall time: 23.7 s


In [15]:
%%time
knn_raw_features_list_10 = knn_annoy(raw_features, 2)

CPU times: user 23.5 s, sys: 1.47 s, total: 25 s
Wall time: 24.9 s


In [16]:
%%time
knn_raw_features_list_20 = knn_annoy(raw_features, 2, 20)

CPU times: user 23.7 s, sys: 1.16 s, total: 24.9 s
Wall time: 24.9 s


In [17]:
%%time
knn_raw_features_list_5 = knn_annoy(raw_features, 2, 5)

CPU times: user 23 s, sys: 1.24 s, total: 24.2 s
Wall time: 24.2 s


In [30]:
tensor_1 = torch.cuda.IntTensor([x[0][1] for x in knn_raw_features_list_10])
tensor_2 = torch.cuda.IntTensor([x[0][1] for x in knn_raw_features_list_20])

In [44]:
def compute_sim_percentage(tensor_1, tensor_2):
    """
        Compute the percentage of similarity between 2 1D IntTensor of same lentgh
    """
    return (tensor_1 == tensor_2).sum() / len(tensor_1)

In [45]:
compute_sim_percentage(tensor_1, tensor_2)

0.9034

In [46]:
tensor_1_cpu = torch.IntTensor([x[0][1] for x in knn_raw_features_list_10])
tensor_2_cpu = torch.IntTensor([x[0][1] for x in knn_raw_features_list_20])

In [49]:
%%time
tensor_1_cpu == tensor_2_cpu

CPU times: user 151 µs, sys: 0 ns, total: 151 µs
Wall time: 74.9 µs



 1
 1
 0
⋮ 
 1
 1
 1
[torch.ByteTensor of size 5000]

In [53]:
%%time
tensor_1 == tensor_2

CPU times: user 766 µs, sys: 0 ns, total: 766 µs
Wall time: 296 µs



 1
 1
 0
⋮ 
 1
 1
 1
[torch.cuda.ByteTensor of size 5000 (GPU 0)]