In [50]:
# warning: faiss need python 2.7 to work properly
# => special conda env for it to run.

# import dependencies
import torch
import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

# For ploting
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import sys
import cPickle as pickle
import time

FAISS_PATH = "../../faiss/"
COCOAPI_PATH = "../cocoapi/PythonAPI"
sys.path.append(FAISS_PATH)
sys.path.append(COCOAPI_PATH)

import faiss

useGPU = torch.cuda.is_available()

In [45]:
# define useful utility function
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    plt.figure()
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated
    
def progress(count, total, suffix=''):
    """ Shows the progress of a given action 
    
    @params:
    - count : the current count of done operations
    - total : the total number of operation to do
    - suffix : a message printed after the progress bar
    """
    
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ... %s\r' % (bar, percents, '%', suffix))
    sys.stdout.flush()
    
def compute_sim_percentage_arrays(array_1, array_2):
    """
        Compute the percentage of similarity between 2 1D IntTensor of same lentgh
    """
    return (array_1 == array_2).sum()/len(array_1)

In [2]:
# define our transformation function
centre_crop = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# load the data with cocoAPI this time we dont need transform
cap = dset.CocoCaptions(root = '/home/raille/coco-features/coco-dataset/train2017',
                        annFile = '/home/raille/coco-features/coco-dataset/annotations/captions_train2017.json',
                        transform=centre_crop)

loading annotations into memory...
Done (t=3.73s)
creating index...
index created!


In [3]:
# load the raw and PCA features (for faiss we need numpy ndarray)
raw_features = torch.load('../data/raw-features-scaled.pt').cpu().numpy()
PCA_features = torch.load('../data/PCA-features-scaled.pt').cpu().numpy()

## Perform kNN

In [28]:
nq, d_raw = raw_features.shape
nq, d_pca = PCA_features.shape

res = faiss.StandardGpuResources()

#### Exact Search

In [19]:
flat_config = faiss.GpuIndexFlatConfig()
flat_config.device = 0 # use the titan X

In [54]:
print "benchmark"

n = 10

creating_index_time = 0
searching_knn_time = 0
total_time = 0

for lk in range(n + 1):
    if lk == 0:
        t0 = time.time()
        index = faiss.GpuIndexFlatL2(res, d_raw, flat_config)
        index.add(raw_features)
        D, I = index.search(raw_features, 2)
        t1 = time.time()
        print "warming up in %.3f s" % (t1 - t0)
    else:
        t0 = time.time()
        index = faiss.GpuIndexFlatL2(res, d_raw, flat_config)
        index.add(raw_features)
        t1 = time.time()
        D, I = index.search(raw_features, 2)
        t2 = time.time()
        creating_index_time += t1 - t0
        searching_knn_time += t2 - t1
        total_time += t2 - t0
        print "test %d: creating index in %.3f s, searching knn in %.3f s, total in %.3f s" % (
            lk, t1 - t0, t2 - t1, t2 - t0)

print "==========="
print "Average time: creating index in %.3f s, searching knn in %.3f s, total in %.3f s" % (
    creating_index_time / n, searching_knn_time / n, total_time / n)

benchmark
warming up in 1.932 s
test 1: creating index in 0.279 s, searching knn in 1.687 s, total in 1.966 s
test 2: creating index in 0.101 s, searching knn in 1.689 s, total in 1.790 s
test 3: creating index in 0.107 s, searching knn in 1.687 s, total in 1.793 s
test 4: creating index in 0.081 s, searching knn in 1.902 s, total in 1.983 s
test 5: creating index in 0.056 s, searching knn in 1.700 s, total in 1.756 s
test 6: creating index in 0.100 s, searching knn in 1.836 s, total in 1.936 s
test 7: creating index in 0.082 s, searching knn in 1.845 s, total in 1.928 s
test 8: creating index in 0.069 s, searching knn in 1.853 s, total in 1.921 s
test 9: creating index in 0.060 s, searching knn in 1.826 s, total in 1.887 s
test 10: creating index in 0.084 s, searching knn in 1.849 s, total in 1.933 s
Average time: creating index in 0.102 s, searching knn in 1.787 s, total in 1.889 s


In [55]:
# create the match index files
pickle.dump([x[1] for x in I], open("../data/match_index_faiss_exact_raw.pl", "wb"))

In [56]:
print "benchmark"

n = 10

creating_index_time = 0
searching_knn_time = 0
total_time = 0

for lk in range(n + 1):
    if lk == 0:
        t0 = time.time()
        index = faiss.GpuIndexFlatL2(res, d_pca, flat_config)
        index.add(PCA_features)
        D, I = index.search(PCA_features, 2)
        t1 = time.time()
        print "warming up in %.3f s" % (t1 - t0)
    else:
        t0 = time.time()
        index = faiss.GpuIndexFlatL2(res, d_pca, flat_config)
        index.add(PCA_features)
        t1 = time.time()
        D, I = index.search(PCA_features, 2)
        t2 = time.time()
        creating_index_time += t1 - t0
        searching_knn_time += t2 - t1
        total_time += t2 - t0
        print "test %d: creating index in %.3f s, searching knn in %.3f s, total in %.3f s" % (
            lk, t1 - t0, t2 - t1, t2 - t0)

print "==========="
print "Average time: creating index in %.3f s, searching knn in %.3f s, total in %.3f s" % (
    creating_index_time / n, searching_knn_time / n, total_time / n)

benchmark
warming up in 0.760 s
test 1: creating index in 0.026 s, searching knn in 0.636 s, total in 0.662 s
test 2: creating index in 0.014 s, searching knn in 0.619 s, total in 0.633 s
test 3: creating index in 0.011 s, searching knn in 0.637 s, total in 0.648 s
test 4: creating index in 0.029 s, searching knn in 0.671 s, total in 0.699 s
test 5: creating index in 0.008 s, searching knn in 0.637 s, total in 0.645 s
test 6: creating index in 0.008 s, searching knn in 0.625 s, total in 0.633 s
test 7: creating index in 0.018 s, searching knn in 0.629 s, total in 0.647 s
test 8: creating index in 0.012 s, searching knn in 0.631 s, total in 0.643 s
test 9: creating index in 0.027 s, searching knn in 0.639 s, total in 0.665 s
test 10: creating index in 0.026 s, searching knn in 0.664 s, total in 0.690 s
Average time: creating index in 0.018 s, searching knn in 0.639 s, total in 0.657 s


In [57]:
# create the match index files
pickle.dump([x[1] for x in I], open("../data/match_index_faiss_exact_pca.pl", "wb"))

#### Approximate Search

In [41]:
print "benchmark"

n = 10

creating_index_time = 0
searching_knn_time = 0
total_time = 0

for lk in range(n + 1):
    if lk == 0:
        t0 = time.time()
        index = faiss.index_factory(d_raw, "IVF16384,Flat")
        co = faiss.GpuClonerOptions()
        index = faiss.index_cpu_to_gpu(res, 0, index, co)
        index.train(raw_features)
        index.add(raw_features)
        D, I = index.search(raw_features, 2)
        t1 = time.time()
        print "warming up in %.3f s" % (t1 - t0)
    else:
        t0 = time.time()
        index = faiss.index_factory(d_raw, "IVF16384,Flat")
        co = faiss.GpuClonerOptions()
        index = faiss.index_cpu_to_gpu(res, 0, index, co)
        index.train(raw_features)
        index.add(raw_features)
        t1 = time.time()
        D, I = index.search(raw_features, 2)
        t2 = time.time()
        creating_index_time += t1 - t0
        searching_knn_time += t2 - t1
        total_time += t2 - t0
        print "test %d: creating index in %.3f s, searching knn in %.3f s, total in %.3f s" % (
            lk, t1 - t0, t2 - t1, t2 - t0)

print "==========="
print "Average time: creating index in %.3f s, searching knn in %.3f s, total in %.3f s" % (
    creating_index_time / n, searching_knn_time / n, total_time / n)

benchmark
warming up in 4.633 s
test 1: creating index in 4.136 s, searching knn in 0.284 s, total in 4.420 s
test 2: creating index in 4.151 s, searching knn in 0.282 s, total in 4.433 s
test 3: creating index in 4.669 s, searching knn in 0.286 s, total in 4.955 s
test 4: creating index in 4.402 s, searching knn in 0.369 s, total in 4.771 s
test 5: creating index in 4.584 s, searching knn in 0.288 s, total in 4.871 s
test 6: creating index in 4.535 s, searching knn in 0.302 s, total in 4.836 s
test 7: creating index in 4.228 s, searching knn in 0.308 s, total in 4.536 s
test 8: creating index in 4.140 s, searching knn in 0.287 s, total in 4.428 s
test 9: creating index in 4.245 s, searching knn in 0.312 s, total in 4.557 s
test 10: creating index in 4.329 s, searching knn in 0.304 s, total in 4.633 s
Average time: creating index in 4.342 s, searching knn in 0.302 s, total in 4.644 s


In [42]:
# create the match index files
pickle.dump([x[1] for x in I], open("../data/match_index_faiss_approximate_raw.pl", "wb"))

In [43]:
print "benchmark"

n = 10

creating_index_time = 0
searching_knn_time = 0
total_time = 0

for lk in range(n + 1):
    if lk == 0:
        t0 = time.time()
        index = faiss.index_factory(d_pca, "IVF16384,Flat")
        co = faiss.GpuClonerOptions()
        index = faiss.index_cpu_to_gpu(res, 0, index, co)
        index.train(PCA_features)
        index.add(PCA_features)
        D, I = index.search(PCA_features, 2)
        t1 = time.time()
        print "warming up in %.3f s" % (t1 - t0)
    else:
        t0 = time.time()
        index = faiss.index_factory(d_pca, "IVF16384,Flat")
        co = faiss.GpuClonerOptions()
        index = faiss.index_cpu_to_gpu(res, 0, index, co)
        index.train(PCA_features)
        index.add(PCA_features)
        t1 = time.time()
        D, I = index.search(PCA_features, 2)
        t2 = time.time()
        creating_index_time += t1 - t0
        searching_knn_time += t2 - t1
        total_time += t2 - t0
        print "test %d: creating index in %.3f s, searching knn in %.3f s, total in %.3f s" % (
            lk, t1 - t0, t2 - t1, t2 - t0)

print "==========="
print "Average time: creating index in %.3f s, searching knn in %.3f s, total in %.3f s" % (
    creating_index_time / n, searching_knn_time / n, total_time / n)

benchmark
warming up in 2.210 s
test 1: creating index in 2.019 s, searching knn in 0.113 s, total in 2.132 s
test 2: creating index in 2.005 s, searching knn in 0.109 s, total in 2.114 s
test 3: creating index in 1.882 s, searching knn in 0.110 s, total in 1.992 s
test 4: creating index in 1.848 s, searching knn in 0.113 s, total in 1.961 s
test 5: creating index in 1.970 s, searching knn in 0.109 s, total in 2.079 s
test 6: creating index in 1.927 s, searching knn in 0.121 s, total in 2.047 s
test 7: creating index in 1.962 s, searching knn in 0.118 s, total in 2.080 s
test 8: creating index in 1.990 s, searching knn in 0.120 s, total in 2.110 s
test 9: creating index in 2.052 s, searching knn in 0.116 s, total in 2.168 s
test 10: creating index in 1.966 s, searching knn in 0.121 s, total in 2.088 s
Average time: creating index in 1.962 s, searching knn in 0.115 s, total in 2.077 s


In [44]:
# create the match index files
pickle.dump([x[1] for x in I], open("../data/match_index_faiss_approximate_pca.pl", "wb"))