In [16]:
import nmslib
import numpy

# create a random matrix to index
data = numpy.random.randn(10000, 100).astype(numpy.float32)

# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='l2')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)

# query for the nearest neighbours of the first datapoint
ids, distances = index.knnQuery(data[0], k=10)

# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
neighbours = index.knnQueryBatch(data, k=10, num_threads=4)

In [35]:
import torch
import pickle
import sys

import numpy as np

from nltk.translate import bleu_score
from nltk import word_tokenize

In [3]:
# first we load the data
features_PCA = torch.load('PCA-features.pt')

# load the corresponding captions
with open('captions.pkl', 'rb') as f:
    captions_list = pickle.load(f)
    
# load the raw features
raw_features = torch.load('raw-features.pt')

### Perform knn on the raw features

In [24]:
data = raw_features.cpu().numpy()

# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='l2')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)

In [25]:
%%time
# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
neighbours_raw = index.knnQueryBatch(data, k=2, num_threads=4)

CPU times: user 314 ms, sys: 4.43 ms, total: 318 ms
Wall time: 93.6 ms


### Perform knn on PCA features

In [27]:
data = features_PCA.cpu().numpy()

# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='l2')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)

In [28]:
%%time
# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
neighbours_pca = index.knnQueryBatch(data, k=2, num_threads=4)

CPU times: user 152 ms, sys: 0 ns, total: 152 ms
Wall time: 44.3 ms


### Assess Performance using captions and BLEU score

In [33]:
for idx, val in enumerate(neighbours_pca):
    print(val[0][1])
    break

1870


In [36]:
pca_bleu_score = []
for idx, val in enumerate(neighbours_pca):
    match_index = val[0][1]
    weights = [0.25, 0.25, 0.25, 0.25]
    score = []
    for caption in captions_list[idx]:
        candidate_tokens = word_tokenize(caption.replace('.',''))
        references_tokens = [word_tokenize(i.replace('.','')) for i in captions_list[match_index]] 
        score.append(bleu_score.sentence_bleu(references_tokens, candidate_tokens, weights))
    mean_1 = np.mean(score)
    score = []
    for caption in captions_list[match_index]:
        candidate_tokens = word_tokenize(caption.replace('.',''))
        references_tokens = [word_tokenize(i.replace('.','')) for i in captions_list[idx]] 
        score.append(bleu_score.sentence_bleu(references_tokens, candidate_tokens, weights))
    mean_2 = np.mean(score)
    mean = np.mean([mean_1, mean_2])
    pca_bleu_score.append(mean)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [37]:
np.mean(pca_bleu_score)

0.55638710827323334

In [38]:
raw_bleu_score = []
for idx, val in enumerate(neighbours_raw):
    match_index = val[0][1]
    weights = [0.25, 0.25, 0.25, 0.25]
    score = []
    for caption in captions_list[idx]:
        candidate_tokens = word_tokenize(caption.replace('.',''))
        references_tokens = [word_tokenize(i.replace('.','')) for i in captions_list[match_index]] 
        score.append(bleu_score.sentence_bleu(references_tokens, candidate_tokens, weights))
    mean_1 = np.mean(score)
    score = []
    for caption in captions_list[match_index]:
        candidate_tokens = word_tokenize(caption.replace('.',''))
        references_tokens = [word_tokenize(i.replace('.','')) for i in captions_list[idx]] 
        score.append(bleu_score.sentence_bleu(references_tokens, candidate_tokens, weights))
    mean_2 = np.mean(score)
    mean = np.mean([mean_1, mean_2])
    raw_bleu_score.append(mean)

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [39]:
np.mean(raw_bleu_score)

0.55565998115402049

Again raw is a little less accurate than with PCA. However, the overall score is a lot less than bruteforce.