In [1]:
from Datasets.News.news_data_loader import data_loader
from Datasets.News.news_gen_test import gen_test_data
from utils.cluster import KMEANS
from utils.get_features import SIFT
from utils.find_similar_vectors import KNN
import matplotlib.pyplot as plt
import numpy as np

In [2]:
images, image_names = data_loader()

100%|██████████| 41762/41762 [02:06<00:00, 330.23it/s]


In [3]:
orb_detector = SIFT('sift')
features = orb_detector.get_features(images)

features_array = np.concatenate(features, axis=0)

100%|██████████| 41762/41762 [21:27<00:00, 32.43it/s]


In [4]:
num_features, dim = features_array.shape
num_clusters = 500

kmeans = KMEANS(dim, num_clusters=num_clusters, niter=500)
kmeans.train(features_array)

Sampling a subset of 128000 / 40025973 for training
Clustering 128000 points in 128D to 500 clusters, redo 1 times, 500 iterations
  Preprocessing in 6.44 s
  Iteration 499 (35.42 s, search 31.91 s): objective=9.10152e+09 imbalance=1.075 nsplit=0       


In [5]:
from tqdm import tqdm


hists = []

for f in tqdm(features):
    labels = kmeans.search(f)[:, 0]
    counts = np.unique(labels, return_counts=True)
    arr = np.zeros(num_clusters)
    arr[counts[0]] = counts[1]
    arr /= np.linalg.norm(arr)
    hists.append(arr[None, :]) # dim: 1 x num_clusters

hists = np.concatenate(hists, axis=0)    

100%|██████████| 41762/41762 [00:56<00:00, 743.62it/s] 


In [6]:
knn = KNN(num_clusters)
knn.fit(hists)
knn.findKNearest(hists[:3])

array([[    0, 37067, 39414, 40146, 39766,   998, 40258,  4750, 10467,
        40659],
       [    1, 29337, 16279, 39619, 39894, 33187, 18322,  2704, 37769,
          354],
       [    2, 39948, 41430, 41675, 41606, 40258,  5423, 39744, 41758,
        29731]])

# Inferencing with augmented data

In [7]:
from importlib import reload

from Datasets.News import news_gen_test
reload(news_gen_test)
from Datasets.News.news_gen_test import gen_test_data

In [8]:
K = 10
precision = np.zeros(K)

for itr, (image, name) in enumerate(tqdm(gen_test_data())):
    image = image['image']
    # get the orb features
    f = orb_detector.__get_features__(image)
    # it is possible that no descriptors are obtained
    if f is None:
        print(f'{name}: no features extracted')
        continue
        
    f = np.array(f)
    
    # histogram
    labels = kmeans.search(f)[:, 0]        
    counts = np.unique(labels, return_counts=True)
    arr = np.zeros(num_clusters)
    arr[counts[0]] = counts[1]
    arr /= np.linalg.norm(arr)
    
    # finding the nearest neighbours
    indices = knn.findKNearest(arr[None, :], K)[0]
    # finding the position of correct image
    # in returned neighbours
    # the original image may not be there in the image
    pos = np.where(indices==itr)[0]
    if pos.size:
        precision[pos[0]:] += 1

100%|██████████| 41762/41762 [1:12:58<00:00,  9.54it/s]
41762it [1:12:58,  9.54it/s]


In [9]:
precision/itr

array([0.70939393, 0.74339695, 0.76051819, 0.7729221 , 0.78355403,
       0.79186322, 0.79890328, 0.80477   , 0.81042121, 0.81513853])