In [1]:
from Datasets.News.news_data_loader import data_loader
from Datasets.News.news_gen_test import gen_test_data
from utils.cluster import KMEANS
from utils.get_features import SIFT
from utils.find_similar_vectors import KNN
import matplotlib.pyplot as plt
import numpy as np

In [2]:
images, image_names = data_loader()

  0%|          | 0/41762 [00:00<?, ?it/s]

100%|██████████| 41762/41762 [01:58<00:00, 352.91it/s]


In [3]:
orb_detector = SIFT('orb')
features = orb_detector.get_features(images)

features_array = np.concatenate(features, axis=0)

100%|██████████| 41762/41762 [03:32<00:00, 196.38it/s]


In [4]:
num_features, dim = features_array.shape
num_clusters = 500

kmeans = KMEANS(dim, num_clusters=num_clusters, niter=500)
kmeans.train(features_array)

Sampling a subset of 128000 / 20243502 for training
Clustering 128000 points in 32D to 500 clusters, redo 1 times, 500 iterations
  Preprocessing in 1.26 s
  Iteration 499 (29.82 s, search 27.43 s): objective=1.13964e+10 imbalance=1.031 nsplit=0       


In [5]:
from tqdm import tqdm


hists = []

for f in tqdm(features):
    labels = kmeans.search(f)[:, 0]
    counts = np.unique(labels, return_counts=True)
    arr = np.zeros(num_clusters)
    arr[counts[0]] = counts[1]
    arr /= np.linalg.norm(arr)
    hists.append(arr[None, :]) # dim: 1 x num_clusters

hists = np.concatenate(hists, axis=0)    

100%|██████████| 41762/41762 [00:39<00:00, 1067.13it/s]


In [6]:
knn = KNN(num_clusters)
knn.fit(hists)
knn.findKNearest(hists[:3])

array([[    0, 20261, 18966, 23944, 24203,  5653, 17760,  1336, 17768,
        40842],
       [    1, 18441, 34868, 18229, 27413, 32395, 14757, 21432, 39475,
         6652],
       [    2, 21442, 40447, 15355, 28175,  1816, 32178, 11354,  6023,
        23861]])

# Inferencing with augmented data

In [7]:
from importlib import reload

from Datasets.News import news_gen_test
reload(news_gen_test)
from Datasets.News.news_gen_test import gen_test_data

In [8]:
K = 10
precision = np.zeros(K)

for itr, (image, name) in enumerate(tqdm(gen_test_data())):
    image = image['image']
    # get the orb features
    f = orb_detector.__get_features__(image)
    # it is possible that no descriptors are obtained
    if f is None:
        print(f'{name}: no features extracted')
        continue
        
    f = np.array(f)
    
    # histogram
    labels = kmeans.search(f)[:, 0]        
    counts = np.unique(labels, return_counts=True)
    arr = np.zeros(num_clusters)
    arr[counts[0]] = counts[1]
    arr /= np.linalg.norm(arr)
    
    # finding the nearest neighbours
    indices = knn.findKNearest(arr[None, :], K)[0]
    # finding the position of correct image
    # in returned neighbours
    # the original image may not be there in the image
    pos = np.where(indices==itr)[0]
    if pos.size:
        precision[pos[0]:] += 1

100%|██████████| 41762/41762 [23:11<00:00, 30.00it/s]
41762it [23:11, 30.00it/s]


In [9]:
precision/itr

array([0.99554608, 0.99949714, 0.99985633, 1.        , 1.00002395,
       1.00002395, 1.00002395, 1.00002395, 1.00002395, 1.00002395])