## Import libraries

In [None]:
from google.colab import drive
from pathlib import Path
from matplotlib import pyplot as plt 
import pandas as pd
import numpy as np
import time
import os
import csv
import concurrent.futures
import sklearn
import sklearn.metrics
import heapq
import tqdm

## Utility functions

### Create annot and load descriptors

In [None]:
def create_annot(path):
  image_list = list(Path(path).glob('*/*.jpg'))
  # the identity name is in the path (the name of the parent directory)
  names_list = [i.parent.name for i in image_list]  # get the identity of each image
  # keep info in a pandas DataFrame
  annot = pd.DataFrame({'identity': names_list, 'image_path': image_list})
  return annot

def concatenate_annots(list_of_paths):
  concat_annot = pd.DataFrame()
  with concurrent.futures.ThreadPoolExecutor() as executor:
    annots = [executor.submit(create_annot, path) for path in list_of_paths]
    for annot in annots:
      new_annot = annot.result()
      concat_annot = concat_annot.append(new_annot, ignore_index = True)
    return concat_annot

In [None]:
def load_descriptors(path):
  with open(path, 'rb') as file:
    return np.load(file)

def concatenate_descriptors(list_of_paths):
  concat_descriptors = None
  with concurrent.futures.ThreadPoolExecutor() as executor:
    descriptors = [executor.submit(load_descriptors, path) for path in list_of_paths]
    for descriptor in descriptors:
      new_descriptor = descriptor.result()
      if concat_descriptors is None:
        concat_descriptors = new_descriptor
      else:
        concat_descriptors = np.concatenate([concat_descriptors, new_descriptor])
    return concat_descriptors

## Utility functions

In [None]:
def cosine_distance(o1, o2):
  similarity = sklearn.metrics.pairwise.cosine_similarity(X=o1, Y=o2)
  return np.subtract(np.ones(similarity.shape), similarity)

def euclidean_distance(o1, o2):
  return sklearn.metrics.pairwise_distances(X=o1, Y=o2, metric='euclidean')

### Save test results

In [None]:
def save_results(dir, file_name, results, header=True):
  with open(os.path.join(dir, file_name +".csv"), 'w') as f:
    writer = csv.writer(f)
    # write the header
    if header:
      writer.writerow(["K", "P@K", "R@K", "QUERY TIME"])
    # write the data
    for r in results:
      writer.writerow(r) 

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Create annot and load descriptors for the database

In [None]:
db_annot = concatenate_annots(['/content/drive/MyDrive/CV_Birds/train', '/content/drive/MyDrive/CV_Birds/mirflickr25k'])
db_annot

Unnamed: 0,identity,image_path
0,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
1,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
2,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
3,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
4,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
...,...,...
72327,mirflickr,/content/drive/MyDrive/CV_Birds/mirflickr25k/m...
72328,mirflickr,/content/drive/MyDrive/CV_Birds/mirflickr25k/m...
72329,mirflickr,/content/drive/MyDrive/CV_Birds/mirflickr25k/m...
72330,mirflickr,/content/drive/MyDrive/CV_Birds/mirflickr25k/m...


In [None]:
db_descriptors = concatenate_descriptors(['/content/drive/MyDrive/CV_Birds/features/training/AutoEncoder/512to128withPace64.npy','/content/drive/MyDrive/CV_Birds/features/distractor/AutoEncoder/512to128withPace64.npy'])
db_descriptors.shape

(72332, 128)

### Create annot and load descriptors for the test set

In [None]:
query_annot = create_annot('/content/drive/MyDrive/CV_Birds/test')
query_annot

Unnamed: 0,identity,image_path
0,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
1,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
2,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
3,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
4,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
...,...,...
1620,YELLOW HEADED BLACKBIRD,/content/drive/MyDrive/CV_Birds/test/YELLOW HE...
1621,YELLOW HEADED BLACKBIRD,/content/drive/MyDrive/CV_Birds/test/YELLOW HE...
1622,YELLOW HEADED BLACKBIRD,/content/drive/MyDrive/CV_Birds/test/YELLOW HE...
1623,YELLOW HEADED BLACKBIRD,/content/drive/MyDrive/CV_Birds/test/YELLOW HE...


In [None]:
query_descriptors = load_descriptors('/content/drive/MyDrive/CV_Birds/features/test/AutoEncoder/512to128withPace64.npy')
query_descriptors.shape

(1625, 128)

To run our tests we select only the first image of each species within the test set. Please note that within the test set we have 5 images per species.

In [None]:
queries_indexes = [x for x in range(325*5) if x%5 == 0]

In [None]:
ks = [3, 5, 10, 20, 50]

## Load model

In [None]:
def get_descriptor_from_id(id_object):
  return db_descriptors[id_object]

In [None]:
%cd "/content/drive/MyDrive/CV_Birds/Notebooks/PP-Index"
%run PPIndex.ipynb

/content/drive/.shortcut-targets-by-id/1rI5YNBuaSlCB__w522WEkHjw-nFuvIo0/CV_Birds/Notebooks/PP-Index


In [None]:
pp_forest = PrefixForest("", base_directory='/content/drive/MyDrive/CV_Birds/indexes/fine_tuning/forest/cosine', forest_file='forest_structure')

Forest retrieved from disk


## Brute force queries

In [None]:
n = max(ks)
n

50

In [None]:
def k_nn_query(query_index, k, distance_metric = 'cosine'):
  print(query_index//5, query_annot['identity'][query_index])
  heap = []
  best_indexes = rank_distances = []
  query_descriptor = query_descriptors[query_index]
  for index in range(len(db_annot)):
    descriptor = db_descriptors[index]
    if distance_metric == 'euclidean':
      distance = euclidean_distance(query_descriptor.reshape(1,-1), descriptor.reshape(1,-1)).squeeze()
    elif distance_metric == 'cosine':
      distance = cosine_distance(query_descriptor.reshape(1,-1), descriptor.reshape(1,-1)).squeeze()
    else:
      return None, None  

    heapq.heappush(heap, (distance, index))

  smallest = heapq.nsmallest(k, heap)
  res = list(zip(*smallest))
  best_indexes = res[1]
  rank_distances = res[0]
  return best_indexes, rank_distances

In [None]:
brute_force_results = [k_nn_query(query_index, n, 'cosine')[0] for query_index in queries_indexes]
brute_force_results

0 AFRICAN CROWNED CRANE
1 AFRICAN FIREFINCH
2 ALBATROSS
3 ALEXANDRINE PARAKEET
4 AMERICAN AVOCET
5 AMERICAN BITTERN
6 AMERICAN COOT
7 AMERICAN GOLDFINCH
8 AMERICAN KESTREL
9 AMERICAN PIPIT
10 AMERICAN REDSTART
11 ANHINGA
12 ANNAS HUMMINGBIRD
13 ANTBIRD
14 ARARIPE MANAKIN
15 ASIAN CRESTED IBIS
16 BALD EAGLE
17 BALD IBIS
18 BALI STARLING
19 BALTIMORE ORIOLE
20 BANANAQUIT
21 BANDED BROADBILL
22 BANDED PITA
23 BAR-TAILED GODWIT
24 BARN OWL
25 BARN SWALLOW
26 BARRED PUFFBIRD
27 BAY-BREASTED WARBLER
28 BEARDED BARBET
29 BEARDED BELLBIRD
30 BEARDED REEDLING
31 BELTED KINGFISHER
32 BIRD OF PARADISE
33 BLACK & YELLOW bROADBILL
34 BLACK BAZA
35 BLACK FRANCOLIN
36 BLACK SKIMMER
37 BLACK SWAN
38 BLACK TAIL CRAKE
39 BLACK THROATED BUSHTIT
40 BLACK THROATED WARBLER
41 BLACK VULTURE
42 BLACK-CAPPED CHICKADEE
43 BLACK-NECKED GREBE
44 BLACK-THROATED SPARROW
45 BLACKBURNIAM WARBLER
46 BLONDE CRESTED WOODPECKER
47 BLUE COAU
48 BLUE GROUSE
49 BLUE HERON
50 BLUE THROATED TOUCANET
51 BOBOLINK
52 BORNEAN BRI

[(27,
  34,
  44,
  41,
  73,
  116,
  57,
  105,
  55,
  110,
  120,
  2,
  29,
  87,
  95,
  70,
  60,
  96,
  130,
  19,
  118,
  13,
  50,
  12,
  46,
  11,
  132,
  67,
  102,
  89,
  135,
  54,
  72,
  64,
  101,
  76,
  53,
  107,
  126,
  133,
  65,
  23,
  56,
  14,
  93,
  127,
  8,
  4,
  69,
  22),
 (156,
  140,
  150,
  193,
  272,
  172,
  180,
  252,
  159,
  153,
  144,
  169,
  176,
  225,
  205,
  238,
  229,
  222,
  189,
  263,
  230,
  242,
  214,
  199,
  148,
  220,
  203,
  248,
  171,
  142,
  273,
  177,
  175,
  226,
  149,
  163,
  208,
  168,
  165,
  251,
  145,
  210,
  158,
  137,
  173,
  241,
  240,
  141,
  260,
  191),
 (293,
  400,
  315,
  294,
  307,
  310,
  292,
  327,
  342,
  407,
  344,
  277,
  282,
  346,
  328,
  359,
  381,
  408,
  365,
  398,
  278,
  392,
  368,
  312,
  379,
  357,
  279,
  297,
  358,
  354,
  314,
  349,
  347,
  300,
  341,
  329,
  369,
  403,
  286,
  374,
  311,
  306,
  391,
  394,
  361,
  409,
  396,
  405,
 

In [None]:
len(brute_force_results)
len(brute_force_results[0])
len(brute_force_results[0])==len(brute_force_results[1]) and len(brute_force_results[0])==len(brute_force_results[2]) and len(brute_force_results[0])==len(brute_force_results[324]) and len(brute_force_results[0])==n
brute_force_results[324][49]

47297

In [None]:
save_results('/content/drive/MyDrive', 'brute_force_results', brute_force_results, header=False) 

In [None]:
import csv

def read_csv(filename):
  with open(filename, newline='') as f_input:
    return [list(map(float, row)) for row in csv.reader(f_input)]

brute_force_results2 = read_csv('/content/drive/MyDrive/brute_force_results.csv')    

## Compute Precision@k and Recall@k

P@k = relevant/k

R@K = (Retrieved by Index ^ Retrieved by BruteForce)/k

In [None]:
def compute_p_at_k(query_index, retrieved_ids):
  relevant = 0
  query_identity = query_annot['identity'][query_index]
  for id in retrieved_ids:
    if db_annot['identity'][id] == query_identity:
      relevant+=1
  return relevant/len(retrieved_ids)


def compute_r_at_k(retrieved_ids, true_neighbors):
  intersection = np.intersect1d(retrieved_ids, true_neighbors)  # find common elements 
  return len(intersection)/len(retrieved_ids)

In [None]:
def rnd_pivots_queries(query_index, n, true_neighbors):
  start_time = time.time()
  ids, distances = pp_forest.find_nearest_neighbors(query_descriptors[query_index], n, perturbations=3)
  end_time = time.time()
  ids = ids.tolist()
  return (compute_p_at_k(query_index, ids), compute_r_at_k(ids, true_neighbors) ,(end_time - start_time))

In [None]:
index_results=[]

for k in ks:
  precisions = []
  recalls = []
  times = []
  for query_index in queries_indexes:
    p, r, t = rnd_pivots_queries(query_index, k, brute_force_results[query_index//5])
    precisions.append(p)
    recalls.append(r)
    times.append(t)
    print("k=", k, query_index//5, query_annot['identity'][query_index])
  result_tuple = (k, np.mean(np.array(precisions)), np.mean(np.array(recalls)), np.mean(np.array(times)))
  print(result_tuple)  
  index_results.append(result_tuple)

k= 3 0 AFRICAN CROWNED CRANE
k= 3 5 AFRICAN FIREFINCH
k= 3 10 ALBATROSS
k= 3 15 ALEXANDRINE PARAKEET
k= 3 20 AMERICAN AVOCET
k= 3 25 AMERICAN BITTERN
k= 3 30 AMERICAN COOT
k= 3 35 AMERICAN GOLDFINCH
k= 3 40 AMERICAN KESTREL
k= 3 45 AMERICAN PIPIT
k= 3 50 AMERICAN REDSTART
k= 3 55 ANHINGA
k= 3 60 ANNAS HUMMINGBIRD
k= 3 65 ANTBIRD
k= 3 70 ARARIPE MANAKIN
k= 3 75 ASIAN CRESTED IBIS
k= 3 80 BALD EAGLE
k= 3 85 BALD IBIS
k= 3 90 BALI STARLING
k= 3 95 BALTIMORE ORIOLE
k= 3 100 BANANAQUIT
k= 3 105 BANDED BROADBILL
k= 3 110 BANDED PITA
k= 3 115 BAR-TAILED GODWIT
k= 3 120 BARN OWL
k= 3 125 BARN SWALLOW
k= 3 130 BARRED PUFFBIRD
k= 3 135 BAY-BREASTED WARBLER
k= 3 140 BEARDED BARBET
k= 3 145 BEARDED BELLBIRD
k= 3 150 BEARDED REEDLING
k= 3 155 BELTED KINGFISHER
k= 3 160 BIRD OF PARADISE
k= 3 165 BLACK & YELLOW bROADBILL
k= 3 170 BLACK BAZA
k= 3 175 BLACK FRANCOLIN
k= 3 180 BLACK SKIMMER
k= 3 185 BLACK SWAN
k= 3 190 BLACK TAIL CRAKE
k= 3 195 BLACK THROATED BUSHTIT
k= 3 200 BLACK THROATED WARBLER
k= 3

## Save and plot results

In [None]:
index_results
len(index_results)
len(index_results) == len(ks)

True

In [None]:
save_results('/content/drive/MyDrive/CV_Birds/performance/fine_tuning/index/AutoEncoder/', 'precision_and_recall', index_results) 

In [None]:
## Plot