## Import libraries

In [1]:
from google.colab import drive
from pathlib import Path
from matplotlib import pyplot as plt 
import pandas as pd
import numpy as np
import time
import os
import csv
import concurrent.futures

## Utility functions

### Create annot and load descriptors

In [2]:
def create_annot(path):
  image_list = list(Path(path).glob('*/*.jpg'))
  # the identity name is in the path (the name of the parent directory)
  names_list = [i.parent.name for i in image_list]  # get the identity of each image
  # keep info in a pandas DataFrame
  annot = pd.DataFrame({'identity': names_list, 'image_path': image_list})
  return annot

def concatenate_annots(list_of_paths):
  concat_annot = pd.DataFrame()
  with concurrent.futures.ThreadPoolExecutor() as executor:
    annots = [executor.submit(create_annot, path) for path in list_of_paths]
    for annot in annots:
      new_annot = annot.result()
      concat_annot = concat_annot.append(new_annot, ignore_index = True)
    return concat_annot

In [3]:
def load_descriptors(path):
  with open(path, 'rb') as file:
    return np.load(file)

def concatenate_descriptors(list_of_paths):
  concat_descriptors = None
  with concurrent.futures.ThreadPoolExecutor() as executor:
    descriptors = [executor.submit(load_descriptors, path) for path in list_of_paths]
    for descriptor in descriptors:
      new_descriptor = descriptor.result()
      if concat_descriptors is None:
        concat_descriptors = new_descriptor
      else:
        concat_descriptors = np.concatenate([concat_descriptors, new_descriptor])
    return concat_descriptors

### Create pivots

In [4]:
def generate_pivots(descriptors, n, strategy="rnd"):
  if strategy == "kMED":
    kmedoids = sklearn_extra.cluster.KMedoids(n_clusters=n).fit(descriptors)
    return kmedoids.cluster_centers_
  if strategy != "rnd":
    print(strategy, "was not implemented. Random pivots were returned")
  pivots_id = np.random.choice(np.arange(len(descriptors)), size=n)
  return descriptors[pivots_id]

def generate_list_of_pivots(descriptors, t, n, strategy="rnd"):
  list_of_pivots = []
  with concurrent.futures.ThreadPoolExecutor() as executor:
    pivots = [executor.submit(generate_pivots, descriptors, n, strategy) for i in range(t)]
    for pivot in concurrent.futures.as_completed(pivots):
      new_pivot = pivot.result()
      list_of_pivots.append(new_pivot)
    return list_of_pivots

### Save test results

In [5]:
def save_results(dir, file_name, results):
  with open(os.path.join(dir, file_name +".csv"), 'w') as f:
    writer = csv.writer(f)
    # write the header
    writer.writerow(["CLASS", "AP", "QUERY TIME"])
    # write the data
    for r in results:
      writer.writerow(r) 

## Test Performance

In [6]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Create annot and load descriptors for the database

In [8]:
db_annot = concatenate_annots(['/content/drive/MyDrive/CV_Birds/train', '/content/drive/MyDrive/CV_Birds/mirflickr25k'])
db_annot

Unnamed: 0,identity,image_path
0,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
1,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
2,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
3,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
4,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/train/AFRICAN ...
...,...,...
72327,mirflickr,/content/drive/MyDrive/CV_Birds/mirflickr25k/m...
72328,mirflickr,/content/drive/MyDrive/CV_Birds/mirflickr25k/m...
72329,mirflickr,/content/drive/MyDrive/CV_Birds/mirflickr25k/m...
72330,mirflickr,/content/drive/MyDrive/CV_Birds/mirflickr25k/m...


In [9]:
db_descriptors = concatenate_descriptors(['/content/drive/MyDrive/CV_Birds/features/training/ResNet152v2/OneDense512_Dropout_fine_tuning.npy','/content/drive/MyDrive/CV_Birds/features/distractor/ResNet152v2/OneDense512_Dropout_fine_tuning.npy'])
db_descriptors.shape

(72332, 512)

### Create annot and load descriptors for the test set

In [10]:
query_annot = create_annot('/content/drive/MyDrive/CV_Birds/test')
query_annot

Unnamed: 0,identity,image_path
0,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
1,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
2,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
3,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
4,AFRICAN CROWNED CRANE,/content/drive/MyDrive/CV_Birds/test/AFRICAN C...
...,...,...
1620,YELLOW HEADED BLACKBIRD,/content/drive/MyDrive/CV_Birds/test/YELLOW HE...
1621,YELLOW HEADED BLACKBIRD,/content/drive/MyDrive/CV_Birds/test/YELLOW HE...
1622,YELLOW HEADED BLACKBIRD,/content/drive/MyDrive/CV_Birds/test/YELLOW HE...
1623,YELLOW HEADED BLACKBIRD,/content/drive/MyDrive/CV_Birds/test/YELLOW HE...


In [11]:
query_descriptors = load_descriptors('/content/drive/MyDrive/CV_Birds/features/test/ResNet152v2/OneDense512_Dropout_fine_tuning.npy')
query_descriptors.shape

(1625, 512)

To run our tests we select only the first image of each species within the test set. Please note that within the test set we have 5 images per species.

In [12]:
queries_indexes = [x for x in range(325*5) if x%5 == 0]

### Create PP-Index

In [13]:
!rm /content/drive/MyDrive/CV_Birds/performance/fine_tuning/index/FT_forest_512_5rnd_cosine/*
!rm -r /content/drive/MyDrive/CV_Birds/performance/fine_tuning/index/FT_forest_512_5rnd_cosine/forest_structure/*

rm: cannot remove '/content/drive/MyDrive/CV_Birds/performance/fine_tuning/index/FT_forest_512_5rnd_cosine/forest_structure': Is a directory


In [14]:
def get_descriptor_from_id(id_object):
  return db_descriptors[id_object]

In [15]:
%cd "/content/drive/MyDrive/CV_Birds/Notebooks/PP-Index"
%run PPIndex.ipynb

/content/drive/.shortcut-targets-by-id/1rI5YNBuaSlCB__w522WEkHjw-nFuvIo0/CV_Birds/Notebooks/PP-Index


In [16]:
pivots = generate_list_of_pivots(db_descriptors, t=3, n=5, strategy="rnd")
rnd_pp_forest = PrefixForest(pivots, length=3, distance_metric='cosine', base_directory="/content", forest_file='forest_structure')
rnd_pp_forest.insert_objects_into_forest(range(len(db_descriptors)))
rnd_pp_forest.save()

Inserting objects intoInserting objects into tree2 ...
Inserting objects into  tree3 ...
tree1 ...
--- Insertion into tree tree3 completed: 242.293 seconds ---
--- Insertion into tree tree1 completed: 243.235 seconds ---
--- Insertion into tree tree2 completed: 243.358 seconds ---


### Compute mAP

In [17]:
birds_db = db_annot.loc[db_annot['identity'] != 'mirflickr']
counts = birds_db.groupby('identity').count()
print("Minimum number of images per species:", int(counts.min()))
print("Maximum number of images per species:", int(counts.max()))
print("Average number of images:", float(counts.sum()/325))

Minimum number of images per species: 116
Maximum number of images per species: 249
Average number of images: 145.63692307692307


Since at most we have 249 images per species, we use $n=250$.

In [18]:
n = 250

The formula for Average Precision is the following:

> $AP@n=\frac{1}{GTP}\sum_{k=1}^{n}P@k×rel@k$

where $GTP$ refers to the total number of ground truth positives, $n$ refers to the total number of images we are interested in, $P@k$ refers to the precision@k and $rel@k$ is a relevance function. 

The relevance function is an indicator function which equals 1 if the document at rank $k$ is relevant and equals to 0 otherwise.

In [19]:
def compute_ap(query_index, retrieved_ids):
  query_identity = query_annot['identity'][query_index]
  print(query_index//5, query_identity)
  GTP = len(db_annot.loc[db_annot['identity'] == query_identity])
  relevant = 0
  precision_summation = 0
  for k, id in enumerate(retrieved_ids):
    if db_annot['identity'][id] == query_identity: # relevant result
      relevant = relevant + 1
      precision_at_k = relevant/(k+1)
      precision_summation = precision_summation + precision_at_k
  return (query_identity, precision_summation/GTP)

For each query, $Q$, we can calculate a corresponding $AP$. Then, the $mAP$ is simply the mean of all the queries that were made.
> $mAP = \frac{1}{N}\sum_{i=1}^{N}AP_i$

In our case, $N=325$ (one query per species)

In [20]:
def rnd_pivots_queries(query_index, n):
  start_time = time.time()
  ids, distances = rnd_pp_forest.find_nearest_neighbors(query_descriptors[query_index], n)
  end_time = time.time()
  ids = ids.tolist()
  return compute_ap(query_index, ids) + (end_time - start_time,)

In [21]:
aps = []
for query_index in queries_indexes:
  aps.append(rnd_pivots_queries(query_index, n))

0 AFRICAN CROWNED CRANE
1 AFRICAN FIREFINCH
2 ALBATROSS
3 ALEXANDRINE PARAKEET
4 AMERICAN AVOCET
5 AMERICAN BITTERN
6 AMERICAN COOT
7 AMERICAN GOLDFINCH
8 AMERICAN KESTREL
9 AMERICAN PIPIT
10 AMERICAN REDSTART
11 ANHINGA
12 ANNAS HUMMINGBIRD
13 ANTBIRD
14 ARARIPE MANAKIN
15 ASIAN CRESTED IBIS
16 BALD EAGLE
17 BALD IBIS
18 BALI STARLING
19 BALTIMORE ORIOLE
20 BANANAQUIT
21 BANDED BROADBILL
22 BANDED PITA
23 BAR-TAILED GODWIT
24 BARN OWL
25 BARN SWALLOW
26 BARRED PUFFBIRD
27 BAY-BREASTED WARBLER
28 BEARDED BARBET
29 BEARDED BELLBIRD
30 BEARDED REEDLING
31 BELTED KINGFISHER
32 BIRD OF PARADISE
33 BLACK & YELLOW bROADBILL
34 BLACK BAZA
35 BLACK FRANCOLIN
36 BLACK SKIMMER
37 BLACK SWAN
38 BLACK TAIL CRAKE
39 BLACK THROATED BUSHTIT
40 BLACK THROATED WARBLER
41 BLACK VULTURE
42 BLACK-CAPPED CHICKADEE
43 BLACK-NECKED GREBE
44 BLACK-THROATED SPARROW
45 BLACKBURNIAM WARBLER
46 BLONDE CRESTED WOODPECKER
47 BLUE COAU
48 BLUE GROUSE
49 BLUE HERON
50 BLUE THROATED TOUCANET
51 BOBOLINK
52 BORNEAN BRI

In [22]:
aps

[('AFRICAN CROWNED CRANE', 0.703296406798262, 2.469438314437866),
 ('AFRICAN FIREFINCH', 0.48158593584499587, 1.9648754596710205),
 ('ALBATROSS', 0.45194645652144977, 1.8388497829437256),
 ('ALEXANDRINE PARAKEET', 0.5225732155714851, 2.23830246925354),
 ('AMERICAN AVOCET', 0.6685465750405687, 2.144314765930176),
 ('AMERICAN BITTERN', 0.4420174590384099, 1.985260009765625),
 ('AMERICAN COOT', 0.828650120071781, 2.087278127670288),
 ('AMERICAN GOLDFINCH', 0.3102478613458765, 2.2525367736816406),
 ('AMERICAN KESTREL', 0.1881063206820266, 2.011359691619873),
 ('AMERICAN PIPIT', 0.5457842780046025, 2.20345401763916),
 ('AMERICAN REDSTART', 0.36325047622974405, 1.8635225296020508),
 ('ANHINGA', 0.24702038073341634, 2.23799467086792),
 ('ANNAS HUMMINGBIRD', 0.3068847187535703, 2.080514430999756),
 ('ANTBIRD', 0.04698882483487931, 2.0518643856048584),
 ('ARARIPE MANAKIN', 0.6169328046519973, 1.6766712665557861),
 ('ASIAN CRESTED IBIS', 0.18308795995762056, 2.077681064605713),
 ('BALD EAGLE', 0

In [23]:
ap_at_n = np.array([ap[1] for ap in aps])
query_time = np.array(([ap[2] for ap in aps]))

In [24]:
mAP_at_n = np.mean(ap_at_n, axis=0)
avg_query_time = np.mean(query_time, axis=0)
print("mAP:", mAP_at_n)
print("avg. query time: ", avg_query_time)

mAP: 0.40500974486432717
avg. query time:  2.076718615752


In [25]:
save_results('/content/drive/MyDrive/CV_Birds/performance/fine_tuning/index/FT_forest_512_5rnd_cosine', 'FT_forest_512_5rnd_cosine_results', aps)

In [26]:
! cp /content/tree* /content/drive/MyDrive/CV_Birds/performance/fine_tuning/index/FT_forest_512_5rnd_cosine/forest_structure/
! cp /content/forest* /content/drive/MyDrive/CV_Birds/performance/fine_tuning/index/FT_forest_512_5rnd_cosine/forest_structure/