In [1]:
import faiss
import os
import pandas as pd
import numpy 
import glob
import numpy as np
from skimage import measure, transform, feature
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import ml_metrics as metrics
%matplotlib inline

In [2]:
# DB details
NPZ_FEAT_SRC = "/local/cs572/rchan31/codespace/BaselinePrototype/npz_file_saves/COMPACT_NPZ_DELF_FEATS/"
FAISS_INDEX_FILE = "/local/cs572/rchan31/fINALwEEK/FAISSIndex/FAISS_delf_MultiIndex_v3.index" # TODO:
IMG_SRC_PATH = "/local/cs572/cziems/index/"

DB_LOOKUP_CSV = "/local/cs572/rchan31/fINALwEEK/csv_lookups/FAISS_delf_v3_MultiIndex_CSV_lookup.csv" # TODO:
N_INDEX = 952133

In [3]:
cpu_index = faiss.read_index(FAISS_INDEX_FILE)

cpu_index.ntotal

952133

In [4]:
# Query details
Q_NPZ_FEAT_SRC = "/local/cs572/rchan31/fINALwEEK/COMPACT_NPZ_DELF_test_FEATS"
Q_IMG_SRC_PATH = "/local/cs572/cziems/test/"

Q_LOOKUP_CSV = "/local/cs572/rchan31/fINALwEEK/csv_lookups/processed_retrieval_solutions.csv"
Q_N_INDEX = 667

INITIAL_RANKED_LIST_CSV = "/local/cs572/rchan31/fINALwEEK/csv_lookups/initial_retrievals_delf_MultiIndex_v3.csv"

In [5]:
# query-collection
query_df = pd.read_csv(Q_LOOKUP_CSV)
query_image_IDs = query_df["id"]
len(query_image_IDs)

667

In [6]:
quer_vecs = []
quer_ids = {}
for i, fn in enumerate(glob.glob(os.path.join(Q_NPZ_FEAT_SRC, "*.npz"))):
    ID = os.path.basename(fn)[:-4]
    vec = np.load(fn)["descriptors"]
    quer_vecs.append(np.asarray(vec[0, :], dtype=np.float32))
    quer_ids[i] = ID
quer_vecs = np.array(quer_vecs)

quer_vecs.shape

(667, 40)

In [7]:
TOP_K = 100

cpu_index.nprobe = 100

distances, indices = cpu_index.search(quer_vecs, TOP_K)
print(distances.shape)
print(indices.shape)

indices

(667, 100)
(667, 100)


array([[736715, 363666, 948517, ..., 693750, 939159, 765396],
       [501075, 543586, 575723, ..., 428912, 140755, 585678],
       [229099, 775990, 410356, ...,   3485, 524962,   1653],
       ...,
       [255989, 528209, 108180, ..., 245666,  22096, 721689],
       [568700, 641612, 916228, ..., 470368, 876154,  68822],
       [406825, 673593, 472827, ..., 132467,    276, 880866]])

In [8]:
db_df = pd.read_csv(DB_LOOKUP_CSV)
db_df.head()

Unnamed: 0.1,Unnamed: 0,filenames
0,0,00002469b818f290
1,1,0000298d976221f3
2,2,00002e4e382333bf
3,3,000036329c35b65c
4,4,0000394edaaa55b1


In [9]:
def match_initial_retrievals(q_idx, query_id): # per query
    
    db_ret_img_IDs = []
    for topk_idx, index_num in enumerate(indices[q_idx]):
        
        if index_num == -1:
            continue
        
        fileID = db_df.loc[index_num]["filenames"]
        db_ret_img_IDs.append(fileID)
    
    if len(db_ret_img_IDs) != TOP_K:
        db_ret_img_IDs = db_ret_img_IDs + ['None'] * (TOP_K-len(db_ret_img_IDs))

    return db_ret_img_IDs


ranked_q_db_pairs = []
for q_idx, query_row in query_df.iterrows():
    db_ret_img_IDs = match_initial_retrievals(q_idx, query_row["id"])
    ranked_q_db_pairs.append(db_ret_img_IDs)
    
    print("............ Finished retrieving for query : %s.jpg : %d / %d" % (query_row["id"], (q_idx + 1), query_df.shape[0]))
    
        
retrieval_df = pd.DataFrame(ranked_q_db_pairs)
retrieval_df.insert(0, "queryImageID", query_df["id"])

............ Finished retrieving for query : f60a52c609ab0208.jpg : 1 / 667
............ Finished retrieving for query : 4fa678d60ab4bc23.jpg : 2 / 667
............ Finished retrieving for query : 5be9d8002f8dc60e.jpg : 3 / 667
............ Finished retrieving for query : 0810084dfbce8ea8.jpg : 4 / 667
............ Finished retrieving for query : bb8ca55b60e8cff4.jpg : 5 / 667
............ Finished retrieving for query : c0c270330090bab6.jpg : 6 / 667
............ Finished retrieving for query : 6100f8cc6c68c67f.jpg : 7 / 667
............ Finished retrieving for query : 2e77b0bbdd4d2677.jpg : 8 / 667
............ Finished retrieving for query : 1934744456f5f29c.jpg : 9 / 667
............ Finished retrieving for query : 7929c900163556db.jpg : 10 / 667
............ Finished retrieving for query : 42b9fd64aa1b43bc.jpg : 11 / 667
............ Finished retrieving for query : b39ff185206cf76b.jpg : 12 / 667
............ Finished retrieving for query : e7a124993271a25a.jpg : 13 / 667
........

In [10]:
retrieval_df.to_csv(INITIAL_RANKED_LIST_CSV)
retrieval_df.head()

Unnamed: 0,queryImageID,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,f60a52c609ab0208,c62a9ca23e45420f,61f08305fbc9f806,ff039f52ca878504,f258c7eb40fdc8cf,7c9c6f52fd38e29b,5cbb626ced45ba96,9ae068672e7d8270,61e5ac7b27f26e15,c800b2f0e4d23943,...,353cdc982697ebb9,583411310be32c3e,3fe77660e5ba7e8a,46c3ed8ef09b5119,64fdb1d47de1bf07,ddb5178a5397776d,863dc3093a3968c3,ba9627ba978958e9,fc7af51823e9137a,cdd271e5d2ba1d0f
1,4fa678d60ab4bc23,86b37c8c6c559b21,921f02216a181a18,9adc544d34892d6a,720382171af17c2f,8b21751541e80559,e25485a00530097d,5f443f06cae15cd5,3f84e9a136feffa2,d2726ef324ceb7e8,...,6f030f25cab3b68d,3421cd84cfe8f558,8b94757658c5f4cb,fed2969b5278a844,ec7b662bf8a9ce53,caddb647d2cbaf39,47367775e52cd791,735ed8d385f4d91e,25ec89b5e07e16b8,9d8dfe0b2e7d772d
2,5be9d8002f8dc60e,3dcfb68630dae905,d0abe024e8e92459,6e709c5b726a7527,4327234a4d27d181,eebf7fc00415e13e,899fc319f90c23e3,24f05eeecbbfaf9c,ecdf378666f39a84,190b40938e2bd088,...,f02a15cb1d442f0c,72dda67b57a481ed,c6f8390bb06b0aa9,df9fcec52ac9c6fa,e1f6577950ef0462,23edebcf8d2acb97,b0a824cbd08e019e,00eb7b05f3a4903c,8d1a7838e3328321,0070b0960ae92c14
3,0810084dfbce8ea8,70e8730fe483d70c,5114c8e3265ada3d,aeba6ccde4c770ff,3330b87594b7dbfc,972862b79aa07753,0944c9f723b5b4ed,8307c5fff7779d70,5d48cd80a39de57c,fa94b601d2382721,...,ca539c56874075fd,93948001c4a30bdb,e145410873532e33,b6c21875e44aafad,645c68dc24280ff7,a392d62ee37ed23f,1f901aa928f363f9,e763186d9699dfdf,24e70e3c2250dfdf,2caaf7b48d2d2075
4,bb8ca55b60e8cff4,e714bc41d5d7084c,d6433b3ad4f2258f,db518a09119c7b1f,0d37fbeeac8a6534,2214d68952b1c5d1,1e897b3ee5a22f43,738bd050be438d37,71b470bc516392da,4de111f0414cadd0,...,58d9aa0da3d82fb3,dd20d49372c3858f,9155bacc533ff75e,568d026fcd32233d,9edaa79ae1d5fea1,ef6a980a7713e427,17dacfc2e1bdda9a,b0740c7e993512fd,0cd02ad748bc87ca,3e6230a8ffd06118


In [11]:
## Removing first-columns : imageID
# cropping out only the retrieved-db-image IDs
retrieved_solution = retrieval_df.iloc[:, 1:] 
retrieved_solution = retrieved_solution.values.tolist()
print(len(retrieved_solution[0]))
len(retrieved_solution)

100


667

In [12]:
## Removing first-columns : imageID
# cropping out only the q-image IDs
solution_df = pd.read_csv(Q_LOOKUP_CSV)
grountruth_solution = solution_df.iloc[:, 2:]
grountruth_solution = grountruth_solution.values.tolist()
print(len(grountruth_solution[0]))
len(grountruth_solution)

100


667

In [13]:
mAPk_cum = 0
TOP_K = 100
for q in range(len(retrieved_solution)):
    temp_mAP = metrics.mapk(grountruth_solution[q], retrieved_solution[q], TOP_K)
    print("mAP for query index :%s : %s" % (q, temp_mAP))
    mAPk_cum += temp_mAP

mAPk_avg = mAPk_cum/len(retrieved_solution)
print(mAPk_cum)
print("mAP across first all queries : ", mAPk_avg) # 0.1916305682880129

mAP for query index :0 : 0.27031966644466643
mAP for query index :1 : 0.1425612737956488
mAP for query index :2 : 0.22260847399128644
mAP for query index :3 : 0.07955449411699411
mAP for query index :4 : 0.07411582774864026
mAP for query index :5 : 0.22504166840104337
mAP for query index :6 : 0.21839664935758685
mAP for query index :7 : 0.2946118907134532
mAP for query index :8 : 0.2707129008144633
mAP for query index :9 : 0.2658177837787213
mAP for query index :10 : 0.2390033846361971
mAP for query index :11 : 0.1425109551906427
mAP for query index :12 : 0.1775871446262071
mAP for query index :13 : 0.08224039415445666
mAP for query index :14 : 0.10641491581335333
mAP for query index :15 : 0.23781784188034183
mAP for query index :16 : 0.26180608540764794
mAP for query index :17 : 0.12874166545260293
mAP for query index :18 : 0.10339362720612721
mAP for query index :19 : 0.26521115256271505
mAP for query index :20 : 0.07413726898101897
mAP for query index :21 : 0.28339338959651456
mAP f

In [19]:
# # Recall
# from sklearn.metrics import recall_score

# Recall_cum = 0
# TOP_K = 100
# for q in range(len(retrieved_solution)):
#     temp_recall = recall_score(grountruth_solution[q], retrieved_solution[q])
#     print("Recall for query index :%s : %s" % (q, temp_recall))
#     Recall_cum += temp_recall

# Recall_avg = Recall_cum/len(retrieved_solution)
# print(Recall_avg)
# print("mAP across first all queries : ", Recall_avg) # 0.1916305682880129

def Recall_at(retrieved, matches, k=5):
    GTP = len(set(matches)) # Ground-Truth True Positives
    if GTP==0:
        return 0
    TP_seen = 0
    R_sum = 0 # sum of recall
    print()
    for i, r in enumerate(retrieved):
        if r in matches[:k]:
            TP_seen += 1
            print(TP_seen)
#             P_sum += (TP_seen) / (i+1)
    return R_sum/GTP

Recall_at(retrieved_solution[0], grountruth_solution[0], 100)

0.0