In [1]:
import faiss
import os
import pandas as pd
import numpy 
import glob
import numpy as np
from skimage import measure, transform, feature
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import ml_metrics as metrics
%matplotlib inline

In [2]:
# DB details
NPZ_FEAT_SRC = "/local/cs572/rchan31/codespace/BaselinePrototype/npz_file_saves/COMPACT_NPZ_DELF_FEATS/"
FAISS_INDEX_FILE = "/local/cs572/rchan31/fINALwEEK/FAISSIndex/FAISS_delf_v1.index"
IMG_SRC_PATH = "/local/cs572/cziems/index/"

DB_LOOKUP_CSV = "/local/cs572/rchan31/fINALwEEK/csv_lookups/FAISS_delf_v1_CSV_lookup.csv"
N_INDEX = 952133

In [3]:
cpu_index = faiss.read_index(FAISS_INDEX_FILE)

cpu_index.ntotal

952133

In [4]:
# Query details
Q_NPZ_FEAT_SRC = "/local/cs572/rchan31/fINALwEEK/COMPACT_NPZ_DELF_test_FEATS"
Q_IMG_SRC_PATH = "/local/cs572/cziems/test/"

Q_LOOKUP_CSV = "/local/cs572/rchan31/fINALwEEK/csv_lookups/processed_retrieval_solutions.csv"
Q_N_INDEX = 667

RERANKED_LIST_CSV = "/local/cs572/rchan31/fINALwEEK/csv_lookups/reranked_retrievals_delf_v1.csv"

In [5]:
# query-collection
query_df = pd.read_csv(Q_LOOKUP_CSV)
query_image_IDs = query_df["id"]
len(query_image_IDs)

667

In [6]:
quer_vecs = []
quer_ids = {}
for i, fn in enumerate(glob.glob(os.path.join(Q_NPZ_FEAT_SRC, "*.npz"))):
    ID = os.path.basename(fn)[:-4]
    vec = np.load(fn)["descriptors"]
    quer_vecs.append(np.asarray(vec[0, :], dtype=np.float32))
    quer_ids[i] = ID
quer_vecs = np.array(quer_vecs)

quer_vecs.shape

(667, 40)

In [7]:
TOP_K = 100

cpu_index.nprobe = 100

distances, indices = cpu_index.search(quer_vecs, TOP_K)
print(distances.shape)
print(indices.shape)

indices

(667, 100)
(667, 100)


array([[736715, 363666, 948517, ..., 719896,  43536, 851043],
       [501075, 543586, 575723, ..., 623293, 119295, 444459],
       [229099, 775990, 410356, ..., 779317, 594871, 644500],
       ...,
       [255989, 528209, 108180, ..., 543274, 807051, 611768],
       [568700, 641612, 916228, ..., 115030, 130539, 948178],
       [406825, 673593, 472827, ..., 773443, 390467, 884963]])

In [8]:
db_df = pd.read_csv(DB_LOOKUP_CSV)
db_df.head()

Unnamed: 0.1,Unnamed: 0,filenames
0,0,00002469b818f290
1,1,0000298d976221f3
2,2,00002e4e382333bf
3,3,000036329c35b65c
4,4,0000394edaaa55b1


In [10]:
def match_retrievals(q_idx, query_id): # per query
    
    query_npz_path = os.path.join(Q_NPZ_FEAT_SRC, "%s.npz" % query_id)
    
    q_locations_x = np.asarray(np.load(query_npz_path)["locations_x"], np.int32)
    q_locations_y = np.asarray(np.load(query_npz_path)["locations_y"], np.int32)
    q_locations = np.asarray([q_locations_x, q_locations_y]).T
    
    num_features_q = q_locations.shape[0]

    query_db_inliers = {}
    db_ret_img_IDs = []
    for topk_idx, index_num in enumerate(indices[q_idx]):
        
        if index_num == -1:
            continue
        

        fileID = db_df.loc[index_num]["filenames"]
        
        db_npz_filename_path = os.path.join(NPZ_FEAT_SRC, "%s.npz" % fileID)
        insta_imageID = "%s.jpg" % fileID
        npz_data = np.load(db_npz_filename_path)
        
        inst_location_x = np.asarray(npz_data["locations_x"], np.int32)
        inst_location_y = np.asarray(npz_data["locations_y"], np.int32)
        db_locations = np.asarray([inst_location_x, inst_location_y]).T

        num_features_db = db_locations.shape[0]
        
        min_feat_len = min(num_features_q, num_features_db)


        # Perform geometric verification using RANSAC.
        _, inliers = measure.ransac((db_locations[:min_feat_len, :],
                                     q_locations[:min_feat_len, :]),
                                      transform.AffineTransform,
                                      min_samples=3,
                                      residual_threshold=20,
                                      max_trials=1000)

        print('Found %d inliers for image : %s.jpg' % (sum(inliers), fileID))
        query_db_inliers[fileID] = sum(inliers)
        db_ret_img_IDs.append(fileID)
        
#         if topk_idx == 2:
#             break


    db_ret_img_IDs.sort(key=query_db_inliers.get, reverse = True)
#     print(db_ret_img_IDs)
    
    if len(db_ret_img_IDs) != TOP_K:
        db_ret_img_IDs = db_ret_img_IDs + ['None'] * (TOP_K-len(db_ret_img_IDs))

    return db_ret_img_IDs


reranked_IDs = []
for q_idx, query_row in query_df.iterrows():
    sorted_db_ret_img_IDs = match_retrievals(q_idx, query_row["id"])
    reranked_IDs.append(sorted_db_ret_img_IDs)
    
    print("............ Finished reranking for query : %s.jpg : %d / %d" % (query_row["id"], q_idx, query_df.shape[0]))
    
    if q_idx == 2:
        break
        
# print(reranked_IDs)
reranked_df = pd.DataFrame(reranked_IDs)
reranked_df.insert(0, "queryImageID", query_df["id"])

Found 9 inliers for image : c62a9ca23e45420f.jpg
Found 10 inliers for image : 61f08305fbc9f806.jpg
Found 7 inliers for image : ff039f52ca878504.jpg
Found 9 inliers for image : f258c7eb40fdc8cf.jpg
Found 10 inliers for image : 7c9c6f52fd38e29b.jpg
Found 8 inliers for image : 5cbb626ced45ba96.jpg
Found 9 inliers for image : 9ae068672e7d8270.jpg
Found 9 inliers for image : 61e5ac7b27f26e15.jpg
Found 9 inliers for image : c800b2f0e4d23943.jpg
Found 9 inliers for image : d54d45d1931a54c5.jpg
Found 9 inliers for image : 0b222e3a8abd07ca.jpg
Found 8 inliers for image : c604ef43fea8fbd1.jpg
Found 7 inliers for image : 1ab719ee0f390fde.jpg
Found 9 inliers for image : fbd33968c8cc0ffb.jpg
Found 11 inliers for image : 06591603c89ba406.jpg
Found 7 inliers for image : eb31b11020497621.jpg
Found 7 inliers for image : 0177598b0a7f795b.jpg
Found 9 inliers for image : b6218c7ee9d204f5.jpg
Found 10 inliers for image : b26c637f44cfc1f1.jpg
Found 9 inliers for image : d2c236ef72656b67.jpg
Found 8 inliers 

In [11]:
reranked_df.to_csv("./csv_lookups/reranked_retrievals_delf_v1.csv")
reranked_df.head()

Unnamed: 0,queryImageID,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,f60a52c609ab0208,6743b01ed9a80b21,795382a3476eef45,f7516eb6373880a8,5b317bd61482003e,5b66021e346ab3c8,0ac20102062fcfe4,2389c6f3ae0d098c,06591603c89ba406,cb160c4a3ebcf864,...,f87b9cbd29f65b13,fc7af51823e9137a,d9467598a6f2c341,00f76a32545a61fa,3171bbb1efba8ded,5a17c286f55000f8,9543cd918ee87e38,b22b1d598e405a32,4025b17b3449da20,c1a50fc8a5713159
1,4fa678d60ab4bc23,9adc544d34892d6a,a153c9c6daa1c533,9a720ea3a239c7e2,921f02216a181a18,9458354974361f83,094dd3087b8eb2b6,3dfe0a1b754c2b79,bf62dece3bcddfef,201e24b5cea6c46d,...,bc1256effd2d1a3e,f4cc1116e22ccd10,25ec89b5e07e16b8,ff8d7a1abf8b0c53,e19b111562e7ea47,1adfc00c2c6689e6,f904c847e26279b1,58462343843646b0,5f443f06cae15cd5,07b7f7489addcf6e
2,5be9d8002f8dc60e,4f420991b27bebb7,795aecdcd72a2ac0,2a9e1f5ef1a57543,eebf7fc00415e13e,7989bf0178906d80,acc1d2f042303981,b0a824cbd08e019e,d0abe024e8e92459,6e709c5b726a7527,...,47fa19a557300bd8,df9fcec52ac9c6fa,8d1a7838e3328321,a4b6fe7b72005ac6,cdd563a2801777ae,746d033aacdeb8a4,c5755c70ce833453,1efd82154a2556c8,557f33defed30b43,96d55734414382af


In [34]:
def AP_at(retrieved, matches, k=5):
    GTP = len(set(matches)) # Ground-Truth True Positives
    if GTP==0:
        return 0
    TP_seen = 0
    P_sum = 0 # sum of precision
    for i, r in enumerate(retrieved):
        if r in matches:
            TP_seen += 1
            P_sum += (TP_seen) / (i+1)
    print(P_sum)
    return P_sum/GTP

def mAP_at(retrieved_arr, matches_arr, num_query=100):
    assert len(retrieved_arr)==len(matches_arr)
    # assert len(retrieved_arr)>=K
    cumulative_AP = 0
    for i in range(num_query):
        retrieved = retrieved_arr[i]
        matches = matches_arr[i]
        cumulative_AP += AP_at(retrieved, matches, i+1)

    print(cumulative_AP)
    return cumulative_AP / num_query

In [13]:
retrieved_solution = reranked_df.iloc[:, 1:] # cropping out only the retrieved-db-image IDs
retrieved_solution = retrieved_solution.values.tolist()

In [14]:
solution_df = pd.read_csv(Q_LOOKUP_CSV)
grountruth_solution = solution_df.iloc[:, 2:]
grountruth_solution = grountruth_solution.values.tolist()

In [35]:
print(len(retrieved_solution))
print(len(grountruth_solution[:3]))
print(len(retrieved_solution[0]))
print(len(grountruth_solution[:3][0]))
mAP_at(retrieved_solution, grountruth_solution[:3], 3)

3
3
100
100
0
0
0
0.0


0.0

In [42]:
import ml_metrics as metrics
sol1 = metrics.mapk(grountruth_solution[0], retrieved_solution[0], 100)
sol2 = metrics.mapk(grountruth_solution[1], retrieved_solution[1], 100)
sol3 = metrics.mapk(grountruth_solution[2], retrieved_solution[2], 100)
print(sol1)
print(sol2)
print(sol3)

mAPk_cum = 0
TOP_K = 100
for q in range(len(retrieved_solution)):
    mAPk_cum += metrics.mapk(grountruth_solution[q], retrieved_solution[q], TOP_K)

mAPk_avg = mAPk_cum/len(retrieved_solution)
print(mAPk_cum)
print("mAP across first 3 queries : ", mAPk_avg)

0.2653045704295704
0.12397819454850705
0.21415891573704068
0.6034416807151182
mAP across first 3 queries :  0.2011472269050394
