
# Goal:
* Figure out why AP decreases with the number of probes
* Look at the combinations being tested
* Look at the distance values 
* Look at the coarse quantizer
* Look at our reconstruction methods
* Look at baseline methods -- are they sound? Are they making up numbers?

In [1]:
import os
import pickle
import json


import numpy as np
from importlib import reload

In [71]:
import random

import utils
import data
import search_concept
import search_expand

reload(utils)
reload(data)
reload(search_concept)
reload(search_expand)


# Index settings
dim = 512
k_coarse = 512
m = 64
nbits = 8
n_probes_list = [5, 10, 25, 50]
segments_to_test = 50

coco_path = '/data/users/jie/data-slicing/COCO/'
embed_path = '/data/users/jie/data-slicing/COCO/embeds/train2017_fixed_clip_only'
pickle_path = os.path.join(coco_path, 'embeds/coco-2017-train-realclip-realsam.pkl')
embed_dict = utils.create_new_pickle(embed_path, pickle_path)



In [72]:
import utils
import data
import search_concept
import search_expand

reload(utils)
reload(data)
reload(search_concept)
reload(search_expand)

def inner_loop(
        index_pq, index_flat_cpu, packd, img_concept_bitmap, 
        all_images, pqq, kmeans, n_probes, embed_dict, average_embeddings,
        n_segments_list):

    precision_results = {
        'our': [],
        'pq': []
    }
    counter_results = {
        'our': [],
        'pq': []
    }
    print('Number of segments:', n_segments_list)
    for n_segments in n_segments_list:
        random_img = random.choice(all_images)
        start_idx, end_idx, _ = embed_dict['img_to_vec_list'][random_img]
        img_embeddings = average_embeddings[start_idx:end_idx]
        random_embedding_idx = random.sample(range(len(img_embeddings)), min(n_segments, len(img_embeddings)))
        
        features = {f'v_{i}': img_embeddings[idx] for i, idx in enumerate(random_embedding_idx)}

        gt_images, _, gt_counter = search_expand.flat_multisearch(
            index_flat_cpu, features, p_k, vec_to_img, 
            max_search_radius=1e6, exclusive_matching=False)
        if len(gt_images) < p_k:
            continue

        our_images, _, our_counter = search_concept.perform_search(
            features, p_k, kmeans, pqq, packd, img_concept_bitmap, all_images, 
            n_probes=n_probes, exclusive_matching=False)
        pq_images, _, pq_counter = search_expand.flat_multisearch(index_pq, features, p_k, vec_to_img, 
            max_search_radius=1e6, exclusive_matching=False)

        our_ap = utils.calculate_avg_precision(gt_images, our_images)
        pq_ap = utils.calculate_avg_precision(gt_images, pq_images)
        precision_results['our'].append(our_ap)
        precision_results['pq'].append(pq_ap)

        counter_results['our'].append(our_counter)
        counter_results['pq'].append(pq_counter)
    
    return precision_results, counter_results

average_embeddings = embed_dict['average_embeddings']
all_precisions = {}
all_counts = {}
for n_probes in n_probes_list:
    train_sample_size = 10_000
    data_sample_size = 10_000

    index_pq, index_flat_cpu, packd, img_concept_bitmap, all_images, pqq, kmeans, = data.get_indices(
            dim=dim, k_coarse=k_coarse, m=m, cluster_bits=nbits, 
            n_probes=n_probes, embed_dict=embed_dict, use_custom_pq=False,
            random_seed=None, train_sample_size=train_sample_size,
            data_sample_size=data_sample_size
        )

    print(index_pq.nprobe, 'is number of nprobe i configured')

    all_images = list(embed_dict['img_to_vec_list'].keys())
    vec_to_img = embed_dict['vec_to_img']
    n_iterations = 100
    n_segments_list = [2, 4, 6]
    p_k = 10

    all_counts[n_probes] = {
        'our': [],
        'pq': []
    }

    all_precisions[n_probes] = {
        'our': [],
        'pq': []
    }

    for i in range(n_iterations):
        precisions, counts = inner_loop(
            index_pq, index_flat_cpu, packd, img_concept_bitmap, 
            all_images, pqq, kmeans, n_probes, embed_dict, average_embeddings,
            n_segments_list
        )
        all_counts[n_probes]['our'].append(counts['our'])
        all_counts[n_probes]['pq'].append(counts['pq'])

        all_precisions[n_probes]['our'].append(precisions['our'])
        all_precisions[n_probes]['pq'].append(precisions['pq'])

    print('Number of probes:', n_probes)
    print('Our precision:', np.mean(all_precisions[n_probes]['our']))
    print('PQ precision:', np.mean(all_precisions[n_probes]['pq']))
    print('Our counter:', np.mean(all_counts[n_probes]['our']))
    print('PQ counter:', np.mean(all_counts[n_probes]['pq']))
    print('')

for n_probes in n_probes_list:
    print('Number of probes:', n_probes)
    print('Our precision:', np.mean(all_precisions[n_probes]['our']))
    print('PQ precision:', np.mean(all_precisions[n_probes]['pq']))
    print('Our counter:', np.mean(all_counts[n_probes]['our']))
    print('PQ counter:', np.mean(all_counts[n_probes]['pq']))
    print('')


        


Training on subset 0.0009526723793983454
Utilizing subset 0.0009526723793983454
Building IVFPQ index




Building Flat index
Kmeans trained (hax) 512


9491it [00:00, 440550.46it/s]


5 is number of nprobe i configured
Number of segments: [2, 4, 6]


AttributeError: 'NoneType' object has no attribute 'info'

In [130]:
index_pq, index_flat_cpu, packd, img_concept_bitmap, all_images, pqq, kmeans, = data.get_indices(
        dim=dim, k_coarse=k_coarse, m=m, cluster_bits=nbits, 
        n_probes=n_probes, embed_dict=embed_dict, use_custom_pq=False,
        random_seed=None, train_sample_size=train_sample_size,
        data_sample_size=data_sample_size
    )

Training on subset 0.004193854348587396
Building IVFPQ index
Building Flat index




Kmeans trained (hax) 512


118287it [00:17, 6663.89it/s] 


In [131]:
img_concept_bitmap_original = img_concept_bitmap.copy()

In [132]:
img_concept_bitmap.shape

(118287, 512)

In [134]:
spars_values.shape

(118287, 512)

In [141]:
# Check if their elements are the same
# np.all(img_concept_bitmap == spars_values)

# Find the indices where the elements are different
difference = np.where(img_concept_bitmap != spars_values)

# Compare the values at the indices where the elements are different
count = 0
for i, j in zip(*difference):
    print(img_concept_bitmap[i, j], spars_values[i, j])
    count += 1
    if count > 10:
        break


False True
True False
False True
False True
True False
False True
False True
True False
False True
False True
True False


In [142]:
# Test the bitwise or's

queries = np.random.rand(10, 512) > 0.5
print(queries.shape)

(10, 512)


In [166]:
import math
import random

import utils
import data
import search_concept
import search_expand
import numpy as np
import mlog

reload(utils)
reload(data)
reload(search_concept)
reload(search_expand)
reload(mlog)

logger = mlog.SimpleLogger("debug_logs")

logger.info('Setup')
logger.info('K Coarse', k_coarse)
logger.info('M', m)
logger.info('N Bits', nbits)
logger.info('Number of Probes', n_probes_list)

def bitmap_to_sparse(bitmap):
    from scipy import sparse
    import pandas as pd
    sparse_mat = sparse.csr_matrix(bitmap)
    return pd.DataFrame.sparse.from_spmatrix(sparse_mat).values


def inner_loop(
        index_pq, index_flat_cpu, packd, img_concept_bitmap, 
        all_images, pqq, kmeans, n_probes, embed_dict, average_embeddings,
        n_segments_list):

    precision_results = {
        'our': {n: [] for n in n_segments_list},
        'pq': {n: [] for n in n_segments_list}
    }
    counter_results = {
        'our': {n: [] for n in n_segments_list},
        'pq': {n: [] for n in n_segments_list}
    }
    for n_segments in n_segments_list:
        random_img = random.choice(all_images)
        start_idx, end_idx, _ = embed_dict['img_to_vec_list'][random_img]
        img_embeddings = average_embeddings[start_idx:end_idx]
        random_embedding_idx = random.sample(range(len(img_embeddings)), min(n_segments, len(img_embeddings)))
        
        features = {f'v_{i}': img_embeddings[idx] for i, idx in enumerate(random_embedding_idx)}

        gt_images, _, gt_counter = search_expand.flat_multisearch(
            index_flat_cpu, features, p_k, vec_to_img, 
            max_search_radius=1e6, exclusive_matching=False,
            logger=logger, is_gt=True)
        
        if len(gt_images) < p_k:
            continue

        print('@@@@@ US @@@@@')
        our_images, _, our_counter = search_concept.perform_search(
            features, p_k, kmeans, pqq, packd, img_concept_bitmap, all_images, 
            n_probes=n_probes, exclusive_matching=False,
            logger=logger)
        print('@@@@@ PQ! @@@@@')
        pq_images, _, pq_counter = search_expand.flat_multisearch(
            index_pq, features, p_k, vec_to_img, 
            max_search_radius=1e6, exclusive_matching=False,
            logger=logger)
        print('-----------')

        print('GT images', gt_images)
        print('Our images', our_images)
        print('PQ images', pq_images)

        our_ap = utils.calculate_avg_precision(gt_images, our_images)
        pq_ap = utils.calculate_avg_precision(gt_images, pq_images)
        precision_results['our'][n_segments].append(our_ap)
        precision_results['pq'][n_segments].append(pq_ap)

        counter_results['our'][n_segments].append(our_counter)
        counter_results['pq'][n_segments].append(pq_counter)
    
    return precision_results, counter_results

n_probes_list = [25, 50]
average_embeddings = embed_dict['average_embeddings']
all_precisions = {}
all_counts = {}
# n_segments_list = [4, 6]
n_segments_list = [6]

print('Current Settings:')
print('Number of probes:', n_probes_list)
print('Number of segments:', n_segments_list)
print('')
print('-----------')

for n_probes in n_probes_list:
    train_sample_size = int(128 * math.sqrt(len(embed_dict['img_to_vec_list'])))
    print(train_sample_size, 'is train sample size')
    # data_sample_size = 10_000
    # data_sample_size = None
    data_sample_size = None

    index_pq, index_flat_cpu, packd, img_concept_bitmap, all_images, pqq, kmeans = data.get_indices(
            dim=dim, k_coarse=k_coarse, m=m, cluster_bits=nbits, 
            n_probes=n_probes, embed_dict=embed_dict, use_custom_pq=False,
            random_seed=None, train_sample_size=train_sample_size,
            data_sample_size=data_sample_size
        )
    
    img_concept_bitmap_original = bitmap_to_sparse(img_concept_bitmap)

    print(index_pq.nprobe, 'is number of nprobe i configured')
    print('-----------')

    all_images = list(embed_dict['img_to_vec_list'].keys())
    vec_to_img = embed_dict['vec_to_img']
    n_iterations = 15
    p_k = 10

    all_counts[n_probes] = {
        'our': {n: [] for n in n_segments_list},
        'pq': {n: [] for n in n_segments_list}
    }

    all_precisions[n_probes] = {
        'our': {n: [] for n in n_segments_list},
        'pq': {n: [] for n in n_segments_list}
    }

    for i in range(n_iterations):
        precisions, counts = inner_loop(
            index_pq, index_flat_cpu, packd, img_concept_bitmap, 
            all_images, pqq, kmeans, n_probes, embed_dict, average_embeddings,
            n_segments_list
        )
        for n_segments in n_segments_list:
            all_counts[n_probes]['our'][n_segments].extend(counts['our'][n_segments])
            all_counts[n_probes]['pq'][n_segments].extend(counts['pq'][n_segments])
            all_precisions[n_probes]['our'][n_segments].extend(precisions['our'][n_segments])
            all_precisions[n_probes]['pq'][n_segments].extend(precisions['pq'][n_segments])

    print('Number of probes:', n_probes)
    for n_segments in n_segments_list:
        print(f'  Number of segments: {n_segments}')
        print(f'    Our precision: {np.mean(all_precisions[n_probes]["our"][n_segments])}')
        print(f'    PQ precision: {np.mean(all_precisions[n_probes]["pq"][n_segments])}')
        print(f'    Our counter: {np.mean(all_counts[n_probes]["our"][n_segments])}')
        print(f'    PQ counter: {np.mean(all_counts[n_probes]["pq"][n_segments])}')
    print('')

print("Overall results:")
for n_probes in n_probes_list:
    print('Number of probes:', n_probes)
    for n_segments in n_segments_list:
        print(f'  Number of segments: {n_segments}')
        print(f'    Our precision: {np.mean(all_precisions[n_probes]["our"][n_segments])}')
        print(f'    PQ precision: {np.mean(all_precisions[n_probes]["pq"][n_segments])}')
        print(f'    Our counter: {np.mean(all_counts[n_probes]["our"][n_segments])}')
        print(f'    PQ counter: {np.mean(all_counts[n_probes]["pq"][n_segments])}')
    print('')

[2024-10-11 05:40:45] INFO: Setup
[2024-10-11 05:40:45] INFO: K Coarse 512
[2024-10-11 05:40:45] INFO: M 64
[2024-10-11 05:40:45] INFO: N Bits 8
[2024-10-11 05:40:45] INFO: Number of Probes [25, 50]
Current Settings:
Number of probes: [25, 50]
Number of segments: [6]

-----------
44022 is train sample size
Training on subset 0.004193854348587396
Building IVFPQ index
Building Flat index




Kmeans trained (hax) 512


118287it [00:32, 3589.03it/s] 
  return pd.DataFrame.sparse.from_spmatrix(sparse_mat).values


25 is number of nprobe i configured
-----------
[2024-10-11 05:43:35] INFO: (GT) (6, 10496788) is the shape of indices
[2024-10-11 05:43:35] INFO: (GT) 6 is the number of features
[2024-10-11 05:43:35] INFO: (GT) 10496788 is the search radius
[2024-10-11 05:43:38] INFO: (GT) 0| v_0 -> 1
[2024-10-11 05:43:41] INFO: (GT) 1| v_1 -> 1
[2024-10-11 05:43:44] INFO: (GT) 2| v_2 -> 1
[2024-10-11 05:43:47] INFO: (GT) 3| v_3 -> 1
[2024-10-11 05:43:49] INFO: (GT) 4| v_4 -> 1
[2024-10-11 05:43:52] INFO: (GT) 5| v_5 -> 1
[2024-10-11 05:43:52] INFO: (GT) Potential candidates 6
[2024-10-11 05:43:53] INFO: (GT) Number of candidates 118287
[2024-10-11 05:43:53] INFO: (GT) Sorted image scores [('000000251690.jpg', 0.0), ('000000208712.jpg', 134.40242862701416), ('000000320978.jpg', 134.49176216125488), ('000000396030.jpg', 135.5910997390747), ('000000156302.jpg', 135.91495037078857), ('000000574692.jpg', 136.54400539398193), ('000000168405.jpg', 136.97784042358398), ('000000349485.jpg', 137.0153656005859

KeyboardInterrupt: 

In [167]:
print("Overall results:")
for n_probes in n_probes_list:
    print('Number of probes:', n_probes)
    for n_segments in n_segments_list:
        print(f'  Number of segments: {n_segments}')
        print(f'    Our precision: {np.mean(all_precisions[n_probes]["our"][n_segments])}')
        print(f'    PQ precision: {np.mean(all_precisions[n_probes]["pq"][n_segments])}')
        print(f'    Our counter: {np.mean(all_counts[n_probes]["our"][n_segments])}')
        print(f'    PQ counter: {np.mean(all_counts[n_probes]["pq"][n_segments])}')
    print('')

Overall results:
Number of probes: 25
  Number of segments: 6
    Our precision: 0.5668386243386242
    PQ precision: 0.3267222222222222
    Our counter: 2716957.0
    PQ counter: 43745158.0

Number of probes: 50
  Number of segments: 6


KeyError: 50

In [82]:
index_flat_cpu.search(average_embeddings[0:5], len(average_embeddings))

(array([[  0.       ,   6.7542486,  15.997889 , ..., 165.2841   ,
         168.22972  , 168.90839  ],
        [  0.       ,  26.215351 ,  26.65932  , ..., 168.77779  ,
         168.80864  , 170.93292  ],
        [  0.       ,  12.078511 ,  12.441524 , ..., 160.25388  ,
         160.59152  , 166.04483  ],
        [  0.       ,  22.248173 ,  22.590618 , ..., 163.26358  ,
         163.901    , 164.56924  ],
        [  0.       ,  15.343946 ,  21.181234 , ..., 165.8447   ,
         166.39017  , 167.35727  ]], dtype=float32),
 array([[      0,      31, 6992270, ..., 1494383, 3370485, 4363280],
        [      1,      24, 4359709, ..., 8340859, 6368273, 2234141],
        [      2, 1693016, 6081274, ..., 4772084, 4363280, 2817541],
        [      3, 3950297, 9074137, ..., 6323630, 6323620, 6368273],
        [      4,      62,      17, ..., 6323630, 2817541, 6323620]]))

In [148]:

import numpy as np
import pandas as pd
from scipy import sparse
import time
import random

def generate_cluster_ids(num_clusters, max_size=20, num_cols=None):
    return [random.sample(range(num_cols), random.randint(1, max_size)) for _ in range(num_clusters)]

def bitmap_to_sparse_df(bitmap):
    sparse_mat = sparse.csr_matrix(bitmap)
    return pd.DataFrame.sparse.from_spmatrix(sparse_mat)

def benchmark_dense(bitmap, cluster_ids_list):
    start_time = time.time()
    for cluster_ids in cluster_ids_list:
        query_matches = np.any(bitmap[:, cluster_ids], axis=1)
    for cluster_ids in cluster_ids_list:
        query_matches = np.any(bitmap[:, cluster_ids], axis=1)
    end_time = time.time()
    return end_time - start_time

def benchmark_sparse_pandas(sparse_df, cluster_ids_list):
    sparse_matrix = sparse_df.values
    start_time = time.time()
    # sparse_matrix = sparse_df.sparse.to_coo().tocsr()
    for cluster_ids in cluster_ids_list:
        query_matches = np.any(sparse_matrix[:, cluster_ids], axis=1)
    for cluster_ids in cluster_ids_list:
        query_matches = np.any(sparse_matrix[:, cluster_ids], axis=1)
    end_time = time.time()
    return end_time - start_time

def run_benchmark(bitmap, num_queries):
    rows, cols = bitmap.shape
    cluster_ids_list = generate_cluster_ids(num_queries, max_size=10, num_cols=cols)

    dense_time = benchmark_dense(bitmap, cluster_ids_list)
    
    sparse_df = bitmap_to_sparse_df(bitmap)
    sparse_time = benchmark_sparse_pandas(sparse_df, cluster_ids_list)

    print(f"Dense time: {dense_time:.4f} seconds")
    print(f"Sparse Pandas time: {sparse_time:.4f} seconds")
    print(f"Speedup: {dense_time / sparse_time:.2f}x")

print("Running benchmark...")
run_benchmark(img_concept_bitmap, num_queries=100)

Running benchmark...
Dense time: 0.6982 seconds
Sparse Pandas time: 0.0099 seconds
Speedup: 70.82x


  return pd.DataFrame.sparse.from_spmatrix(sparse_mat)


In [120]:
spars_bitmap = pd.DataFrame.sparse.from_spmatrix(sparse.csr_matrix(img_concept_bitmap))
print(spars_bitmap)
spars_values = spars_bitmap.values
print(spars_values)

  spars_bitmap = pd.DataFrame.sparse.from_spmatrix(sparse.csr_matrix(img_concept_bitmap))


        0    1    2    3    4    5    6    7    8    9    ...  502  503  504  \
0         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
1         0    0    0    0    0    0    0    0    0    0  ...    1    0    1   
2         0    0    1    0    1    0    0    0    0    0  ...    0    0    0   
3         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
4         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
118282    0    0    0    0    0    1    0    0    0    1  ...    0    0    0   
118283    0    1    0    0    0    0    0    0    0    0  ...    0    0    0   
118284    1    0    0    0    0    0    0    0    0    0  ...    0    0    1   
118285    0    0    0    0    0    0    0    0    0    0  ...    1    0    0   
118286    0    0    0    0    0    0    0    0    0    0  ...    1    0    0   

        505  506  507  508  509  510  5