In [2]:
import os
import math
import hashlib
from utils import create_new_pickle


coco_path = '/data/users/jie/data-slicing/COCO/'
# embed_path = os.path.join(coco_path, 'embeds/fixed/train2017_vitl_fixed_maskclip/')
embed_path = os.path.join(coco_path, 'embeds/train2017_vitl_fixed_maskclip/')
pickle_path = 'train2017_vitl_fixed_maskclip-fast.pkl'
embed_dict = create_new_pickle(embed_path, pickle_path)
average_embeddings = embed_dict['average_embeddings']
dim = average_embeddings.shape[1]
k_coarse = 1023
m = 256
nbits = 8
n_probes = 20

train_sample_size = int(128 * math.sqrt(average_embeddings.shape[0]))

Running get embeddings call
No fast path sir


In [9]:
average_embeddings.nbytes / 1024 / 1024 / 1024

20.157777786254883

In [3]:
embed_dict['embed_path'] = embed_path

In [4]:
import data

index_pq, index_flat_cpu, index_ivf_flat_cpu, packd, img_concept_bitmap, all_images, pqq, kmeans = \
    data.get_indices(dim, k_coarse, m, nbits, n_probes, embed_dict, train_sample_size=train_sample_size,
                     build_ivf_flat=True, cache_enabled=True)

this is hash 31bd25a
Building new index
Training on subset 0.03937339717978393
Building IVFPQ index
Saved new index to cached_index_c70f56f8a2
Kmeans trained (hax) 1023
Building Flat index
Building IVF-Flat index




This is training embeds (416117, 512)
521 1024


118287it [01:35, 1236.73it/s]


In [5]:
def improved_bitmap_to_pyroaring(bitmap):
    """
    Convert a NumPy bitmap to PyRoaring bitmaps with careful verification.
    
    Args:
        bitmap: A 2D NumPy boolean array
    
    Returns:
        List of PyRoaring BitMap objects, one for each column
    """
    from pyroaring import BitMap
    import numpy as np
    
    _, n_cols = bitmap.shape
    roarings = []
    
    for col in range(n_cols):
        # Find indices where column is True
        indices = np.where(bitmap[:, col])[0]
        
        # Create BitMap from indices
        # Important: PyRoaring requires uint32 indices
        # roaring = BitMap(indices.astype(np.uint32))
        roaring = BitMap(indices)
        # Verification step
        original_count = np.sum(bitmap[:, col])
        roaring_count = len(roaring)
        assert original_count == roaring_count, f"Column {col} conversion mismatch! Original: {original_count}, Roaring: {roaring_count}"
        
        
        roarings.append(roaring)
    
    return roarings

In [8]:
import numpy as np
import faiss
from scipy import sparse
from pympler import asizeof
# our memory usage
# * kmeans centroids
# * quantizer codebook
# * bitmap
# * packd

ksize_bytes = kmeans.centroids.shape[0] * kmeans.centroids.shape[1] * np.dtype(np.float32).itemsize
packd_key_bytes = asizeof.asizeof(packd.keys)
packd_value_bytes = len(average_embeddings) * m * nbits / 8
packd_bytes = packd_key_bytes + packd_value_bytes
codebook = faiss.vector_to_array(pqq.centroids).reshape(pqq.M, pqq.ksub, pqq.dsub)
codebook_bytes = codebook.nbytes
bitmap_bytes = img_concept_bitmap.nbytes
sparse_mat_bytes = asizeof.asizeof(sparse.csr_matrix(img_concept_bitmap))
roaring_bytes = asizeof.asizeof(improved_bitmap_to_pyroaring(img_concept_bitmap))

print(f'kmeans centroids: {ksize_bytes / 1024 / 1024} MB')
print(f'packd: {packd_bytes / 1024 / 1024} MB')
print(f'codebook: {codebook_bytes / 1024 / 1024} MB')
print(f'roaring: {roaring_bytes / 1024 / 1024} MB')
print(f'bitmap: {bitmap_bytes / 1024 / 1024} MB')
print(f'sparse: {sparse_mat_bytes / 1024 / 1024} MB')

us_total_bytes = ksize_bytes + packd_bytes + codebook_bytes + bitmap_bytes 
print(f'total non-sparse: {us_total_bytes / 1024 / 1024 / 1024:.3f} GB')
us_total_bytes_sparse = ksize_bytes + packd_bytes + codebook_bytes + sparse_mat_bytes
print(f'total sparse: {us_total_bytes_sparse / 1024 / 1024 / 1024:.3f} GB')
us_total_bytes_roaring = ksize_bytes + packd_bytes + codebook_bytes + roaring_bytes
print(f'total roaring: {us_total_bytes_roaring / 1024 / 1024 / 1024:.3f} GB')

kmeans centroids: 1.998046875 MB
packd: 2612.1956787109375 MB
codebook: 0.5 MB
bitmap: 115.40184116363525 MB
sparse: 38.57171630859375 MB
total non-sparse: 2.666 GB
total sparse: 2.591 GB
total roaring: 2.561 GB


In [5]:
average_embeddings.shape

(10568481, 512)

In [10]:
# PQ memory usage
# centroids 
# codebook
# inverted list
import faiss.contrib.inspect_tools

def get_invlist(invlists, l):
    """ returns the inverted lists content. """
    ls = invlists.list_size(l)
    list_ids = np.zeros(ls, dtype='int32') # Can be made 32
    x = invlists.get_ids(l)
    faiss.memcpy(faiss.swig_ptr(list_ids), x, list_ids.nbytes)
    invlists.release_ids(l, x)
    x = invlists.get_codes(l)
    list_codes = np.zeros((ls, invlists.code_size), dtype='uint8')
    faiss.memcpy(faiss.swig_ptr(list_codes), x, list_codes.nbytes)
    invlists.release_codes(l, x)    
    return list_ids, list_codes


ksize_bytes = kmeans.centroids.shape[0] * kmeans.centroids.shape[1] * np.dtype(np.float32).itemsize
codebook = faiss.contrib.inspect_tools.get_pq_centroids(index_pq.pq)
codebook_bytes = codebook.nbytes
list_bytes = 0
for i in range(index_pq.invlists.nlist):
    list_ids, list_codes = get_invlist(index_pq.invlists, i)
    list_bytes += list_ids.nbytes + list_codes.nbytes
    # We ignore list_ids because we have an equivalent mapping tracking the vector ids
    # list_bytes +=  list_codes.nbytes

print(f'kmeans centroids: {ksize_bytes / 1024 / 1024} MB')
print(f'inverted list: {list_bytes / 1024 / 1024} MB')
print(f'codebook: {codebook_bytes / 1024 / 1024} MB')
pq_total_bytes = ksize_bytes + list_bytes + codebook_bytes
print(f'total: {pq_total_bytes / 1024 / 1024 / 1024:.3f} GB')




kmeans centroids: 1.998046875 MB
inverted list: 2620.5111122131348 MB
codebook: 0.5 MB
total: 2.562 GB


In [87]:
packd.used - average_embeddings.shape[0]

-6581531

In [47]:
average_embeddings.nbytes / 1024 / 1024 / 1024

20.157777786254883

In [103]:
original =  (average_embeddings.nbytes)
compression_ratio_us = us_total_bytes / original
compression_ratio_pq = pq_total_bytes / original

print(f'compression ratio us: {compression_ratio_us}')
print(f'compression ratio pq: {compression_ratio_pq}')



compression ratio us: 0.038512046946555634
compression ratio pq: 0.03332414522987031


In [12]:
list_ids.nbytes + list_codes.nbytes

424224

In [2]:
# Check bitmap selectivities

    

NameError: name 'img_concept_bitmap' is not defined