# Setup

In [None]:
feature_shape = None
features_dir = None
cpu_bin_name = None
gpu_bin_name = None

In [None]:
import os

dir_path = os.getcwd()
parent_dir_path = os.path.dirname(dir_path)

if not feature_shape:
    feature_shape = 512
    
if not features_dir:
    features_dir = f'{parent_dir_path}/data_extraction/clip/CLIPv2_features'

if not cpu_bin_name:
    cpu_bin_name = 'faiss_clipv2_cosine_cpu.bin'
    
if not gpu_bin_name:
    gpu_bin_name = 'faiss_clipv2_cosine_gpu.bin'

In [None]:
! pip install faiss-gpu faiss-cpu

[0m

In [None]:
import os
import glob
import faiss
import numpy as np
from tqdm import tqdm

# Indexing

In [None]:
def create_faiss_indexes(cpu_bin_name, gpu_bin_name, features_dir, feature_shape):
    """
    Create both CPU and GPU FAISS indexes.

    Parameters:
    - cpu_bin_name: Name of the output CPU FAISS index file
    - gpu_bin_name: Name of the output GPU FAISS index file
    - features_dir: Directory containing feature files
    - feature_shape: Shape of each feature vector

    Returns:
    - None (saves the indexes to disk)
    """
    # Initialize the index with the first feature file to ensure correct dimensionality
    npy_files = list(glob.iglob(os.path.join(features_dir, '*', '*.npy')))
    if not npy_files:
        raise ValueError(f"No .npy files found in {features_dir}")

    first_feature_file = npy_files[0]
    first_feature = np.load(first_feature_file)
    feature_dim = first_feature.shape[1]
    

    if feature_dim != feature_shape:
        print(f"Warning: Actual feature dimension {feature_dim} doesn't match expected {feature_shape}")
        print(f"Using actual feature dimension {feature_dim} for index creation")
    feature_shape = feature_dim

    cpu_index = faiss.IndexFlatIP(feature_shape)

    try:
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(
            res, 0, faiss.IndexFlatIP(feature_dim))
        print("GPU index creation is available")
        use_gpu = True
    except Exception as e:
        print(f"GPU index creation not available: {e}")
        use_gpu = False

    for data_part in tqdm(sorted(os.listdir(features_dir)), desc="Processing data parts"):
        for feature_path in tqdm(sorted(glob.glob(os.path.join(features_dir, data_part, '*.npy'))),
                                 desc=f"Processing {data_part}"):
            try:
                feats = np.load(feature_path)
                feats = feats.astype(np.float32)

                if feats.shape[1] != feature_shape:
                    print(f"Warning: Feature dimension mismatch in {feature_path}. "
                          f"Expected {feature_shape}, got {feats.shape[1]}. Skipping this file.")
                    continue

                cpu_index.add(feats)
                if use_gpu:
                    gpu_index.add(feats)
            except Exception as e:
                print(f"Error processing {feature_path}: {e}")
                continue

    faiss.write_index(cpu_index, cpu_bin_name)
    print(f"CPU FAISS index saved to {cpu_bin_name}")

    if use_gpu:
        gpu_index_cpu = faiss.index_gpu_to_cpu(gpu_index)
        faiss.write_index(gpu_index_cpu, gpu_bin_name)
        print(f"GPU FAISS index saved to {gpu_bin_name}")

In [None]:
create_faiss_indexes(cpu_bin_name, gpu_bin_name, features_dir, feature_shape)

Using actual feature dimension 768 for index creation
GPU index creation not available: module 'faiss' has no attribute 'StandardGpuResources'


Processing L01: 100%|██████████| 31/31 [00:00<00:00, 93.65it/s] 
Processing data parts: 100%|██████████| 1/1 [00:00<00:00,  2.98it/s]


CPU FAISS index saved to faiss_clipv2_cosine_cpu.bin
