# Setup

In [None]:
feature_shape = None
features_dir = None
cpu_bin_name = None
gpu_bin_name = None

In [None]:
import os

dir_path = os.getcwd()
parent_dir_path = os.path.dirname(dir_path)

if not feature_shape:
    feature_shape = 512
    
if not features_dir:
    features_dir = f'{parent_dir_path}/data_extraction/clip/CLIPv2_features'

if not cpu_bin_name:
    cpu_bin_name = 'faiss_clipv2_cosine_cpu.bin'
    
if not gpu_bin_name:
    gpu_bin_name = 'faiss_clipv2_cosine_gpu.bin'

In [None]:
! pip install faiss-gpu

Collecting faiss-gpu
  Using cached faiss_gpu-1.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
[0m

In [None]:
import os
import glob
import faiss
import numpy as np
from tqdm import tqdm

In [None]:
import torch
print(torch.cuda.is_available())

True


# Indexing

In [None]:
def create_faiss_indexes(cpu_bin_name, gpu_bin_name, features_dir, feature_shape):
    """
    Create both CPU and GPU FAISS indexes if possible. If GPU is not available, create only CPU index.

    Parameters:
    - cpu_bin_name: Name of the output CPU FAISS index file
    - gpu_bin_name: Name of the output GPU FAISS index file
    - features_dir: Directory containing feature files
    - feature_shape: Shape of each feature vector

    Returns:
    - None (saves the indexes to disk)
    """
    feature_shape = validate_and_get_feature_shape(features_dir, feature_shape)
    cpu_index, gpu_index, use_gpu = initialize_indexes(feature_shape)

    process_feature_files(features_dir, cpu_index,
                          gpu_index, use_gpu, feature_shape)

    save_indexes(cpu_index, gpu_index, use_gpu, cpu_bin_name, gpu_bin_name)


def validate_and_get_feature_shape(features_dir, expected_shape):
    npy_files = list(glob.iglob(os.path.join(features_dir, '*', '*.npy')))
    if not npy_files:
        raise ValueError(f"No .npy files found in {features_dir}")

    first_feature = np.load(npy_files[0])
    actual_shape = first_feature.shape[1]

    if actual_shape != expected_shape:
        print(
            f"Warning: Actual feature dimension {actual_shape} doesn't match expected {expected_shape}")
        print(
            f"Using actual feature dimension {actual_shape} for index creation")

    return actual_shape


def initialize_indexes(feature_shape):
    cpu_index = faiss.IndexFlatIP(feature_shape)
    gpu_index = None
    use_gpu = False

    try:
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(
            res, 0, faiss.IndexFlatIP(feature_shape))
        use_gpu = True
        print("GPU index creation is available")
    except Exception as e:
        print(f"GPU index creation not available: {e}")
        print("Proceeding with CPU index only")

    return cpu_index, gpu_index, use_gpu


def process_feature_files(features_dir, cpu_index, gpu_index, use_gpu, feature_shape):
    for data_part in tqdm(sorted(os.listdir(features_dir)), desc="Processing data parts"):
        process_data_part(features_dir, data_part, cpu_index,
                          gpu_index, use_gpu, feature_shape)


def process_data_part(features_dir, data_part, cpu_index, gpu_index, use_gpu, feature_shape):
    for feature_path in tqdm(sorted(glob.glob(os.path.join(features_dir, data_part, '*.npy'))),
                             desc=f"Processing {data_part}"):
        try:
            feats = np.load(feature_path).astype(np.float32)
            if feats.shape[1] != feature_shape:
                print(
                    f"Warning: Feature dimension mismatch in {feature_path}. Skipping this file.")
                continue

            cpu_index.add(feats)
            if use_gpu:
                gpu_index.add(feats)
        except Exception as e:
            print(f"Error processing {feature_path}: {e}")


def save_indexes(cpu_index, gpu_index, use_gpu, cpu_bin_name, gpu_bin_name):
    faiss.write_index(cpu_index, cpu_bin_name)
    print(f"CPU FAISS index saved to {cpu_bin_name}")

    if use_gpu:
        gpu_index_cpu = faiss.index_gpu_to_cpu(gpu_index)
        faiss.write_index(gpu_index_cpu, gpu_bin_name)
        print(f"GPU FAISS index saved to {gpu_bin_name}")
    else:
        print("GPU index was not created due to unavailability of GPU resources")
        print("GPU index was not created due to unavailability of GPU resources")

In [None]:
create_faiss_indexes(cpu_bin_name, gpu_bin_name, features_dir, feature_shape)

Using actual feature dimension 768 for index creation
GPU index creation is available


Processing L01: 100%|██████████| 31/31 [00:00<00:00, 69.90it/s]
Processing data parts: 100%|██████████| 1/1 [00:00<00:00,  2.24it/s]


CPU FAISS index saved to faiss_clipv2_cosine_cpu.bin
GPU FAISS index saved to faiss_clipv2_cosine_gpu.bin
